diff --git a/.github/actions/setup-claude-code-action/action.yml b/.github/actions/setup-claude-code-action/action.yml index dbc832aba..de25bab99 100644 --- a/.github/actions/setup-claude-code-action/action.yml +++ b/.github/actions/setup-claude-code-action/action.yml @@ -41,7 +41,7 @@ runs: - name: Setup Python uses: actions/setup-python@v6 with: - python-version: "3.14" + python-version: "3.13" - name: Install LiteLLM dependencies shell: bash diff --git a/.github/renovate.json b/.github/renovate.json index 761bff78a..b864ff7c3 100644 --- a/.github/renovate.json +++ b/.github/renovate.json @@ -11,7 +11,8 @@ ], "commitMessageAction": "Renovate: Update", "constraints": { - "go": "1.26" + "go": "1.26", + "python": "3.13" }, "dependencyDashboardOSVVulnerabilitySummary": "all", "osvVulnerabilityAlerts": true, diff --git a/.github/workflows/claude-assistant.yaml b/.github/workflows/claude-assistant.yaml index bbac074ef..1ce4b1234 100644 --- a/.github/workflows/claude-assistant.yaml +++ b/.github/workflows/claude-assistant.yaml @@ -1,9 +1,3 @@ -# NOTE: This workflow is temporarily disabled. -# We need to determine a compliant approach for passing AI Core credentials -# and whether utilizing the SAP AI Core Proxy is a viable and compliant option. -# Until this is resolved, no AI Core credentials are configured and no -# communication with AI Core takes place. - name: Claude Code Assistant on Issues and PRs on: @@ -18,7 +12,6 @@ on: jobs: check-allowlist: - if: false # Temporarily disabled runs-on: ubuntu-latest outputs: allowed: ${{ steps.check.outputs.allowed }} @@ -38,7 +31,6 @@ jobs: claude: needs: check-allowlist - if: false # Temporarily disabled runs-on: ubuntu-latest permissions: contents: write @@ -52,9 +44,13 @@ jobs: uses: actions/setup-go@v6 with: go-version-file: 'go.mod' - + - name: Generate GitHub App token + id: app-token + uses: actions/create-github-app-token@v1 + with: + app-id: ${{ secrets.CORTEX_AI_AGENTS_APP_ID }} + private-key: ${{ secrets.CORTEX_AI_AGENTS_CLIENT_PKEY }} - uses: ./.github/actions/setup-claude-code-action - - uses: ./.github/actions/start-litellm-proxy env: AICORE_RESOURCE_GROUP: ${{ secrets.AICORE_RESOURCE_GROUP }} @@ -62,19 +58,18 @@ jobs: AICORE_AUTH_URL: ${{ secrets.AICORE_AUTH_URL }} AICORE_CLIENT_ID: ${{ secrets.AICORE_CLIENT_ID }} AICORE_CLIENT_SECRET: ${{ secrets.AICORE_CLIENT_SECRET }} - - uses: ./.claude-code-action with: claude_args: | --max-turns 1000 --permission-mode auto + --effort-level max --allowedTools "Read,Write,Edit,Bash(*),WebSearch,WebFetch" trigger_phrase: "@claude" include_comments_by_actor: "auhlig,umswmayj,juliusclausnitzer,mblos,PhilippMatthes,Varsius,henrichter,SoWieMarkus,*[bot]" use_litellm: "true" litellm_model: "sap/anthropic--claude-4.6-opus" - github_token: ${{ secrets.GITHUB_TOKEN }} + github_token: ${{ steps.app-token.outputs.token }} show_full_output: "true" - - uses: ./.github/actions/stop-litellm-proxy if: always() diff --git a/.github/workflows/claude-release.yaml b/.github/workflows/claude-release.yaml deleted file mode 100644 index 216716809..000000000 --- a/.github/workflows/claude-release.yaml +++ /dev/null @@ -1,52 +0,0 @@ -name: Claude Code Release Orchestrator - -on: - pull_request: - types: [opened, synchronize, reopened] - branches: - - release - -jobs: - release: - if: false # Temporarily disabled - runs-on: ubuntu-latest - concurrency: - group: changelog-release - cancel-in-progress: false - permissions: - contents: write - pull-requests: write - id-token: write - steps: - - name: Checkout code - uses: actions/checkout@v6 - - - name: Set up Go - uses: actions/setup-go@v6 - with: - go-version-file: 'go.mod' - - - uses: ./.github/actions/setup-claude-code-action - - - uses: ./.github/actions/start-litellm-proxy - env: - AICORE_RESOURCE_GROUP: ${{ secrets.AICORE_RESOURCE_GROUP }} - AICORE_BASE_URL: ${{ secrets.AICORE_BASE_URL }} - AICORE_AUTH_URL: ${{ secrets.AICORE_AUTH_URL }} - AICORE_CLIENT_ID: ${{ secrets.AICORE_CLIENT_ID }} - AICORE_CLIENT_SECRET: ${{ secrets.AICORE_CLIENT_SECRET }} - - - uses: ./.claude-code-action - with: - claude_args: | - --max-turns 1000 - --permission-mode auto - --allowedTools "Read,Write,Edit,Bash(*),WebSearch,WebFetch,Agent" - use_litellm: "true" - litellm_model: "sap/anthropic--claude-4.6-opus" - github_token: ${{ secrets.GITHUB_TOKEN }} - show_full_output: "true" - prompt: "/release ${{ github.event.pull_request.number }}" - - - uses: ./.github/actions/stop-litellm-proxy - if: always() diff --git a/.github/workflows/claude-review.yaml b/.github/workflows/claude-review.yaml deleted file mode 100644 index c16b9d72c..000000000 --- a/.github/workflows/claude-review.yaml +++ /dev/null @@ -1,74 +0,0 @@ -# NOTE: This workflow is temporarily disabled. -# We need to determine a compliant approach for passing AI Core credentials -# and whether utilizing the SAP AI Core Proxy is a viable and compliant option. -# Until this is resolved, no AI Core credentials are configured and no -# communication with AI Core takes place. - -name: Claude Code reviewer on PRs - -on: - pull_request: - types: [opened, synchronize, reopened] - -jobs: - check-allowlist: - if: false # Temporarily disabled - runs-on: ubuntu-latest - outputs: - allowed: ${{ steps.check.outputs.allowed }} - steps: - - uses: actions/checkout@v6 - with: - fetch-depth: 1 - - name: Check sender against allowlist - id: check - run: | - if grep -qxF "${{ github.event.sender.login }}" \ - <(grep -v '^#' .github/claude-allowed-users | sed '/^[[:space:]]*$/d'); then - echo "allowed=true" >> $GITHUB_OUTPUT - else - echo "allowed=false" >> $GITHUB_OUTPUT - fi - - claude: - needs: check-allowlist - if: false # Temporarily disabled - runs-on: ubuntu-latest - permissions: - contents: write - pull-requests: write - issues: write - id-token: write - steps: - - name: Checkout code - uses: actions/checkout@v6 - - name: Set up Go - uses: actions/setup-go@v6 - with: - go-version-file: 'go.mod' - - - uses: ./.github/actions/setup-claude-code-action - - - uses: ./.github/actions/start-litellm-proxy - env: - AICORE_RESOURCE_GROUP: ${{ secrets.AICORE_RESOURCE_GROUP }} - AICORE_BASE_URL: ${{ secrets.AICORE_BASE_URL }} - AICORE_AUTH_URL: ${{ secrets.AICORE_AUTH_URL }} - AICORE_CLIENT_ID: ${{ secrets.AICORE_CLIENT_ID }} - AICORE_CLIENT_SECRET: ${{ secrets.AICORE_CLIENT_SECRET }} - - - uses: ./.claude-code-action - with: - claude_args: | - --max-turns 1000 - --permission-mode auto - --allowedTools "Read,Write,Edit,Bash(*),WebSearch,WebFetch" - prompt: "/review-pr REPO: ${{ github.repository }} PR_NUMBER: ${{ github.event.pull_request.number }}" - include_comments_by_actor: "auhlig,umswmayj,juliusclausnitzer,mblos,PhilippMatthes,Varsius,henrichter,SoWieMarkus,*[bot]" - use_litellm: "true" - litellm_model: "sap/anthropic--claude-4.6-opus" - github_token: ${{ secrets.GITHUB_TOKEN }} - show_full_output: "true" - - - uses: ./.github/actions/stop-litellm-proxy - if: always() diff --git a/.github/workflows/claude-weekly.yaml b/.github/workflows/claude-weekly.yaml index a42f157fd..de4e940df 100644 --- a/.github/workflows/claude-weekly.yaml +++ b/.github/workflows/claude-weekly.yaml @@ -1,9 +1,3 @@ -# NOTE: This workflow is temporarily disabled. -# We need to determine a compliant approach for passing AI Core credentials -# and whether utilizing the SAP AI Core Proxy is a viable and compliant option. -# Until this is resolved, no AI Core credentials are configured and no -# communication with AI Core takes place. - name: Weekly Claude Code Repo Analysis and Grooming on: @@ -13,7 +7,6 @@ on: jobs: claude: - if: false # Temporarily disabled runs-on: ubuntu-latest permissions: contents: write @@ -26,9 +19,13 @@ jobs: uses: actions/setup-go@v6 with: go-version-file: 'go.mod' - + - name: Generate GitHub App token + id: app-token + uses: actions/create-github-app-token@v1 + with: + app-id: ${{ secrets.CORTEX_AI_AGENTS_APP_ID }} + private-key: ${{ secrets.CORTEX_AI_AGENTS_CLIENT_PKEY }} - uses: ./.github/actions/setup-claude-code-action - - uses: ./.github/actions/start-litellm-proxy env: AICORE_RESOURCE_GROUP: ${{ secrets.AICORE_RESOURCE_GROUP }} @@ -36,18 +33,17 @@ jobs: AICORE_AUTH_URL: ${{ secrets.AICORE_AUTH_URL }} AICORE_CLIENT_ID: ${{ secrets.AICORE_CLIENT_ID }} AICORE_CLIENT_SECRET: ${{ secrets.AICORE_CLIENT_SECRET }} - - uses: ./.claude-code-action with: prompt: "/weekly" claude_args: | --max-turns 1000 --permission-mode auto + --effort-level max --allowedTools "Read,Write,Edit,Bash(*),WebSearch,WebFetch" use_litellm: "true" litellm_model: "sap/anthropic--claude-4.6-opus" - github_token: ${{ secrets.GITHUB_TOKEN }} + github_token: ${{ steps.app-token.outputs.token }} show_full_output: "true" - - uses: ./.github/actions/stop-litellm-proxy if: always() diff --git a/AGENTS.md b/AGENTS.md index 59747bd8c..6699e8449 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -35,7 +35,6 @@ General: - You can use `maps.Copy` instead of iteratively copying a map - You can use `strings.Contains` to check if some string is in another - You can use `slices.Contains` to check if an element is part of a slice -- And definitely use `testlib.Ptr` for test cases that require pointer values Testing: - Ideally test files should be short and contain only the necessary cases diff --git a/CHANGELOG.md b/CHANGELOG.md index a7ff9901a..19febfd60 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,39 @@ # Changelog +## 2026-05-07 — [#814](https://github.com/cobaltcore-dev/cortex/pull/814) + +### cortex v0.0.47 (sha-b8cecd0c) + +Non-breaking changes: +- Add `ProjectQuota` CRD with per-resource, per-AZ quota breakdown and PAYG (pay-as-you-go) calculation support ([#796](https://github.com/cobaltcore-dev/cortex/pull/796)) +- Add `FlavorGroupCapacity` CRD and background capacity controller that pre-computes per-flavor VM slot capacity for each (flavor group × AZ) pair on a configurable interval ([#728](https://github.com/cobaltcore-dev/cortex/pull/728)) +- Report capacity from `FlavorGroupCapacity` CRDs in `POST /commitments/v1/report-capacity` — replaces placeholder zeros with real values; stale CRDs report last-known capacity +- Move CommittedResource usage computation from the API handler into a dedicated reconciler that persists results in CRD status, making usage data available to both the LIQUID API and quota controller ([#800](https://github.com/cobaltcore-dev/cortex/pull/800)) +- Add KVM OS version as a label to KVM host capacity metrics ([#810](https://github.com/cobaltcore-dev/cortex/pull/810)) +- Add KVM project usage metrics (running VMs and resource usage per project/flavor) ([#803](https://github.com/cobaltcore-dev/cortex/pull/803)) +- Add `domain_id` and name to vmware project capacity metrics ([#802](https://github.com/cobaltcore-dev/cortex/pull/802)) +- Include `domain_id` in vmware project commitment KPI ([#806](https://github.com/cobaltcore-dev/cortex/pull/806)) +- Add weighing explainer for scheduling decisions, surfacing per-host scoring rationale ([#808](https://github.com/cobaltcore-dev/cortex/pull/808)) +- Move KVM host capacity metric into infrastructure plugins package ([#809](https://github.com/cobaltcore-dev/cortex/pull/809)) +- Remove deprecated per-compute infrastructure KPIs (`flavor_running_vms`, `host_running_vms`, `resource_capacity_kvm`) ([#807](https://github.com/cobaltcore-dev/cortex/pull/807)) +- Rename hypervisor `ClusterRoleBinding` objects to avoid `roleRef` conflicts on redeploy ([#804](https://github.com/cobaltcore-dev/cortex/pull/804)) +- Move bundle-specific RBAC templates from the library chart into individual bundle charts (`cortex-ironcore`, `cortex-pods`) ([#797](https://github.com/cobaltcore-dev/cortex/pull/797)) +- Move webhook templates from library chart back into `cortex-nova` bundle (reverts earlier move) ([#805](https://github.com/cobaltcore-dev/cortex/pull/805)) +- Fix: add `identity-domains` as a KPI dependency +- Fix: remove `ignoreAllocations` from kvm-report-capacity pipeline to unblock deployment against older admission webhook ([#812](https://github.com/cobaltcore-dev/cortex/pull/812)) +- Fix: suppress nova scheduling alerts on transient `no such host` DNS errors +- Replace `testlib.Ptr` helper with native `new()` across test files ([#801](https://github.com/cobaltcore-dev/cortex/pull/801)) + +### cortex-nova v0.0.60 (sha-b8cecd0c) + +Includes updated chart cortex v0.0.47. + +Non-breaking changes: +- Add Prometheus datasource for KVM project usage metrics +- Add KVM project usage KPI CRD templates +- Add KVM project utilization KPI CRD templates +- Update `cortex-nova` RBAC to grant permissions for `FlavorGroupCapacity` and `ProjectQuota` CRDs + ## 2026-05-04 — [#793](https://github.com/cobaltcore-dev/cortex/pull/793) ### cortex v0.0.46 (sha-ab6eb45d) diff --git a/Tiltfile b/Tiltfile index ef1ee3b02..ebe7430bb 100644 --- a/Tiltfile +++ b/Tiltfile @@ -196,7 +196,7 @@ k8s_yaml(helm('./helm/bundles/cortex-crds', name='cortex-crds', set=crd_extra_va if 'nova' in ACTIVE_DEPLOYMENTS: print("Activating Cortex Nova bundle") k8s_yaml(helm('./helm/bundles/cortex-nova', name='cortex-nova', values=tilt_values, set=env_set_overrides)) - k8s_resource('cortex-nova-postgresql', labels=['Cortex-Nova'], port_forwards=[ + k8s_resource('cortex-nova-postgresql-v18', labels=['Cortex-Nova'], port_forwards=[ port_forward(8000, 5432), ]) k8s_resource('cortex-nova-scheduling-controller-manager', labels=['Cortex-Nova'], port_forwards=[ @@ -221,7 +221,7 @@ if 'nova' in ACTIVE_DEPLOYMENTS: if 'manila' in ACTIVE_DEPLOYMENTS: print("Activating Cortex Manila bundle") k8s_yaml(helm('./helm/bundles/cortex-manila', name='cortex-manila', values=tilt_values, set=env_set_overrides)) - k8s_resource('cortex-manila-postgresql', labels=['Cortex-Manila'], port_forwards=[ + k8s_resource('cortex-manila-postgresql-v18', labels=['Cortex-Manila'], port_forwards=[ port_forward(8002, 5432), ]) k8s_resource('cortex-manila-scheduling-controller-manager', labels=['Cortex-Manila'], port_forwards=[ @@ -238,7 +238,7 @@ if 'manila' in ACTIVE_DEPLOYMENTS: if 'cinder' in ACTIVE_DEPLOYMENTS: k8s_yaml(helm('./helm/bundles/cortex-cinder', name='cortex-cinder', values=tilt_values, set=env_set_overrides)) - k8s_resource('cortex-cinder-postgresql', labels=['Cortex-Cinder'], port_forwards=[ + k8s_resource('cortex-cinder-postgresql-v18', labels=['Cortex-Cinder'], port_forwards=[ port_forward(8004, 5432), ]) k8s_resource('cortex-cinder-scheduling-controller-manager', labels=['Cortex-Cinder'], port_forwards=[ diff --git a/api/v1alpha1/committed_resource_types.go b/api/v1alpha1/committed_resource_types.go index 31365887f..d49a9be90 100644 --- a/api/v1alpha1/committed_resource_types.go +++ b/api/v1alpha1/committed_resource_types.go @@ -106,38 +106,42 @@ type CommittedResourceSpec struct { // CommittedResourceStatus defines the observed state of CommittedResource. type CommittedResourceStatus struct { - // AcceptedAmount is the quantity the controller last successfully provisioned as Reservation slots. - // Nil if the spec has never been successfully reconciled. + // AcceptedSpec is a snapshot of Spec from the last successful reconcile. + // Used by rollbackToAccepted to restore the exact previously-accepted placement (AZ, amount, + // project, domain, flavor group) even when the current spec has already been mutated to a new value. // +kubebuilder:validation:Optional - AcceptedAmount *resource.Quantity `json:"acceptedAmount,omitempty"` + AcceptedSpec *CommittedResourceSpec `json:"acceptedSpec,omitempty"` // AcceptedAt is when the controller last successfully reconciled the spec into Reservation slots. // +kubebuilder:validation:Optional AcceptedAt *metav1.Time `json:"acceptedAt,omitempty"` - // LastChanged is when the spec was last written by the syncer. - // When AcceptedAt is older than LastChanged, the controller has pending work. - // +kubebuilder:validation:Optional - LastChanged *metav1.Time `json:"lastChanged,omitempty"` - // LastReconcileAt is when the controller last ran its reconcile loop for this resource. // +kubebuilder:validation:Optional LastReconcileAt *metav1.Time `json:"lastReconcileAt,omitempty"` - // AssignedVMs holds the UUIDs of VMs deterministically assigned to this committed resource. - // Populated by the usage reconciler; used to compute UsedAmount and drive the quota controller. + // AssignedInstances holds the UUIDs of VM instances deterministically assigned to this committed resource. + // Populated by the usage reconciler; used to compute UsedResources and drive the quota controller. // +kubebuilder:validation:Optional - AssignedVMs []string `json:"assignedVMs,omitempty"` + AssignedInstances []string `json:"assignedInstances,omitempty"` - // UsedAmount is the sum of assigned VM resources expressed in the same units as Spec.Amount. - // Populated by the usage reconciler. + // UsedResources is the total resource consumption of assigned VM instances, keyed by resource type + // (e.g. "memory" in MiB binary SI, "cpu" as core count). Populated by the usage reconciler. // +kubebuilder:validation:Optional - UsedAmount *resource.Quantity `json:"usedAmount,omitempty"` + UsedResources map[string]resource.Quantity `json:"usedResources,omitempty"` - // LastUsageReconcileAt is when the usage reconciler last updated AssignedVMs and UsedAmount. + // LastUsageReconcileAt is when the usage reconciler last updated AssignedInstances and UsedResources. // +kubebuilder:validation:Optional LastUsageReconcileAt *metav1.Time `json:"lastUsageReconcileAt,omitempty"` + // UsageObservedGeneration is the CR generation that the usage reconciler last processed. + // Follows the Kubernetes observedGeneration pattern: when this differs from + // metadata.generation the cooldown is bypassed so spec changes (e.g. shrink) are reflected + // immediately rather than waiting for the next cooldown interval. + // +kubebuilder:validation:Optional + // +kubebuilder:validation:Minimum=0 + UsageObservedGeneration *int64 `json:"usageObservedGeneration,omitempty"` + // Conditions holds the current status conditions. // +kubebuilder:validation:Optional Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"` @@ -163,8 +167,8 @@ const ( // +kubebuilder:printcolumn:name="ResourceType",type="string",JSONPath=".spec.resourceType" // +kubebuilder:printcolumn:name="AZ",type="string",JSONPath=".spec.availabilityZone" // +kubebuilder:printcolumn:name="Amount",type="string",JSONPath=".spec.amount" -// +kubebuilder:printcolumn:name="AcceptedAmount",type="string",JSONPath=".status.acceptedAmount" -// +kubebuilder:printcolumn:name="UsedAmount",type="string",JSONPath=".status.usedAmount" +// +kubebuilder:printcolumn:name="UsedMemory",type="string",JSONPath=".status.usedResources.memory",priority=1 +// +kubebuilder:printcolumn:name="UsedCPU",type="string",JSONPath=".status.usedResources.cpu",priority=1 // +kubebuilder:printcolumn:name="State",type="string",JSONPath=".spec.state" // +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" // +kubebuilder:printcolumn:name="StartTime",type="date",JSONPath=".spec.startTime",priority=1 diff --git a/api/v1alpha1/datasource_types.go b/api/v1alpha1/datasource_types.go index fff321c48..f9963a35c 100644 --- a/api/v1alpha1/datasource_types.go +++ b/api/v1alpha1/datasource_types.go @@ -52,6 +52,7 @@ const ( NovaDatasourceTypeFlavors NovaDatasourceType = "flavors" NovaDatasourceTypeMigrations NovaDatasourceType = "migrations" NovaDatasourceTypeAggregates NovaDatasourceType = "aggregates" + NovaDatasourceTypeImages NovaDatasourceType = "images" ) type NovaDatasource struct { diff --git a/api/v1alpha1/flavor_group_capacity_types.go b/api/v1alpha1/flavor_group_capacity_types.go new file mode 100644 index 000000000..a7339dce2 --- /dev/null +++ b/api/v1alpha1/flavor_group_capacity_types.go @@ -0,0 +1,112 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +const ( + // FlavorGroupCapacityConditionReady indicates the status data is up-to-date. + FlavorGroupCapacityConditionReady = "Ready" +) + +// FlavorGroupCapacitySpec defines the desired state of FlavorGroupCapacity. +type FlavorGroupCapacitySpec struct { + // FlavorGroup is the name of the flavor group (e.g. "hana-v2"). + // +kubebuilder:validation:Required + FlavorGroup string `json:"flavorGroup"` + + // AvailabilityZone is the OpenStack AZ this capacity data covers (e.g. "qa-de-1a"). + // +kubebuilder:validation:Required + AvailabilityZone string `json:"availabilityZone"` +} + +// FlavorCapacityStatus holds per-flavor capacity numbers for one (flavor group × AZ) pair. +type FlavorCapacityStatus struct { + // FlavorName is the OpenStack flavor name (e.g. "hana-v2-small"). + FlavorName string `json:"flavorName"` + + // PlaceableHosts is the number of hosts that can still fit this flavor given current allocations. + // +kubebuilder:validation:Optional + PlaceableHosts int64 `json:"placeableHosts,omitempty"` + + // PlaceableVMs is the number of VM slots remaining for this flavor given current allocations. + // +kubebuilder:validation:Optional + PlaceableVMs int64 `json:"placeableVms,omitempty"` + + // TotalCapacityHosts is the number of eligible hosts in an empty-datacenter scenario. + // +kubebuilder:validation:Optional + TotalCapacityHosts int64 `json:"totalCapacityHosts,omitempty"` + + // TotalCapacityVMSlots is the maximum number of VM slots in an empty-datacenter scenario. + // +kubebuilder:validation:Optional + TotalCapacityVMSlots int64 `json:"totalCapacityVmSlots,omitempty"` +} + +// FlavorGroupCapacityStatus defines the observed state of FlavorGroupCapacity. +type FlavorGroupCapacityStatus struct { + // Flavors holds per-flavor capacity data for all flavors in the group. + // +kubebuilder:validation:Optional + Flavors []FlavorCapacityStatus `json:"flavors,omitempty"` + + // CommittedCapacity is the sum of AcceptedAmount across active CommittedResource CRDs, + // expressed in multiples of the smallest flavor's memory. + // +kubebuilder:validation:Optional + CommittedCapacity int64 `json:"committedCapacity,omitempty"` + + // TotalInstances is the total number of VM instances running on hypervisors in this AZ, + // derived from Hypervisor CRD Status.Instances (not filtered by flavor group). + // +kubebuilder:validation:Optional + TotalInstances int64 `json:"totalInstances,omitempty"` + + // LastReconcileAt is the timestamp of the last successful reconcile. + // +kubebuilder:validation:Optional + LastReconcileAt metav1.Time `json:"lastReconcileAt,omitempty"` + + // The current status conditions of the FlavorGroupCapacity. + // +kubebuilder:validation:Optional + Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:resource:scope=Cluster +// +kubebuilder:printcolumn:name="FlavorGroup",type="string",JSONPath=".spec.flavorGroup" +// +kubebuilder:printcolumn:name="AZ",type="string",JSONPath=".spec.availabilityZone" +// +kubebuilder:printcolumn:name="TotalInstances",type="integer",JSONPath=".status.totalInstances" +// +kubebuilder:printcolumn:name="LastReconcile",type="date",JSONPath=".status.lastReconcileAt" +// +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" + +// FlavorGroupCapacity caches pre-computed capacity data for one flavor group in one AZ. +// One CRD exists per (flavor group × AZ) pair, updated by the capacity controller on a fixed interval. +// The capacity API reads these CRDs instead of probing the scheduler on each request. +type FlavorGroupCapacity struct { + metav1.TypeMeta `json:",inline"` + + // metadata is a standard object metadata + // +optional + metav1.ObjectMeta `json:"metadata,omitempty,omitzero"` + + // spec defines the desired state of FlavorGroupCapacity + // +required + Spec FlavorGroupCapacitySpec `json:"spec"` + + // status defines the observed state of FlavorGroupCapacity + // +optional + Status FlavorGroupCapacityStatus `json:"status,omitempty,omitzero"` +} + +// +kubebuilder:object:root=true + +// FlavorGroupCapacityList contains a list of FlavorGroupCapacity. +type FlavorGroupCapacityList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []FlavorGroupCapacity `json:"items"` +} + +func init() { + SchemeBuilder.Register(&FlavorGroupCapacity{}, &FlavorGroupCapacityList{}) +} diff --git a/api/v1alpha1/project_quota_types.go b/api/v1alpha1/project_quota_types.go new file mode 100644 index 000000000..fecac57cc --- /dev/null +++ b/api/v1alpha1/project_quota_types.go @@ -0,0 +1,141 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// ResourceQuota holds the quota for a single resource with per-AZ breakdown. +// Maps to liquid.ResourceQuotaRequest from the LIQUID API. +// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ResourceQuotaRequest +type ResourceQuota struct { + // Quota is the total quota across all AZs (for compatibility). + // Corresponds to liquid.ResourceQuotaRequest.Quota. + // +kubebuilder:validation:Required + Quota int64 `json:"quota"` + + // PerAZ holds the per-availability-zone quota breakdown. + // Key: availability zone name, Value: quota for that AZ. + // Only populated for AZSeparatedTopology resources. + // Corresponds to liquid.ResourceQuotaRequest.PerAZ[az].Quota. + // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#AZResourceQuotaRequest + // +kubebuilder:validation:Optional + PerAZ map[string]int64 `json:"perAZ,omitempty"` +} + +// ResourceQuotaUsage holds per-AZ PAYG usage for a single resource. +type ResourceQuotaUsage struct { + // PerAZ holds per-availability-zone PAYG usage values. + // Key: availability zone name, Value: PAYG usage in that AZ. + // +kubebuilder:validation:Optional + PerAZ map[string]int64 `json:"perAZ,omitempty"` +} + +// ProjectQuotaSpec defines the desired state of ProjectQuota. +// Populated from PUT /v1/projects/:uuid/quota payloads (liquid.ServiceQuotaRequest). +// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ServiceQuotaRequest +type ProjectQuotaSpec struct { + // ProjectID of the OpenStack project this quota belongs to. + // Corresponds to the :uuid in the PUT URL path. + // +kubebuilder:validation:Required + ProjectID string `json:"projectID"` + + // ProjectName is the human-readable name of the OpenStack project. + // Extracted from liquid.ServiceQuotaRequest.ProjectMetadata.Name. + // +kubebuilder:validation:Optional + ProjectName string `json:"projectName,omitempty"` + + // DomainID of the OpenStack domain this project belongs to. + // Extracted from liquid.ServiceQuotaRequest.ProjectMetadata.Domain.UUID. + // +kubebuilder:validation:Required + DomainID string `json:"domainID"` + + // DomainName is the human-readable name of the OpenStack domain. + // Extracted from liquid.ServiceQuotaRequest.ProjectMetadata.Domain.Name. + // +kubebuilder:validation:Optional + DomainName string `json:"domainName,omitempty"` + + // Quota maps LIQUID resource names to their per-AZ quota. + // Key: liquid.ResourceName (e.g. "hw_version_hana_v2_ram") + // Mirrors liquid.ServiceQuotaRequest.Resources with AZSeparatedTopology. + // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ServiceQuotaRequest + // +kubebuilder:validation:Optional + Quota map[string]ResourceQuota `json:"quota,omitempty"` +} + +// ProjectQuotaStatus defines the observed state of ProjectQuota. +// Usage values correspond to liquid.AZResourceUsageReport fields reported via /report-usage. +// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#AZResourceUsageReport +type ProjectQuotaStatus struct { + // ObservedGeneration is the most recent spec generation that the controller has processed. + // Used to distinguish spec changes (which require TotalUsage recompute) from + // CommittedResource changes (which only need PaygUsage recompute). + // +kubebuilder:validation:Optional + ObservedGeneration int64 `json:"observedGeneration,omitempty"` + + // TotalUsage tracks per-resource per-AZ total resource consumption (all VMs in this project). + // Persisted by the quota controller; updated by full reconcile and HV instance diffs. + // Key: liquid.ResourceName + // +kubebuilder:validation:Optional + TotalUsage map[string]ResourceQuotaUsage `json:"totalUsage,omitempty"` + + // PaygUsage tracks per-resource per-AZ pay-as-you-go usage. + // Derived as TotalUsage - CRUsage (clamped >= 0). + // Key: liquid.ResourceName + // +kubebuilder:validation:Optional + PaygUsage map[string]ResourceQuotaUsage `json:"paygUsage,omitempty"` + + // LastReconcileAt is when the controller last reconciled this project's quota (any path). + // +kubebuilder:validation:Optional + LastReconcileAt *metav1.Time `json:"lastReconcileAt,omitempty"` + + // LastFullReconcileAt is when the periodic full reconcile last completed for this project. + // Used as the watermark for isVMNewSinceLastReconcile (incremental add detection). + // Only updated by ReconcilePeriodic, NOT by PaygUsage recomputes or incremental deltas. + // +kubebuilder:validation:Optional + LastFullReconcileAt *metav1.Time `json:"lastFullReconcileAt,omitempty"` + + // Conditions holds the current status conditions. + // +kubebuilder:validation:Optional + Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:resource:scope=Cluster +// +kubebuilder:printcolumn:name="Project",type="string",JSONPath=".spec.projectID" +// +kubebuilder:printcolumn:name="Domain",type="string",JSONPath=".spec.domainID" +// +kubebuilder:printcolumn:name="LastReconcile",type="date",JSONPath=".status.lastReconcileAt" +// +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" + +// ProjectQuota is the Schema for the projectquotas API. +// It persists quota values pushed by Limes via the LIQUID quota endpoint +// (PUT /v1/projects/:uuid/quota → liquid.ServiceQuotaRequest). +// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ServiceQuotaRequest +type ProjectQuota struct { + metav1.TypeMeta `json:",inline"` + + // +optional + metav1.ObjectMeta `json:"metadata,omitempty,omitzero"` + + // +required + Spec ProjectQuotaSpec `json:"spec"` + + // +optional + Status ProjectQuotaStatus `json:"status,omitempty,omitzero"` +} + +// +kubebuilder:object:root=true + +// ProjectQuotaList contains a list of ProjectQuota +type ProjectQuotaList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []ProjectQuota `json:"items"` +} + +func init() { + SchemeBuilder.Register(&ProjectQuota{}, &ProjectQuotaList{}) +} diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index d9daa7aab..06f075e1e 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -187,37 +187,40 @@ func (in *CommittedResourceSpec) DeepCopy() *CommittedResourceSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *CommittedResourceStatus) DeepCopyInto(out *CommittedResourceStatus) { *out = *in - if in.AcceptedAmount != nil { - in, out := &in.AcceptedAmount, &out.AcceptedAmount - x := (*in).DeepCopy() - *out = &x + if in.AcceptedSpec != nil { + in, out := &in.AcceptedSpec, &out.AcceptedSpec + *out = new(CommittedResourceSpec) + (*in).DeepCopyInto(*out) } if in.AcceptedAt != nil { in, out := &in.AcceptedAt, &out.AcceptedAt *out = (*in).DeepCopy() } - if in.LastChanged != nil { - in, out := &in.LastChanged, &out.LastChanged - *out = (*in).DeepCopy() - } if in.LastReconcileAt != nil { in, out := &in.LastReconcileAt, &out.LastReconcileAt *out = (*in).DeepCopy() } - if in.AssignedVMs != nil { - in, out := &in.AssignedVMs, &out.AssignedVMs + if in.AssignedInstances != nil { + in, out := &in.AssignedInstances, &out.AssignedInstances *out = make([]string, len(*in)) copy(*out, *in) } - if in.UsedAmount != nil { - in, out := &in.UsedAmount, &out.UsedAmount - x := (*in).DeepCopy() - *out = &x + if in.UsedResources != nil { + in, out := &in.UsedResources, &out.UsedResources + *out = make(map[string]resource.Quantity, len(*in)) + for key, val := range *in { + (*out)[key] = val.DeepCopy() + } } if in.LastUsageReconcileAt != nil { in, out := &in.LastUsageReconcileAt, &out.LastUsageReconcileAt *out = (*in).DeepCopy() } + if in.UsageObservedGeneration != nil { + in, out := &in.UsageObservedGeneration, &out.UsageObservedGeneration + *out = new(int64) + **out = **in + } if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions *out = make([]v1.Condition, len(*in)) @@ -749,6 +752,123 @@ func (in *FilterSpec) DeepCopy() *FilterSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FlavorCapacityStatus) DeepCopyInto(out *FlavorCapacityStatus) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FlavorCapacityStatus. +func (in *FlavorCapacityStatus) DeepCopy() *FlavorCapacityStatus { + if in == nil { + return nil + } + out := new(FlavorCapacityStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FlavorGroupCapacity) DeepCopyInto(out *FlavorGroupCapacity) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + out.Spec = in.Spec + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FlavorGroupCapacity. +func (in *FlavorGroupCapacity) DeepCopy() *FlavorGroupCapacity { + if in == nil { + return nil + } + out := new(FlavorGroupCapacity) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *FlavorGroupCapacity) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FlavorGroupCapacityList) DeepCopyInto(out *FlavorGroupCapacityList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]FlavorGroupCapacity, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FlavorGroupCapacityList. +func (in *FlavorGroupCapacityList) DeepCopy() *FlavorGroupCapacityList { + if in == nil { + return nil + } + out := new(FlavorGroupCapacityList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *FlavorGroupCapacityList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FlavorGroupCapacitySpec) DeepCopyInto(out *FlavorGroupCapacitySpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FlavorGroupCapacitySpec. +func (in *FlavorGroupCapacitySpec) DeepCopy() *FlavorGroupCapacitySpec { + if in == nil { + return nil + } + out := new(FlavorGroupCapacitySpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FlavorGroupCapacityStatus) DeepCopyInto(out *FlavorGroupCapacityStatus) { + *out = *in + if in.Flavors != nil { + in, out := &in.Flavors, &out.Flavors + *out = make([]FlavorCapacityStatus, len(*in)) + copy(*out, *in) + } + in.LastReconcileAt.DeepCopyInto(&out.LastReconcileAt) + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FlavorGroupCapacityStatus. +func (in *FlavorGroupCapacityStatus) DeepCopy() *FlavorGroupCapacityStatus { + if in == nil { + return nil + } + out := new(FlavorGroupCapacityStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *History) DeepCopyInto(out *History) { *out = *in @@ -1420,6 +1540,131 @@ func (in *PlacementDatasource) DeepCopy() *PlacementDatasource { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ProjectQuota) DeepCopyInto(out *ProjectQuota) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProjectQuota. +func (in *ProjectQuota) DeepCopy() *ProjectQuota { + if in == nil { + return nil + } + out := new(ProjectQuota) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ProjectQuota) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ProjectQuotaList) DeepCopyInto(out *ProjectQuotaList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]ProjectQuota, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProjectQuotaList. +func (in *ProjectQuotaList) DeepCopy() *ProjectQuotaList { + if in == nil { + return nil + } + out := new(ProjectQuotaList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ProjectQuotaList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ProjectQuotaSpec) DeepCopyInto(out *ProjectQuotaSpec) { + *out = *in + if in.Quota != nil { + in, out := &in.Quota, &out.Quota + *out = make(map[string]ResourceQuota, len(*in)) + for key, val := range *in { + (*out)[key] = *val.DeepCopy() + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProjectQuotaSpec. +func (in *ProjectQuotaSpec) DeepCopy() *ProjectQuotaSpec { + if in == nil { + return nil + } + out := new(ProjectQuotaSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ProjectQuotaStatus) DeepCopyInto(out *ProjectQuotaStatus) { + *out = *in + if in.TotalUsage != nil { + in, out := &in.TotalUsage, &out.TotalUsage + *out = make(map[string]ResourceQuotaUsage, len(*in)) + for key, val := range *in { + (*out)[key] = *val.DeepCopy() + } + } + if in.PaygUsage != nil { + in, out := &in.PaygUsage, &out.PaygUsage + *out = make(map[string]ResourceQuotaUsage, len(*in)) + for key, val := range *in { + (*out)[key] = *val.DeepCopy() + } + } + if in.LastReconcileAt != nil { + in, out := &in.LastReconcileAt, &out.LastReconcileAt + *out = (*in).DeepCopy() + } + if in.LastFullReconcileAt != nil { + in, out := &in.LastFullReconcileAt, &out.LastFullReconcileAt + *out = (*in).DeepCopy() + } + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProjectQuotaStatus. +func (in *ProjectQuotaStatus) DeepCopy() *ProjectQuotaStatus { + if in == nil { + return nil + } + out := new(ProjectQuotaStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *PrometheusDatasource) DeepCopyInto(out *PrometheusDatasource) { *out = *in @@ -1570,6 +1815,50 @@ func (in *ReservationStatus) DeepCopy() *ReservationStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ResourceQuota) DeepCopyInto(out *ResourceQuota) { + *out = *in + if in.PerAZ != nil { + in, out := &in.PerAZ, &out.PerAZ + *out = make(map[string]int64, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourceQuota. +func (in *ResourceQuota) DeepCopy() *ResourceQuota { + if in == nil { + return nil + } + out := new(ResourceQuota) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ResourceQuotaUsage) DeepCopyInto(out *ResourceQuotaUsage) { + *out = *in + if in.PerAZ != nil { + in, out := &in.PerAZ, &out.PerAZ + *out = make(map[string]int64, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourceQuotaUsage. +func (in *ResourceQuotaUsage) DeepCopy() *ResourceQuotaUsage { + if in == nil { + return nil + } + out := new(ResourceQuotaUsage) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SchedulingHistoryEntry) DeepCopyInto(out *SchedulingHistoryEntry) { *out = *in diff --git a/cmd/manager/main.go b/cmd/manager/main.go index 4a09323a4..6c1096512 100644 --- a/cmd/manager/main.go +++ b/cmd/manager/main.go @@ -6,6 +6,7 @@ package main import ( "context" "crypto/tls" + "encoding/json" "flag" "fmt" "log/slog" @@ -56,9 +57,11 @@ import ( "github.com/cobaltcore-dev/cortex/internal/scheduling/nova" "github.com/cobaltcore-dev/cortex/internal/scheduling/pods" "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/capacity" "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/commitments" commitmentsapi "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/commitments/api" "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/failover" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/quota" "github.com/cobaltcore-dev/cortex/pkg/conf" "github.com/cobaltcore-dev/cortex/pkg/monitoring" "github.com/cobaltcore-dev/cortex/pkg/multicluster" @@ -101,7 +104,10 @@ func main() { restConfig := ctrl.GetConfigOrDie() // Custom entrypoint for scheduler e2e tests. - if len(os.Args) == 2 { + // Usage: /main [json-override] + // The optional json-override is merged on top of the ConfigMap config, e.g.: + // /main e2e-commitments '{"noCleanup":true,"azs":["qa-de-1a"]}' + if len(os.Args) >= 2 { copts := client.Options{Scheme: scheme} client := must.Return(client.New(restConfig, copts)) switch os.Args[1] { @@ -118,7 +124,21 @@ func main() { return case "e2e-commitments": commitmentsChecksConfig := conf.GetConfigOrDie[commitments.E2EChecksConfig]() - commitments.RunCommitmentsE2EChecks(ctx, commitmentsChecksConfig) + if len(os.Args) >= 3 { + if err := json.Unmarshal([]byte(os.Args[2]), &commitmentsChecksConfig); err != nil { + slog.Error("invalid json override for e2e-commitments", "err", err) + os.Exit(1) + } + } + func() { + defer func() { + if r := recover(); r != nil { + slog.Error("e2e check failed", "reason", r) + os.Exit(1) + } + }() + commitments.RunCommitmentsE2EChecks(ctx, commitmentsChecksConfig) + }() return } } @@ -548,14 +568,34 @@ func main() { os.Exit(1) } + crControllerConf := commitmentsConfig.CommittedResourceController + crControllerConf.ApplyDefaults() if err := (&commitments.CommittedResourceController{ Client: multiclusterClient, Scheme: mgr.GetScheme(), - Conf: commitmentsConfig.CommittedResourceController, + Conf: crControllerConf, }).SetupWithManager(mgr, multiclusterClient); err != nil { setupLog.Error(err, "unable to create controller", "controller", "CommittedResource") os.Exit(1) } + + usageReconcilerMonitor := commitments.NewUsageReconcilerMonitor() + metrics.Registry.MustRegister(&usageReconcilerMonitor) + if commitmentsUsageDB == nil { + setupLog.Error(nil, "UsageReconciler requires a datasource but commitments.datasourceName is not configured — skipping") + } else { + usageReconcilerConf := commitmentsConfig.UsageReconciler + usageReconcilerConf.ApplyDefaults() + if err := (&commitments.UsageReconciler{ + Client: multiclusterClient, + Conf: usageReconcilerConf, + UsageDB: commitmentsUsageDB, + Monitor: usageReconcilerMonitor, + }).SetupWithManager(mgr, multiclusterClient); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "CommittedResourceUsage") + os.Exit(1) + } + } } if slices.Contains(mainConfig.EnabledControllers, "datasource-controllers") { setupLog.Info("enabling controller", "controller", "datasource-controllers") @@ -686,6 +726,92 @@ func main() { "maxVMsToProcess", failoverConfig.MaxVMsToProcess, "vmSelectionRotationInterval", failoverConfig.VMSelectionRotationInterval) } + if slices.Contains(mainConfig.EnabledControllers, "capacity-controller") { + setupLog.Info("enabling controller", "controller", "capacity-controller") + capacityConfig := conf.GetConfigOrDie[capacity.Config]() + capacityConfig.ApplyDefaults() + + capacityMonitor := capacity.NewMonitor(multiclusterClient) + if err := metrics.Registry.Register(&capacityMonitor); err != nil { + setupLog.Error(err, "failed to register capacity monitor metrics, continuing without metrics") + } + + capacityController := capacity.NewController(multiclusterClient, capacityConfig) + if err := mgr.Add(manager.RunnableFunc(func(ctx context.Context) error { + return capacityController.Start(ctx) + })); err != nil { + setupLog.Error(err, "unable to add capacity controller to manager") + os.Exit(1) + } + setupLog.Info("capacity-controller registered", + "schedulerURL", capacityConfig.SchedulerURL, + "reconcileInterval", capacityConfig.ReconcileInterval, + "totalPipeline", capacityConfig.TotalPipeline, + "placeablePipeline", capacityConfig.PlaceablePipeline) + } + + if slices.Contains(mainConfig.EnabledControllers, "quota-controller") { + setupLog.Info("enabling controller", "controller", "quota-controller") + quotaConfig := conf.GetConfigOrDie[quota.QuotaControllerConfig]() + quotaConfig.ApplyDefaults() + + // Get datasource name from the failover/commitments config (shared dependency) + failoverCfg := conf.GetConfigOrDie[failover.FailoverConfig]() + failoverCfg.ApplyDefaults() + datasourceName := failoverCfg.DatasourceName + if datasourceName == "" { + setupLog.Error(nil, "quota-controller requires datasourceName to be configured") + os.Exit(1) + } + + quotaMetrics := quota.NewQuotaMetrics(metrics.Registry) + + // Defer initialization until the manager starts (cache must be ready for postgres reader) + if err := mgr.Add(manager.RunnableFunc(func(ctx context.Context) error { + // Create PostgresReader from the configured Datasource CRD + postgresReader, err := external.NewPostgresReader(ctx, multiclusterClient, datasourceName) + if err != nil { + setupLog.Error(err, "unable to create postgres reader for quota controller", + "datasourceName", datasourceName) + return err + } + + // Create NovaReader and DBVMSource + novaReader := external.NewNovaReader(postgresReader) + vmSource := failover.NewDBVMSource(novaReader) + + // Create the quota controller + quotaController := quota.NewQuotaController( + multiclusterClient, + vmSource, + quotaConfig, + quotaMetrics, + ) + + // Set up the watch-based reconciler (ProjectQuota spec changes, CR changes) + if err := quotaController.SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to set up quota controller") + return err + } + + // Set up the HV watcher for incremental TotalUsage updates + if err := quotaController.SetupHVWatcher(mgr); err != nil { + setupLog.Error(err, "unable to set up quota HV watcher") + return err + } + + setupLog.Info("quota-controller starting", + "fullReconcileInterval", quotaConfig.FullReconcileInterval.Duration, + "crStateFilter", quotaConfig.CRStateFilter) + + // Start the periodic full reconciliation loop + return quotaController.Start(ctx) + })); err != nil { + setupLog.Error(err, "unable to add quota controller to manager") + os.Exit(1) + } + setupLog.Info("quota-controller registered") + } // +kubebuilder:scaffold:builder @@ -722,7 +848,7 @@ func main() { syncerConfig := conf.GetConfigOrDie[commitments.SyncerConfig]() if err := (&task.Runner{ Client: multiclusterClient, - Interval: syncerConfig.SyncInterval, + Interval: syncerConfig.SyncInterval.Duration, Name: "commitments-sync-task", Run: func(ctx context.Context) error { return syncer.SyncReservations(ctx) }, Init: func(ctx context.Context) error { return syncer.Init(ctx, syncerConfig) }, diff --git a/helm/bundles/cortex-cinder/Chart.yaml b/helm/bundles/cortex-cinder/Chart.yaml index 9118e6f60..0ae840f15 100644 --- a/helm/bundles/cortex-cinder/Chart.yaml +++ b/helm/bundles/cortex-cinder/Chart.yaml @@ -5,7 +5,7 @@ apiVersion: v2 name: cortex-cinder description: A Helm chart deploying Cortex for Cinder. type: application -version: 0.0.59 +version: 0.0.60 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres @@ -16,12 +16,12 @@ dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.46 + version: 0.0.47 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.46 + version: 0.0.47 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-crds/Chart.yaml b/helm/bundles/cortex-crds/Chart.yaml index 0fe152845..633fe00e7 100644 --- a/helm/bundles/cortex-crds/Chart.yaml +++ b/helm/bundles/cortex-crds/Chart.yaml @@ -5,13 +5,13 @@ apiVersion: v2 name: cortex-crds description: A Helm chart deploying Cortex CRDs. type: application -version: 0.0.59 +version: 0.0.60 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.46 + version: 0.0.47 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/bundles/cortex-ironcore/Chart.yaml b/helm/bundles/cortex-ironcore/Chart.yaml index 079aed03e..571e1a243 100644 --- a/helm/bundles/cortex-ironcore/Chart.yaml +++ b/helm/bundles/cortex-ironcore/Chart.yaml @@ -5,13 +5,13 @@ apiVersion: v2 name: cortex-ironcore description: A Helm chart deploying Cortex for IronCore. type: application -version: 0.0.59 +version: 0.0.60 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.46 + version: 0.0.47 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/library/cortex/templates/rbac/compute.ironcore.dev_role.yaml b/helm/bundles/cortex-ironcore/templates/rbac.yaml similarity index 55% rename from helm/library/cortex/templates/rbac/compute.ironcore.dev_role.yaml rename to helm/bundles/cortex-ironcore/templates/rbac.yaml index d9f4402ef..0ff654d34 100644 --- a/helm/library/cortex/templates/rbac/compute.ironcore.dev_role.yaml +++ b/helm/bundles/cortex-ironcore/templates/rbac.yaml @@ -1,11 +1,7 @@ -{{- if .Values.rbac.ironcore.enable }} ---- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - labels: - {{- include "chart.labels" . | nindent 4 }} - name: {{ .Values.namePrefix }}-manager-role-ironcore + name: {{ .Values.cortex.namePrefix }}-manager-role-ironcore rules: - apiGroups: - compute.ironcore.dev @@ -39,4 +35,16 @@ rules: - get - patch - update -{{- end -}} \ No newline at end of file +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ .Values.cortex.namePrefix }}-manager-rolebinding-ironcore +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ .Values.cortex.namePrefix }}-manager-role-ironcore +subjects: +- kind: ServiceAccount + name: {{ .Values.cortex.namePrefix }}-controller-manager + namespace: {{ .Release.Namespace }} diff --git a/helm/bundles/cortex-ironcore/values.iiab.yaml b/helm/bundles/cortex-ironcore/values.iiab.yaml index d013f9f2e..e2082e841 100644 --- a/helm/bundles/cortex-ironcore/values.iiab.yaml +++ b/helm/bundles/cortex-ironcore/values.iiab.yaml @@ -8,9 +8,6 @@ cortex: # Grant cortex access to ironcore resources. crd: enable: true - rbac: - ironcore: - enable: true # Use our locally built image. controllerManager: container: diff --git a/helm/bundles/cortex-manila/Chart.yaml b/helm/bundles/cortex-manila/Chart.yaml index 25e81f47e..93ab402d2 100644 --- a/helm/bundles/cortex-manila/Chart.yaml +++ b/helm/bundles/cortex-manila/Chart.yaml @@ -5,7 +5,7 @@ apiVersion: v2 name: cortex-manila description: A Helm chart deploying Cortex for Manila. type: application -version: 0.0.59 +version: 0.0.60 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres @@ -16,12 +16,12 @@ dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.46 + version: 0.0.47 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.46 + version: 0.0.47 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-nova/Chart.yaml b/helm/bundles/cortex-nova/Chart.yaml index 1d838afa0..feff1ae83 100644 --- a/helm/bundles/cortex-nova/Chart.yaml +++ b/helm/bundles/cortex-nova/Chart.yaml @@ -5,7 +5,7 @@ apiVersion: v2 name: cortex-nova description: A Helm chart deploying Cortex for Nova. type: application -version: 0.0.59 +version: 0.0.60 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres @@ -16,12 +16,12 @@ dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.46 + version: 0.0.47 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.46 + version: 0.0.47 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index b1a8570f4..47b337968 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -288,7 +288,7 @@ groups: pipeline status and logs for more details. - alert: CortexNovaDoesntFindValidKVMHosts - expr: sum by (az, hvtype) (increase(cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*"}[5m])) > 0 + expr: sum by (az, hvtype) (increase(cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*",faultmsg!~".*No such host.*"}[5m])) > 0 for: 5m labels: context: scheduling diff --git a/helm/bundles/cortex-nova/templates/datasources.yaml b/helm/bundles/cortex-nova/templates/datasources.yaml index f9160602f..582effac2 100644 --- a/helm/bundles/cortex-nova/templates/datasources.yaml +++ b/helm/bundles/cortex-nova/templates/datasources.yaml @@ -337,6 +337,30 @@ spec: --- apiVersion: cortex.cloud/v1alpha1 kind: Datasource +metadata: + name: nova-images +spec: + schedulingDomain: nova + databaseSecretRef: + name: cortex-nova-postgres + namespace: {{ .Release.Namespace }} + {{- if .Values.openstack.sso.enabled }} + ssoSecretRef: + name: cortex-nova-openstack-sso + namespace: {{ .Release.Namespace }} + {{- end }} + type: openstack + openstack: + syncInterval: 3600s + secretRef: + name: cortex-nova-openstack-keystone + namespace: {{ .Release.Namespace }} + type: nova + nova: + type: images +--- +apiVersion: cortex.cloud/v1alpha1 +kind: Datasource metadata: name: limes-project-commitments spec: diff --git a/helm/bundles/cortex-nova/templates/kpis.yaml b/helm/bundles/cortex-nova/templates/kpis.yaml index 6979b0e29..5717cd62e 100644 --- a/helm/bundles/cortex-nova/templates/kpis.yaml +++ b/helm/bundles/cortex-nova/templates/kpis.yaml @@ -29,33 +29,6 @@ spec: --- apiVersion: cortex.cloud/v1alpha1 kind: KPI -metadata: - name: host-running-vms -spec: - schedulingDomain: nova - impl: host_running_vms_kpi - dependencies: - knowledges: - - name: host-details - - name: host-utilization - description: | - This KPI tracks the total number of running VMs on hosts. ---- -apiVersion: cortex.cloud/v1alpha1 -kind: KPI -metadata: - name: flavors-running-vms -spec: - schedulingDomain: nova - impl: flavor_running_vms_kpi - dependencies: - datasources: - - name: nova-servers - description: | - This KPI tracks the total number of running VMs per flavor and availability zone. ---- -apiVersion: cortex.cloud/v1alpha1 -kind: KPI metadata: name: vm-migration-statistics spec: @@ -183,6 +156,7 @@ spec: - name: nova-servers - name: nova-flavors - name: identity-projects + - name: identity-domains knowledges: - name: host-details description: | @@ -191,15 +165,17 @@ spec: apiVersion: cortex.cloud/v1alpha1 kind: KPI metadata: - name: vmware-resource-commitments + name: vmware-project-commitments spec: schedulingDomain: nova - impl: vmware_resource_commitments_kpi + impl: vmware_project_commitments_kpi dependencies: datasources: - name: nova-servers - name: nova-flavors - name: limes-project-commitments + - name: identity-domains + - name: identity-projects description: | This KPI tracks the resource commitments of projects running VMs on VMware hosts. --- diff --git a/helm/bundles/cortex-nova/templates/kpis_kvm.yaml b/helm/bundles/cortex-nova/templates/kpis_kvm.yaml index e98c0a447..48b9eb155 100644 --- a/helm/bundles/cortex-nova/templates/kpis_kvm.yaml +++ b/helm/bundles/cortex-nova/templates/kpis_kvm.yaml @@ -13,4 +13,17 @@ spec: - name: host-utilization description: | This KPI tracks the total, utilized, reserved and failover capacity of KVM hosts. +--- +apiVersion: cortex.cloud/v1alpha1 +kind: KPI +metadata: + name: kvm-project-utilization +spec: + schedulingDomain: nova + impl: kvm_project_utilization_kpi + dependencies: + datasources: + - name: nova-servers + - name: nova-flavors + - name: identity-projects {{- end }} \ No newline at end of file diff --git a/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml b/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml index 561d9fc3c..143c0488a 100644 --- a/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml +++ b/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml @@ -557,4 +557,43 @@ spec: VM is allocated get a higher weight, encouraging placement on pre-reserved failover capacity. For non-evacuation requests, this weigher has no effect. +--- +apiVersion: cortex.cloud/v1alpha1 +kind: Pipeline +metadata: + name: kvm-report-capacity +spec: + schedulingDomain: nova + description: | + This pipeline is used by the capacity controller to determine the + theoretical maximum capacity of each flavor group per availability zone, + as if all hosts were completely empty. It ignores current VM allocations + and all reservation blockings so that only raw hardware capacity is + considered. + type: filter-weigher + createDecisions: false + # Fetch all placement candidates, ignoring nova's preselection. + ignorePreselection: true + filters: + - name: filter_correct_az + description: | + Restricts host candidates to the requested availability zone. + - name: filter_has_enough_capacity + description: | + Filters hosts that cannot fit the flavor based on raw hardware capacity. + VM allocations and all reservation types are ignored to represent an + empty datacenter scenario. + params: + - {key: ignoredReservationTypes, stringListValue: ["CommittedResourceReservation", "FailoverReservation"]} + - name: filter_has_requested_traits + description: | + Ensures hosts have the hardware traits required by the flavor. + - name: filter_capabilities + description: | + Ensures hosts meet the compute capabilities required by the flavor + extra specs (e.g., architecture, maxphysaddr bits). + - name: filter_status_conditions + description: | + Excludes hosts that are not ready or are disabled. + weighers: [] {{- end }} diff --git a/helm/bundles/cortex-nova/templates/rbac.yaml b/helm/bundles/cortex-nova/templates/rbac.yaml index 36914f17c..c47a8f061 100644 --- a/helm/bundles/cortex-nova/templates/rbac.yaml +++ b/helm/bundles/cortex-nova/templates/rbac.yaml @@ -14,12 +14,60 @@ metadata: name: cortex-nova-secret-reader-binding subjects: - kind: ServiceAccount - name: cortex-nova-scheduling-controller-manager + name: {{ (index .Values "cortex-scheduling-controllers").namePrefix }}-controller-manager namespace: {{ .Release.Namespace }} - kind: ServiceAccount - name: cortex-nova-knowledge-controller-manager + name: {{ (index .Values "cortex-knowledge-controllers").namePrefix }}-controller-manager namespace: {{ .Release.Namespace }} roleRef: kind: ClusterRole name: cortex-nova-secret-reader apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: cortex-nova-manager-role-hypervisor +rules: +- apiGroups: + - kvm.cloud.sap + resources: + - hypervisors + verbs: + - get + - list + - patch + - update + - watch +- apiGroups: + - kvm.cloud.sap + resources: + - hypervisors/status + verbs: + - get +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: cortex-nova-manager-rolebinding-hypervisor-scheduling +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cortex-nova-manager-role-hypervisor +subjects: +- kind: ServiceAccount + name: {{ (index .Values "cortex-scheduling-controllers").namePrefix }}-controller-manager + namespace: {{ .Release.Namespace }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: cortex-nova-manager-rolebinding-hypervisor-knowledge +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cortex-nova-manager-role-hypervisor +subjects: +- kind: ServiceAccount + name: {{ (index .Values "cortex-knowledge-controllers").namePrefix }}-controller-manager + namespace: {{ .Release.Namespace }} diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml index a83a4f944..65ce2ddde 100644 --- a/helm/bundles/cortex-nova/values.yaml +++ b/helm/bundles/cortex-nova/values.yaml @@ -95,6 +95,10 @@ cortex: &cortex - cortex.cloud/v1alpha1/ReservationList - cortex.cloud/v1alpha1/CommittedResource - cortex.cloud/v1alpha1/CommittedResourceList + - cortex.cloud/v1alpha1/ProjectQuota + - cortex.cloud/v1alpha1/ProjectQuotaList + - cortex.cloud/v1alpha1/FlavorGroupCapacity + - cortex.cloud/v1alpha1/FlavorGroupCapacityList - kvm.cloud.sap/v1/Hypervisor - kvm.cloud.sap/v1/HypervisorList - v1/Secret @@ -114,9 +118,6 @@ cortex: &cortex cortex-scheduling-controllers: <<: *cortex namePrefix: cortex-nova-scheduling - rbac: - # The cortex nova scheduling controllers need hypervisor crd access. - hypervisor: {enable: true} # Enable webhook that will validate CRDs for the scheduling controllers. webhook: {enable: true} certmanager: {enable: true} # Needed for the webhook TLS certificates. @@ -133,8 +134,17 @@ cortex-scheduling-controllers: - hypervisor-overcommit-controller - committed-resource-reservations-controller - failover-reservations-controller + - quota-controller + - capacity-controller + # Pipeline used for the empty-state capacity probe (ignores allocations and reservations). + capacityTotalPipeline: "kvm-report-capacity" + # Pipeline used for the current-state capacity probe (considers current VM allocations). + capacityPlaceablePipeline: "kvm-general-purpose-load-balancing" + # How often the capacity controller re-runs its scheduler probes. + capacityReconcileInterval: 5m enabledTasks: - nova-history-cleanup-task + - commitments-sync-task # If true, the external scheduler API will limit the list of hosts in its # response to those included in the scheduling request. novaLimitHostsToRequest: true @@ -161,8 +171,10 @@ cortex-scheduling-controllers: # URL of the nova external scheduler API for placement decisions schedulerURL: "http://localhost:8080/scheduler/nova/external" committedResourceController: - # Back-off interval while CommittedResource placement is pending or failed + # Back-off interval while CommittedResource placement is pending or failed (base for exponential backoff) requeueIntervalRetry: "1m" + # Maximum back-off interval cap for the exponential retry delay + maxRequeueInterval: "30m" committedResourceAPI: # Timeout for watching CommittedResource CRDs before rolling back watchTimeout: "10s" @@ -174,6 +186,9 @@ cortex-scheduling-controllers: enableReportUsage: true # When false, the endpoint returns HTTP 503. enableReportCapacity: true + # Whether the quota API endpoint is active + # When false, the endpoint returns HTTP 503. + enableQuota: true # Maps flavor group IDs to resource flag configs; "*" acts as catch-all. # Controls handlesCommitments, hasCapacity, hasQuota per resource type for each group. flavorGroupResourceConfig: @@ -190,6 +205,11 @@ cortex-scheduling-controllers: handlesCommitments: false hasCapacity: true hasQuota: false + committedResourceUsageReconciler: + # Minimum time between usage reconcile runs for the same CommittedResource. + # Also acts as the periodic fallback interval: a successful reconcile schedules + # the next run after this duration, so this is also the maximum status staleness. + cooldownInterval: "5m" # OvercommitMappings is a list of mappings that map hypervisor traits to # overcommit ratios. Note that this list is applied in order, so if there # are multiple mappings applying to the same hypervisors, the last mapping @@ -231,9 +251,6 @@ cortex-scheduling-controllers: cortex-knowledge-controllers: <<: *cortex namePrefix: cortex-nova-knowledge - rbac: - # The cortex nova scheduling controllers need hypervisor crd access. - hypervisor: {enable: true} conf: <<: *cortexConf leaderElectionID: cortex-nova-knowledge diff --git a/helm/bundles/cortex-pods/Chart.yaml b/helm/bundles/cortex-pods/Chart.yaml index a64fe009c..b952d4322 100644 --- a/helm/bundles/cortex-pods/Chart.yaml +++ b/helm/bundles/cortex-pods/Chart.yaml @@ -5,13 +5,13 @@ apiVersion: v2 name: cortex-pods description: A Helm chart deploying Cortex for Pods. type: application -version: 0.0.59 +version: 0.0.60 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.46 + version: 0.0.47 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/library/cortex/templates/rbac/cortex.dev_pods_role.yaml b/helm/bundles/cortex-pods/templates/rbac.yaml similarity index 53% rename from helm/library/cortex/templates/rbac/cortex.dev_pods_role.yaml rename to helm/bundles/cortex-pods/templates/rbac.yaml index 39ef1e93f..727865863 100644 --- a/helm/library/cortex/templates/rbac/cortex.dev_pods_role.yaml +++ b/helm/bundles/cortex-pods/templates/rbac.yaml @@ -1,11 +1,7 @@ -{{- if .Values.rbac.pods.enable }} ---- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - labels: - {{- include "chart.labels" . | nindent 4 }} - name: {{ .Values.namePrefix }}-manager-role-pods + name: {{ .Values.cortex.namePrefix }}-manager-role-pods rules: - apiGroups: - "" @@ -48,4 +44,16 @@ rules: - pods/binding verbs: - create -{{- end -}} \ No newline at end of file +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ .Values.cortex.namePrefix }}-manager-rolebinding-pods +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ .Values.cortex.namePrefix }}-manager-role-pods +subjects: +- kind: ServiceAccount + name: {{ .Values.cortex.namePrefix }}-controller-manager + namespace: {{ .Release.Namespace }} diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index d4519077d..1e1e9a4fa 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: cortex description: A Helm chart to distribute cortex. type: application -version: 0.0.46 -appVersion: "sha-ab6eb45d" +version: 0.0.47 +appVersion: "sha-f9c27d07" icon: "https://example.com/icon.png" dependencies: [] diff --git a/helm/library/cortex/files/crds/cortex.cloud_committedresources.yaml b/helm/library/cortex/files/crds/cortex.cloud_committedresources.yaml index 092827edd..bf63aea12 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_committedresources.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_committedresources.yaml @@ -30,11 +30,13 @@ spec: - jsonPath: .spec.amount name: Amount type: string - - jsonPath: .status.acceptedAmount - name: AcceptedAmount + - jsonPath: .status.usedResources.memory + name: UsedMemory + priority: 1 type: string - - jsonPath: .status.usedAmount - name: UsedAmount + - jsonPath: .status.usedResources.cpu + name: UsedCPU + priority: 1 type: string - jsonPath: .spec.state name: State @@ -166,24 +168,110 @@ spec: status: description: CommittedResourceStatus defines the observed state of CommittedResource. properties: - acceptedAmount: - anyOf: - - type: integer - - type: string - description: |- - AcceptedAmount is the quantity the controller last successfully provisioned as Reservation slots. - Nil if the spec has never been successfully reconciled. - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true acceptedAt: description: AcceptedAt is when the controller last successfully reconciled the spec into Reservation slots. format: date-time type: string - assignedVMs: + acceptedSpec: + description: |- + AcceptedSpec is a snapshot of Spec from the last successful reconcile. + Used by rollbackToAccepted to restore the exact previously-accepted placement (AZ, amount, + project, domain, flavor group) even when the current spec has already been mutated to a new value. + properties: + allowRejection: + description: |- + AllowRejection controls what the CommittedResource controller does when placement fails + for a guaranteed or confirmed commitment. + true — controller may reject: on failure, child Reservations are rolled back and the CR + is marked Rejected. Use this when the caller is making a first-time placement + decision and a "no" answer is acceptable (e.g. the change-commitments API). + false — controller must retry: on failure, existing child Reservations are kept and the + CR is set to Reserving so the controller retries later. Use this when the caller + is restoring already-committed state that Cortex must honour (e.g. the syncer). + Only meaningful for state=guaranteed or state=confirmed; ignored for all other states. + type: boolean + amount: + anyOf: + - type: integer + - type: string + description: |- + Amount is the total committed quantity. + memory: MiB expressed in K8s binary SI notation (e.g. "1280Gi", "640Mi"). + cores: integer core count (e.g. "40"). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + availabilityZone: + description: AvailabilityZone specifies the availability zone + for this commitment. + type: string + commitmentUUID: + description: UUID of the commitment this resource corresponds + to. + type: string + confirmedAt: + description: ConfirmedAt is when the commitment was confirmed. + format: date-time + type: string + domainID: + description: DomainID of the OpenStack domain this commitment + belongs to. + type: string + endTime: + description: EndTime is when Reservation slots expire. Nil for + unbounded commitments with no expiry. + format: date-time + type: string + flavorGroupName: + description: FlavorGroupName identifies the flavor group this + commitment targets, e.g. "kvm_v2_hana_s". + type: string + projectID: + description: ProjectID of the OpenStack project this commitment + belongs to. + type: string + resourceType: + description: 'ResourceType identifies the kind of resource committed: + memory drives Reservation slots; cores uses an arithmetic check + only.' + enum: + - memory + - cores + type: string + schedulingDomain: + description: SchedulingDomain specifies the scheduling domain + for this committed resource (e.g., "nova", "ironcore"). + type: string + startTime: + description: |- + StartTime is the activation time for Reservation slots. + Nil for guaranteed commitments (slots are active from creation); set to ConfirmedAt for confirmed ones. + format: date-time + type: string + state: + description: State is the lifecycle state of the commitment. + enum: + - planned + - pending + - guaranteed + - confirmed + - superseded + - expired + type: string + required: + - amount + - availabilityZone + - commitmentUUID + - domainID + - flavorGroupName + - projectID + - resourceType + - state + type: object + assignedInstances: description: |- - AssignedVMs holds the UUIDs of VMs deterministically assigned to this committed resource. - Populated by the usage reconciler; used to compute UsedAmount and drive the quota controller. + AssignedInstances holds the UUIDs of VM instances deterministically assigned to this committed resource. + Populated by the usage reconciler; used to compute UsedResources and drive the quota controller. items: type: string type: array @@ -244,12 +332,6 @@ spec: - type type: object type: array - lastChanged: - description: |- - LastChanged is when the spec was last written by the syncer. - When AcceptedAt is older than LastChanged, the controller has pending work. - format: date-time - type: string lastReconcileAt: description: LastReconcileAt is when the controller last ran its reconcile loop for this resource. @@ -257,18 +339,29 @@ spec: type: string lastUsageReconcileAt: description: LastUsageReconcileAt is when the usage reconciler last - updated AssignedVMs and UsedAmount. + updated AssignedInstances and UsedResources. format: date-time type: string - usedAmount: - anyOf: - - type: integer - - type: string + usageObservedGeneration: description: |- - UsedAmount is the sum of assigned VM resources expressed in the same units as Spec.Amount. - Populated by the usage reconciler. - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true + UsageObservedGeneration is the CR generation that the usage reconciler last processed. + Follows the Kubernetes observedGeneration pattern: when this differs from + metadata.generation the cooldown is bypassed so spec changes (e.g. shrink) are reflected + immediately rather than waiting for the next cooldown interval. + format: int64 + minimum: 0 + type: integer + usedResources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + UsedResources is the total resource consumption of assigned VM instances, keyed by resource type + (e.g. "memory" in MiB binary SI, "cpu" as core count). Populated by the usage reconciler. + type: object type: object required: - spec diff --git a/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml b/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml new file mode 100644 index 000000000..5f475689e --- /dev/null +++ b/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml @@ -0,0 +1,190 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.20.1 + name: flavorgroupcapacities.cortex.cloud +spec: + group: cortex.cloud + names: + kind: FlavorGroupCapacity + listKind: FlavorGroupCapacityList + plural: flavorgroupcapacities + singular: flavorgroupcapacity + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .spec.flavorGroup + name: FlavorGroup + type: string + - jsonPath: .spec.availabilityZone + name: AZ + type: string + - jsonPath: .status.totalInstances + name: TotalInstances + type: integer + - jsonPath: .status.lastReconcileAt + name: LastReconcile + type: date + - jsonPath: .status.conditions[?(@.type=='Ready')].status + name: Ready + type: string + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + FlavorGroupCapacity caches pre-computed capacity data for one flavor group in one AZ. + One CRD exists per (flavor group × AZ) pair, updated by the capacity controller on a fixed interval. + The capacity API reads these CRDs instead of probing the scheduler on each request. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec defines the desired state of FlavorGroupCapacity + properties: + availabilityZone: + description: AvailabilityZone is the OpenStack AZ this capacity data + covers (e.g. "qa-de-1a"). + type: string + flavorGroup: + description: FlavorGroup is the name of the flavor group (e.g. "hana-v2"). + type: string + required: + - availabilityZone + - flavorGroup + type: object + status: + description: status defines the observed state of FlavorGroupCapacity + properties: + committedCapacity: + description: |- + CommittedCapacity is the sum of AcceptedAmount across active CommittedResource CRDs, + expressed in multiples of the smallest flavor's memory. + format: int64 + type: integer + conditions: + description: The current status conditions of the FlavorGroupCapacity. + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + flavors: + description: Flavors holds per-flavor capacity data for all flavors + in the group. + items: + description: FlavorCapacityStatus holds per-flavor capacity numbers + for one (flavor group × AZ) pair. + properties: + flavorName: + description: FlavorName is the OpenStack flavor name (e.g. "hana-v2-small"). + type: string + placeableHosts: + description: PlaceableHosts is the number of hosts that can + still fit this flavor given current allocations. + format: int64 + type: integer + placeableVms: + description: PlaceableVMs is the number of VM slots remaining + for this flavor given current allocations. + format: int64 + type: integer + totalCapacityHosts: + description: TotalCapacityHosts is the number of eligible hosts + in an empty-datacenter scenario. + format: int64 + type: integer + totalCapacityVmSlots: + description: TotalCapacityVMSlots is the maximum number of VM + slots in an empty-datacenter scenario. + format: int64 + type: integer + required: + - flavorName + type: object + type: array + lastReconcileAt: + description: LastReconcileAt is the timestamp of the last successful + reconcile. + format: date-time + type: string + totalInstances: + description: |- + TotalInstances is the total number of VM instances running on hypervisors in this AZ, + derived from Hypervisor CRD Status.Instances (not filtered by flavor group). + format: int64 + type: integer + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml b/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml new file mode 100644 index 000000000..c9183638b --- /dev/null +++ b/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml @@ -0,0 +1,246 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.20.1 + name: projectquotas.cortex.cloud +spec: + group: cortex.cloud + names: + kind: ProjectQuota + listKind: ProjectQuotaList + plural: projectquotas + singular: projectquota + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .spec.projectID + name: Project + type: string + - jsonPath: .spec.domainID + name: Domain + type: string + - jsonPath: .status.lastReconcileAt + name: LastReconcile + type: date + - jsonPath: .status.conditions[?(@.type=='Ready')].status + name: Ready + type: string + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + ProjectQuota is the Schema for the projectquotas API. + It persists quota values pushed by Limes via the LIQUID quota endpoint + (PUT /v1/projects/:uuid/quota → liquid.ServiceQuotaRequest). + See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ServiceQuotaRequest + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: |- + ProjectQuotaSpec defines the desired state of ProjectQuota. + Populated from PUT /v1/projects/:uuid/quota payloads (liquid.ServiceQuotaRequest). + See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ServiceQuotaRequest + properties: + domainID: + description: |- + DomainID of the OpenStack domain this project belongs to. + Extracted from liquid.ServiceQuotaRequest.ProjectMetadata.Domain.UUID. + type: string + domainName: + description: |- + DomainName is the human-readable name of the OpenStack domain. + Extracted from liquid.ServiceQuotaRequest.ProjectMetadata.Domain.Name. + type: string + projectID: + description: |- + ProjectID of the OpenStack project this quota belongs to. + Corresponds to the :uuid in the PUT URL path. + type: string + projectName: + description: |- + ProjectName is the human-readable name of the OpenStack project. + Extracted from liquid.ServiceQuotaRequest.ProjectMetadata.Name. + type: string + quota: + additionalProperties: + description: |- + ResourceQuota holds the quota for a single resource with per-AZ breakdown. + Maps to liquid.ResourceQuotaRequest from the LIQUID API. + See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ResourceQuotaRequest + properties: + perAZ: + additionalProperties: + format: int64 + type: integer + description: |- + PerAZ holds the per-availability-zone quota breakdown. + Key: availability zone name, Value: quota for that AZ. + Only populated for AZSeparatedTopology resources. + Corresponds to liquid.ResourceQuotaRequest.PerAZ[az].Quota. + See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#AZResourceQuotaRequest + type: object + quota: + description: |- + Quota is the total quota across all AZs (for compatibility). + Corresponds to liquid.ResourceQuotaRequest.Quota. + format: int64 + type: integer + required: + - quota + type: object + description: |- + Quota maps LIQUID resource names to their per-AZ quota. + Key: liquid.ResourceName (e.g. "hw_version_hana_v2_ram") + Mirrors liquid.ServiceQuotaRequest.Resources with AZSeparatedTopology. + See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ServiceQuotaRequest + type: object + required: + - domainID + - projectID + type: object + status: + description: |- + ProjectQuotaStatus defines the observed state of ProjectQuota. + Usage values correspond to liquid.AZResourceUsageReport fields reported via /report-usage. + See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#AZResourceUsageReport + properties: + conditions: + description: Conditions holds the current status conditions. + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + lastFullReconcileAt: + description: |- + LastFullReconcileAt is when the periodic full reconcile last completed for this project. + Used as the watermark for isVMNewSinceLastReconcile (incremental add detection). + Only updated by ReconcilePeriodic, NOT by PaygUsage recomputes or incremental deltas. + format: date-time + type: string + lastReconcileAt: + description: LastReconcileAt is when the controller last reconciled + this project's quota (any path). + format: date-time + type: string + observedGeneration: + description: |- + ObservedGeneration is the most recent spec generation that the controller has processed. + Used to distinguish spec changes (which require TotalUsage recompute) from + CommittedResource changes (which only need PaygUsage recompute). + format: int64 + type: integer + paygUsage: + additionalProperties: + description: ResourceQuotaUsage holds per-AZ PAYG usage for a single + resource. + properties: + perAZ: + additionalProperties: + format: int64 + type: integer + description: |- + PerAZ holds per-availability-zone PAYG usage values. + Key: availability zone name, Value: PAYG usage in that AZ. + type: object + type: object + description: |- + PaygUsage tracks per-resource per-AZ pay-as-you-go usage. + Derived as TotalUsage - CRUsage (clamped >= 0). + Key: liquid.ResourceName + type: object + totalUsage: + additionalProperties: + description: ResourceQuotaUsage holds per-AZ PAYG usage for a single + resource. + properties: + perAZ: + additionalProperties: + format: int64 + type: integer + description: |- + PerAZ holds per-availability-zone PAYG usage values. + Key: availability zone name, Value: PAYG usage in that AZ. + type: object + type: object + description: |- + TotalUsage tracks per-resource per-AZ total resource consumption (all VMs in this project). + Persisted by the quota controller; updated by full reconcile and HV instance diffs. + Key: liquid.ResourceName + type: object + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/helm/library/cortex/templates/rbac/compute.ironcore.dev_role_binding.yaml b/helm/library/cortex/templates/rbac/compute.ironcore.dev_role_binding.yaml deleted file mode 100644 index b28e869c7..000000000 --- a/helm/library/cortex/templates/rbac/compute.ironcore.dev_role_binding.yaml +++ /dev/null @@ -1,16 +0,0 @@ -{{- if .Values.rbac.ironcore.enable }} -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - {{- include "chart.labels" . | nindent 4 }} - name: {{ .Values.namePrefix }}-manager-rolebinding-ironcore -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: {{ .Values.namePrefix }}-manager-role-ironcore -subjects: -- kind: ServiceAccount - name: {{ .Values.namePrefix }}-{{ .Values.controllerManager.serviceAccountName }} - namespace: {{ .Release.Namespace }} -{{- end -}} \ No newline at end of file diff --git a/helm/library/cortex/templates/rbac/cortex.dev_pods_role_binding.yaml b/helm/library/cortex/templates/rbac/cortex.dev_pods_role_binding.yaml deleted file mode 100644 index 74770ec3e..000000000 --- a/helm/library/cortex/templates/rbac/cortex.dev_pods_role_binding.yaml +++ /dev/null @@ -1,16 +0,0 @@ -{{- if .Values.rbac.pods.enable }} -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - {{- include "chart.labels" . | nindent 4 }} - name: {{ .Values.namePrefix }}-manager-rolebinding-pods -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: {{ .Values.namePrefix }}-manager-role-pods -subjects: -- kind: ServiceAccount - name: {{ .Values.namePrefix }}-{{ .Values.controllerManager.serviceAccountName }} - namespace: {{ .Release.Namespace }} -{{- end -}} \ No newline at end of file diff --git a/helm/library/cortex/templates/rbac/hypervisor_role.yaml b/helm/library/cortex/templates/rbac/hypervisor_role.yaml deleted file mode 100644 index 0a2fefa00..000000000 --- a/helm/library/cortex/templates/rbac/hypervisor_role.yaml +++ /dev/null @@ -1,27 +0,0 @@ -{{- if .Values.rbac.hypervisor.enable }} ---- -# TODO: Check if this role can be part of the nova bundle, not the core library -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - {{- include "chart.labels" . | nindent 4 }} - name: {{ .Values.namePrefix }}-manager-role-hypervisor -rules: -- apiGroups: - - kvm.cloud.sap - resources: - - hypervisors - verbs: - - get - - list - - patch - - update - - watch -- apiGroups: - - kvm.cloud.sap - resources: - - hypervisors/status - verbs: - - get -{{- end -}} \ No newline at end of file diff --git a/helm/library/cortex/templates/rbac/hypervisor_role_binding.yaml b/helm/library/cortex/templates/rbac/hypervisor_role_binding.yaml deleted file mode 100644 index 7c41c4513..000000000 --- a/helm/library/cortex/templates/rbac/hypervisor_role_binding.yaml +++ /dev/null @@ -1,16 +0,0 @@ -{{- if .Values.rbac.hypervisor.enable }} -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - {{- include "chart.labels" . | nindent 4 }} - name: {{ .Values.namePrefix }}-manager-rolebinding-hypervisor -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: {{ .Values.namePrefix }}-manager-role-hypervisor -subjects: -- kind: ServiceAccount - name: {{ .Values.namePrefix }}-{{ .Values.controllerManager.serviceAccountName }} - namespace: {{ .Release.Namespace }} -{{- end -}} \ No newline at end of file diff --git a/helm/library/cortex/templates/rbac/role.yaml b/helm/library/cortex/templates/rbac/role.yaml index ea75c6897..661a3be71 100644 --- a/helm/library/cortex/templates/rbac/role.yaml +++ b/helm/library/cortex/templates/rbac/role.yaml @@ -14,6 +14,8 @@ rules: - datasources - reservations - committedresources + - projectquotas + - flavorgroupcapacities - decisions - deschedulings - pipelines @@ -34,6 +36,8 @@ rules: - datasources/finalizers - reservations/finalizers - committedresources/finalizers + - projectquotas/finalizers + - flavorgroupcapacities/finalizers - decisions/finalizers - deschedulings/finalizers - pipelines/finalizers @@ -48,6 +52,8 @@ rules: - datasources/status - reservations/status - committedresources/status + - projectquotas/status + - flavorgroupcapacities/status - decisions/status - deschedulings/status - pipelines/status diff --git a/helm/library/cortex/values.yaml b/helm/library/cortex/values.yaml index f6b94c9ef..49f6cb227 100644 --- a/helm/library/cortex/values.yaml +++ b/helm/library/cortex/values.yaml @@ -55,15 +55,6 @@ controllerManager: # [RBAC]: To enable RBAC (Permissions) configurations rbac: enable: true - # Whether the ironcore roles should be deployed as well. - ironcore: - enable: false - pods: - enable: false - # Whether hypervisor operator/crd related roles should be deployed. - # See: https://github.com/cobaltcore-dev/openstack-hypervisor-operator - hypervisor: - enable: false # [CRDs]: To enable the CRDs crd: diff --git a/internal/knowledge/datasources/plugins/openstack/controller_test.go b/internal/knowledge/datasources/plugins/openstack/controller_test.go index 899e83237..586238d54 100644 --- a/internal/knowledge/datasources/plugins/openstack/controller_test.go +++ b/internal/knowledge/datasources/plugins/openstack/controller_test.go @@ -104,6 +104,7 @@ func TestNovaDatasourceTypeConstants(t *testing.T) { {v1alpha1.NovaDatasourceTypeFlavors, "flavors"}, {v1alpha1.NovaDatasourceTypeMigrations, "migrations"}, {v1alpha1.NovaDatasourceTypeAggregates, "aggregates"}, + {v1alpha1.NovaDatasourceTypeImages, "images"}, } for _, test := range tests { diff --git a/internal/knowledge/datasources/plugins/openstack/nova/nova_api.go b/internal/knowledge/datasources/plugins/openstack/nova/nova_api.go index ca25868f2..03298e9db 100644 --- a/internal/knowledge/datasources/plugins/openstack/nova/nova_api.go +++ b/internal/knowledge/datasources/plugins/openstack/nova/nova_api.go @@ -10,14 +10,17 @@ import ( "log/slog" "net/http" "net/url" + "strings" "time" "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources" "github.com/cobaltcore-dev/cortex/pkg/keystone" "github.com/gophercloud/gophercloud/v2" + "github.com/gophercloud/gophercloud/v2/openstack" "github.com/gophercloud/gophercloud/v2/openstack/compute/v2/aggregates" "github.com/gophercloud/gophercloud/v2/openstack/compute/v2/flavors" + glanceimages "github.com/gophercloud/gophercloud/v2/openstack/image/v2/images" "github.com/gophercloud/gophercloud/v2/pagination" "github.com/prometheus/client_golang/prometheus" ) @@ -37,6 +40,8 @@ type NovaAPI interface { GetAllMigrations(ctx context.Context) ([]Migration, error) // Get all aggregates. GetAllAggregates(ctx context.Context) ([]Aggregate, error) + // Get all Glance images with pre-computed os_type. + GetAllImages(ctx context.Context) ([]Image, error) } // API for OpenStack Nova. @@ -47,8 +52,10 @@ type novaAPI struct { keystoneClient keystone.KeystoneClient // Nova configuration. conf v1alpha1.NovaDatasource - // Authenticated OpenStack service client to fetch the data. + // Authenticated OpenStack compute service client. sc *gophercloud.ServiceClient + // Authenticated Glance image service client (only used for NovaDatasourceTypeImages). + glance *gophercloud.ServiceClient } func NewNovaAPI(mon datasources.Monitor, k keystone.KeystoneClient, conf v1alpha1.NovaDatasource) NovaAPI { @@ -78,6 +85,16 @@ func (api *novaAPI) Init(ctx context.Context) error { // Since 2.61, the extra_specs are returned in the flavor details. Microversion: "2.61", } + // Initialize the Glance client only when this datasource is used for images. + if api.conf.Type == v1alpha1.NovaDatasourceTypeImages { + glanceClient, err := openstack.NewImageV2(provider, gophercloud.EndpointOpts{ + Availability: gophercloud.Availability(sameAsKeystone), + }) + if err != nil { + return fmt.Errorf("failed to create Glance client: %w", err) + } + api.glance = glanceClient + } return nil } @@ -436,3 +453,72 @@ func (api *novaAPI) GetAllAggregates(ctx context.Context) ([]Aggregate, error) { } return aggregates, nil } + +// GetAllImages fetches all Glance images and returns them with pre-computed os_type. +// See deriveOSType for the derivation logic. +func (api *novaAPI) GetAllImages(ctx context.Context) ([]Image, error) { + if api.glance == nil { + return nil, fmt.Errorf("glance client not initialized: datasource type must be %q", v1alpha1.NovaDatasourceTypeImages) + } + + label := Image{}.TableName() + slog.Info("fetching nova data", "label", label) + if api.mon.RequestTimer != nil { + hist := api.mon.RequestTimer.WithLabelValues(label) + timer := prometheus.NewTimer(hist) + defer timer.ObserveDuration() + } + + var result []Image + opts := glanceimages.ListOpts{Limit: 1000} + err := glanceimages.List(api.glance, opts).EachPage(ctx, func(_ context.Context, page pagination.Page) (bool, error) { + imgs, err := glanceimages.ExtractImages(page) + if err != nil { + return false, err + } + for _, img := range imgs { + result = append(result, Image{ + ID: img.ID, + OSType: deriveOSType(img.Properties, img.Tags), + }) + } + return true, nil + }) + if err != nil { + return nil, fmt.Errorf("failed to list Glance images: %w", err) + } + slog.Info("fetched", "label", label, "count", len(result)) + return result, nil +} + +// deriveOSType computes os_type from image properties and tags. +// Mirrors the logic of OSTypeProber.findFromImage in github.com/sapcc/go-bits/liquidapi, +// with two intentional simplifications: +// 1. No regex validation on vmware_ostype — Nova validates that field at VM boot time, +// so any value stored in Glance is already valid. +// 2. Volume-booted VMs are not yet supported — os_type will be "unknown" for them. +// Supporting them would require per-VM Cinder calls (volume_image_metadata.vmware_ostype) +// either at server sync time or via a dedicated datasource. +func deriveOSType(properties map[string]any, tags []string) string { + if v, ok := properties["vmware_ostype"]; ok { + if s, ok := v.(string); ok && s != "" { + return s + } + } + var osType string + for _, tag := range tags { + if after, ok := strings.CutPrefix(tag, "ostype:"); ok { + if osType == "" { + osType = after + } else { + // multiple ostype: tags → ambiguous, fall through to unknown + osType = "" + break + } + } + } + if osType != "" { + return osType + } + return "unknown" +} diff --git a/internal/knowledge/datasources/plugins/openstack/nova/nova_api_test.go b/internal/knowledge/datasources/plugins/openstack/nova/nova_api_test.go index 49f0af4b4..63f83c176 100644 --- a/internal/knowledge/datasources/plugins/openstack/nova/nova_api_test.go +++ b/internal/knowledge/datasources/plugins/openstack/nova/nova_api_test.go @@ -538,6 +538,66 @@ func TestNovaAPI_GetAllHypervisors_DeduplicatesHypervisors(t *testing.T) { } } +func TestDeriveOSType(t *testing.T) { + tests := []struct { + name string + properties map[string]any + tags []string + want string + }{ + { + name: "vmware_ostype property wins", + properties: map[string]any{"vmware_ostype": "windows8Server64Guest"}, + tags: []string{"ostype:linux"}, + want: "windows8Server64Guest", + }, + { + name: "vmware_ostype empty string falls through to tags", + properties: map[string]any{"vmware_ostype": ""}, + tags: []string{"ostype:debian"}, + want: "debian", + }, + { + name: "vmware_ostype not a string falls through", + properties: map[string]any{"vmware_ostype": 42}, + tags: []string{"ostype:centos"}, + want: "centos", + }, + { + name: "single ostype tag", + properties: map[string]any{}, + tags: []string{"ostype:ubuntu", "env:prod"}, + want: "ubuntu", + }, + { + name: "multiple ostype tags: ambiguous, returns unknown", + properties: map[string]any{}, + tags: []string{"ostype:ubuntu", "ostype:debian"}, + want: "unknown", + }, + { + name: "no properties, no tags", + properties: map[string]any{}, + tags: nil, + want: "unknown", + }, + { + name: "tags without ostype prefix", + properties: map[string]any{}, + tags: []string{"env:prod", "region:eu"}, + want: "unknown", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := deriveOSType(tt.properties, tt.tags); got != tt.want { + t.Errorf("deriveOSType() = %q, want %q", got, tt.want) + } + }) + } +} + func TestNovaAPI_GetAllMigrations_DeduplicatesMigrations(t *testing.T) { tests := []struct { name string diff --git a/internal/knowledge/datasources/plugins/openstack/nova/nova_sync.go b/internal/knowledge/datasources/plugins/openstack/nova/nova_sync.go index b8b0bc35d..a2c466c42 100644 --- a/internal/knowledge/datasources/plugins/openstack/nova/nova_sync.go +++ b/internal/knowledge/datasources/plugins/openstack/nova/nova_sync.go @@ -45,6 +45,8 @@ func (s *NovaSyncer) Init(ctx context.Context) error { tables = append(tables, s.DB.AddTable(Migration{})) case v1alpha1.NovaDatasourceTypeAggregates: tables = append(tables, s.DB.AddTable(Aggregate{})) + case v1alpha1.NovaDatasourceTypeImages: + tables = append(tables, s.DB.AddTable(Image{})) } return s.DB.CreateTable(tables...) } @@ -67,6 +69,8 @@ func (s *NovaSyncer) Sync(ctx context.Context) (int64, error) { nResults, err = s.SyncAllMigrations(ctx) case v1alpha1.NovaDatasourceTypeAggregates: nResults, err = s.SyncAllAggregates(ctx) + case v1alpha1.NovaDatasourceTypeImages: + nResults, err = s.SyncAllImages(ctx) } return nResults, err } @@ -192,6 +196,26 @@ func (s *NovaSyncer) SyncAllMigrations(ctx context.Context) (int64, error) { return int64(len(allMigrations)), nil } +// Sync all Glance images into the database with pre-computed os_type. +func (s *NovaSyncer) SyncAllImages(ctx context.Context) (int64, error) { + allImages, err := s.API.GetAllImages(ctx) + if err != nil { + return 0, err + } + err = db.ReplaceAll(s.DB, allImages...) + if err != nil { + return 0, err + } + label := Image{}.TableName() + if s.Mon.ObjectsGauge != nil { + s.Mon.ObjectsGauge.WithLabelValues(label).Set(float64(len(allImages))) + } + if s.Mon.RequestProcessedCounter != nil { + s.Mon.RequestProcessedCounter.WithLabelValues(label).Inc() + } + return int64(len(allImages)), nil +} + // Sync the OpenStack aggregates into the database. func (s *NovaSyncer) SyncAllAggregates(ctx context.Context) (int64, error) { allAggregates, err := s.API.GetAllAggregates(ctx) diff --git a/internal/knowledge/datasources/plugins/openstack/nova/nova_sync_test.go b/internal/knowledge/datasources/plugins/openstack/nova/nova_sync_test.go index b88d25aa6..fdd826171 100644 --- a/internal/knowledge/datasources/plugins/openstack/nova/nova_sync_test.go +++ b/internal/knowledge/datasources/plugins/openstack/nova/nova_sync_test.go @@ -12,7 +12,6 @@ import ( "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources" "github.com/cobaltcore-dev/cortex/internal/knowledge/db" testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing" - testlib "github.com/cobaltcore-dev/cortex/pkg/testing" ) type mockNovaAPI struct{} @@ -56,6 +55,10 @@ func (m *mockNovaAPI) GetAllAggregates(ctx context.Context) ([]Aggregate, error) return []Aggregate{{Name: "aggregate1"}}, nil } +func (m *mockNovaAPI) GetAllImages(ctx context.Context) ([]Image, error) { + return []Image{{ID: "img-1", OSType: "windows8Server64Guest"}}, nil +} + func TestNovaSyncer_Init(t *testing.T) { dbEnv := testlibDB.SetupDBEnv(t) testDB := db.DB{DbMap: dbEnv.DbMap} @@ -133,7 +136,7 @@ func TestNovaSyncer_SyncDeletedServers(t *testing.T) { }, { Name: "custom time", - DeletedServersChangesSinceMinutes: testlib.Ptr(60), + DeletedServersChangesSinceMinutes: new(60), ExpectedAmountOfDeletedServers: 1, }, } @@ -268,3 +271,35 @@ func TestNovaSyncer_SyncAggregates(t *testing.T) { t.Fatalf("expected 1 aggregate, got %d", n) } } + +func TestNovaSyncer_SyncImages(t *testing.T) { + dbEnv := testlibDB.SetupDBEnv(t) + testDB := db.DB{DbMap: dbEnv.DbMap} + defer dbEnv.Close() + mon := datasources.Monitor{} + syncer := &NovaSyncer{ + DB: testDB, + Mon: mon, + Conf: v1alpha1.NovaDatasource{Type: v1alpha1.NovaDatasourceTypeImages}, + API: &mockNovaAPI{}, + } + + ctx := t.Context() + if err := syncer.Init(ctx); err != nil { + t.Fatalf("failed to init images syncer: %v", err) + } + n, err := syncer.Sync(ctx) + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if n != 1 { + t.Fatalf("expected 1 image, got %d", n) + } + var images []Image + if _, err := testDB.Select(&images, "SELECT * FROM "+Image{}.TableName()); err != nil { + t.Fatalf("select images: %v", err) + } + if len(images) != 1 || images[0].ID != "img-1" || images[0].OSType != "windows8Server64Guest" { + t.Errorf("unexpected images in DB: %+v", images) + } +} diff --git a/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go b/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go index 1be2b7a29..5fef71d6e 100644 --- a/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go +++ b/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go @@ -5,6 +5,7 @@ package nova import ( "encoding/json" + "log/slog" ) // OpenStack server model as returned by the Nova API under /servers/detail. @@ -111,6 +112,10 @@ type Server struct { // From nested server.flavor JSON FlavorName string `json:"-" db:"flavor_name"` + // ImageRef is the Glance image UUID the server was booted from. + // Empty string for volume-booted servers. + ImageRef string `json:"-" db:"image_ref"` + // From nested server.fault JSON // The error response code. @@ -136,6 +141,8 @@ func (s *Server) UnmarshalJSON(data []byte) error { aux := &struct { Flavor json.RawMessage `json:"flavor"` Fault *json.RawMessage `json:"fault,omitempty"` + // Nova returns image as a map {"id": "..."} for image-booted or "" for volume-booted. + Image json.RawMessage `json:"image"` *Alias }{ Alias: (*Alias)(s), @@ -151,6 +158,17 @@ func (s *Server) UnmarshalJSON(data []byte) error { return err } s.FlavorName = flavor.Name + // Parse image ref: map → extract id; empty string → leave blank (volume-booted). + if len(aux.Image) > 0 && aux.Image[0] == '{' { + var imageMap struct { + ID string `json:"id"` + } + if err := json.Unmarshal(aux.Image, &imageMap); err != nil { + slog.Warn("failed to parse image ref from server response, leaving blank", "error", err, "serverID", s.ID) + } else { + s.ImageRef = imageMap.ID + } + } var fault struct { Code uint `json:"code"` Created string `json:"created"` @@ -194,20 +212,29 @@ func (s *Server) MarshalJSON() ([]byte, error) { Details: s.FaultDetails, } } + // Represent image as {"id": ""} for image-booted or "" for volume-booted. + var imageVal any + if s.ImageRef != "" { + imageVal = map[string]string{"id": s.ImageRef} + } else { + imageVal = "" + } aux := &struct { Flavor flavor `json:"flavor"` Fault *fault `json:"fault,omitempty"` + Image any `json:"image"` *Alias }{ Alias: (*Alias)(s), Flavor: flavorVal, Fault: faultVal, + Image: imageVal, } return json.Marshal(aux) } // Table in which the openstack model is stored. -func (Server) TableName() string { return "openstack_servers_v2" } +func (Server) TableName() string { return "openstack_servers_v3" } // Index for the openstack model. func (Server) Indexes() map[string][]string { return nil } @@ -481,3 +508,17 @@ func (Aggregate) TableName() string { return "openstack_aggregates_v2" } // Index for the openstack model. func (Aggregate) Indexes() map[string][]string { return nil } + +// Image stores pre-computed os_type for a Glance image UUID. +// Populated by the NovaDatasourceTypeImages syncer from the Glance API. +// Used by the CR usage API to include os_type in VM subresources without live API calls. +type Image struct { + ID string `json:"id" db:"id,primarykey"` + OSType string `json:"os_type" db:"os_type"` +} + +// Table in which the openstack model is stored. +func (Image) TableName() string { return "openstack_images" } + +// Index for the openstack model. +func (Image) Indexes() map[string][]string { return nil } diff --git a/internal/knowledge/extractor/plugins/compute/host_az_test.go b/internal/knowledge/extractor/plugins/compute/host_az_test.go index d3f47bec6..380674a90 100644 --- a/internal/knowledge/extractor/plugins/compute/host_az_test.go +++ b/internal/knowledge/extractor/plugins/compute/host_az_test.go @@ -12,7 +12,6 @@ import ( "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" "github.com/cobaltcore-dev/cortex/internal/knowledge/db" testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing" - testlib "github.com/cobaltcore-dev/cortex/pkg/testing" ) func TestHostAZExtractor_Init(t *testing.T) { @@ -52,13 +51,13 @@ func TestHostAZExtractor_Extract(t *testing.T) { aggregates := []any{ // Test to find the first aggregate for computeHost1 with availability_zone != null - &nova.Aggregate{UUID: "agg1", Name: "something_else", AvailabilityZone: nil, ComputeHost: testlib.Ptr("host1"), Metadata: "{}"}, - &nova.Aggregate{UUID: "agg2", Name: "az1", AvailabilityZone: testlib.Ptr("az1"), ComputeHost: testlib.Ptr("host1"), Metadata: "{}"}, + &nova.Aggregate{UUID: "agg1", Name: "something_else", AvailabilityZone: nil, ComputeHost: new("host1"), Metadata: "{}"}, + &nova.Aggregate{UUID: "agg2", Name: "az1", AvailabilityZone: new("az1"), ComputeHost: new("host1"), Metadata: "{}"}, // Test to check that we get null when there is an aggregate for computeHost2 but without availability_zone - &nova.Aggregate{UUID: "agg3", Name: "something_else_again", AvailabilityZone: nil, ComputeHost: testlib.Ptr("host2"), Metadata: "{}"}, + &nova.Aggregate{UUID: "agg3", Name: "something_else_again", AvailabilityZone: nil, ComputeHost: new("host2"), Metadata: "{}"}, // No aggregate for computeHost3 // Should find an availability zone for computeHost4 - &nova.Aggregate{UUID: "agg4", Name: "az2", AvailabilityZone: testlib.Ptr("az2"), ComputeHost: testlib.Ptr("host4"), Metadata: "{}"}, + &nova.Aggregate{UUID: "agg4", Name: "az2", AvailabilityZone: new("az2"), ComputeHost: new("host4"), Metadata: "{}"}, } if err := testDB.Insert(aggregates...); err != nil { @@ -78,7 +77,7 @@ func TestHostAZExtractor_Extract(t *testing.T) { expectedHostAZs := []HostAZ{ { ComputeHost: "host1", - AvailabilityZone: testlib.Ptr("az1"), + AvailabilityZone: new("az1"), }, // Aggregate without availability_zone provided for host { @@ -92,7 +91,7 @@ func TestHostAZExtractor_Extract(t *testing.T) { }, { ComputeHost: "host4", - AvailabilityZone: testlib.Ptr("az2"), + AvailabilityZone: new("az2"), }, } diff --git a/internal/knowledge/extractor/plugins/compute/host_details_test.go b/internal/knowledge/extractor/plugins/compute/host_details_test.go index 5c701e225..68a3010d9 100644 --- a/internal/knowledge/extractor/plugins/compute/host_details_test.go +++ b/internal/knowledge/extractor/plugins/compute/host_details_test.go @@ -12,7 +12,6 @@ import ( "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/placement" "github.com/cobaltcore-dev/cortex/internal/knowledge/db" testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing" - testlib "github.com/cobaltcore-dev/cortex/pkg/testing" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) @@ -44,13 +43,13 @@ func TestHostDetailsExtractor_Extract(t *testing.T) { } hostPinnedProjects, err := v1alpha1.BoxFeatureList([]any{ - &HostPinnedProjects{ComputeHost: testlib.Ptr("nova-compute-bb01"), Label: testlib.Ptr("project-123")}, - &HostPinnedProjects{ComputeHost: testlib.Ptr("nova-compute-bb01"), Label: testlib.Ptr("project-456")}, - &HostPinnedProjects{ComputeHost: testlib.Ptr("node001-bb02"), Label: nil}, + &HostPinnedProjects{ComputeHost: new("nova-compute-bb01"), Label: new("project-123")}, + &HostPinnedProjects{ComputeHost: new("nova-compute-bb01"), Label: new("project-456")}, + &HostPinnedProjects{ComputeHost: new("node001-bb02"), Label: nil}, // No entry for ironic-host-1 since it is excluded in the feature host pinned projects - &HostPinnedProjects{ComputeHost: testlib.Ptr("node002-bb03"), Label: nil}, - &HostPinnedProjects{ComputeHost: testlib.Ptr("node003-bb03"), Label: nil}, - &HostPinnedProjects{ComputeHost: testlib.Ptr("node004-bb03"), Label: nil}, + &HostPinnedProjects{ComputeHost: new("node002-bb03"), Label: nil}, + &HostPinnedProjects{ComputeHost: new("node003-bb03"), Label: nil}, + &HostPinnedProjects{ComputeHost: new("node004-bb03"), Label: nil}, }) if err != nil { t.Fatalf("expected no error, got %v", err) @@ -67,9 +66,9 @@ func TestHostDetailsExtractor_Extract(t *testing.T) { // Host with no special traits &nova.Hypervisor{ID: "uuid4", ServiceHost: "node002-bb03", HypervisorType: "test", RunningVMs: 2, State: "up", Status: "enabled"}, // Host with disabled status, no entry in the resource providers - &nova.Hypervisor{ID: "uuid5", ServiceHost: "node003-bb03", HypervisorType: "test", RunningVMs: 2, State: "up", Status: "disabled", ServiceDisabledReason: testlib.Ptr("example reason")}, + &nova.Hypervisor{ID: "uuid5", ServiceHost: "node003-bb03", HypervisorType: "test", RunningVMs: 2, State: "up", Status: "disabled", ServiceDisabledReason: new("example reason")}, // Host with disabled trait - &nova.Hypervisor{ID: "uuid6", ServiceHost: "node004-bb03", HypervisorType: "test", RunningVMs: 2, State: "up", Status: "enabled", ServiceDisabledReason: testlib.Ptr("example reason")}, + &nova.Hypervisor{ID: "uuid6", ServiceHost: "node004-bb03", HypervisorType: "test", RunningVMs: 2, State: "up", Status: "enabled", ServiceDisabledReason: new("example reason")}, } if err := testDB.Insert(hypervisors...); err != nil { @@ -96,12 +95,12 @@ func TestHostDetailsExtractor_Extract(t *testing.T) { } hostAvailabilityZones, err := v1alpha1.BoxFeatureList([]any{ - &HostAZ{AvailabilityZone: testlib.Ptr("az1"), ComputeHost: "nova-compute-bb01"}, + &HostAZ{AvailabilityZone: new("az1"), ComputeHost: "nova-compute-bb01"}, &HostAZ{AvailabilityZone: nil, ComputeHost: "node001-bb02"}, - &HostAZ{AvailabilityZone: testlib.Ptr("az2"), ComputeHost: "node002-bb03"}, - &HostAZ{AvailabilityZone: testlib.Ptr("az2"), ComputeHost: "ironic-host-01"}, - &HostAZ{AvailabilityZone: testlib.Ptr("az2"), ComputeHost: "node003-bb03"}, - &HostAZ{AvailabilityZone: testlib.Ptr("az2"), ComputeHost: "node004-bb03"}, + &HostAZ{AvailabilityZone: new("az2"), ComputeHost: "node002-bb03"}, + &HostAZ{AvailabilityZone: new("az2"), ComputeHost: "ironic-host-01"}, + &HostAZ{AvailabilityZone: new("az2"), ComputeHost: "node003-bb03"}, + &HostAZ{AvailabilityZone: new("az2"), ComputeHost: "node004-bb03"}, }) if err != nil { t.Fatalf("expected no error, got %v", err) @@ -153,7 +152,7 @@ func TestHostDetailsExtractor_Extract(t *testing.T) { Enabled: false, Decommissioned: false, ExternalCustomer: false, - DisabledReason: testlib.Ptr("[down] --"), + DisabledReason: new("[down] --"), RunningVMs: 3, PinnedProjects: nil, }, @@ -181,7 +180,7 @@ func TestHostDetailsExtractor_Extract(t *testing.T) { Enabled: false, Decommissioned: false, ExternalCustomer: false, - DisabledReason: testlib.Ptr("[disabled] example reason"), + DisabledReason: new("[disabled] example reason"), RunningVMs: 2, PinnedProjects: nil, }, @@ -195,7 +194,7 @@ func TestHostDetailsExtractor_Extract(t *testing.T) { Enabled: false, Decommissioned: false, ExternalCustomer: false, - DisabledReason: testlib.Ptr("[disabled] example reason"), + DisabledReason: new("[disabled] example reason"), RunningVMs: 2, PinnedProjects: nil, }, @@ -211,7 +210,7 @@ func TestHostDetailsExtractor_Extract(t *testing.T) { ExternalCustomer: true, DisabledReason: nil, RunningVMs: 5, - PinnedProjects: testlib.Ptr("project-123,project-456"), + PinnedProjects: new("project-123,project-456"), }, } diff --git a/internal/knowledge/extractor/plugins/compute/host_pinned_project_test.go b/internal/knowledge/extractor/plugins/compute/host_pinned_project_test.go index df569d566..7ff5375b9 100644 --- a/internal/knowledge/extractor/plugins/compute/host_pinned_project_test.go +++ b/internal/knowledge/extractor/plugins/compute/host_pinned_project_test.go @@ -14,7 +14,6 @@ import ( "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" "github.com/cobaltcore-dev/cortex/internal/knowledge/db" testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing" - testlib "github.com/cobaltcore-dev/cortex/pkg/testing" ) func TestHostPinnedProjectsExtractor_Init(t *testing.T) { @@ -40,13 +39,13 @@ func TestHostPinnedProjectsExtractor_Extract(t *testing.T) { &nova.Aggregate{ Name: "agg1", UUID: "agg1", - ComputeHost: testlib.Ptr("host1"), + ComputeHost: new("host1"), Metadata: `{"filter_tenant_id":"project_id_1, project_id_2"}`, }, &nova.Aggregate{ Name: "agg1", UUID: "agg1", - ComputeHost: testlib.Ptr("host2"), + ComputeHost: new("host2"), Metadata: `{"filter_tenant_id":"project_id_1, project_id_2"}`, }, @@ -71,36 +70,36 @@ func TestHostPinnedProjectsExtractor_Extract(t *testing.T) { }, expected: []HostPinnedProjects{ { - AggregateName: testlib.Ptr("agg1"), - AggregateUUID: testlib.Ptr("agg1"), - ComputeHost: testlib.Ptr("host1"), - ProjectID: testlib.Ptr("project_id_1"), - DomainID: testlib.Ptr("domain_id_1"), - Label: testlib.Ptr("project_name_1 (domain_name_1)"), + AggregateName: new("agg1"), + AggregateUUID: new("agg1"), + ComputeHost: new("host1"), + ProjectID: new("project_id_1"), + DomainID: new("domain_id_1"), + Label: new("project_name_1 (domain_name_1)"), }, { - AggregateName: testlib.Ptr("agg1"), - AggregateUUID: testlib.Ptr("agg1"), - ComputeHost: testlib.Ptr("host1"), - ProjectID: testlib.Ptr("project_id_2"), - DomainID: testlib.Ptr("domain_id_2"), - Label: testlib.Ptr("project_name_2 (domain_name_2)"), + AggregateName: new("agg1"), + AggregateUUID: new("agg1"), + ComputeHost: new("host1"), + ProjectID: new("project_id_2"), + DomainID: new("domain_id_2"), + Label: new("project_name_2 (domain_name_2)"), }, { - AggregateName: testlib.Ptr("agg1"), - AggregateUUID: testlib.Ptr("agg1"), - ComputeHost: testlib.Ptr("host2"), - ProjectID: testlib.Ptr("project_id_1"), - DomainID: testlib.Ptr("domain_id_1"), - Label: testlib.Ptr("project_name_1 (domain_name_1)"), + AggregateName: new("agg1"), + AggregateUUID: new("agg1"), + ComputeHost: new("host2"), + ProjectID: new("project_id_1"), + DomainID: new("domain_id_1"), + Label: new("project_name_1 (domain_name_1)"), }, { - AggregateName: testlib.Ptr("agg1"), - AggregateUUID: testlib.Ptr("agg1"), - ComputeHost: testlib.Ptr("host2"), - ProjectID: testlib.Ptr("project_id_2"), - DomainID: testlib.Ptr("domain_id_2"), - Label: testlib.Ptr("project_name_2 (domain_name_2)"), + AggregateName: new("agg1"), + AggregateUUID: new("agg1"), + ComputeHost: new("host2"), + ProjectID: new("project_id_2"), + DomainID: new("domain_id_2"), + Label: new("project_name_2 (domain_name_2)"), }, }, }, @@ -110,7 +109,7 @@ func TestHostPinnedProjectsExtractor_Extract(t *testing.T) { &nova.Aggregate{ Name: "ignore-no-filter-tenant", UUID: "ignore", - ComputeHost: testlib.Ptr("host1"), + ComputeHost: new("host1"), Metadata: `{"something_different":"project_id_1, project_id_2"}`, }, }, @@ -130,7 +129,7 @@ func TestHostPinnedProjectsExtractor_Extract(t *testing.T) { &nova.Aggregate{ Name: "agg2", UUID: "agg2", - ComputeHost: testlib.Ptr("host1"), + ComputeHost: new("host1"), Metadata: `{"filter_tenant_id":"project_id_1, project_id_2"}`, }, @@ -157,36 +156,36 @@ func TestHostPinnedProjectsExtractor_Extract(t *testing.T) { }, expected: []HostPinnedProjects{ { - AggregateName: testlib.Ptr("agg2"), - AggregateUUID: testlib.Ptr("agg2"), - ComputeHost: testlib.Ptr("host1"), - ProjectID: testlib.Ptr("project_id_1"), - DomainID: testlib.Ptr("domain_id_1"), - Label: testlib.Ptr("project_name_1 (domain_name_1)"), + AggregateName: new("agg2"), + AggregateUUID: new("agg2"), + ComputeHost: new("host1"), + ProjectID: new("project_id_1"), + DomainID: new("domain_id_1"), + Label: new("project_name_1 (domain_name_1)"), }, { - AggregateName: testlib.Ptr("agg2"), - AggregateUUID: testlib.Ptr("agg2"), - ComputeHost: testlib.Ptr("host1"), - ProjectID: testlib.Ptr("project_id_2"), - DomainID: testlib.Ptr("domain_id_2"), - Label: testlib.Ptr("project_name_2 (domain_name_2)"), + AggregateName: new("agg2"), + AggregateUUID: new("agg2"), + ComputeHost: new("host1"), + ProjectID: new("project_id_2"), + DomainID: new("domain_id_2"), + Label: new("project_name_2 (domain_name_2)"), }, { - AggregateName: testlib.Ptr("agg1"), - AggregateUUID: testlib.Ptr("agg1"), + AggregateName: new("agg1"), + AggregateUUID: new("agg1"), ComputeHost: nil, - ProjectID: testlib.Ptr("project_id_1"), - DomainID: testlib.Ptr("domain_id_1"), - Label: testlib.Ptr("project_name_1 (domain_name_1)"), + ProjectID: new("project_id_1"), + DomainID: new("domain_id_1"), + Label: new("project_name_1 (domain_name_1)"), }, { - AggregateName: testlib.Ptr("agg1"), - AggregateUUID: testlib.Ptr("agg1"), + AggregateName: new("agg1"), + AggregateUUID: new("agg1"), ComputeHost: nil, - ProjectID: testlib.Ptr("project_id_2"), - DomainID: testlib.Ptr("domain_id_2"), - Label: testlib.Ptr("project_name_2 (domain_name_2)"), + ProjectID: new("project_id_2"), + DomainID: new("domain_id_2"), + Label: new("project_name_2 (domain_name_2)"), }, }, }, @@ -234,7 +233,7 @@ func TestHostPinnedProjectsExtractor_Extract(t *testing.T) { { AggregateName: nil, AggregateUUID: nil, - ComputeHost: testlib.Ptr("host2"), + ComputeHost: new("host2"), ProjectID: nil, DomainID: nil, Label: nil, @@ -242,7 +241,7 @@ func TestHostPinnedProjectsExtractor_Extract(t *testing.T) { { AggregateName: nil, AggregateUUID: nil, - ComputeHost: testlib.Ptr("host3"), + ComputeHost: new("host3"), ProjectID: nil, DomainID: nil, Label: nil, @@ -273,32 +272,32 @@ func TestHostPinnedProjectsExtractor_Extract(t *testing.T) { &nova.Aggregate{ Name: "agg1", UUID: "agg1", - ComputeHost: testlib.Ptr("host1"), + ComputeHost: new("host1"), Metadata: `{"filter_tenant_id":"project_id_1"}`, }, &nova.Aggregate{ Name: "agg1", UUID: "agg1", - ComputeHost: testlib.Ptr("host2"), + ComputeHost: new("host2"), Metadata: `{"filter_tenant_id":"project_id_1"}`, }, &nova.Aggregate{ Name: "az1", UUID: "az1", - ComputeHost: testlib.Ptr("host1"), + ComputeHost: new("host1"), Metadata: `{"type":"az"}`, }, &nova.Aggregate{ Name: "az1", UUID: "az1", - ComputeHost: testlib.Ptr("host2"), + ComputeHost: new("host2"), Metadata: `{"type":"az"}`, }, // Host 3 is part of an availability zone aggregate, but has no filter_tenant_id &nova.Aggregate{ Name: "az1", UUID: "az1", - ComputeHost: testlib.Ptr("host3"), + ComputeHost: new("host3"), Metadata: `{"type":"az"}`, }, @@ -316,25 +315,25 @@ func TestHostPinnedProjectsExtractor_Extract(t *testing.T) { }, expected: []HostPinnedProjects{ { - AggregateName: testlib.Ptr("agg1"), - AggregateUUID: testlib.Ptr("agg1"), - ComputeHost: testlib.Ptr("host1"), - ProjectID: testlib.Ptr("project_id_1"), - DomainID: testlib.Ptr("domain_id_1"), - Label: testlib.Ptr("project_name_1 (domain_name_1)"), + AggregateName: new("agg1"), + AggregateUUID: new("agg1"), + ComputeHost: new("host1"), + ProjectID: new("project_id_1"), + DomainID: new("domain_id_1"), + Label: new("project_name_1 (domain_name_1)"), }, { - AggregateName: testlib.Ptr("agg1"), - AggregateUUID: testlib.Ptr("agg1"), - ComputeHost: testlib.Ptr("host2"), - ProjectID: testlib.Ptr("project_id_1"), - DomainID: testlib.Ptr("domain_id_1"), - Label: testlib.Ptr("project_name_1 (domain_name_1)"), + AggregateName: new("agg1"), + AggregateUUID: new("agg1"), + ComputeHost: new("host2"), + ProjectID: new("project_id_1"), + DomainID: new("domain_id_1"), + Label: new("project_name_1 (domain_name_1)"), }, { AggregateName: nil, AggregateUUID: nil, - ComputeHost: testlib.Ptr("host3"), + ComputeHost: new("host3"), ProjectID: nil, DomainID: nil, Label: nil, @@ -355,13 +354,13 @@ func TestHostPinnedProjectsExtractor_Extract(t *testing.T) { &nova.Aggregate{ Name: "agg1", UUID: "agg1", - ComputeHost: testlib.Ptr("host1"), + ComputeHost: new("host1"), Metadata: `{"filter_tenant_id":"project_id_1, project_id_1"}`, }, &nova.Aggregate{ Name: "agg1", UUID: "agg1", - ComputeHost: testlib.Ptr("host1"), + ComputeHost: new("host1"), Metadata: `{"filter_tenant_id":"project_id_1, project_id_1"}`, }, @@ -380,12 +379,12 @@ func TestHostPinnedProjectsExtractor_Extract(t *testing.T) { }, expected: []HostPinnedProjects{ { - AggregateName: testlib.Ptr("agg1"), - AggregateUUID: testlib.Ptr("agg1"), - ComputeHost: testlib.Ptr("host1"), - ProjectID: testlib.Ptr("project_id_1"), - DomainID: testlib.Ptr("domain_id_1"), - Label: testlib.Ptr("project_name_1 (domain_name_1)"), + AggregateName: new("agg1"), + AggregateUUID: new("agg1"), + ComputeHost: new("host1"), + ProjectID: new("project_id_1"), + DomainID: new("domain_id_1"), + Label: new("project_name_1 (domain_name_1)"), }, }, }, @@ -403,13 +402,13 @@ func TestHostPinnedProjectsExtractor_Extract(t *testing.T) { &nova.Aggregate{ Name: "agg1", UUID: "agg1", - ComputeHost: testlib.Ptr("host1"), + ComputeHost: new("host1"), Metadata: `{"filter_tenant_id":"project_id_1"}`, }, &nova.Aggregate{ Name: "agg2", UUID: "agg2", - ComputeHost: testlib.Ptr("host1"), + ComputeHost: new("host1"), Metadata: `{"filter_tenant_id":"project_id_1"}`, }, @@ -427,20 +426,20 @@ func TestHostPinnedProjectsExtractor_Extract(t *testing.T) { }, expected: []HostPinnedProjects{ { - AggregateName: testlib.Ptr("agg1"), - AggregateUUID: testlib.Ptr("agg1"), - ComputeHost: testlib.Ptr("host1"), - ProjectID: testlib.Ptr("project_id_1"), - DomainID: testlib.Ptr("domain_id_1"), - Label: testlib.Ptr("project_name_1 (domain_name_1)"), + AggregateName: new("agg1"), + AggregateUUID: new("agg1"), + ComputeHost: new("host1"), + ProjectID: new("project_id_1"), + DomainID: new("domain_id_1"), + Label: new("project_name_1 (domain_name_1)"), }, { - AggregateName: testlib.Ptr("agg2"), - AggregateUUID: testlib.Ptr("agg2"), - ComputeHost: testlib.Ptr("host1"), - ProjectID: testlib.Ptr("project_id_1"), - DomainID: testlib.Ptr("domain_id_1"), - Label: testlib.Ptr("project_name_1 (domain_name_1)"), + AggregateName: new("agg2"), + AggregateUUID: new("agg2"), + ComputeHost: new("host1"), + ProjectID: new("project_id_1"), + DomainID: new("domain_id_1"), + Label: new("project_name_1 (domain_name_1)"), }, }, }, @@ -458,7 +457,7 @@ func TestHostPinnedProjectsExtractor_Extract(t *testing.T) { &nova.Aggregate{ Name: "agg1", UUID: "agg1", - ComputeHost: testlib.Ptr("host1"), + ComputeHost: new("host1"), Metadata: `{"filter_tenant_id":"project_id_1, project_id_domain_unknown, project_id_unknown"}`, }, @@ -481,28 +480,28 @@ func TestHostPinnedProjectsExtractor_Extract(t *testing.T) { }, expected: []HostPinnedProjects{ { - AggregateName: testlib.Ptr("agg1"), - AggregateUUID: testlib.Ptr("agg1"), - ComputeHost: testlib.Ptr("host1"), - ProjectID: testlib.Ptr("project_id_1"), - DomainID: testlib.Ptr("domain_id_1"), - Label: testlib.Ptr("project_name_1 (domain_name_1)"), + AggregateName: new("agg1"), + AggregateUUID: new("agg1"), + ComputeHost: new("host1"), + ProjectID: new("project_id_1"), + DomainID: new("domain_id_1"), + Label: new("project_name_1 (domain_name_1)"), }, { - AggregateName: testlib.Ptr("agg1"), - AggregateUUID: testlib.Ptr("agg1"), - ComputeHost: testlib.Ptr("host1"), - ProjectID: testlib.Ptr("project_id_domain_unknown"), - DomainID: testlib.Ptr("domain_id_unknown"), - Label: testlib.Ptr("project_name_2 (unknown)"), + AggregateName: new("agg1"), + AggregateUUID: new("agg1"), + ComputeHost: new("host1"), + ProjectID: new("project_id_domain_unknown"), + DomainID: new("domain_id_unknown"), + Label: new("project_name_2 (unknown)"), }, { - AggregateName: testlib.Ptr("agg1"), - AggregateUUID: testlib.Ptr("agg1"), - ComputeHost: testlib.Ptr("host1"), - ProjectID: testlib.Ptr("project_id_unknown"), + AggregateName: new("agg1"), + AggregateUUID: new("agg1"), + ComputeHost: new("host1"), + ProjectID: new("project_id_unknown"), DomainID: nil, - Label: testlib.Ptr("unknown (unknown)"), + Label: new("unknown (unknown)"), }, }, }, diff --git a/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct.sql b/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct.sql index ab3c7b8a7..56b20a980 100644 --- a/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct.sql +++ b/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct.sql @@ -3,6 +3,6 @@ SELECT os.os_ext_srv_attr_host AS host, MAX(value) AS max_steal_time_pct FROM kvm_libvirt_domain_metrics kvm -JOIN openstack_servers_v2 os ON os.os_ext_srv_attr_instance_name = kvm.domain +JOIN openstack_servers_v3 os ON os.os_ext_srv_attr_instance_name = kvm.domain WHERE kvm.name = 'kvm_libvirt_domain_steal_pct' AND os.id IS NOT NULL GROUP BY os.os_ext_srv_attr_host, os.id; \ No newline at end of file diff --git a/internal/knowledge/extractor/plugins/compute/vm_host_residency.sql b/internal/knowledge/extractor/plugins/compute/vm_host_residency.sql index c2b4b8846..190f2da19 100644 --- a/internal/knowledge/extractor/plugins/compute/vm_host_residency.sql +++ b/internal/knowledge/extractor/plugins/compute/vm_host_residency.sql @@ -21,7 +21,7 @@ WITH durations AS ( )) AS BIGINT) ) AS duration FROM openstack_migrations AS migrations - LEFT JOIN openstack_servers_v2 AS servers ON servers.id = migrations.instance_uuid + LEFT JOIN openstack_servers_v3 AS servers ON servers.id = migrations.instance_uuid LEFT JOIN openstack_flavors_v2 AS flavors ON flavors.name = servers.flavor_name ) SELECT diff --git a/internal/knowledge/extractor/plugins/compute/vm_life_span.sql b/internal/knowledge/extractor/plugins/compute/vm_life_span.sql index 1fad31536..38b8762ba 100644 --- a/internal/knowledge/extractor/plugins/compute/vm_life_span.sql +++ b/internal/knowledge/extractor/plugins/compute/vm_life_span.sql @@ -13,7 +13,7 @@ running_servers AS ( EXTRACT(EPOCH FROM (NOW()::timestamp - servers.created::timestamp))::BIGINT AS duration, COALESCE(flavors.name, 'unknown')::TEXT AS flavor_name, false::BOOLEAN AS deleted - FROM openstack_servers_v2 servers + FROM openstack_servers_v3 servers LEFT JOIN openstack_flavors_v2 flavors ON flavors.name = servers.flavor_name WHERE servers.created IS NOT NULL ) diff --git a/internal/knowledge/extractor/plugins/compute/vrops_hostsystem_resolver.sql b/internal/knowledge/extractor/plugins/compute/vrops_hostsystem_resolver.sql index 8ab0a2c70..21f3104fd 100644 --- a/internal/knowledge/extractor/plugins/compute/vrops_hostsystem_resolver.sql +++ b/internal/knowledge/extractor/plugins/compute/vrops_hostsystem_resolver.sql @@ -3,5 +3,5 @@ SELECT DISTINCT m.hostsystem AS vrops_hostsystem, s.os_ext_srv_attr_host AS nova_compute_host FROM vrops_vm_metrics m -LEFT JOIN openstack_servers_v2 s ON m.instance_uuid = s.id +LEFT JOIN openstack_servers_v3 s ON m.instance_uuid = s.id WHERE s.os_ext_srv_attr_host IS NOT NULL; diff --git a/internal/knowledge/extractor/plugins/compute/vrops_project_noisiness.sql b/internal/knowledge/extractor/plugins/compute/vrops_project_noisiness.sql index 0b0067790..850cbbca1 100644 --- a/internal/knowledge/extractor/plugins/compute/vrops_project_noisiness.sql +++ b/internal/knowledge/extractor/plugins/compute/vrops_project_noisiness.sql @@ -19,7 +19,7 @@ host_cpu_usage AS ( s.tenant_id, h.service_host, AVG(p.avg_cpu) AS avg_cpu_of_project - FROM openstack_servers_v2 s + FROM openstack_servers_v3 s JOIN vrops_vm_metrics m ON s.id = m.instance_uuid JOIN projects_avg_cpu p ON s.tenant_id = p.tenant_id JOIN openstack_hypervisors h ON s.os_ext_srv_attr_hypervisor_hostname = h.hostname diff --git a/internal/knowledge/kpis/plugins/compute/flavor_running_vms.go b/internal/knowledge/kpis/plugins/compute/flavor_running_vms.go deleted file mode 100644 index 693370ed8..000000000 --- a/internal/knowledge/kpis/plugins/compute/flavor_running_vms.go +++ /dev/null @@ -1,107 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package compute - -import ( - "log/slog" - "regexp" - - "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/identity" - "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" - "github.com/cobaltcore-dev/cortex/internal/knowledge/db" - "github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins" - "github.com/cobaltcore-dev/cortex/pkg/conf" - "github.com/prometheus/client_golang/prometheus" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -type FlavorRunningVMs struct { - FlavorName string `db:"flavor_name"` - AvailabilityZone string `db:"availability_zone"` - RunningVMs float64 `db:"running_vms"` - ProjectID string `db:"project_id"` - ProjectName string `db:"project_name"` -} - -// kvmFlavorPattern matches flavors where the second underscore-delimited -// segment is "k", e.g. "type_k_cXXX_mXXXX". -var kvmFlavorPattern = regexp.MustCompile(`^[^_]+_k_`) - -type FlavorRunningVMsKPI struct { - // Common base for all KPIs that provides standard functionality. - plugins.BaseKPI[struct{}] // No options passed through yaml config - flavorRunningVMs *prometheus.Desc -} - -func (FlavorRunningVMsKPI) GetName() string { - return "flavor_running_vms_kpi" -} - -func (k *FlavorRunningVMsKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) error { - if err := k.BaseKPI.Init(db, client, opts); err != nil { - return err - } - k.flavorRunningVMs = prometheus.NewDesc( - "cortex_flavor_running_vms", - "Current amount of running virtual machines per flavor.", - []string{ - "flavor_name", - "availability_zone", - "project_id", - "project_name", - "hypervisor_family", - }, - nil, - ) - return nil -} - -func (k *FlavorRunningVMsKPI) Describe(ch chan<- *prometheus.Desc) { - ch <- k.flavorRunningVMs -} - -func (k *FlavorRunningVMsKPI) Collect(ch chan<- prometheus.Metric) { - var results []FlavorRunningVMs - - query := ` - SELECT - os.tenant_id AS project_id, - p.name AS project_name, - os.flavor_name, - COALESCE(os.os_ext_az_availability_zone, 'unknown') AS availability_zone, - COUNT(*) AS running_vms - FROM ` + nova.Server{}.TableName() + ` os - JOIN ` + identity.Project{}.TableName() + ` p ON p.id = os.tenant_id - WHERE - status != 'DELETED' - GROUP BY - os.tenant_id, - p.name, - os.flavor_name, - os.os_ext_az_availability_zone - ORDER BY - os.tenant_id; - ` - - if _, err := k.DB.Select(&results, query); err != nil { - slog.Error("failed to select running vms per flavor", "err", err) - return - } - for _, r := range results { - hypervisorFamily := "vmware" - if kvmFlavorPattern.MatchString(r.FlavorName) { - hypervisorFamily = "kvm" - } - ch <- prometheus.MustNewConstMetric( - k.flavorRunningVMs, - prometheus.GaugeValue, - r.RunningVMs, - r.FlavorName, - r.AvailabilityZone, - r.ProjectID, - r.ProjectName, - hypervisorFamily, - ) - } -} diff --git a/internal/knowledge/kpis/plugins/compute/flavor_running_vms_test.go b/internal/knowledge/kpis/plugins/compute/flavor_running_vms_test.go deleted file mode 100644 index f0de67642..000000000 --- a/internal/knowledge/kpis/plugins/compute/flavor_running_vms_test.go +++ /dev/null @@ -1,189 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package compute - -import ( - "reflect" - "testing" - - "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/identity" - "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" - "github.com/cobaltcore-dev/cortex/internal/knowledge/db" - testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing" - "github.com/cobaltcore-dev/cortex/pkg/conf" - "github.com/prometheus/client_golang/prometheus" - prometheusgo "github.com/prometheus/client_model/go" -) - -func TestFlavorRunningVMsKPI_Init(t *testing.T) { - dbEnv := testlibDB.SetupDBEnv(t) - testDB := db.DB{DbMap: dbEnv.DbMap} - defer dbEnv.Close() - kpi := &FlavorRunningVMsKPI{} - if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil { - t.Fatalf("expected no error, got %v", err) - } -} - -func TestFlavorRunningVMsKPI_Collect(t *testing.T) { - dbEnv := testlibDB.SetupDBEnv(t) - testDB := db.DB{DbMap: dbEnv.DbMap} - defer dbEnv.Close() - if err := testDB.CreateTable( - testDB.AddTable(nova.Server{}), - testDB.AddTable(identity.Project{}), - ); err != nil { - t.Fatalf("expected no error, got %v", err) - } - - mockData := []any{ - // VMware flavor (no "_k_" segment) - &nova.Server{ - ID: "id-1", - FlavorName: "small_vmware_flavor", - OSEXTAvailabilityZone: "zone1", - TenantID: "project-1", - }, - // KVM flavor ("_k_" as second segment) - &nova.Server{ - ID: "id-2", - FlavorName: "medium_k_flavor", - OSEXTAvailabilityZone: "zone1", - TenantID: "project-1", - }, - &nova.Server{ - ID: "id-3", - FlavorName: "medium_k_flavor", - OSEXTAvailabilityZone: "zone1", - TenantID: "project-1", - }, - // Another VMware flavor in a different zone and project - &nova.Server{ - ID: "id-4", - FlavorName: "large_vmware_flavor", - OSEXTAvailabilityZone: "zone2", - TenantID: "project-2", - }, - - &identity.Project{ - ID: "project-1", - Name: "Project One", - }, - &identity.Project{ - ID: "project-2", - Name: "Project Two", - }, - } - - if err := testDB.Insert(mockData...); err != nil { - t.Fatalf("expected no error, got %v", err) - } - - kpi := &FlavorRunningVMsKPI{} - if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil { - t.Fatalf("expected no error, got %v", err) - } - - ch := make(chan prometheus.Metric, 100) - kpi.Collect(ch) - close(ch) - - type FlavorRunningVMsMetric struct { - FlavorName string - AvailabilityZone string - RunningVMs float64 - ProjectID string - ProjectName string - HypervisorFamily string - } - - metrics := make(map[string]FlavorRunningVMsMetric) - - for metric := range ch { - var m prometheusgo.Metric - if err := metric.Write(&m); err != nil { - t.Fatalf("failed to write metric: %v", err) - } - - labels := make(map[string]string) - for _, label := range m.Label { - labels[label.GetName()] = label.GetValue() - } - - flavor := labels["flavor_name"] - availabilityZone := labels["availability_zone"] - projectID := labels["project_id"] - projectName := labels["project_name"] - hypervisorFamily := labels["hypervisor_family"] - - key := flavor + "|" + availabilityZone + "|" + projectID - - metrics[key] = FlavorRunningVMsMetric{ - FlavorName: flavor, - AvailabilityZone: availabilityZone, - ProjectID: projectID, - ProjectName: projectName, - RunningVMs: m.GetGauge().GetValue(), - HypervisorFamily: hypervisorFamily, - } - } - - expectedMetrics := map[string]FlavorRunningVMsMetric{ - "small_vmware_flavor|zone1|project-1": { - FlavorName: "small_vmware_flavor", - AvailabilityZone: "zone1", - ProjectID: "project-1", - ProjectName: "Project One", - RunningVMs: 1, - HypervisorFamily: "vmware", - }, - "medium_k_flavor|zone1|project-1": { - FlavorName: "medium_k_flavor", - AvailabilityZone: "zone1", - ProjectID: "project-1", - ProjectName: "Project One", - RunningVMs: 2, - HypervisorFamily: "kvm", - }, - "large_vmware_flavor|zone2|project-2": { - FlavorName: "large_vmware_flavor", - AvailabilityZone: "zone2", - ProjectID: "project-2", - ProjectName: "Project Two", - RunningVMs: 1, - HypervisorFamily: "vmware", - }, - } - - for key, expected := range expectedMetrics { - actual, ok := metrics[key] - if !ok { - t.Errorf("expected metric %q not found", key) - continue - } - if !reflect.DeepEqual(expected, actual) { - t.Errorf("metric %q: expected %+v, got %+v", key, expected, actual) - } - } -} - -func TestKVMFlavorPattern(t *testing.T) { - tests := []struct { - flavor string - isKVM bool - }{ - {"x_k_c89_m7890_v2", true}, - {"x_k_c89_m7890", true}, - {"x_v_c12_m3456", false}, - {"x_kvm_c12_m3456", false}, // "kvm" != "k" - {"k_c12_m3456", false}, // "k" must be second segment - {"", false}, - } - for _, tc := range tests { - got := kvmFlavorPattern.MatchString(tc.flavor) - if got != tc.isKVM { - t.Errorf("kvmFlavorPattern.MatchString(%q) = %v, want %v", tc.flavor, got, tc.isKVM) - } - } -} diff --git a/internal/knowledge/kpis/plugins/compute/host_running_vms.go b/internal/knowledge/kpis/plugins/compute/host_running_vms.go deleted file mode 100644 index 547bf332b..000000000 --- a/internal/knowledge/kpis/plugins/compute/host_running_vms.go +++ /dev/null @@ -1,155 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package compute - -import ( - "context" - "log/slog" - "strconv" - - "github.com/cobaltcore-dev/cortex/api/v1alpha1" - "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" - "sigs.k8s.io/controller-runtime/pkg/client" - - "github.com/cobaltcore-dev/cortex/internal/knowledge/db" - "github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins" - "github.com/cobaltcore-dev/cortex/pkg/conf" - "github.com/prometheus/client_golang/prometheus" -) - -type HostRunningVMs struct { - ComputeHostName string `db:"compute_host"` - AvailabilityZone string `db:"availability_zone"` - CPUArchitecture string `db:"cpu_architecture"` - HypervisorFamily string `db:"hypervisor_family"` - WorkloadType string `db:"workload_type"` - Enabled bool `db:"enabled"` - Decommissioned bool `db:"decommissioned"` - ExternalCustomer bool `db:"external_customer"` - PinnedProjects string `db:"pinned_projects"` - RunningVMs float64 `db:"running_vms"` - compute.HostUtilization -} - -type HostRunningVMsKPI struct { - // Common base for all KPIs that provides standard functionality. - plugins.BaseKPI[struct{}] // No options passed through yaml config - - hostRunningVMsPerHost *prometheus.Desc -} - -func (HostRunningVMsKPI) GetName() string { - return "host_running_vms_kpi" -} - -func (k *HostRunningVMsKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) error { - if err := k.BaseKPI.Init(db, client, opts); err != nil { - return err - } - k.hostRunningVMsPerHost = prometheus.NewDesc( - "cortex_running_vms_per_host", - "Current amount of running virtual machines on a host.", - []string{ - "compute_host", - "availability_zone", - "cpu_architecture", - "workload_type", - "hypervisor_family", - "enabled", - "decommissioned", - "external_customer", - "pinned_projects", - }, - nil, - ) - return nil -} - -func (k *HostRunningVMsKPI) Describe(ch chan<- *prometheus.Desc) { - ch <- k.hostRunningVMsPerHost -} - -func (k *HostRunningVMsKPI) Collect(ch chan<- prometheus.Metric) { - hostDetailsKnowledge := &v1alpha1.Knowledge{} - if err := k.Client.Get( - context.Background(), - client.ObjectKey{Name: "host-details"}, - hostDetailsKnowledge, - ); err != nil { - slog.Error("failed to get knowledge host-details", "err", err) - return - } - hostDetails, err := v1alpha1. - UnboxFeatureList[compute.HostDetails](hostDetailsKnowledge.Status.Raw) - if err != nil { - slog.Error("failed to unbox storage pool cpu usage", "err", err) - return - } - detailsByComputeHost := make(map[string]compute.HostDetails) - for _, detail := range hostDetails { - detailsByComputeHost[detail.ComputeHost] = detail - } - - hostUtilizationKnowledge := &v1alpha1.Knowledge{} - if err := k.Client.Get( - context.Background(), - client.ObjectKey{Name: "host-utilization"}, - hostUtilizationKnowledge, - ); err != nil { - slog.Error("failed to get knowledge host-utilization", "err", err) - return - } - hostUtilizations, err := v1alpha1. - UnboxFeatureList[compute.HostUtilization](hostUtilizationKnowledge.Status.Raw) - if err != nil { - slog.Error("failed to unbox host utilization", "err", err) - return - } - - for _, utilization := range hostUtilizations { - detail, exists := detailsByComputeHost[utilization.ComputeHost] - if !exists { - slog.Warn("host_running_vms: no host details for compute host", "compute_host", utilization.ComputeHost) - continue - } - if utilization.TotalDiskAllocatableGB == 0 || - utilization.TotalRAMAllocatableMB == 0 || - utilization.TotalVCPUsAllocatable == 0 { - slog.Info( - "Skipping host since placement is reporting zero allocatable resources", - "metric", "cortex_running_vms_per_host", - "host", utilization.ComputeHost, - "cpu", utilization.TotalVCPUsAllocatable, - "ram", utilization.TotalRAMAllocatableMB, - "disk", utilization.TotalDiskAllocatableGB, - ) - continue - } - if detail.HypervisorType == "ironic" { - continue // Ignore ironic hosts - } - enabled := strconv.FormatBool(detail.Enabled) - decommissioned := strconv.FormatBool(detail.Decommissioned) - externalCustomer := strconv.FormatBool(detail.ExternalCustomer) - pinnedProjects := "" - if detail.PinnedProjects != nil { - pinnedProjects = *detail.PinnedProjects - } - - ch <- prometheus.MustNewConstMetric( - k.hostRunningVMsPerHost, - prometheus.GaugeValue, - float64(detail.RunningVMs), - utilization.ComputeHost, - detail.AvailabilityZone, - detail.CPUArchitecture, - detail.WorkloadType, - detail.HypervisorFamily, - enabled, - decommissioned, - externalCustomer, - pinnedProjects, - ) - } -} diff --git a/internal/knowledge/kpis/plugins/compute/host_running_vms_test.go b/internal/knowledge/kpis/plugins/compute/host_running_vms_test.go deleted file mode 100644 index b02aaa91e..000000000 --- a/internal/knowledge/kpis/plugins/compute/host_running_vms_test.go +++ /dev/null @@ -1,185 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package compute - -import ( - "reflect" - "testing" - - "github.com/cobaltcore-dev/cortex/api/v1alpha1" - "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" - "github.com/cobaltcore-dev/cortex/pkg/conf" - testlib "github.com/cobaltcore-dev/cortex/pkg/testing" - "github.com/prometheus/client_golang/prometheus" - prometheusgo "github.com/prometheus/client_model/go" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "sigs.k8s.io/controller-runtime/pkg/client/fake" -) - -func TestHostRunningVMsKPI_Init(t *testing.T) { - kpi := &HostRunningVMsKPI{} - if err := kpi.Init(nil, nil, conf.NewRawOpts("{}")); err != nil { - t.Fatalf("expected no error, got %v", err) - } -} - -func TestHostRunningVMsKPI_Collect(t *testing.T) { - scheme, err := v1alpha1.SchemeBuilder.Build() - if err != nil { - t.Fatalf("expected no error, got %v", err) - } - - hypervisors, err := v1alpha1.BoxFeatureList([]any{ - &compute.HostDetails{ - ComputeHost: "host1", - AvailabilityZone: "az1", - CPUArchitecture: "cascade-lake", - HypervisorType: "vcenter", - HypervisorFamily: "vmware", - RunningVMs: 5, - WorkloadType: "general-purpose", - Enabled: true, - Decommissioned: true, - ExternalCustomer: true, - PinnedProjects: testlib.Ptr("project-123,project-456"), - }, - // Should be ignored since its an ironic host - &compute.HostDetails{ - ComputeHost: "host2", - AvailabilityZone: "az1", - CPUArchitecture: "cascade-lake", - HypervisorType: "ironic", - HypervisorFamily: "vmware", - RunningVMs: 5, - WorkloadType: "general-purpose", - Enabled: true, - }, - // Should be ignored since it has no usage data - &compute.HostDetails{ - ComputeHost: "host3", - AvailabilityZone: "az1", - CPUArchitecture: "cascade-lake", - HypervisorType: "ironic", - HypervisorFamily: "vmware", - RunningVMs: 5, - WorkloadType: "general-purpose", - Enabled: true, - }, - }) - if err != nil { - t.Fatalf("expected no error, got %v", err) - } - - hostUtilizations, err := v1alpha1.BoxFeatureList([]any{ - &compute.HostUtilization{ - ComputeHost: "host1", - TotalVCPUsAllocatable: 100, - TotalRAMAllocatableMB: 200, - TotalDiskAllocatableGB: 300, - }, - // Ironic host - &compute.HostUtilization{ - ComputeHost: "host2", - TotalVCPUsAllocatable: 1, - TotalRAMAllocatableMB: 1, - TotalDiskAllocatableGB: 1, - }, - // No Capacity reported for host3 - }) - if err != nil { - t.Fatalf("expected no error, got %v", err) - } - - kpi := &HostRunningVMsKPI{} - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithRuntimeObjects(&v1alpha1.Knowledge{ - ObjectMeta: v1.ObjectMeta{Name: "host-details"}, - Status: v1alpha1.KnowledgeStatus{Raw: hypervisors}, - }, &v1alpha1.Knowledge{ - ObjectMeta: v1.ObjectMeta{Name: "host-utilization"}, - Status: v1alpha1.KnowledgeStatus{Raw: hostUtilizations}, - }). - Build() - if err := kpi.Init(nil, client, conf.NewRawOpts("{}")); err != nil { - t.Fatalf("expected no error, got %v", err) - } - - ch := make(chan prometheus.Metric, 100) - kpi.Collect(ch) - close(ch) - - type HostRunningVMsMetric struct { - ComputeHost string - AvailabilityZone string - Enabled string - Decommissioned string - ExternalCustomer string - CPUArchitecture string - WorkloadType string - HypervisorFamily string - PinnedProjects string - Value float64 - } - - actualMetrics := make(map[string]HostRunningVMsMetric, 0) - - for metric := range ch { - var m prometheusgo.Metric - if err := metric.Write(&m); err != nil { - t.Fatalf("failed to write metric: %v", err) - } - - labels := make(map[string]string) - for _, label := range m.Label { - labels[label.GetName()] = label.GetValue() - } - - key := labels["compute_host"] - - actualMetrics[key] = HostRunningVMsMetric{ - ComputeHost: labels["compute_host"], - AvailabilityZone: labels["availability_zone"], - Enabled: labels["enabled"], - Decommissioned: labels["decommissioned"], - ExternalCustomer: labels["external_customer"], - CPUArchitecture: labels["cpu_architecture"], - WorkloadType: labels["workload_type"], - HypervisorFamily: labels["hypervisor_family"], - PinnedProjects: labels["pinned_projects"], - Value: m.GetGauge().GetValue(), - } - } - - expectedMetrics := map[string]HostRunningVMsMetric{ - "host1": { - ComputeHost: "host1", - AvailabilityZone: "az1", - Enabled: "true", - Decommissioned: "true", - ExternalCustomer: "true", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - HypervisorFamily: "vmware", - Value: 5, - PinnedProjects: "project-123,project-456", - }, - } - - if len(expectedMetrics) != len(actualMetrics) { - t.Errorf("expected %d metrics, got %d", len(expectedMetrics), len(actualMetrics)) - } - - for key, expected := range expectedMetrics { - actual, ok := actualMetrics[key] - if !ok { - t.Errorf("expected metric %q not found", key) - continue - } - - if !reflect.DeepEqual(expected, actual) { - t.Errorf("metric %q: expected %+v, got %+v", key, expected, actual) - } - } -} diff --git a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go deleted file mode 100644 index c233cfd4c..000000000 --- a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go +++ /dev/null @@ -1,1532 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package compute - -import ( - "regexp" - "testing" - - "github.com/cobaltcore-dev/cortex/api/v1alpha1" - "github.com/cobaltcore-dev/cortex/pkg/conf" - hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" - "github.com/prometheus/client_golang/prometheus" - prometheusgo "github.com/prometheus/client_model/go" - "k8s.io/apimachinery/pkg/api/resource" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "sigs.k8s.io/controller-runtime/pkg/client/fake" -) - -func TestKVMResourceCapacityKPI_Init(t *testing.T) { - kpi := &KVMResourceCapacityKPI{} - if err := kpi.Init(nil, nil, conf.NewRawOpts("{}")); err != nil { - t.Fatalf("expected no error, got %v", err) - } -} - -type kvmMetricLabels struct { - ComputeHost string - Resource string - Type string - AvailabilityZone string - BuildingBlock string - CPUArchitecture string - WorkloadType string - Enabled string - Decommissioned string - ExternalCustomer string - Maintenance string -} - -var fqNameRe = regexp.MustCompile(`fqName: "([^"]+)"`) - -func getMetricName(desc string) string { - match := fqNameRe.FindStringSubmatch(desc) - if len(match) > 1 { - return match[1] - } - return "" -} - -type kvmExpectedMetric struct { - Name string // metric family name (e.g. "cortex_kvm_host_capacity_total") - Labels kvmMetricLabels - Value float64 -} - -func defaultHostLabels(host, az, bb string) kvmMetricLabels { - return kvmMetricLabels{ - ComputeHost: host, - AvailabilityZone: az, - BuildingBlock: bb, - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - } -} - -func totalMetric(host, res, az, bb string, value float64) kvmExpectedMetric { - l := defaultHostLabels(host, az, bb) - l.Resource = res - return kvmExpectedMetric{Name: "cortex_kvm_host_capacity_total", Labels: l, Value: value} -} - -func usageMetric(host, res, capacityType, az, bb string, value float64) kvmExpectedMetric { - l := defaultHostLabels(host, az, bb) - l.Resource = res - l.Type = capacityType - return kvmExpectedMetric{Name: "cortex_kvm_host_capacity_usage", Labels: l, Value: value} -} - -func TestKVMResourceCapacityKPI_Collect(t *testing.T) { - tests := []struct { - name string - hypervisors []hv1.Hypervisor - reservations []v1alpha1.Reservation - expectedMetrics []kvmExpectedMetric - }{ - { - name: "single hypervisor with nil effective capacity", - hypervisors: []hv1.Hypervisor{ - { - ObjectMeta: v1.ObjectMeta{ - Name: "node001-bb088", - Labels: map[string]string{ - "topology.kubernetes.io/zone": "qa-1a", - }, - }, - Status: hv1.HypervisorStatus{ - EffectiveCapacity: nil, - Allocation: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("64"), - hv1.ResourceMemory: resource.MustParse("256Gi"), - }, - Traits: []string{}, - }, - }, - }, - expectedMetrics: []kvmExpectedMetric{}, - }, - { - name: "single hypervisor with zero total capacity", - hypervisors: []hv1.Hypervisor{ - { - ObjectMeta: v1.ObjectMeta{ - Name: "node001-bb088", - Labels: map[string]string{ - "topology.kubernetes.io/zone": "qa-1a", - }, - }, - Status: hv1.HypervisorStatus{ - EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("0"), - hv1.ResourceMemory: resource.MustParse("0"), - }, - Allocation: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("0"), - hv1.ResourceMemory: resource.MustParse("0"), - }, - Traits: []string{}, - }, - }, - }, - expectedMetrics: []kvmExpectedMetric{}, - }, - { - name: "nil effective capacity falls back to physical capacity", - hypervisors: []hv1.Hypervisor{ - { - ObjectMeta: v1.ObjectMeta{ - Name: "node001-bb088", - Labels: map[string]string{ - "topology.kubernetes.io/zone": "qa-1a", - }, - }, - Status: hv1.HypervisorStatus{ - EffectiveCapacity: nil, - Capacity: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("128"), - hv1.ResourceMemory: resource.MustParse("512Gi"), - }, - Allocation: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("64"), - hv1.ResourceMemory: resource.MustParse("256Gi"), - }, - Traits: []string{}, - }, - }, - }, - expectedMetrics: []kvmExpectedMetric{ - totalMetric("node001-bb088", "cpu", "qa-1a", "bb088", 128), - totalMetric("node001-bb088", "ram", "qa-1a", "bb088", 549755813888), // 512Gi - usageMetric("node001-bb088", "cpu", "utilized", "qa-1a", "bb088", 64), - usageMetric("node001-bb088", "ram", "utilized", "qa-1a", "bb088", 274877906944), // 256Gi - usageMetric("node001-bb088", "cpu", "reserved", "qa-1a", "bb088", 0), - usageMetric("node001-bb088", "ram", "reserved", "qa-1a", "bb088", 0), - usageMetric("node001-bb088", "cpu", "failover", "qa-1a", "bb088", 0), - usageMetric("node001-bb088", "ram", "failover", "qa-1a", "bb088", 0), - usageMetric("node001-bb088", "cpu", "payg", "qa-1a", "bb088", 64), // 128-64-0-0 - usageMetric("node001-bb088", "ram", "payg", "qa-1a", "bb088", 274877906944), // 512Gi-256Gi - }, - }, - { - name: "zero effective capacity falls back to physical capacity", - hypervisors: []hv1.Hypervisor{ - { - ObjectMeta: v1.ObjectMeta{ - Name: "node001-bb088", - Labels: map[string]string{ - "topology.kubernetes.io/zone": "qa-1a", - }, - }, - Status: hv1.HypervisorStatus{ - EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("0"), - hv1.ResourceMemory: resource.MustParse("0"), - }, - Capacity: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("128"), - hv1.ResourceMemory: resource.MustParse("512Gi"), - }, - Allocation: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("64"), - hv1.ResourceMemory: resource.MustParse("256Gi"), - }, - Traits: []string{}, - }, - }, - }, - expectedMetrics: []kvmExpectedMetric{ - totalMetric("node001-bb088", "cpu", "qa-1a", "bb088", 128), - totalMetric("node001-bb088", "ram", "qa-1a", "bb088", 549755813888), // 512Gi - usageMetric("node001-bb088", "cpu", "utilized", "qa-1a", "bb088", 64), - usageMetric("node001-bb088", "ram", "utilized", "qa-1a", "bb088", 274877906944), // 256Gi - usageMetric("node001-bb088", "cpu", "reserved", "qa-1a", "bb088", 0), - usageMetric("node001-bb088", "ram", "reserved", "qa-1a", "bb088", 0), - usageMetric("node001-bb088", "cpu", "failover", "qa-1a", "bb088", 0), - usageMetric("node001-bb088", "ram", "failover", "qa-1a", "bb088", 0), - usageMetric("node001-bb088", "cpu", "payg", "qa-1a", "bb088", 64), // 128-64-0-0 - usageMetric("node001-bb088", "ram", "payg", "qa-1a", "bb088", 274877906944), // 512Gi-256Gi - }, - }, - { - name: "zero effective capacity with nil physical capacity skips", - hypervisors: []hv1.Hypervisor{ - { - ObjectMeta: v1.ObjectMeta{ - Name: "node001-bb088", - Labels: map[string]string{ - "topology.kubernetes.io/zone": "qa-1a", - }, - }, - Status: hv1.HypervisorStatus{ - EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("0"), - hv1.ResourceMemory: resource.MustParse("0"), - }, - Capacity: nil, - Traits: []string{}, - }, - }, - }, - expectedMetrics: []kvmExpectedMetric{}, - }, - { - name: "zero effective capacity with zero physical capacity skips", - hypervisors: []hv1.Hypervisor{ - { - ObjectMeta: v1.ObjectMeta{ - Name: "node001-bb088", - Labels: map[string]string{ - "topology.kubernetes.io/zone": "qa-1a", - }, - }, - Status: hv1.HypervisorStatus{ - EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("0"), - hv1.ResourceMemory: resource.MustParse("0"), - }, - Capacity: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("0"), - hv1.ResourceMemory: resource.MustParse("0"), - }, - Traits: []string{}, - }, - }, - }, - expectedMetrics: []kvmExpectedMetric{}, - }, - { - name: "single hypervisor with default traits, no reservations", - hypervisors: []hv1.Hypervisor{ - { - ObjectMeta: v1.ObjectMeta{ - Name: "node001-bb088", - Labels: map[string]string{ - "topology.kubernetes.io/zone": "qa-1a", - }, - }, - Status: hv1.HypervisorStatus{ - EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("128"), - hv1.ResourceMemory: resource.MustParse("512Gi"), - }, - Allocation: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("64"), - hv1.ResourceMemory: resource.MustParse("256Gi"), - }, - Traits: []string{}, - }, - }, - }, - expectedMetrics: []kvmExpectedMetric{ - totalMetric("node001-bb088", "cpu", "qa-1a", "bb088", 128), - totalMetric("node001-bb088", "ram", "qa-1a", "bb088", 549755813888), // 512Gi - usageMetric("node001-bb088", "cpu", "utilized", "qa-1a", "bb088", 64), - usageMetric("node001-bb088", "ram", "utilized", "qa-1a", "bb088", 274877906944), // 256Gi - usageMetric("node001-bb088", "cpu", "reserved", "qa-1a", "bb088", 0), - usageMetric("node001-bb088", "ram", "reserved", "qa-1a", "bb088", 0), - usageMetric("node001-bb088", "cpu", "failover", "qa-1a", "bb088", 0), - usageMetric("node001-bb088", "ram", "failover", "qa-1a", "bb088", 0), - usageMetric("node001-bb088", "cpu", "payg", "qa-1a", "bb088", 64), // 128-64-0-0 - usageMetric("node001-bb088", "ram", "payg", "qa-1a", "bb088", 274877906944), // 512Gi-256Gi - }, - }, - { - name: "hypervisor with sapphire rapids and hana traits", - hypervisors: []hv1.Hypervisor{ - { - ObjectMeta: v1.ObjectMeta{ - Name: "node002-bb089", - Labels: map[string]string{ - "topology.kubernetes.io/zone": "qa-1b", - }, - }, - Status: hv1.HypervisorStatus{ - EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("256"), - hv1.ResourceMemory: resource.MustParse("1Ti"), - }, - Allocation: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("128"), - hv1.ResourceMemory: resource.MustParse("512Gi"), - }, - Traits: []string{ - "CUSTOM_HW_SAPPHIRE_RAPIDS", - "CUSTOM_HANA_EXCLUSIVE_HOST", - }, - }, - }, - }, - expectedMetrics: []kvmExpectedMetric{ - { - Name: "cortex_kvm_host_capacity_total", - Labels: kvmMetricLabels{ - ComputeHost: "node002-bb089", - Resource: "cpu", - AvailabilityZone: "qa-1b", - BuildingBlock: "bb089", - CPUArchitecture: "sapphire-rapids", - WorkloadType: "hana", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 256, - }, - { - Name: "cortex_kvm_host_capacity_total", - Labels: kvmMetricLabels{ - ComputeHost: "node002-bb089", - Resource: "ram", - AvailabilityZone: "qa-1b", - BuildingBlock: "bb089", - CPUArchitecture: "sapphire-rapids", - WorkloadType: "hana", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 1099511627776, // 1Ti - }, - { - Name: "cortex_kvm_host_capacity_usage", - Labels: kvmMetricLabels{ - ComputeHost: "node002-bb089", - Resource: "cpu", - Type: "utilized", - AvailabilityZone: "qa-1b", - BuildingBlock: "bb089", - CPUArchitecture: "sapphire-rapids", - WorkloadType: "hana", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 128, - }, - { - Name: "cortex_kvm_host_capacity_usage", - Labels: kvmMetricLabels{ - ComputeHost: "node002-bb089", - Resource: "ram", - Type: "utilized", - AvailabilityZone: "qa-1b", - BuildingBlock: "bb089", - CPUArchitecture: "sapphire-rapids", - WorkloadType: "hana", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 549755813888, // 512Gi - }, - { - Name: "cortex_kvm_host_capacity_usage", - Labels: kvmMetricLabels{ - ComputeHost: "node002-bb089", - Resource: "cpu", - Type: "reserved", - AvailabilityZone: "qa-1b", - BuildingBlock: "bb089", - CPUArchitecture: "sapphire-rapids", - WorkloadType: "hana", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 0, - }, - { - Name: "cortex_kvm_host_capacity_usage", - Labels: kvmMetricLabels{ - ComputeHost: "node002-bb089", - Resource: "ram", - Type: "reserved", - AvailabilityZone: "qa-1b", - BuildingBlock: "bb089", - CPUArchitecture: "sapphire-rapids", - WorkloadType: "hana", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 0, - }, - { - Name: "cortex_kvm_host_capacity_usage", - Labels: kvmMetricLabels{ - ComputeHost: "node002-bb089", - Resource: "cpu", - Type: "failover", - AvailabilityZone: "qa-1b", - BuildingBlock: "bb089", - CPUArchitecture: "sapphire-rapids", - WorkloadType: "hana", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 0, - }, - { - Name: "cortex_kvm_host_capacity_usage", - Labels: kvmMetricLabels{ - ComputeHost: "node002-bb089", - Resource: "ram", - Type: "failover", - AvailabilityZone: "qa-1b", - BuildingBlock: "bb089", - CPUArchitecture: "sapphire-rapids", - WorkloadType: "hana", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 0, - }, - { - Name: "cortex_kvm_host_capacity_usage", - Labels: kvmMetricLabels{ - ComputeHost: "node002-bb089", - Resource: "cpu", - Type: "payg", - AvailabilityZone: "qa-1b", - BuildingBlock: "bb089", - CPUArchitecture: "sapphire-rapids", - WorkloadType: "hana", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 128, // 256-128-0-0 - }, - { - Name: "cortex_kvm_host_capacity_usage", - Labels: kvmMetricLabels{ - ComputeHost: "node002-bb089", - Resource: "ram", - Type: "payg", - AvailabilityZone: "qa-1b", - BuildingBlock: "bb089", - CPUArchitecture: "sapphire-rapids", - WorkloadType: "hana", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 549755813888, // 1Ti-512Gi - }, - }, - }, - { - name: "hypervisor with decommissioned and external customer traits", - hypervisors: []hv1.Hypervisor{ - { - ObjectMeta: v1.ObjectMeta{ - Name: "node003-bb090", - Labels: map[string]string{ - "topology.kubernetes.io/zone": "qa-1c", - }, - }, - Status: hv1.HypervisorStatus{ - EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("64"), - hv1.ResourceMemory: resource.MustParse("256Gi"), - }, - Allocation: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("32"), - hv1.ResourceMemory: resource.MustParse("128Gi"), - }, - Traits: []string{ - "CUSTOM_DECOMMISSIONING", - "CUSTOM_EXTERNAL_CUSTOMER_EXCLUSIVE", - }, - }, - }, - }, - expectedMetrics: []kvmExpectedMetric{ - { - Name: "cortex_kvm_host_capacity_total", - Labels: kvmMetricLabels{ - ComputeHost: "node003-bb090", - Resource: "cpu", - AvailabilityZone: "qa-1c", - BuildingBlock: "bb090", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "true", - ExternalCustomer: "true", - Maintenance: "false", - }, - Value: 64, - }, - { - Name: "cortex_kvm_host_capacity_total", - Labels: kvmMetricLabels{ - ComputeHost: "node003-bb090", - Resource: "ram", - AvailabilityZone: "qa-1c", - BuildingBlock: "bb090", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "true", - ExternalCustomer: "true", - Maintenance: "false", - }, - Value: 274877906944, // 256Gi - }, - { - Name: "cortex_kvm_host_capacity_usage", - Labels: kvmMetricLabels{ - ComputeHost: "node003-bb090", - Resource: "cpu", - Type: "utilized", - AvailabilityZone: "qa-1c", - BuildingBlock: "bb090", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "true", - ExternalCustomer: "true", - Maintenance: "false", - }, - Value: 32, - }, - { - Name: "cortex_kvm_host_capacity_usage", - Labels: kvmMetricLabels{ - ComputeHost: "node003-bb090", - Resource: "ram", - Type: "utilized", - AvailabilityZone: "qa-1c", - BuildingBlock: "bb090", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "true", - ExternalCustomer: "true", - Maintenance: "false", - }, - Value: 137438953472, // 128Gi - }, - { - Name: "cortex_kvm_host_capacity_usage", - Labels: kvmMetricLabels{ - ComputeHost: "node003-bb090", - Resource: "cpu", - Type: "reserved", - AvailabilityZone: "qa-1c", - BuildingBlock: "bb090", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "true", - ExternalCustomer: "true", - Maintenance: "false", - }, - Value: 0, - }, - { - Name: "cortex_kvm_host_capacity_usage", - Labels: kvmMetricLabels{ - ComputeHost: "node003-bb090", - Resource: "ram", - Type: "reserved", - AvailabilityZone: "qa-1c", - BuildingBlock: "bb090", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "true", - ExternalCustomer: "true", - Maintenance: "false", - }, - Value: 0, - }, - { - Name: "cortex_kvm_host_capacity_usage", - Labels: kvmMetricLabels{ - ComputeHost: "node003-bb090", - Resource: "cpu", - Type: "failover", - AvailabilityZone: "qa-1c", - BuildingBlock: "bb090", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "true", - ExternalCustomer: "true", - Maintenance: "false", - }, - Value: 0, - }, - { - Name: "cortex_kvm_host_capacity_usage", - Labels: kvmMetricLabels{ - ComputeHost: "node003-bb090", - Resource: "ram", - Type: "failover", - AvailabilityZone: "qa-1c", - BuildingBlock: "bb090", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "true", - ExternalCustomer: "true", - Maintenance: "false", - }, - Value: 0, - }, - { - Name: "cortex_kvm_host_capacity_usage", - Labels: kvmMetricLabels{ - ComputeHost: "node003-bb090", - Resource: "cpu", - Type: "payg", - AvailabilityZone: "qa-1c", - BuildingBlock: "bb090", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "true", - ExternalCustomer: "true", - Maintenance: "false", - }, - Value: 32, // 64-32-0-0 - }, - { - Name: "cortex_kvm_host_capacity_usage", - Labels: kvmMetricLabels{ - ComputeHost: "node003-bb090", - Resource: "ram", - Type: "payg", - AvailabilityZone: "qa-1c", - BuildingBlock: "bb090", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "true", - ExternalCustomer: "true", - Maintenance: "false", - }, - Value: 137438953472, // 256Gi-128Gi - }, - }, - }, - { - name: "multiple hypervisors", - hypervisors: []hv1.Hypervisor{ - { - ObjectMeta: v1.ObjectMeta{ - Name: "node010-bb100", - Labels: map[string]string{ - "topology.kubernetes.io/zone": "qa-1a", - }, - }, - Status: hv1.HypervisorStatus{ - EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("100"), - hv1.ResourceMemory: resource.MustParse("200Gi"), - }, - Allocation: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("50"), - hv1.ResourceMemory: resource.MustParse("100Gi"), - }, - Traits: []string{}, - }, - }, - { - ObjectMeta: v1.ObjectMeta{ - Name: "node020-bb200", - Labels: map[string]string{ - "topology.kubernetes.io/zone": "qa-1b", - }, - }, - Status: hv1.HypervisorStatus{ - EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("200"), - hv1.ResourceMemory: resource.MustParse("400Gi"), - }, - Allocation: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("150"), - hv1.ResourceMemory: resource.MustParse("300Gi"), - }, - Traits: []string{"CUSTOM_HW_SAPPHIRE_RAPIDS"}, - }, - }, - }, - expectedMetrics: []kvmExpectedMetric{ - totalMetric("node010-bb100", "cpu", "qa-1a", "bb100", 100), - totalMetric("node010-bb100", "ram", "qa-1a", "bb100", 214748364800), // 200Gi - usageMetric("node010-bb100", "cpu", "utilized", "qa-1a", "bb100", 50), - usageMetric("node010-bb100", "ram", "utilized", "qa-1a", "bb100", 107374182400), // 100Gi - usageMetric("node010-bb100", "cpu", "reserved", "qa-1a", "bb100", 0), - usageMetric("node010-bb100", "ram", "reserved", "qa-1a", "bb100", 0), - usageMetric("node010-bb100", "cpu", "failover", "qa-1a", "bb100", 0), - usageMetric("node010-bb100", "ram", "failover", "qa-1a", "bb100", 0), - usageMetric("node010-bb100", "cpu", "payg", "qa-1a", "bb100", 50), // 100-50-0-0 - usageMetric("node010-bb100", "ram", "payg", "qa-1a", "bb100", 107374182400), // 200Gi-100Gi - { - Name: "cortex_kvm_host_capacity_total", - Labels: kvmMetricLabels{ - ComputeHost: "node020-bb200", - Resource: "cpu", - AvailabilityZone: "qa-1b", - BuildingBlock: "bb200", - CPUArchitecture: "sapphire-rapids", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 200, - }, - { - Name: "cortex_kvm_host_capacity_total", - Labels: kvmMetricLabels{ - ComputeHost: "node020-bb200", - Resource: "ram", - AvailabilityZone: "qa-1b", - BuildingBlock: "bb200", - CPUArchitecture: "sapphire-rapids", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 429496729600, // 400Gi - }, - { - Name: "cortex_kvm_host_capacity_usage", - Labels: kvmMetricLabels{ - ComputeHost: "node020-bb200", - Resource: "cpu", - Type: "utilized", - AvailabilityZone: "qa-1b", - BuildingBlock: "bb200", - CPUArchitecture: "sapphire-rapids", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 150, - }, - { - Name: "cortex_kvm_host_capacity_usage", - Labels: kvmMetricLabels{ - ComputeHost: "node020-bb200", - Resource: "ram", - Type: "utilized", - AvailabilityZone: "qa-1b", - BuildingBlock: "bb200", - CPUArchitecture: "sapphire-rapids", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 322122547200, // 300Gi - }, - { - Name: "cortex_kvm_host_capacity_usage", - Labels: kvmMetricLabels{ - ComputeHost: "node020-bb200", - Resource: "cpu", - Type: "reserved", - AvailabilityZone: "qa-1b", - BuildingBlock: "bb200", - CPUArchitecture: "sapphire-rapids", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 0, - }, - { - Name: "cortex_kvm_host_capacity_usage", - Labels: kvmMetricLabels{ - ComputeHost: "node020-bb200", - Resource: "ram", - Type: "reserved", - AvailabilityZone: "qa-1b", - BuildingBlock: "bb200", - CPUArchitecture: "sapphire-rapids", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 0, - }, - { - Name: "cortex_kvm_host_capacity_usage", - Labels: kvmMetricLabels{ - ComputeHost: "node020-bb200", - Resource: "cpu", - Type: "failover", - AvailabilityZone: "qa-1b", - BuildingBlock: "bb200", - CPUArchitecture: "sapphire-rapids", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 0, - }, - { - Name: "cortex_kvm_host_capacity_usage", - Labels: kvmMetricLabels{ - ComputeHost: "node020-bb200", - Resource: "ram", - Type: "failover", - AvailabilityZone: "qa-1b", - BuildingBlock: "bb200", - CPUArchitecture: "sapphire-rapids", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 0, - }, - { - Name: "cortex_kvm_host_capacity_usage", - Labels: kvmMetricLabels{ - ComputeHost: "node020-bb200", - Resource: "cpu", - Type: "payg", - AvailabilityZone: "qa-1b", - BuildingBlock: "bb200", - CPUArchitecture: "sapphire-rapids", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 50, // 200-150-0-0 - }, - { - Name: "cortex_kvm_host_capacity_usage", - Labels: kvmMetricLabels{ - ComputeHost: "node020-bb200", - Resource: "ram", - Type: "payg", - AvailabilityZone: "qa-1b", - BuildingBlock: "bb200", - CPUArchitecture: "sapphire-rapids", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 107374182400, // 400Gi-300Gi - }, - }, - }, - { - name: "hypervisor with missing allocation data", - hypervisors: []hv1.Hypervisor{ - { - ObjectMeta: v1.ObjectMeta{ - Name: "node004-bb091", - Labels: map[string]string{ - "topology.kubernetes.io/zone": "qa-1d", - }, - }, - Status: hv1.HypervisorStatus{ - EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("96"), - hv1.ResourceMemory: resource.MustParse("384Gi"), - }, - Allocation: nil, - Traits: []string{}, - }, - }, - }, - expectedMetrics: []kvmExpectedMetric{ - totalMetric("node004-bb091", "cpu", "qa-1d", "bb091", 96), - totalMetric("node004-bb091", "ram", "qa-1d", "bb091", 412316860416), // 384Gi - usageMetric("node004-bb091", "cpu", "utilized", "qa-1d", "bb091", 0), - usageMetric("node004-bb091", "ram", "utilized", "qa-1d", "bb091", 0), - usageMetric("node004-bb091", "cpu", "reserved", "qa-1d", "bb091", 0), - usageMetric("node004-bb091", "ram", "reserved", "qa-1d", "bb091", 0), - usageMetric("node004-bb091", "cpu", "failover", "qa-1d", "bb091", 0), - usageMetric("node004-bb091", "ram", "failover", "qa-1d", "bb091", 0), - usageMetric("node004-bb091", "cpu", "payg", "qa-1d", "bb091", 96), // 96-0-0-0 - usageMetric("node004-bb091", "ram", "payg", "qa-1d", "bb091", 412316860416), // 384Gi-0 - }, - }, - { - name: "failover reservation on a hypervisor", - hypervisors: []hv1.Hypervisor{ - { - ObjectMeta: v1.ObjectMeta{ - Name: "node001-bb088", - Labels: map[string]string{ - "topology.kubernetes.io/zone": "qa-1a", - }, - }, - Status: hv1.HypervisorStatus{ - EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("128"), - hv1.ResourceMemory: resource.MustParse("512Gi"), - }, - Allocation: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("64"), - hv1.ResourceMemory: resource.MustParse("256Gi"), - }, - Traits: []string{}, - }, - }, - }, - reservations: []v1alpha1.Reservation{ - { - ObjectMeta: v1.ObjectMeta{ - Name: "failover-1", - }, - Spec: v1alpha1.ReservationSpec{ - Type: v1alpha1.ReservationTypeFailover, - SchedulingDomain: v1alpha1.SchedulingDomainNova, - Resources: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("16"), - hv1.ResourceMemory: resource.MustParse("64Gi"), - }, - FailoverReservation: &v1alpha1.FailoverReservationSpec{}, - }, - Status: v1alpha1.ReservationStatus{ - Host: "node001-bb088", - Conditions: []v1.Condition{ - {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, - }, - }, - }, - }, - expectedMetrics: []kvmExpectedMetric{ - totalMetric("node001-bb088", "cpu", "qa-1a", "bb088", 128), - totalMetric("node001-bb088", "ram", "qa-1a", "bb088", 549755813888), // 512Gi - usageMetric("node001-bb088", "cpu", "utilized", "qa-1a", "bb088", 64), - usageMetric("node001-bb088", "ram", "utilized", "qa-1a", "bb088", 274877906944), // 256Gi - usageMetric("node001-bb088", "cpu", "reserved", "qa-1a", "bb088", 0), - usageMetric("node001-bb088", "ram", "reserved", "qa-1a", "bb088", 0), - usageMetric("node001-bb088", "cpu", "failover", "qa-1a", "bb088", 16), - usageMetric("node001-bb088", "ram", "failover", "qa-1a", "bb088", 68719476736), // 64Gi - usageMetric("node001-bb088", "cpu", "payg", "qa-1a", "bb088", 48), // 128-64-0-16 - usageMetric("node001-bb088", "ram", "payg", "qa-1a", "bb088", 206158430208), // 512Gi-256Gi-0-64Gi = 192Gi - }, - }, - { - name: "committed resource reservation with partial allocation", - hypervisors: []hv1.Hypervisor{ - { - ObjectMeta: v1.ObjectMeta{ - Name: "node001-bb088", - Labels: map[string]string{ - "topology.kubernetes.io/zone": "qa-1a", - }, - }, - Status: hv1.HypervisorStatus{ - EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("128"), - hv1.ResourceMemory: resource.MustParse("512Gi"), - }, - Allocation: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("64"), - hv1.ResourceMemory: resource.MustParse("256Gi"), - }, - Traits: []string{}, - }, - }, - }, - reservations: []v1alpha1.Reservation{ - { - ObjectMeta: v1.ObjectMeta{ - Name: "committed-1", - }, - Spec: v1alpha1.ReservationSpec{ - Type: v1alpha1.ReservationTypeCommittedResource, - SchedulingDomain: v1alpha1.SchedulingDomainNova, - Resources: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("32"), - hv1.ResourceMemory: resource.MustParse("128Gi"), - }, - CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ - Allocations: map[string]v1alpha1.CommittedResourceAllocation{ - "vm-uuid-1": { - Resources: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("8"), - hv1.ResourceMemory: resource.MustParse("32Gi"), - }, - }, - }, - }, - }, - Status: v1alpha1.ReservationStatus{ - Host: "node001-bb088", - Conditions: []v1.Condition{ - {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, - }, - }, - }, - }, - expectedMetrics: []kvmExpectedMetric{ - totalMetric("node001-bb088", "cpu", "qa-1a", "bb088", 128), - totalMetric("node001-bb088", "ram", "qa-1a", "bb088", 549755813888), // 512Gi - usageMetric("node001-bb088", "cpu", "utilized", "qa-1a", "bb088", 64), - usageMetric("node001-bb088", "ram", "utilized", "qa-1a", "bb088", 274877906944), // 256Gi - // reserved = 32-8=24 CPU, 128Gi-32Gi=96Gi RAM (not in use) - usageMetric("node001-bb088", "cpu", "reserved", "qa-1a", "bb088", 24), - usageMetric("node001-bb088", "ram", "reserved", "qa-1a", "bb088", 103079215104), // 96Gi - usageMetric("node001-bb088", "cpu", "failover", "qa-1a", "bb088", 0), - usageMetric("node001-bb088", "ram", "failover", "qa-1a", "bb088", 0), - usageMetric("node001-bb088", "cpu", "payg", "qa-1a", "bb088", 40), // 128-64-24-0 - usageMetric("node001-bb088", "ram", "payg", "qa-1a", "bb088", 171798691840), // 512Gi-256Gi-96Gi-0 = 160Gi - }, - }, - { - name: "non-ready reservation should be ignored", - hypervisors: []hv1.Hypervisor{ - { - ObjectMeta: v1.ObjectMeta{ - Name: "node001-bb088", - Labels: map[string]string{ - "topology.kubernetes.io/zone": "qa-1a", - }, - }, - Status: hv1.HypervisorStatus{ - EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("128"), - hv1.ResourceMemory: resource.MustParse("512Gi"), - }, - Allocation: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("64"), - hv1.ResourceMemory: resource.MustParse("256Gi"), - }, - Traits: []string{}, - }, - }, - }, - reservations: []v1alpha1.Reservation{ - { - ObjectMeta: v1.ObjectMeta{ - Name: "failover-not-ready", - }, - Spec: v1alpha1.ReservationSpec{ - Type: v1alpha1.ReservationTypeFailover, - SchedulingDomain: v1alpha1.SchedulingDomainNova, - Resources: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("16"), - hv1.ResourceMemory: resource.MustParse("64Gi"), - }, - FailoverReservation: &v1alpha1.FailoverReservationSpec{}, - }, - Status: v1alpha1.ReservationStatus{ - Host: "node001-bb088", - Conditions: []v1.Condition{ - {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionFalse}, - }, - }, - }, - }, - expectedMetrics: []kvmExpectedMetric{ - totalMetric("node001-bb088", "cpu", "qa-1a", "bb088", 128), - totalMetric("node001-bb088", "ram", "qa-1a", "bb088", 549755813888), // 512Gi - usageMetric("node001-bb088", "cpu", "utilized", "qa-1a", "bb088", 64), - usageMetric("node001-bb088", "ram", "utilized", "qa-1a", "bb088", 274877906944), // 256Gi - usageMetric("node001-bb088", "cpu", "reserved", "qa-1a", "bb088", 0), - usageMetric("node001-bb088", "ram", "reserved", "qa-1a", "bb088", 0), - // Non-ready reservation ignored, so failover = 0 - usageMetric("node001-bb088", "cpu", "failover", "qa-1a", "bb088", 0), - usageMetric("node001-bb088", "ram", "failover", "qa-1a", "bb088", 0), - usageMetric("node001-bb088", "cpu", "payg", "qa-1a", "bb088", 64), // 128-64-0-0 - usageMetric("node001-bb088", "ram", "payg", "qa-1a", "bb088", 274877906944), // 512Gi-256Gi - }, - }, - { - name: "multiple failover reservations on same host are summed", - hypervisors: []hv1.Hypervisor{ - { - ObjectMeta: v1.ObjectMeta{ - Name: "node001-bb088", - Labels: map[string]string{ - "topology.kubernetes.io/zone": "qa-1a", - }, - }, - Status: hv1.HypervisorStatus{ - EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("128"), - hv1.ResourceMemory: resource.MustParse("512Gi"), - }, - Allocation: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("64"), - hv1.ResourceMemory: resource.MustParse("256Gi"), - }, - Traits: []string{}, - }, - }, - }, - reservations: []v1alpha1.Reservation{ - { - ObjectMeta: v1.ObjectMeta{ - Name: "failover-1", - }, - Spec: v1alpha1.ReservationSpec{ - Type: v1alpha1.ReservationTypeFailover, - SchedulingDomain: v1alpha1.SchedulingDomainNova, - Resources: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("8"), - hv1.ResourceMemory: resource.MustParse("32Gi"), - }, - FailoverReservation: &v1alpha1.FailoverReservationSpec{}, - }, - Status: v1alpha1.ReservationStatus{ - Host: "node001-bb088", - Conditions: []v1.Condition{ - {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, - }, - }, - }, - { - ObjectMeta: v1.ObjectMeta{ - Name: "failover-2", - }, - Spec: v1alpha1.ReservationSpec{ - Type: v1alpha1.ReservationTypeFailover, - SchedulingDomain: v1alpha1.SchedulingDomainNova, - Resources: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("12"), - hv1.ResourceMemory: resource.MustParse("48Gi"), - }, - FailoverReservation: &v1alpha1.FailoverReservationSpec{}, - }, - Status: v1alpha1.ReservationStatus{ - Host: "node001-bb088", - Conditions: []v1.Condition{ - {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, - }, - }, - }, - }, - expectedMetrics: []kvmExpectedMetric{ - totalMetric("node001-bb088", "cpu", "qa-1a", "bb088", 128), - totalMetric("node001-bb088", "ram", "qa-1a", "bb088", 549755813888), // 512Gi - usageMetric("node001-bb088", "cpu", "utilized", "qa-1a", "bb088", 64), - usageMetric("node001-bb088", "ram", "utilized", "qa-1a", "bb088", 274877906944), // 256Gi - usageMetric("node001-bb088", "cpu", "reserved", "qa-1a", "bb088", 0), - usageMetric("node001-bb088", "ram", "reserved", "qa-1a", "bb088", 0), - // failover = 8+12=20 CPU, 32Gi+48Gi=80Gi RAM - usageMetric("node001-bb088", "cpu", "failover", "qa-1a", "bb088", 20), - usageMetric("node001-bb088", "ram", "failover", "qa-1a", "bb088", 85899345920), // 80Gi - usageMetric("node001-bb088", "cpu", "payg", "qa-1a", "bb088", 44), // 128-64-0-20 - usageMetric("node001-bb088", "ram", "payg", "qa-1a", "bb088", 188978561024), // 512Gi-256Gi-0-80Gi = 176Gi - }, - }, - { - name: "payg capacity clamped to zero when overcommitted", - hypervisors: []hv1.Hypervisor{ - { - ObjectMeta: v1.ObjectMeta{ - Name: "node001-bb088", - Labels: map[string]string{"topology.kubernetes.io/zone": "qa-1a"}, - }, - Status: hv1.HypervisorStatus{ - EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("100"), - hv1.ResourceMemory: resource.MustParse("200Gi"), - }, - Allocation: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("80"), - hv1.ResourceMemory: resource.MustParse("150Gi"), - }, - Traits: []string{}, - }, - }, - }, - // failover=20 CPU/40Gi RAM, committed reserved=20 CPU/40Gi RAM (no allocations) - // CPU: 100 - 80 - 20 - 20 = -20 → clamped to 0 - // RAM: 200Gi - 150Gi - 40Gi - 40Gi = -30Gi → clamped to 0 - reservations: []v1alpha1.Reservation{ - { - ObjectMeta: v1.ObjectMeta{Name: "failover-1"}, - Spec: v1alpha1.ReservationSpec{ - Type: v1alpha1.ReservationTypeFailover, - SchedulingDomain: v1alpha1.SchedulingDomainNova, - Resources: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("20"), - hv1.ResourceMemory: resource.MustParse("40Gi"), - }, - FailoverReservation: &v1alpha1.FailoverReservationSpec{}, - }, - Status: v1alpha1.ReservationStatus{ - Host: "node001-bb088", - Conditions: []v1.Condition{ - {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, - }, - }, - }, - { - ObjectMeta: v1.ObjectMeta{Name: "committed-1"}, - Spec: v1alpha1.ReservationSpec{ - Type: v1alpha1.ReservationTypeCommittedResource, - SchedulingDomain: v1alpha1.SchedulingDomainNova, - Resources: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("20"), - hv1.ResourceMemory: resource.MustParse("40Gi"), - }, - CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{}, - }, - Status: v1alpha1.ReservationStatus{ - Host: "node001-bb088", - Conditions: []v1.Condition{ - {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, - }, - }, - }, - }, - expectedMetrics: []kvmExpectedMetric{ - totalMetric("node001-bb088", "cpu", "qa-1a", "bb088", 100), - totalMetric("node001-bb088", "ram", "qa-1a", "bb088", 214748364800), // 200Gi - usageMetric("node001-bb088", "cpu", "utilized", "qa-1a", "bb088", 80), - usageMetric("node001-bb088", "ram", "utilized", "qa-1a", "bb088", 161061273600), // 150Gi - usageMetric("node001-bb088", "cpu", "reserved", "qa-1a", "bb088", 20), - usageMetric("node001-bb088", "ram", "reserved", "qa-1a", "bb088", 42949672960), // 40Gi - usageMetric("node001-bb088", "cpu", "failover", "qa-1a", "bb088", 20), - usageMetric("node001-bb088", "ram", "failover", "qa-1a", "bb088", 42949672960), // 40Gi - usageMetric("node001-bb088", "cpu", "payg", "qa-1a", "bb088", 0), - usageMetric("node001-bb088", "ram", "payg", "qa-1a", "bb088", 0), - }, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - scheme := runtime.NewScheme() - if err := hv1.AddToScheme(scheme); err != nil { - t.Fatalf("failed to add hypervisor scheme: %v", err) - } - if err := v1alpha1.AddToScheme(scheme); err != nil { - t.Fatalf("failed to add v1alpha1 scheme: %v", err) - } - - objects := make([]runtime.Object, 0, len(tt.hypervisors)+len(tt.reservations)) - for i := range tt.hypervisors { - objects = append(objects, &tt.hypervisors[i]) - } - for i := range tt.reservations { - objects = append(objects, &tt.reservations[i]) - } - - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithRuntimeObjects(objects...). - Build() - - kpi := &KVMResourceCapacityKPI{} - if err := kpi.Init(nil, client, conf.NewRawOpts("{}")); err != nil { - t.Fatalf("failed to init KPI: %v", err) - } - - ch := make(chan prometheus.Metric, 1000) - kpi.Collect(ch) - close(ch) - - var actualMetrics []kvmExpectedMetric - for metric := range ch { - var m prometheusgo.Metric - if err := metric.Write(&m); err != nil { - t.Fatalf("failed to write metric: %v", err) - } - - metricName := getMetricName(metric.Desc().String()) - - labels := kvmMetricLabels{} - for _, label := range m.Label { - switch label.GetName() { - case "compute_host": - labels.ComputeHost = label.GetValue() - case "resource": - labels.Resource = label.GetValue() - case "type": - labels.Type = label.GetValue() - case "availability_zone": - labels.AvailabilityZone = label.GetValue() - case "building_block": - labels.BuildingBlock = label.GetValue() - case "cpu_architecture": - labels.CPUArchitecture = label.GetValue() - case "workload_type": - labels.WorkloadType = label.GetValue() - case "enabled": - labels.Enabled = label.GetValue() - case "decommissioned": - labels.Decommissioned = label.GetValue() - case "external_customer": - labels.ExternalCustomer = label.GetValue() - case "maintenance": - labels.Maintenance = label.GetValue() - } - } - - actualMetrics = append(actualMetrics, kvmExpectedMetric{ - Name: metricName, - Labels: labels, - Value: m.GetGauge().GetValue(), - }) - } - - // Verify exact equality: same number of metrics and each expected metric is present. - if len(actualMetrics) != len(tt.expectedMetrics) { - t.Errorf("metric count mismatch: expected %d, got %d\nactual: %+v", - len(tt.expectedMetrics), len(actualMetrics), actualMetrics) - } - for _, expected := range tt.expectedMetrics { - found := false - for _, actual := range actualMetrics { - nameMatch := expected.Name == "" || actual.Name == expected.Name - if nameMatch && actual.Labels == expected.Labels { - found = true - if actual.Value != expected.Value { - t.Errorf("metric %s with labels %+v: expected value %f, got %f", - expected.Name, expected.Labels, expected.Value, actual.Value) - } - break - } - } - if !found { - t.Errorf("metric %s with labels %+v not found in actual metrics", - expected.Name, expected.Labels) - } - } - for _, actual := range actualMetrics { - found := false - for _, expected := range tt.expectedMetrics { - nameMatch := expected.Name == "" || actual.Name == expected.Name - if nameMatch && actual.Labels == expected.Labels { - found = true - break - } - } - if !found { - t.Errorf("unexpected metric %s with labels %+v (value %f) in actual metrics", - actual.Name, actual.Labels, actual.Value) - } - } - }) - } -} - -func TestAggregateReservationsByHost(t *testing.T) { - tests := []struct { - name string - reservations []v1alpha1.Reservation - expectedFailover map[string]hostReservationResources - expectedCommittedNotInUse map[string]hostReservationResources - }{ - { - name: "empty reservations", - reservations: nil, - expectedFailover: map[string]hostReservationResources{}, - expectedCommittedNotInUse: map[string]hostReservationResources{}, - }, - { - name: "reservation with no ready condition is skipped", - reservations: []v1alpha1.Reservation{ - { - Spec: v1alpha1.ReservationSpec{ - Type: v1alpha1.ReservationTypeFailover, - SchedulingDomain: v1alpha1.SchedulingDomainNova, - Resources: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("10"), - }, - }, - Status: v1alpha1.ReservationStatus{ - Host: "host-1", - // No conditions - }, - }, - }, - expectedFailover: map[string]hostReservationResources{}, - expectedCommittedNotInUse: map[string]hostReservationResources{}, - }, - { - name: "reservation with empty host is skipped", - reservations: []v1alpha1.Reservation{ - { - Spec: v1alpha1.ReservationSpec{ - Type: v1alpha1.ReservationTypeFailover, - SchedulingDomain: v1alpha1.SchedulingDomainNova, - Resources: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("10"), - }, - }, - Status: v1alpha1.ReservationStatus{ - Host: "", - Conditions: []v1.Condition{ - {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, - }, - }, - }, - }, - expectedFailover: map[string]hostReservationResources{}, - expectedCommittedNotInUse: map[string]hostReservationResources{}, - }, - { - name: "committed resource with nil spec does not panic", - reservations: []v1alpha1.Reservation{ - { - Spec: v1alpha1.ReservationSpec{ - Type: v1alpha1.ReservationTypeCommittedResource, - SchedulingDomain: v1alpha1.SchedulingDomainNova, - Resources: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("16"), - hv1.ResourceMemory: resource.MustParse("64Gi"), - }, - CommittedResourceReservation: nil, - }, - Status: v1alpha1.ReservationStatus{ - Host: "host-1", - Conditions: []v1.Condition{ - {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, - }, - }, - }, - }, - expectedFailover: map[string]hostReservationResources{}, - expectedCommittedNotInUse: map[string]hostReservationResources{ - "host-1": { - cpu: resource.MustParse("16"), - memory: resource.MustParse("64Gi"), - }, - }, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - failover, committed := aggregateReservationsByHost(tt.reservations) - - if len(failover) != len(tt.expectedFailover) { - t.Errorf("failover map length: expected %d, got %d", len(tt.expectedFailover), len(failover)) - } - for host, expected := range tt.expectedFailover { - actual, ok := failover[host] - if !ok { - t.Errorf("failover: host %q not found", host) - continue - } - if actual.cpu.Cmp(expected.cpu) != 0 { - t.Errorf("failover[%s].cpu: expected %s, got %s", host, expected.cpu.String(), actual.cpu.String()) - } - if actual.memory.Cmp(expected.memory) != 0 { - t.Errorf("failover[%s].memory: expected %s, got %s", host, expected.memory.String(), actual.memory.String()) - } - } - - if len(committed) != len(tt.expectedCommittedNotInUse) { - t.Errorf("committed map length: expected %d, got %d", len(tt.expectedCommittedNotInUse), len(committed)) - } - for host, expected := range tt.expectedCommittedNotInUse { - actual, ok := committed[host] - if !ok { - t.Errorf("committed: host %q not found", host) - continue - } - if actual.cpu.Cmp(expected.cpu) != 0 { - t.Errorf("committed[%s].cpu: expected %s, got %s", host, expected.cpu.String(), actual.cpu.String()) - } - if actual.memory.Cmp(expected.memory) != 0 { - t.Errorf("committed[%s].memory: expected %s, got %s", host, expected.memory.String(), actual.memory.String()) - } - } - }) - } -} diff --git a/internal/knowledge/kpis/plugins/compute/vm_faults_test.go b/internal/knowledge/kpis/plugins/compute/vm_faults_test.go index a5b248b55..fd8d7f5d2 100644 --- a/internal/knowledge/kpis/plugins/compute/vm_faults_test.go +++ b/internal/knowledge/kpis/plugins/compute/vm_faults_test.go @@ -11,7 +11,6 @@ import ( "github.com/cobaltcore-dev/cortex/internal/knowledge/db" testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing" "github.com/cobaltcore-dev/cortex/pkg/conf" - testlib "github.com/cobaltcore-dev/cortex/pkg/testing" "github.com/prometheus/client_golang/prometheus" prometheusgo "github.com/prometheus/client_model/go" ) @@ -130,8 +129,8 @@ func TestVMFaultsKPI_Collect(t *testing.T) { Status: "ERROR", FlavorName: "qemu-small", OSEXTAvailabilityZone: "az1", - FaultCode: testlib.Ptr(uint(500)), - FaultMessage: testlib.Ptr("Internal error"), + FaultCode: new(uint(500)), + FaultMessage: new("Internal error"), }, // Another faulty server in different AZ &nova.Server{ @@ -140,8 +139,8 @@ func TestVMFaultsKPI_Collect(t *testing.T) { Status: "ERROR", FlavorName: "vmware-medium", OSEXTAvailabilityZone: "az2", - FaultCode: testlib.Ptr(uint(400)), - FaultMessage: testlib.Ptr("Bad request"), + FaultCode: new(uint(400)), + FaultMessage: new("Bad request"), }, // Server with only fault message (no code) &nova.Server{ @@ -150,7 +149,7 @@ func TestVMFaultsKPI_Collect(t *testing.T) { Status: "BUILD", FlavorName: "generic-large", OSEXTAvailabilityZone: "az1", - FaultMessage: testlib.Ptr("Some warning"), + FaultMessage: new("Some warning"), }, // Server with flavor that doesn't exist (should be skipped) &nova.Server{ diff --git a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go b/internal/knowledge/kpis/plugins/infrastructure/kvm_host_capacity.go similarity index 52% rename from internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go rename to internal/knowledge/kpis/plugins/infrastructure/kvm_host_capacity.go index 4a4040fd5..399648403 100644 --- a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go +++ b/internal/knowledge/kpis/plugins/infrastructure/kvm_host_capacity.go @@ -1,13 +1,11 @@ // Copyright SAP SE // SPDX-License-Identifier: Apache-2.0 -package compute +package infrastructure import ( "context" "log/slog" - "strconv" - "strings" "k8s.io/apimachinery/pkg/api/resource" "sigs.k8s.io/controller-runtime/pkg/client" @@ -21,141 +19,43 @@ import ( hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" ) -type kvmHost struct { - hv1.Hypervisor -} - -// getResourceCapacity attempts to retrieve the effective capacity for the specified resource from the hypervisor status, falling back to the physical capacity if effective capacity is not available. It returns the capacity quantity and a boolean indicating whether any capacity information was found. -func (k kvmHost) getResourceCapacity(resourceName hv1.ResourceName) (capacity resource.Quantity, ok bool) { - if k.Status.EffectiveCapacity != nil { - qty, exists := k.Status.EffectiveCapacity[resourceName] - if exists && !qty.IsZero() { - return qty, true - } - } - if k.Status.Capacity == nil { - return resource.Quantity{}, false - } - qty, exists := k.Status.Capacity[resourceName] - if !exists || qty.IsZero() { - return resource.Quantity{}, false - } - return qty, true -} - -func (k kvmHost) getResourceAllocation(resourceName hv1.ResourceName) (allocation resource.Quantity) { - if k.Status.Allocation == nil { - return resource.MustParse("0") - } - - qty, exists := k.Status.Allocation[resourceName] - if !exists { - return resource.MustParse("0") - } - return qty -} - -func (k kvmHost) getLabels() kvmHostLabels { - decommissioned := false - externalCustomer := false - workloadType := "general-purpose" - cpuArchitecture := "cascade-lake" - - for _, trait := range k.Status.Traits { - switch trait { - case "CUSTOM_HW_SAPPHIRE_RAPIDS": - cpuArchitecture = "sapphire-rapids" - case "CUSTOM_HANA_EXCLUSIVE_HOST": - workloadType = "hana" - case "CUSTOM_DECOMMISSIONING": - decommissioned = true - case "CUSTOM_EXTERNAL_CUSTOMER_EXCLUSIVE": - externalCustomer = true - } - } - - return kvmHostLabels{ - computeHost: k.Name, - availabilityZone: k.Labels["topology.kubernetes.io/zone"], - buildingBlock: getBuildingBlock(k.Name), - cpuArchitecture: cpuArchitecture, - workloadType: workloadType, - enabled: strconv.FormatBool(true), - decommissioned: strconv.FormatBool(decommissioned), - externalCustomer: strconv.FormatBool(externalCustomer), - maintenance: strconv.FormatBool(false), - } -} - -// Assuming hypervisor names are in the format nodeXXX-bbYY -func getBuildingBlock(hostName string) string { - parts := strings.Split(hostName, "-") - if len(parts) > 1 { - return parts[1] - } - return "unknown" -} - // hostReservationResources holds aggregated CPU and memory reservation quantities for a single hypervisor. type hostReservationResources struct { cpu resource.Quantity memory resource.Quantity } -type KVMResourceCapacityKPI struct { +type KVMHostCapacityKPI struct { // Common base for all KPIs that provides standard functionality. plugins.BaseKPI[struct{}] // No options passed through yaml config totalCapacityPerHost *prometheus.Desc capacityPerHost *prometheus.Desc } -func (KVMResourceCapacityKPI) GetName() string { +func (KVMHostCapacityKPI) GetName() string { return "kvm_host_capacity_kpi" } -func (k *KVMResourceCapacityKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) error { +func (k *KVMHostCapacityKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) error { if err := k.BaseKPI.Init(db, client, opts); err != nil { return err } k.totalCapacityPerHost = prometheus.NewDesc( "cortex_kvm_host_capacity_total", - "Total resource capacity on the KVM hosts (individually by host).", - []string{ - "compute_host", - "resource", - "availability_zone", - "building_block", - "cpu_architecture", - "workload_type", - "enabled", - "decommissioned", - "external_customer", - "maintenance", - }, + "Total resource capacity on the KVM hosts (individually by host). CPU in vCPUs, memory in bytes.", + append(kvmHostLabels, "resource"), nil, ) k.capacityPerHost = prometheus.NewDesc( "cortex_kvm_host_capacity_usage", - "Resource capacity usage on the KVM hosts (individually by host).", - []string{ - "compute_host", - "resource", - "type", - "availability_zone", - "building_block", - "cpu_architecture", - "workload_type", - "enabled", - "decommissioned", - "external_customer", - "maintenance", - }, + "Resource capacity usage on the KVM hosts (individually by host). CPU in vCPUs, memory in bytes.", + append(kvmHostLabels, "resource", "type"), nil, ) return nil } -func (k *KVMResourceCapacityKPI) Describe(ch chan<- *prometheus.Desc) { +func (k *KVMHostCapacityKPI) Describe(ch chan<- *prometheus.Desc) { ch <- k.totalCapacityPerHost ch <- k.capacityPerHost } @@ -230,7 +130,7 @@ func aggregateReservationsByHost(reservations []v1alpha1.Reservation) ( return failoverByHost, committedNotInUseByHost } -func (k *KVMResourceCapacityKPI) getHypervisors() ([]kvmHost, error) { +func (k *KVMHostCapacityKPI) getHypervisors() ([]kvmHost, error) { hvs := &hv1.HypervisorList{} if err := k.Client.List(context.Background(), hvs); err != nil { return nil, err @@ -243,7 +143,7 @@ func (k *KVMResourceCapacityKPI) getHypervisors() ([]kvmHost, error) { return hosts, nil } -func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) { +func (k *KVMHostCapacityKPI) Collect(ch chan<- prometheus.Metric) { hypervisors, err := k.getHypervisors() if err != nil { slog.Error("failed to get hypervisors", "error", err) @@ -260,6 +160,7 @@ func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) { for _, hypervisor := range hypervisors { cpuTotal, hasCPUTotal := hypervisor.getResourceCapacity(hv1.ResourceCPU) + ramTotal, hasRAMTotal := hypervisor.getResourceCapacity(hv1.ResourceMemory) if !hasCPUTotal || !hasRAMTotal { @@ -276,22 +177,23 @@ func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) { cpuReserved := committedRes.cpu ramReserved := committedRes.memory + cpuFailover := failoverRes.cpu ramFailover := failoverRes.memory - labels := hypervisor.getLabels() + labels := hypervisor.getHostLabels() - k.emitTotal(ch, "cpu", cpuTotal.AsApproximateFloat64(), labels) - k.emitTotal(ch, "ram", ramTotal.AsApproximateFloat64(), labels) + ch <- prometheus.MustNewConstMetric(k.totalCapacityPerHost, prometheus.GaugeValue, cpuTotal.AsApproximateFloat64(), append(labels, "cpu")...) + ch <- prometheus.MustNewConstMetric(k.totalCapacityPerHost, prometheus.GaugeValue, ramTotal.AsApproximateFloat64(), append(labels, "ram")...) - k.emitUsage(ch, "cpu", cpuUsed.AsApproximateFloat64(), "utilized", labels) - k.emitUsage(ch, "ram", ramUsed.AsApproximateFloat64(), "utilized", labels) + ch <- prometheus.MustNewConstMetric(k.capacityPerHost, prometheus.GaugeValue, cpuUsed.AsApproximateFloat64(), append(labels, "cpu", "utilized")...) + ch <- prometheus.MustNewConstMetric(k.capacityPerHost, prometheus.GaugeValue, ramUsed.AsApproximateFloat64(), append(labels, "ram", "utilized")...) - k.emitUsage(ch, "cpu", cpuReserved.AsApproximateFloat64(), "reserved", labels) - k.emitUsage(ch, "ram", ramReserved.AsApproximateFloat64(), "reserved", labels) + ch <- prometheus.MustNewConstMetric(k.capacityPerHost, prometheus.GaugeValue, cpuReserved.AsApproximateFloat64(), append(labels, "cpu", "reserved")...) + ch <- prometheus.MustNewConstMetric(k.capacityPerHost, prometheus.GaugeValue, ramReserved.AsApproximateFloat64(), append(labels, "ram", "reserved")...) - k.emitUsage(ch, "cpu", cpuFailover.AsApproximateFloat64(), "failover", labels) - k.emitUsage(ch, "ram", ramFailover.AsApproximateFloat64(), "failover", labels) + ch <- prometheus.MustNewConstMetric(k.capacityPerHost, prometheus.GaugeValue, cpuFailover.AsApproximateFloat64(), append(labels, "cpu", "failover")...) + ch <- prometheus.MustNewConstMetric(k.capacityPerHost, prometheus.GaugeValue, ramFailover.AsApproximateFloat64(), append(labels, "ram", "failover")...) // Calculate PAYG capacity paygCPU := cpuTotal.DeepCopy() @@ -310,57 +212,7 @@ func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) { paygRAM = resource.MustParse("0") } - k.emitUsage(ch, "cpu", paygCPU.AsApproximateFloat64(), "payg", labels) - k.emitUsage(ch, "ram", paygRAM.AsApproximateFloat64(), "payg", labels) + ch <- prometheus.MustNewConstMetric(k.capacityPerHost, prometheus.GaugeValue, paygCPU.AsApproximateFloat64(), append(labels, "cpu", "available")...) + ch <- prometheus.MustNewConstMetric(k.capacityPerHost, prometheus.GaugeValue, paygRAM.AsApproximateFloat64(), append(labels, "ram", "available")...) } } - -// kvmHostLabels holds precomputed label values derived from a hypervisor. -type kvmHostLabels struct { - computeHost string - availabilityZone string - buildingBlock string - cpuArchitecture string - workloadType string - enabled string - decommissioned string - externalCustomer string - maintenance string -} - -func (k *KVMResourceCapacityKPI) emitTotal(ch chan<- prometheus.Metric, resourceName string, value float64, l kvmHostLabels) { - ch <- prometheus.MustNewConstMetric( - k.totalCapacityPerHost, - prometheus.GaugeValue, - value, - l.computeHost, - resourceName, - l.availabilityZone, - l.buildingBlock, - l.cpuArchitecture, - l.workloadType, - l.enabled, - l.decommissioned, - l.externalCustomer, - l.maintenance, - ) -} - -func (k *KVMResourceCapacityKPI) emitUsage(ch chan<- prometheus.Metric, resourceName string, value float64, capacityType string, l kvmHostLabels) { - ch <- prometheus.MustNewConstMetric( - k.capacityPerHost, - prometheus.GaugeValue, - value, - l.computeHost, - resourceName, - capacityType, - l.availabilityZone, - l.buildingBlock, - l.cpuArchitecture, - l.workloadType, - l.enabled, - l.decommissioned, - l.externalCustomer, - l.maintenance, - ) -} diff --git a/internal/knowledge/kpis/plugins/infrastructure/kvm_host_capacity_test.go b/internal/knowledge/kpis/plugins/infrastructure/kvm_host_capacity_test.go new file mode 100644 index 000000000..317b478cc --- /dev/null +++ b/internal/knowledge/kpis/plugins/infrastructure/kvm_host_capacity_test.go @@ -0,0 +1,1003 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package infrastructure + +import ( + "reflect" + "testing" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/pkg/conf" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "github.com/prometheus/client_golang/prometheus" + prometheusgo "github.com/prometheus/client_model/go" + "k8s.io/apimachinery/pkg/api/resource" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestKVMResourceCapacityKPI_Init(t *testing.T) { + kpi := &KVMHostCapacityKPI{} + if err := kpi.Init(nil, nil, conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error, got %v", err) + } +} + +func kvmTotalMetric(host, res, az string, value float64) collectedVMwareMetric { + l := mockKVMHostLabels(host, az) + l["resource"] = res + return collectedVMwareMetric{Name: "cortex_kvm_host_capacity_total", Labels: l, Value: value} +} + +func kvmUsageMetric(host, res, capacityType, az string, value float64) collectedVMwareMetric { + l := mockKVMHostLabels(host, az) + l["resource"] = res + l["type"] = capacityType + return collectedVMwareMetric{Name: "cortex_kvm_host_capacity_usage", Labels: l, Value: value} +} + +func TestKVMResourceCapacityKPI_Collect(t *testing.T) { + tests := []struct { + name string + hypervisors []hv1.Hypervisor + reservations []v1alpha1.Reservation + expectedMetrics []collectedVMwareMetric + }{ + { + name: "single hypervisor with nil effective capacity", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: nil, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("64"), + hv1.ResourceMemory: resource.MustParse("256Gi"), + }, + Traits: []string{}, + }, + }, + }, + expectedMetrics: []collectedVMwareMetric{}, + }, + { + name: "single hypervisor with zero total capacity", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), + hv1.ResourceMemory: resource.MustParse("0"), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), + hv1.ResourceMemory: resource.MustParse("0"), + }, + Traits: []string{}, + }, + }, + }, + expectedMetrics: []collectedVMwareMetric{}, + }, + { + name: "nil effective capacity falls back to physical capacity", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: nil, + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("128"), + hv1.ResourceMemory: resource.MustParse("512Gi"), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("64"), + hv1.ResourceMemory: resource.MustParse("256Gi"), + }, + Traits: []string{}, + }, + }, + }, + expectedMetrics: []collectedVMwareMetric{ + kvmTotalMetric("node001-bb088", "cpu", "qa-1a", 128), + kvmTotalMetric("node001-bb088", "ram", "qa-1a", 549755813888), // 512Gi + kvmUsageMetric("node001-bb088", "cpu", "utilized", "qa-1a", 64), + kvmUsageMetric("node001-bb088", "ram", "utilized", "qa-1a", 274877906944), // 256Gi + kvmUsageMetric("node001-bb088", "cpu", "reserved", "qa-1a", 0), + kvmUsageMetric("node001-bb088", "ram", "reserved", "qa-1a", 0), + kvmUsageMetric("node001-bb088", "cpu", "failover", "qa-1a", 0), + kvmUsageMetric("node001-bb088", "ram", "failover", "qa-1a", 0), + kvmUsageMetric("node001-bb088", "cpu", "available", "qa-1a", 64), // 128-64-0-0 + kvmUsageMetric("node001-bb088", "ram", "available", "qa-1a", 274877906944), // 512Gi-256Gi + }, + }, + { + name: "zero effective capacity falls back to physical capacity", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), + hv1.ResourceMemory: resource.MustParse("0"), + }, + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("128"), + hv1.ResourceMemory: resource.MustParse("512Gi"), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("64"), + hv1.ResourceMemory: resource.MustParse("256Gi"), + }, + Traits: []string{}, + }, + }, + }, + expectedMetrics: []collectedVMwareMetric{ + kvmTotalMetric("node001-bb088", "cpu", "qa-1a", 128), + kvmTotalMetric("node001-bb088", "ram", "qa-1a", 549755813888), // 512Gi + kvmUsageMetric("node001-bb088", "cpu", "utilized", "qa-1a", 64), + kvmUsageMetric("node001-bb088", "ram", "utilized", "qa-1a", 274877906944), // 256Gi + kvmUsageMetric("node001-bb088", "cpu", "reserved", "qa-1a", 0), + kvmUsageMetric("node001-bb088", "ram", "reserved", "qa-1a", 0), + kvmUsageMetric("node001-bb088", "cpu", "failover", "qa-1a", 0), + kvmUsageMetric("node001-bb088", "ram", "failover", "qa-1a", 0), + kvmUsageMetric("node001-bb088", "cpu", "available", "qa-1a", 64), // 128-64-0-0 + kvmUsageMetric("node001-bb088", "ram", "available", "qa-1a", 274877906944), // 512Gi-256Gi + }, + }, + { + name: "zero effective capacity with nil physical capacity skips", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), + hv1.ResourceMemory: resource.MustParse("0"), + }, + Capacity: nil, + Traits: []string{}, + }, + }, + }, + expectedMetrics: []collectedVMwareMetric{}, + }, + { + name: "zero effective capacity with zero physical capacity skips", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), + hv1.ResourceMemory: resource.MustParse("0"), + }, + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), + hv1.ResourceMemory: resource.MustParse("0"), + }, + Traits: []string{}, + }, + }, + }, + expectedMetrics: []collectedVMwareMetric{}, + }, + { + name: "single hypervisor with default traits, no reservations", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("128"), + hv1.ResourceMemory: resource.MustParse("512Gi"), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("64"), + hv1.ResourceMemory: resource.MustParse("256Gi"), + }, + Traits: []string{}, + }, + }, + }, + expectedMetrics: []collectedVMwareMetric{ + kvmTotalMetric("node001-bb088", "cpu", "qa-1a", 128), + kvmTotalMetric("node001-bb088", "ram", "qa-1a", 549755813888), // 512Gi + kvmUsageMetric("node001-bb088", "cpu", "utilized", "qa-1a", 64), + kvmUsageMetric("node001-bb088", "ram", "utilized", "qa-1a", 274877906944), // 256Gi + kvmUsageMetric("node001-bb088", "cpu", "reserved", "qa-1a", 0), + kvmUsageMetric("node001-bb088", "ram", "reserved", "qa-1a", 0), + kvmUsageMetric("node001-bb088", "cpu", "failover", "qa-1a", 0), + kvmUsageMetric("node001-bb088", "ram", "failover", "qa-1a", 0), + kvmUsageMetric("node001-bb088", "cpu", "available", "qa-1a", 64), // 128-64-0-0 + kvmUsageMetric("node001-bb088", "ram", "available", "qa-1a", 274877906944), // 512Gi-256Gi + }, + }, + { + name: "hypervisor with sapphire rapids and hana traits", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node002-bb089", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1b", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("256"), + hv1.ResourceMemory: resource.MustParse("1Ti"), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("128"), + hv1.ResourceMemory: resource.MustParse("512Gi"), + }, + Traits: []string{ + "CUSTOM_HW_SAPPHIRE_RAPIDS", + "CUSTOM_HANA_EXCLUSIVE_HOST", + }, + }, + }, + }, + expectedMetrics: func() []collectedVMwareMetric { + l := func(res, typ string) map[string]string { + m := mockKVMHostLabels("node002-bb089", "qa-1b") + m["cpu_architecture"] = "sapphire-rapids" + m["workload_type"] = "hana" + m["resource"] = res + if typ != "" { + m["type"] = typ + } + return m + } + return []collectedVMwareMetric{ + {Name: "cortex_kvm_host_capacity_total", Labels: l("cpu", ""), Value: 256}, + {Name: "cortex_kvm_host_capacity_total", Labels: l("ram", ""), Value: 1099511627776}, // 1Ti + {Name: "cortex_kvm_host_capacity_usage", Labels: l("cpu", "utilized"), Value: 128}, + {Name: "cortex_kvm_host_capacity_usage", Labels: l("ram", "utilized"), Value: 549755813888}, // 512Gi + {Name: "cortex_kvm_host_capacity_usage", Labels: l("cpu", "reserved"), Value: 0}, + {Name: "cortex_kvm_host_capacity_usage", Labels: l("ram", "reserved"), Value: 0}, + {Name: "cortex_kvm_host_capacity_usage", Labels: l("cpu", "failover"), Value: 0}, + {Name: "cortex_kvm_host_capacity_usage", Labels: l("ram", "failover"), Value: 0}, + {Name: "cortex_kvm_host_capacity_usage", Labels: l("cpu", "available"), Value: 128}, // 256-128-0-0 + {Name: "cortex_kvm_host_capacity_usage", Labels: l("ram", "available"), Value: 549755813888}, // 1Ti-512Gi + } + }(), + }, + { + name: "hypervisor with decommissioned and external customer traits", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node003-bb090", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1c", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("64"), + hv1.ResourceMemory: resource.MustParse("256Gi"), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("32"), + hv1.ResourceMemory: resource.MustParse("128Gi"), + }, + Traits: []string{ + "CUSTOM_DECOMMISSIONING", + "CUSTOM_EXTERNAL_CUSTOMER_EXCLUSIVE", + }, + }, + }, + }, + expectedMetrics: func() []collectedVMwareMetric { + l := func(res, typ string) map[string]string { + m := mockKVMHostLabels("node003-bb090", "qa-1c") + m["decommissioned"] = "true" + m["external_customer"] = "true" + m["resource"] = res + if typ != "" { + m["type"] = typ + } + return m + } + return []collectedVMwareMetric{ + {Name: "cortex_kvm_host_capacity_total", Labels: l("cpu", ""), Value: 64}, + {Name: "cortex_kvm_host_capacity_total", Labels: l("ram", ""), Value: 274877906944}, // 256Gi + {Name: "cortex_kvm_host_capacity_usage", Labels: l("cpu", "utilized"), Value: 32}, + {Name: "cortex_kvm_host_capacity_usage", Labels: l("ram", "utilized"), Value: 137438953472}, // 128Gi + {Name: "cortex_kvm_host_capacity_usage", Labels: l("cpu", "reserved"), Value: 0}, + {Name: "cortex_kvm_host_capacity_usage", Labels: l("ram", "reserved"), Value: 0}, + {Name: "cortex_kvm_host_capacity_usage", Labels: l("cpu", "failover"), Value: 0}, + {Name: "cortex_kvm_host_capacity_usage", Labels: l("ram", "failover"), Value: 0}, + {Name: "cortex_kvm_host_capacity_usage", Labels: l("cpu", "available"), Value: 32}, // 64-32-0-0 + {Name: "cortex_kvm_host_capacity_usage", Labels: l("ram", "available"), Value: 137438953472}, // 256Gi-128Gi + } + }(), + }, + { + name: "multiple hypervisors", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node010-bb100", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("100"), + hv1.ResourceMemory: resource.MustParse("200Gi"), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("50"), + hv1.ResourceMemory: resource.MustParse("100Gi"), + }, + Traits: []string{}, + }, + }, + { + ObjectMeta: v1.ObjectMeta{ + Name: "node020-bb200", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1b", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("200"), + hv1.ResourceMemory: resource.MustParse("400Gi"), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("150"), + hv1.ResourceMemory: resource.MustParse("300Gi"), + }, + Traits: []string{"CUSTOM_HW_SAPPHIRE_RAPIDS"}, + }, + }, + }, + expectedMetrics: func() []collectedVMwareMetric { + sapphire := func(res, typ string) map[string]string { + m := mockKVMHostLabels("node020-bb200", "qa-1b") + m["cpu_architecture"] = "sapphire-rapids" + m["resource"] = res + if typ != "" { + m["type"] = typ + } + return m + } + return []collectedVMwareMetric{ + kvmTotalMetric("node010-bb100", "cpu", "qa-1a", 100), + kvmTotalMetric("node010-bb100", "ram", "qa-1a", 214748364800), // 200Gi + kvmUsageMetric("node010-bb100", "cpu", "utilized", "qa-1a", 50), + kvmUsageMetric("node010-bb100", "ram", "utilized", "qa-1a", 107374182400), // 100Gi + kvmUsageMetric("node010-bb100", "cpu", "reserved", "qa-1a", 0), + kvmUsageMetric("node010-bb100", "ram", "reserved", "qa-1a", 0), + kvmUsageMetric("node010-bb100", "cpu", "failover", "qa-1a", 0), + kvmUsageMetric("node010-bb100", "ram", "failover", "qa-1a", 0), + kvmUsageMetric("node010-bb100", "cpu", "available", "qa-1a", 50), // 100-50-0-0 + kvmUsageMetric("node010-bb100", "ram", "available", "qa-1a", 107374182400), // 200Gi-100Gi + {Name: "cortex_kvm_host_capacity_total", Labels: sapphire("cpu", ""), Value: 200}, + {Name: "cortex_kvm_host_capacity_total", Labels: sapphire("ram", ""), Value: 429496729600}, // 400Gi + {Name: "cortex_kvm_host_capacity_usage", Labels: sapphire("cpu", "utilized"), Value: 150}, + {Name: "cortex_kvm_host_capacity_usage", Labels: sapphire("ram", "utilized"), Value: 322122547200}, // 300Gi + {Name: "cortex_kvm_host_capacity_usage", Labels: sapphire("cpu", "reserved"), Value: 0}, + {Name: "cortex_kvm_host_capacity_usage", Labels: sapphire("ram", "reserved"), Value: 0}, + {Name: "cortex_kvm_host_capacity_usage", Labels: sapphire("cpu", "failover"), Value: 0}, + {Name: "cortex_kvm_host_capacity_usage", Labels: sapphire("ram", "failover"), Value: 0}, + {Name: "cortex_kvm_host_capacity_usage", Labels: sapphire("cpu", "available"), Value: 50}, // 200-150-0-0 + {Name: "cortex_kvm_host_capacity_usage", Labels: sapphire("ram", "available"), Value: 107374182400}, // 400Gi-300Gi + } + }(), + }, + { + name: "hypervisor with missing allocation data", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node004-bb091", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1d", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("96"), + hv1.ResourceMemory: resource.MustParse("384Gi"), + }, + Allocation: nil, + Traits: []string{}, + }, + }, + }, + expectedMetrics: []collectedVMwareMetric{ + kvmTotalMetric("node004-bb091", "cpu", "qa-1d", 96), + kvmTotalMetric("node004-bb091", "ram", "qa-1d", 412316860416), // 384Gi + kvmUsageMetric("node004-bb091", "cpu", "utilized", "qa-1d", 0), + kvmUsageMetric("node004-bb091", "ram", "utilized", "qa-1d", 0), + kvmUsageMetric("node004-bb091", "cpu", "reserved", "qa-1d", 0), + kvmUsageMetric("node004-bb091", "ram", "reserved", "qa-1d", 0), + kvmUsageMetric("node004-bb091", "cpu", "failover", "qa-1d", 0), + kvmUsageMetric("node004-bb091", "ram", "failover", "qa-1d", 0), + kvmUsageMetric("node004-bb091", "cpu", "available", "qa-1d", 96), // 96-0-0-0 + kvmUsageMetric("node004-bb091", "ram", "available", "qa-1d", 412316860416), // 384Gi-0 + }, + }, + { + name: "failover reservation on a hypervisor", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("128"), + hv1.ResourceMemory: resource.MustParse("512Gi"), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("64"), + hv1.ResourceMemory: resource.MustParse("256Gi"), + }, + Traits: []string{}, + }, + }, + }, + reservations: []v1alpha1.Reservation{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "failover-1", + }, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeFailover, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("16"), + hv1.ResourceMemory: resource.MustParse("64Gi"), + }, + FailoverReservation: &v1alpha1.FailoverReservationSpec{}, + }, + Status: v1alpha1.ReservationStatus{ + Host: "node001-bb088", + Conditions: []v1.Condition{ + {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, + }, + }, + }, + }, + expectedMetrics: []collectedVMwareMetric{ + kvmTotalMetric("node001-bb088", "cpu", "qa-1a", 128), + kvmTotalMetric("node001-bb088", "ram", "qa-1a", 549755813888), // 512Gi + kvmUsageMetric("node001-bb088", "cpu", "utilized", "qa-1a", 64), + kvmUsageMetric("node001-bb088", "ram", "utilized", "qa-1a", 274877906944), // 256Gi + kvmUsageMetric("node001-bb088", "cpu", "reserved", "qa-1a", 0), + kvmUsageMetric("node001-bb088", "ram", "reserved", "qa-1a", 0), + kvmUsageMetric("node001-bb088", "cpu", "failover", "qa-1a", 16), + kvmUsageMetric("node001-bb088", "ram", "failover", "qa-1a", 68719476736), // 64Gi + kvmUsageMetric("node001-bb088", "cpu", "available", "qa-1a", 48), // 128-64-0-16 + kvmUsageMetric("node001-bb088", "ram", "available", "qa-1a", 206158430208), // 512Gi-256Gi-0-64Gi = 192Gi + }, + }, + { + name: "committed resource reservation with partial allocation", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("128"), + hv1.ResourceMemory: resource.MustParse("512Gi"), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("64"), + hv1.ResourceMemory: resource.MustParse("256Gi"), + }, + Traits: []string{}, + }, + }, + }, + reservations: []v1alpha1.Reservation{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "committed-1", + }, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("32"), + hv1.ResourceMemory: resource.MustParse("128Gi"), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + Allocations: map[string]v1alpha1.CommittedResourceAllocation{ + "vm-uuid-1": { + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("8"), + hv1.ResourceMemory: resource.MustParse("32Gi"), + }, + }, + }, + }, + }, + Status: v1alpha1.ReservationStatus{ + Host: "node001-bb088", + Conditions: []v1.Condition{ + {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, + }, + }, + }, + }, + expectedMetrics: []collectedVMwareMetric{ + kvmTotalMetric("node001-bb088", "cpu", "qa-1a", 128), + kvmTotalMetric("node001-bb088", "ram", "qa-1a", 549755813888), // 512Gi + kvmUsageMetric("node001-bb088", "cpu", "utilized", "qa-1a", 64), + kvmUsageMetric("node001-bb088", "ram", "utilized", "qa-1a", 274877906944), // 256Gi + // reserved = 32-8=24 CPU, 128Gi-32Gi=96Gi RAM (not in use) + kvmUsageMetric("node001-bb088", "cpu", "reserved", "qa-1a", 24), + kvmUsageMetric("node001-bb088", "ram", "reserved", "qa-1a", 103079215104), // 96Gi + kvmUsageMetric("node001-bb088", "cpu", "failover", "qa-1a", 0), + kvmUsageMetric("node001-bb088", "ram", "failover", "qa-1a", 0), + kvmUsageMetric("node001-bb088", "cpu", "available", "qa-1a", 40), // 128-64-24-0 + kvmUsageMetric("node001-bb088", "ram", "available", "qa-1a", 171798691840), // 512Gi-256Gi-96Gi-0 = 160Gi + }, + }, + { + name: "non-ready reservation should be ignored", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("128"), + hv1.ResourceMemory: resource.MustParse("512Gi"), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("64"), + hv1.ResourceMemory: resource.MustParse("256Gi"), + }, + Traits: []string{}, + }, + }, + }, + reservations: []v1alpha1.Reservation{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "failover-not-ready", + }, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeFailover, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("16"), + hv1.ResourceMemory: resource.MustParse("64Gi"), + }, + FailoverReservation: &v1alpha1.FailoverReservationSpec{}, + }, + Status: v1alpha1.ReservationStatus{ + Host: "node001-bb088", + Conditions: []v1.Condition{ + {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionFalse}, + }, + }, + }, + }, + expectedMetrics: []collectedVMwareMetric{ + kvmTotalMetric("node001-bb088", "cpu", "qa-1a", 128), + kvmTotalMetric("node001-bb088", "ram", "qa-1a", 549755813888), // 512Gi + kvmUsageMetric("node001-bb088", "cpu", "utilized", "qa-1a", 64), + kvmUsageMetric("node001-bb088", "ram", "utilized", "qa-1a", 274877906944), // 256Gi + kvmUsageMetric("node001-bb088", "cpu", "reserved", "qa-1a", 0), + kvmUsageMetric("node001-bb088", "ram", "reserved", "qa-1a", 0), + // Non-ready reservation ignored, so failover = 0 + kvmUsageMetric("node001-bb088", "cpu", "failover", "qa-1a", 0), + kvmUsageMetric("node001-bb088", "ram", "failover", "qa-1a", 0), + kvmUsageMetric("node001-bb088", "cpu", "available", "qa-1a", 64), // 128-64-0-0 + kvmUsageMetric("node001-bb088", "ram", "available", "qa-1a", 274877906944), // 512Gi-256Gi + }, + }, + { + name: "multiple failover reservations on same host are summed", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("128"), + hv1.ResourceMemory: resource.MustParse("512Gi"), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("64"), + hv1.ResourceMemory: resource.MustParse("256Gi"), + }, + Traits: []string{}, + }, + }, + }, + reservations: []v1alpha1.Reservation{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "failover-1", + }, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeFailover, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("8"), + hv1.ResourceMemory: resource.MustParse("32Gi"), + }, + FailoverReservation: &v1alpha1.FailoverReservationSpec{}, + }, + Status: v1alpha1.ReservationStatus{ + Host: "node001-bb088", + Conditions: []v1.Condition{ + {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, + }, + }, + }, + { + ObjectMeta: v1.ObjectMeta{ + Name: "failover-2", + }, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeFailover, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("12"), + hv1.ResourceMemory: resource.MustParse("48Gi"), + }, + FailoverReservation: &v1alpha1.FailoverReservationSpec{}, + }, + Status: v1alpha1.ReservationStatus{ + Host: "node001-bb088", + Conditions: []v1.Condition{ + {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, + }, + }, + }, + }, + expectedMetrics: []collectedVMwareMetric{ + kvmTotalMetric("node001-bb088", "cpu", "qa-1a", 128), + kvmTotalMetric("node001-bb088", "ram", "qa-1a", 549755813888), // 512Gi + kvmUsageMetric("node001-bb088", "cpu", "utilized", "qa-1a", 64), + kvmUsageMetric("node001-bb088", "ram", "utilized", "qa-1a", 274877906944), // 256Gi + kvmUsageMetric("node001-bb088", "cpu", "reserved", "qa-1a", 0), + kvmUsageMetric("node001-bb088", "ram", "reserved", "qa-1a", 0), + // failover = 8+12=20 CPU, 32Gi+48Gi=80Gi RAM + kvmUsageMetric("node001-bb088", "cpu", "failover", "qa-1a", 20), + kvmUsageMetric("node001-bb088", "ram", "failover", "qa-1a", 85899345920), // 80Gi + kvmUsageMetric("node001-bb088", "cpu", "available", "qa-1a", 44), // 128-64-0-20 + kvmUsageMetric("node001-bb088", "ram", "available", "qa-1a", 188978561024), // 512Gi-256Gi-0-80Gi = 176Gi + }, + }, + { + name: "payg capacity clamped to zero when overcommitted", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{"topology.kubernetes.io/zone": "qa-1a"}, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("100"), + hv1.ResourceMemory: resource.MustParse("200Gi"), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("80"), + hv1.ResourceMemory: resource.MustParse("150Gi"), + }, + Traits: []string{}, + }, + }, + }, + // failover=20 CPU/40Gi RAM, committed reserved=20 CPU/40Gi RAM (no allocations) + // CPU: 100 - 80 - 20 - 20 = -20 → clamped to 0 + // RAM: 200Gi - 150Gi - 40Gi - 40Gi = -30Gi → clamped to 0 + reservations: []v1alpha1.Reservation{ + { + ObjectMeta: v1.ObjectMeta{Name: "failover-1"}, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeFailover, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("20"), + hv1.ResourceMemory: resource.MustParse("40Gi"), + }, + FailoverReservation: &v1alpha1.FailoverReservationSpec{}, + }, + Status: v1alpha1.ReservationStatus{ + Host: "node001-bb088", + Conditions: []v1.Condition{ + {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, + }, + }, + }, + { + ObjectMeta: v1.ObjectMeta{Name: "committed-1"}, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("20"), + hv1.ResourceMemory: resource.MustParse("40Gi"), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{}, + }, + Status: v1alpha1.ReservationStatus{ + Host: "node001-bb088", + Conditions: []v1.Condition{ + {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, + }, + }, + }, + }, + expectedMetrics: []collectedVMwareMetric{ + kvmTotalMetric("node001-bb088", "cpu", "qa-1a", 100), + kvmTotalMetric("node001-bb088", "ram", "qa-1a", 214748364800), // 200Gi + kvmUsageMetric("node001-bb088", "cpu", "utilized", "qa-1a", 80), + kvmUsageMetric("node001-bb088", "ram", "utilized", "qa-1a", 161061273600), // 150Gi + kvmUsageMetric("node001-bb088", "cpu", "reserved", "qa-1a", 20), + kvmUsageMetric("node001-bb088", "ram", "reserved", "qa-1a", 42949672960), // 40Gi + kvmUsageMetric("node001-bb088", "cpu", "failover", "qa-1a", 20), + kvmUsageMetric("node001-bb088", "ram", "failover", "qa-1a", 42949672960), // 40Gi + kvmUsageMetric("node001-bb088", "cpu", "available", "qa-1a", 0), + kvmUsageMetric("node001-bb088", "ram", "available", "qa-1a", 0), + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + scheme := runtime.NewScheme() + if err := hv1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add hypervisor scheme: %v", err) + } + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add v1alpha1 scheme: %v", err) + } + + objects := make([]runtime.Object, 0, len(tt.hypervisors)+len(tt.reservations)) + for i := range tt.hypervisors { + objects = append(objects, &tt.hypervisors[i]) + } + for i := range tt.reservations { + objects = append(objects, &tt.reservations[i]) + } + + client := fake.NewClientBuilder(). + WithScheme(scheme). + WithRuntimeObjects(objects...). + Build() + + kpi := &KVMHostCapacityKPI{} + if err := kpi.Init(nil, client, conf.NewRawOpts("{}")); err != nil { + t.Fatalf("failed to init KPI: %v", err) + } + + ch := make(chan prometheus.Metric, 1000) + kpi.Collect(ch) + close(ch) + + actual := make(map[string]collectedVMwareMetric) + for m := range ch { + var pm prometheusgo.Metric + if err := m.Write(&pm); err != nil { + t.Fatalf("failed to write metric: %v", err) + } + labels := make(map[string]string) + for _, lbl := range pm.Label { + labels[lbl.GetName()] = lbl.GetValue() + } + name := getMetricName(m.Desc().String()) + key := name + "|" + labels["compute_host"] + "|" + labels["resource"] + "|" + labels["type"] + if _, exists := actual[key]; exists { + t.Fatalf("duplicate metric key %q", key) + } + actual[key] = collectedVMwareMetric{Name: name, Labels: labels, Value: pm.GetGauge().GetValue()} + } + + if len(actual) != len(tt.expectedMetrics) { + t.Errorf("expected %d metrics, got %d: actual=%v", len(tt.expectedMetrics), len(actual), actual) + } + for _, exp := range tt.expectedMetrics { + key := exp.Name + "|" + exp.Labels["compute_host"] + "|" + exp.Labels["resource"] + "|" + exp.Labels["type"] + got, ok := actual[key] + if !ok { + t.Errorf("missing metric %q", key) + continue + } + if got.Value != exp.Value { + t.Errorf("metric %q value: expected %v, got %v", key, exp.Value, got.Value) + } + if !reflect.DeepEqual(exp.Labels, got.Labels) { + t.Errorf("metric %q labels: expected %v, got %v", key, exp.Labels, got.Labels) + } + } + }) + } +} + +func TestAggregateReservationsByHost(t *testing.T) { + tests := []struct { + name string + reservations []v1alpha1.Reservation + expectedFailover map[string]hostReservationResources + expectedCommittedNotInUse map[string]hostReservationResources + }{ + { + name: "empty reservations", + reservations: nil, + expectedFailover: map[string]hostReservationResources{}, + expectedCommittedNotInUse: map[string]hostReservationResources{}, + }, + { + name: "reservation with no ready condition is skipped", + reservations: []v1alpha1.Reservation{ + { + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeFailover, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("10"), + }, + }, + Status: v1alpha1.ReservationStatus{ + Host: "host-1", + // No conditions + }, + }, + }, + expectedFailover: map[string]hostReservationResources{}, + expectedCommittedNotInUse: map[string]hostReservationResources{}, + }, + { + name: "reservation with empty host is skipped", + reservations: []v1alpha1.Reservation{ + { + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeFailover, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("10"), + }, + }, + Status: v1alpha1.ReservationStatus{ + Host: "", + Conditions: []v1.Condition{ + {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, + }, + }, + }, + }, + expectedFailover: map[string]hostReservationResources{}, + expectedCommittedNotInUse: map[string]hostReservationResources{}, + }, + { + name: "committed resource with nil spec does not panic", + reservations: []v1alpha1.Reservation{ + { + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("16"), + hv1.ResourceMemory: resource.MustParse("64Gi"), + }, + CommittedResourceReservation: nil, + }, + Status: v1alpha1.ReservationStatus{ + Host: "host-1", + Conditions: []v1.Condition{ + {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, + }, + }, + }, + }, + expectedFailover: map[string]hostReservationResources{}, + expectedCommittedNotInUse: map[string]hostReservationResources{ + "host-1": { + cpu: resource.MustParse("16"), + memory: resource.MustParse("64Gi"), + }, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + failover, committed := aggregateReservationsByHost(tt.reservations) + + if len(failover) != len(tt.expectedFailover) { + t.Errorf("failover map length: expected %d, got %d", len(tt.expectedFailover), len(failover)) + } + for host, expected := range tt.expectedFailover { + actual, ok := failover[host] + if !ok { + t.Errorf("failover: host %q not found", host) + continue + } + if actual.cpu.Cmp(expected.cpu) != 0 { + t.Errorf("failover[%s].cpu: expected %s, got %s", host, expected.cpu.String(), actual.cpu.String()) + } + if actual.memory.Cmp(expected.memory) != 0 { + t.Errorf("failover[%s].memory: expected %s, got %s", host, expected.memory.String(), actual.memory.String()) + } + } + + if len(committed) != len(tt.expectedCommittedNotInUse) { + t.Errorf("committed map length: expected %d, got %d", len(tt.expectedCommittedNotInUse), len(committed)) + } + for host, expected := range tt.expectedCommittedNotInUse { + actual, ok := committed[host] + if !ok { + t.Errorf("committed: host %q not found", host) + continue + } + if actual.cpu.Cmp(expected.cpu) != 0 { + t.Errorf("committed[%s].cpu: expected %s, got %s", host, expected.cpu.String(), actual.cpu.String()) + } + if actual.memory.Cmp(expected.memory) != 0 { + t.Errorf("committed[%s].memory: expected %s, got %s", host, expected.memory.String(), actual.memory.String()) + } + } + }) + } +} diff --git a/internal/knowledge/kpis/plugins/infrastructure/kvm_project_utilization.go b/internal/knowledge/kpis/plugins/infrastructure/kvm_project_utilization.go new file mode 100644 index 000000000..20f0ca32e --- /dev/null +++ b/internal/knowledge/kpis/plugins/infrastructure/kvm_project_utilization.go @@ -0,0 +1,204 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package infrastructure + +import ( + "context" + "log/slog" + + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/identity" + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" + "github.com/cobaltcore-dev/cortex/internal/knowledge/db" + "github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins" + "github.com/cobaltcore-dev/cortex/pkg/conf" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +type kvmProjectInstanceCount struct { + ProjectID string `db:"project_id"` + ProjectName string `db:"project_name"` + DomainID string `db:"domain_id"` + DomainName string `db:"domain_name"` + ComputeHost string `db:"compute_host"` + FlavorName string `db:"flavor_name"` + AvailabilityZone string `db:"availability_zone"` + InstanceCount float64 `db:"instance_count"` +} + +type kvmProjectCapacityUsage struct { + ProjectID string `db:"project_id"` + ProjectName string `db:"project_name"` + DomainID string `db:"domain_id"` + DomainName string `db:"domain_name"` + ComputeHost string `db:"compute_host"` + AvailabilityZone string `db:"availability_zone"` + TotalVCPUs float64 `db:"total_vcpus"` + TotalRAMMB float64 `db:"total_ram_mb"` + TotalDiskGB float64 `db:"total_disk_gb"` +} + +type KVMProjectUtilizationKPI struct { + // BaseKPI provides common fields and methods for all KPIs, such as database connection and Kubernetes client. + plugins.BaseKPI[struct{}] + + // instanceCountPerProjectAndHostAndFlavor is a Prometheus descriptor for the metric that counts the number of instances per project, host, and flavor. + instanceCountPerProjectAndHostAndFlavor *prometheus.Desc + + // capacityUsagePerProjectAndHost is a Prometheus descriptor for the metric that measures the capacity usage per project and host. + capacityUsagePerProjectAndHost *prometheus.Desc +} + +func (k *KVMProjectUtilizationKPI) GetName() string { + return "kvm_project_utilization_kpi" +} + +func (k *KVMProjectUtilizationKPI) Init(dbConn *db.DB, c client.Client, opts conf.RawOpts) error { + if err := k.BaseKPI.Init(dbConn, c, opts); err != nil { + return err + } + + k.instanceCountPerProjectAndHostAndFlavor = prometheus.NewDesc( + "cortex_kvm_project_instances", + "Number of running instances per project, hypervisor, and flavor on KVM.", + append(kvmHostLabels, "project_id", "project_name", "domain_id", "domain_name", "flavor_name"), nil, + ) + k.capacityUsagePerProjectAndHost = prometheus.NewDesc( + "cortex_kvm_project_capacity_usage", + "Resource capacity used by a project per KVM hypervisor and flavor. CPU in vCPUs, memory and disk in bytes.", + append(kvmHostLabels, "project_id", "project_name", "domain_id", "domain_name", "resource"), nil, + ) + return nil +} + +func (k *KVMProjectUtilizationKPI) Describe(ch chan<- *prometheus.Desc) { + ch <- k.instanceCountPerProjectAndHostAndFlavor + ch <- k.capacityUsagePerProjectAndHost +} + +func (k *KVMProjectUtilizationKPI) Collect(ch chan<- prometheus.Metric) { + hosts, err := k.getKVMHosts() + if err != nil { + slog.Error("kvm_project_utilization: failed to get KVM hosts", "error", err) + return + } + + // Export project x flavor x compute_host instance count metric + projectInstanceCounts, err := k.queryProjectInstanceCount() + if err != nil { + slog.Error("kvm_project_utilization: Failed to query project instance count for project utilization KPI", "error", err) + return + } + for _, projectInstanceCount := range projectInstanceCounts { + host, ok := hosts[projectInstanceCount.ComputeHost] + if !ok { + slog.Warn("kvm_project_utilization: Compute host not found for project instance count", "compute_host", projectInstanceCount.ComputeHost) + continue + } + hostLabels := host.getHostLabels() + hostLabels = append(hostLabels, projectInstanceCount.ProjectID, projectInstanceCount.ProjectName, projectInstanceCount.DomainID, projectInstanceCount.DomainName, projectInstanceCount.FlavorName) + ch <- prometheus.MustNewConstMetric(k.instanceCountPerProjectAndHostAndFlavor, prometheus.GaugeValue, projectInstanceCount.InstanceCount, hostLabels...) + } + + // Export project x compute_host x resource capacity usage metric + projectCapacityUsages, err := k.queryProjectCapacityUsage() + if err != nil { + slog.Error("kvm_project_utilization: Failed to query project capacity usage for project utilization KPI", "error", err) + return + } + for _, projectCapacityUsage := range projectCapacityUsages { + host, ok := hosts[projectCapacityUsage.ComputeHost] + if !ok { + slog.Warn("kvm_project_utilization: Compute host not found for project capacity usage", "compute_host", projectCapacityUsage.ComputeHost) + continue + } + hostLabels := host.getHostLabels() + hostLabels = append(hostLabels, projectCapacityUsage.ProjectID, projectCapacityUsage.ProjectName, projectCapacityUsage.DomainID, projectCapacityUsage.DomainName) + + ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, projectCapacityUsage.TotalVCPUs, append(hostLabels, "vcpu")...) + ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, projectCapacityUsage.TotalRAMMB*1024*1024, append(hostLabels, "memory")...) + ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, projectCapacityUsage.TotalDiskGB*1024*1024*1024, append(hostLabels, "disk")...) + } +} + +// getKVMHosts retrieves the list of KVM hosts and their details from the database, returning a map keyed by compute host name. +func (k *KVMProjectUtilizationKPI) getKVMHosts() (map[string]kvmHost, error) { + hvs := &hv1.HypervisorList{} + if err := k.Client.List(context.Background(), hvs); err != nil { + return nil, err + } + + hosts := make(map[string]kvmHost, len(hvs.Items)) + for _, hv := range hvs.Items { + host := kvmHost{Hypervisor: hv} + hosts[host.Name] = host + } + return hosts, nil +} + +// queryProjectInstanceCount retrieves the number of running instances per project, hypervisor, and flavor on KVM from the database. +func (k *KVMProjectUtilizationKPI) queryProjectCapacityUsage() ([]kvmProjectCapacityUsage, error) { + // This query will fetch all active instances. It will perform a join with the openstack projects to get the project name. + // It will also join with the flavors table to get the flavor information, which is needed for the capacity usage metrics. + // The results will be grouped by project, compute host, and availability zone to get the total capacity usage per project and hypervisor. + // We will filter the results to only include instances that are running on KVM hypervisors by checking the compute host name pattern. + // This assumes that all KVM hypervisors have a compute host name that follows the pattern "nodeXXX-bbYYY", + // which is a naming convention in SAP Cloud Infrastructure and may need to be adjusted based on the actual environment. + query := ` + SELECT + s.tenant_id AS project_id, + COALESCE(p.name, '') AS project_name, + COALESCE(p.domain_id, '') AS domain_id, + COALESCE(d.name, '') AS domain_name, + s.os_ext_srv_attr_host AS compute_host, + s.os_ext_az_availability_zone AS availability_zone, + COALESCE(SUM(f.vcpus), 0) AS total_vcpus, + COALESCE(SUM(f.ram), 0) AS total_ram_mb, + COALESCE(SUM(f.disk), 0) AS total_disk_gb + FROM ` + nova.Server{}.TableName() + ` s + LEFT JOIN ` + nova.Flavor{}.TableName() + ` f ON s.flavor_name = f.name + LEFT JOIN ` + identity.Project{}.TableName() + ` p ON p.id = s.tenant_id + LEFT JOIN ` + identity.Domain{}.TableName() + ` d ON d.id = p.domain_id + WHERE s.status NOT IN ('DELETED', 'ERROR') + AND s.os_ext_srv_attr_host LIKE '` + kvmComputeHostPattern + `' + GROUP BY s.tenant_id, p.name, p.domain_id, d.name, s.os_ext_srv_attr_host, s.os_ext_az_availability_zone + ` + var usages []kvmProjectCapacityUsage + if _, err := k.DB.Select(&usages, query); err != nil { + return nil, err + } + return usages, nil +} + +// queryProjectInstanceCount retrieves the number of running instances per project, hypervisor, and flavor on KVM. +func (k *KVMProjectUtilizationKPI) queryProjectInstanceCount() ([]kvmProjectInstanceCount, error) { + // This query will fetch all active instances. It will perform a join with the openstack projects to get the project name. + // The results will be grouped by project, hypervisor, flavor, and availability zone to get the instance count. + // We will filter the results to only include instances that are running on KVM hypervisors by checking the compute host name pattern. + // This assumes that all KVM hypervisors have a compute host name that follows the pattern "nodeXXX-bbYYY", + // which is a naming convention in SAP Cloud Infrastructure and may need to be adjusted based on the actual environment. + query := ` + SELECT + s.tenant_id AS project_id, + COALESCE(p.name, '') AS project_name, + COALESCE(p.domain_id, '') AS domain_id, + COALESCE(d.name, '') AS domain_name, + s.os_ext_srv_attr_host AS compute_host, + s.os_ext_az_availability_zone AS availability_zone, + s.flavor_name, + COUNT(*) AS instance_count + FROM ` + nova.Server{}.TableName() + ` s + LEFT JOIN ` + identity.Project{}.TableName() + ` p ON p.id = s.tenant_id + LEFT JOIN ` + identity.Domain{}.TableName() + ` d ON d.id = p.domain_id + WHERE s.status NOT IN ('DELETED', 'ERROR') + AND s.os_ext_srv_attr_host LIKE '` + kvmComputeHostPattern + `' + GROUP BY s.tenant_id, p.name, p.domain_id, d.name, s.os_ext_srv_attr_host, s.flavor_name, s.os_ext_az_availability_zone + ` + var usages []kvmProjectInstanceCount + if _, err := k.DB.Select(&usages, query); err != nil { + return nil, err + } + return usages, nil +} diff --git a/internal/knowledge/kpis/plugins/infrastructure/kvm_project_utilization_test.go b/internal/knowledge/kpis/plugins/infrastructure/kvm_project_utilization_test.go new file mode 100644 index 000000000..9b919c676 --- /dev/null +++ b/internal/knowledge/kpis/plugins/infrastructure/kvm_project_utilization_test.go @@ -0,0 +1,764 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package infrastructure + +import ( + "reflect" + "testing" + + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/identity" + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" + "github.com/cobaltcore-dev/cortex/internal/knowledge/db" + testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing" + "github.com/cobaltcore-dev/cortex/pkg/conf" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "github.com/prometheus/client_golang/prometheus" + prometheusgo "github.com/prometheus/client_model/go" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +type collectedKVMMetric struct { + Name string + Labels map[string]string + Value float64 +} + +func buildKVMMetricKey(name string, labels map[string]string) string { + switch name { + case "cortex_kvm_project_instances": + return name + "|" + labels["compute_host"] + "|" + labels["project_id"] + + "|" + labels["flavor_name"] + "|" + labels["availability_zone"] + case "cortex_kvm_project_capacity_usage": + return name + "|" + labels["compute_host"] + "|" + labels["project_id"] + + "|" + labels["availability_zone"] + "|" + labels["resource"] + default: + return name + } +} + +func kvmInstanceMetric(computeHost, az, projectID, projectName, domainID, domainName, flavorName string, value float64) collectedKVMMetric { + labels := mockKVMHostLabels(computeHost, az) + labels["project_id"] = projectID + labels["project_name"] = projectName + labels["domain_id"] = domainID + labels["domain_name"] = domainName + labels["flavor_name"] = flavorName + return collectedKVMMetric{Name: "cortex_kvm_project_instances", Labels: labels, Value: value} +} + +func kvmCapacityMetric(computeHost, az, projectID, projectName, domainID, domainName, resource string, value float64) collectedKVMMetric { + labels := mockKVMHostLabels(computeHost, az) + labels["project_id"] = projectID + labels["project_name"] = projectName + labels["domain_id"] = domainID + labels["domain_name"] = domainName + labels["resource"] = resource + return collectedKVMMetric{Name: "cortex_kvm_project_capacity_usage", Labels: labels, Value: value} +} + +func buildKVMHypervisorClient(t *testing.T, hvs []hv1.Hypervisor) *fake.ClientBuilder { + t.Helper() + s := runtime.NewScheme() + if err := hv1.AddToScheme(s); err != nil { + t.Fatalf("failed to add hv1 scheme: %v", err) + } + var objects []runtime.Object + for i := range hvs { + objects = append(objects, &hvs[i]) + } + return fake.NewClientBuilder().WithScheme(s).WithRuntimeObjects(objects...) +} + +func TestKVMProjectUtilizationKPI_Init(t *testing.T) { + dbEnv := testlibDB.SetupDBEnv(t) + testDB := db.DB{DbMap: dbEnv.DbMap} + defer dbEnv.Close() + kpi := &KVMProjectUtilizationKPI{} + if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error, got %v", err) + } +} + +func TestKVMProjectUtilizationKPI_getKVMHosts(t *testing.T) { + hvs := []hv1.Hypervisor{ + {ObjectMeta: metav1.ObjectMeta{Name: "node001-bb01"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "node002-bb01"}}, + } + + clientBuilder := buildKVMHypervisorClient(t, hvs) + kpi := &KVMProjectUtilizationKPI{} + kpi.Client = clientBuilder.Build() + + hostMapping, err := kpi.getKVMHosts() + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + + if len(hostMapping) != len(hvs) { + t.Fatalf("expected %d hosts, got %d", len(hvs), len(hostMapping)) + } + for _, hv := range hvs { + host, ok := hostMapping[hv.Name] + if !ok { + t.Fatalf("expected host %s not found in mapping", hv.Name) + } + if host.Name != hv.Name { + t.Errorf("host name mismatch: expected %s, got %s", hv.Name, host.Name) + } + } +} + +func TestKVMProjectUtilizationKPI_queryProjectInstanceCount(t *testing.T) { + tests := []struct { + name string + servers []nova.Server + projects []identity.Project + domains []identity.Domain + expectedCounts map[string]kvmProjectInstanceCount + }{ + { + name: "single instance in one project", + servers: []nova.Server{ + {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + expectedCounts: map[string]kvmProjectInstanceCount{ + "project-1|node001-bb01|flavor-1|az1": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "node001-bb01", FlavorName: "flavor-1", AvailabilityZone: "az1", InstanceCount: 1}, + }, + }, + { + name: "multiple instances across projects and hosts", + servers: []nova.Server{ + {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + {ID: "server-2", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-2", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + {ID: "server-3", TenantID: "project-2", OSEXTSRVATTRHost: "node002-bb02", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az2"}, + {ID: "server-4", TenantID: "project-2", OSEXTSRVATTRHost: "node002-bb02", FlavorName: "flavor-2", Status: "ACTIVE", OSEXTAvailabilityZone: "az2"}, + }, + projects: []identity.Project{ + {ID: "project-1", Name: "Project One", DomainID: "domain-1"}, + {ID: "project-2", Name: "Project Two", DomainID: "domain-1"}, + }, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + expectedCounts: map[string]kvmProjectInstanceCount{ + "project-1|node001-bb01|flavor-1|az1": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "node001-bb01", FlavorName: "flavor-1", AvailabilityZone: "az1", InstanceCount: 1}, + "project-1|node001-bb01|flavor-2|az1": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "node001-bb01", FlavorName: "flavor-2", AvailabilityZone: "az1", InstanceCount: 1}, + "project-2|node002-bb02|flavor-1|az2": {ProjectID: "project-2", ProjectName: "Project Two", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "node002-bb02", FlavorName: "flavor-1", AvailabilityZone: "az2", InstanceCount: 1}, + "project-2|node002-bb02|flavor-2|az2": {ProjectID: "project-2", ProjectName: "Project Two", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "node002-bb02", FlavorName: "flavor-2", AvailabilityZone: "az2", InstanceCount: 1}, + }, + }, + { + name: "instances on non-KVM hosts are excluded", + servers: []nova.Server{ + {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + {ID: "server-2", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-2", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + expectedCounts: map[string]kvmProjectInstanceCount{ + "project-1|node001-bb01|flavor-1|az1": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "node001-bb01", FlavorName: "flavor-1", AvailabilityZone: "az1", InstanceCount: 1}, + }, + }, + { + name: "instances with non-ACTIVE status are excluded", + servers: []nova.Server{ + {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-1", Status: "DELETED", OSEXTAvailabilityZone: "az1"}, + {ID: "server-2", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-2", Status: "ERROR", OSEXTAvailabilityZone: "az1"}, + {ID: "server-3", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-3", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + expectedCounts: map[string]kvmProjectInstanceCount{ + "project-1|node001-bb01|flavor-3|az1": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "node001-bb01", FlavorName: "flavor-3", AvailabilityZone: "az1", InstanceCount: 1}, + }, + }, + { + name: "multiple instances with same key are counted correctly", + servers: []nova.Server{ + {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + {ID: "server-2", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + {ID: "server-3", TenantID: "project-1", OSEXTSRVATTRHost: "node002-bb02", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az2"}, + {ID: "server-4", TenantID: "project-1", OSEXTSRVATTRHost: "node002-bb02", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az2"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + expectedCounts: map[string]kvmProjectInstanceCount{ + "project-1|node001-bb01|flavor-1|az1": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "node001-bb01", FlavorName: "flavor-1", AvailabilityZone: "az1", InstanceCount: 2}, + "project-1|node002-bb02|flavor-1|az2": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "node002-bb02", FlavorName: "flavor-1", AvailabilityZone: "az2", InstanceCount: 2}, + }, + }, + { + name: "project references non-existent domain results in empty domain fields", + servers: []nova.Server{ + {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-unknown"}}, + domains: []identity.Domain{}, + expectedCounts: map[string]kvmProjectInstanceCount{ + // The domain_id is extracted from the project record, so it should be "domain-unknown" even though there is no matching domain entry + "project-1|node001-bb01|flavor-1|az1": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-unknown", DomainName: "", ComputeHost: "node001-bb01", FlavorName: "flavor-1", AvailabilityZone: "az1", InstanceCount: 1}, + }, + }, + { + name: "missing project entry results in empty project_name and domain", + servers: []nova.Server{ + {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + }, + projects: []identity.Project{}, + domains: []identity.Domain{}, + expectedCounts: map[string]kvmProjectInstanceCount{ + "project-1|node001-bb01|flavor-1|az1": {ProjectID: "project-1", ProjectName: "", DomainID: "", DomainName: "", ComputeHost: "node001-bb01", FlavorName: "flavor-1", AvailabilityZone: "az1", InstanceCount: 1}, + }, + }, + { + name: "no instances returns empty result", + servers: []nova.Server{}, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + expectedCounts: map[string]kvmProjectInstanceCount{}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + dbEnv := testlibDB.SetupDBEnv(t) + testDB := db.DB{DbMap: dbEnv.DbMap} + defer dbEnv.Close() + + if err := testDB.CreateTable( + testDB.AddTable(nova.Server{}), + testDB.AddTable(identity.Project{}), + testDB.AddTable(identity.Domain{}), + ); err != nil { + t.Fatalf("failed to create tables: %v", err) + } + + var mockData []any + for i := range tt.servers { + mockData = append(mockData, &tt.servers[i]) + } + for i := range tt.projects { + mockData = append(mockData, &tt.projects[i]) + } + for i := range tt.domains { + mockData = append(mockData, &tt.domains[i]) + } + if len(mockData) > 0 { + if err := testDB.Insert(mockData...); err != nil { + t.Fatalf("expected no error, got %v", err) + } + } + + client := buildKVMHypervisorClient(t, []hv1.Hypervisor{}) + kpi := &KVMProjectUtilizationKPI{} + if err := kpi.Init(&testDB, client.Build(), conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error on Init, got %v", err) + } + counts, err := kpi.queryProjectInstanceCount() + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + + if len(counts) != len(tt.expectedCounts) { + t.Fatalf("expected %d counts, got %d", len(tt.expectedCounts), len(counts)) + } + for _, got := range counts { + key := got.ProjectID + "|" + got.ComputeHost + "|" + got.FlavorName + "|" + got.AvailabilityZone + exp, ok := tt.expectedCounts[key] + if !ok { + t.Errorf("unexpected count for key %q: %+v", key, got) + continue + } + if got != exp { + t.Errorf("count mismatch for key %q: expected %+v, got %+v", key, exp, got) + } + } + }) + } +} + +func TestKVMProjectUtilizationKPI_queryProjectCapacityUsage(t *testing.T) { + tests := []struct { + name string + servers []nova.Server + projects []identity.Project + domains []identity.Domain + flavors []nova.Flavor + expectedUsages map[string]kvmProjectCapacityUsage + }{ + { + name: "single instance with flavor details", + servers: []nova.Server{ + {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, + expectedUsages: map[string]kvmProjectCapacityUsage{ + "project-1|node001-bb01|az1": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "node001-bb01", AvailabilityZone: "az1", TotalVCPUs: 2, TotalRAMMB: 4096, TotalDiskGB: 1}, + }, + }, + { + name: "multiple instances with different flavors and projects", + servers: []nova.Server{ + {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + {ID: "server-2", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-2", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + {ID: "server-3", TenantID: "project-2", OSEXTSRVATTRHost: "node002-bb02", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az2"}, + }, + projects: []identity.Project{ + {ID: "project-1", Name: "Project One", DomainID: "domain-1"}, + {ID: "project-2", Name: "Project Two", DomainID: "domain-1"}, + }, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{ + {ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}, + {ID: "f2", Name: "flavor-2", VCPUs: 4, RAM: 8192, Disk: 2}, + }, + expectedUsages: map[string]kvmProjectCapacityUsage{ + "project-1|node001-bb01|az1": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "node001-bb01", AvailabilityZone: "az1", TotalVCPUs: 6, TotalRAMMB: 12288, TotalDiskGB: 3}, + "project-2|node002-bb02|az2": {ProjectID: "project-2", ProjectName: "Project Two", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "node002-bb02", AvailabilityZone: "az2", TotalVCPUs: 2, TotalRAMMB: 4096, TotalDiskGB: 1}, + }, + }, + { + name: "missing flavor entry results in zero capacity", + servers: []nova.Server{ + {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-missing", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, + expectedUsages: map[string]kvmProjectCapacityUsage{ + "project-1|node001-bb01|az1": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "node001-bb01", AvailabilityZone: "az1", TotalVCPUs: 0, TotalRAMMB: 0, TotalDiskGB: 0}, + }, + }, + { + name: "instances on non-KVM hosts are excluded", + servers: []nova.Server{ + {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, + expectedUsages: map[string]kvmProjectCapacityUsage{}, + }, + { + name: "instances with non-ACTIVE status are excluded", + servers: []nova.Server{ + {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-1", Status: "DELETED", OSEXTAvailabilityZone: "az1"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, + expectedUsages: map[string]kvmProjectCapacityUsage{}, + }, + { + name: "no instances returns empty capacity usage", + servers: []nova.Server{}, + projects: []identity.Project{ + {ID: "project-1", Name: "Project One", DomainID: "domain-1"}, + }, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, + expectedUsages: map[string]kvmProjectCapacityUsage{}, + }, + { + name: "multiple instances with same flavor aggregate capacity correctly", + servers: []nova.Server{ + {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + {ID: "server-2", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, + expectedUsages: map[string]kvmProjectCapacityUsage{ + "project-1|node001-bb01|az1": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "node001-bb01", AvailabilityZone: "az1", TotalVCPUs: 4, TotalRAMMB: 8192, TotalDiskGB: 2}, + }, + }, + { + name: "project references non-existent domain results in empty domain fields", + servers: []nova.Server{ + {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-unknown"}}, + domains: []identity.Domain{}, + flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, + expectedUsages: map[string]kvmProjectCapacityUsage{ + // The domain_id is extracted from the project record, so it should be "domain-unknown" even though there is no matching domain entry + "project-1|node001-bb01|az1": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-unknown", DomainName: "", ComputeHost: "node001-bb01", AvailabilityZone: "az1", TotalVCPUs: 2, TotalRAMMB: 4096, TotalDiskGB: 1}, + }, + }, + { + name: "missing project entry results in empty project_name and domain", + servers: []nova.Server{ + {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + }, + projects: []identity.Project{}, + domains: []identity.Domain{}, + flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, + expectedUsages: map[string]kvmProjectCapacityUsage{ + "project-1|node001-bb01|az1": {ProjectID: "project-1", ProjectName: "", DomainID: "", DomainName: "", ComputeHost: "node001-bb01", AvailabilityZone: "az1", TotalVCPUs: 2, TotalRAMMB: 4096, TotalDiskGB: 1}, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + dbEnv := testlibDB.SetupDBEnv(t) + testDB := db.DB{DbMap: dbEnv.DbMap} + defer dbEnv.Close() + + if err := testDB.CreateTable( + testDB.AddTable(nova.Server{}), + testDB.AddTable(identity.Project{}), + testDB.AddTable(identity.Domain{}), + testDB.AddTable(nova.Flavor{}), + ); err != nil { + t.Fatalf("failed to create tables: %v", err) + } + + var mockData []any + for i := range tt.servers { + mockData = append(mockData, &tt.servers[i]) + } + for i := range tt.projects { + mockData = append(mockData, &tt.projects[i]) + } + for i := range tt.domains { + mockData = append(mockData, &tt.domains[i]) + } + for i := range tt.flavors { + mockData = append(mockData, &tt.flavors[i]) + } + if len(mockData) > 0 { + if err := testDB.Insert(mockData...); err != nil { + t.Fatalf("expected no error, got %v", err) + } + } + + client := buildKVMHypervisorClient(t, []hv1.Hypervisor{}) + kpi := &KVMProjectUtilizationKPI{} + if err := kpi.Init(&testDB, client.Build(), conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error on Init, got %v", err) + } + usages, err := kpi.queryProjectCapacityUsage() + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + + if len(usages) != len(tt.expectedUsages) { + t.Fatalf("expected %d usages, got %d", len(tt.expectedUsages), len(usages)) + } + for _, got := range usages { + key := got.ProjectID + "|" + got.ComputeHost + "|" + got.AvailabilityZone + exp, ok := tt.expectedUsages[key] + if !ok { + t.Errorf("unexpected usage for key %q: %+v", key, got) + continue + } + if got != exp { + t.Errorf("usage mismatch for key %q: expected %+v, got %+v", key, exp, got) + } + } + }) + } +} + +func TestKVMProjectUtilizationKPI_Collect(t *testing.T) { + tests := []struct { + name string + servers []nova.Server + projects []identity.Project + domains []identity.Domain + flavors []nova.Flavor + hypervisors []hv1.Hypervisor + expectedMetrics []collectedKVMMetric + }{ + { + name: "single instance in one project", + servers: []nova.Server{ + {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, + hypervisors: []hv1.Hypervisor{ + {ObjectMeta: metav1.ObjectMeta{Name: "node001-bb01", Labels: map[string]string{"topology.kubernetes.io/zone": "az1"}}}, + }, + expectedMetrics: []collectedKVMMetric{ + kvmInstanceMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-1", 1), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 2), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "memory", 4096*1024*1024), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "disk", 1*1024*1024*1024), + }, + }, + { + name: "multiple instances across hosts, projects, and flavors", + servers: []nova.Server{ + {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + {ID: "s2", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-2", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + {ID: "s3", TenantID: "project-2", OSEXTSRVATTRHost: "node002-bb02", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az2"}, + }, + projects: []identity.Project{ + {ID: "project-1", Name: "Project One", DomainID: "domain-1"}, + {ID: "project-2", Name: "Project Two", DomainID: "domain-1"}, + }, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{ + {ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}, + {ID: "f2", Name: "flavor-2", VCPUs: 4, RAM: 8192, Disk: 2}, + }, + hypervisors: []hv1.Hypervisor{ + {ObjectMeta: metav1.ObjectMeta{Name: "node001-bb01", Labels: map[string]string{"topology.kubernetes.io/zone": "az1"}}}, + {ObjectMeta: metav1.ObjectMeta{Name: "node002-bb02", Labels: map[string]string{"topology.kubernetes.io/zone": "az2"}}}, + }, + expectedMetrics: []collectedKVMMetric{ + kvmInstanceMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-1", 1), + kvmInstanceMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-2", 1), + kvmInstanceMetric("node002-bb02", "az2", "project-2", "Project Two", "domain-1", "Domain One", "flavor-1", 1), + // node001-bb01/project-1: 1*flavor-1 + 1*flavor-2 + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 6), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "memory", 12288*1024*1024), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "disk", 3*1024*1024*1024), + // node002-bb02/project-2: 1*flavor-1 + kvmCapacityMetric("node002-bb02", "az2", "project-2", "Project Two", "domain-1", "Domain One", "vcpu", 2), + kvmCapacityMetric("node002-bb02", "az2", "project-2", "Project Two", "domain-1", "Domain One", "memory", 4096*1024*1024), + kvmCapacityMetric("node002-bb02", "az2", "project-2", "Project Two", "domain-1", "Domain One", "disk", 1*1024*1024*1024), + }, + }, + { + name: "non-KVM hosts are excluded from metrics", + servers: []nova.Server{ + {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + {ID: "s2", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, + hypervisors: []hv1.Hypervisor{ + {ObjectMeta: metav1.ObjectMeta{Name: "node001-bb01", Labels: map[string]string{"topology.kubernetes.io/zone": "az1"}}}, + }, + expectedMetrics: []collectedKVMMetric{ + kvmInstanceMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-1", 1), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 2), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "memory", 4096*1024*1024), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "disk", 1*1024*1024*1024), + }, + }, + { + name: "DELETED and ERROR instances are excluded", + servers: []nova.Server{ + {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-1", Status: "DELETED", OSEXTAvailabilityZone: "az1"}, + {ID: "s2", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-2", Status: "ERROR", OSEXTAvailabilityZone: "az1"}, + {ID: "s3", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-3", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{ + {ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}, + {ID: "f2", Name: "flavor-2", VCPUs: 4, RAM: 8192, Disk: 2}, + {ID: "f3", Name: "flavor-3", VCPUs: 8, RAM: 16384, Disk: 4}, + }, + hypervisors: []hv1.Hypervisor{ + {ObjectMeta: metav1.ObjectMeta{Name: "node001-bb01", Labels: map[string]string{"topology.kubernetes.io/zone": "az1"}}}, + }, + expectedMetrics: []collectedKVMMetric{ + kvmInstanceMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-3", 1), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 8), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "memory", 16384*1024*1024), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "disk", 4*1024*1024*1024), + }, + }, + { + name: "multiple instances with same flavor are aggregated correctly", + servers: []nova.Server{ + {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + {ID: "s2", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + {ID: "s3", TenantID: "project-1", OSEXTSRVATTRHost: "node002-bb02", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az2"}, + {ID: "s4", TenantID: "project-1", OSEXTSRVATTRHost: "node002-bb02", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az2"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, + hypervisors: []hv1.Hypervisor{ + {ObjectMeta: metav1.ObjectMeta{Name: "node001-bb01", Labels: map[string]string{"topology.kubernetes.io/zone": "az1"}}}, + {ObjectMeta: metav1.ObjectMeta{Name: "node002-bb02", Labels: map[string]string{"topology.kubernetes.io/zone": "az2"}}}, + }, + expectedMetrics: []collectedKVMMetric{ + kvmInstanceMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-1", 2), + kvmInstanceMetric("node002-bb02", "az2", "project-1", "Project One", "domain-1", "Domain One", "flavor-1", 2), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 4), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "memory", 2*4096*1024*1024), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "disk", 2*1024*1024*1024), + kvmCapacityMetric("node002-bb02", "az2", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 4), + kvmCapacityMetric("node002-bb02", "az2", "project-1", "Project One", "domain-1", "Domain One", "memory", 2*4096*1024*1024), + kvmCapacityMetric("node002-bb02", "az2", "project-1", "Project One", "domain-1", "Domain One", "disk", 2*1024*1024*1024), + }, + }, + { + name: "compute host not in hypervisor list produces no metrics", + servers: []nova.Server{ + {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, + hypervisors: []hv1.Hypervisor{}, + expectedMetrics: []collectedKVMMetric{}, + }, + { + name: "project references non-existent domain results in empty domain labels", + servers: []nova.Server{ + {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-unknown"}}, + domains: []identity.Domain{}, + flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, + hypervisors: []hv1.Hypervisor{ + {ObjectMeta: metav1.ObjectMeta{Name: "node001-bb01", Labels: map[string]string{"topology.kubernetes.io/zone": "az1"}}}, + }, + expectedMetrics: []collectedKVMMetric{ + // The domain_id is extracted from the project record, so it should be "domain-unknown" even though there is no matching domain entry + kvmInstanceMetric("node001-bb01", "az1", "project-1", "Project One", "domain-unknown", "", "flavor-1", 1), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-unknown", "", "vcpu", 2), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-unknown", "", "memory", 4096*1024*1024), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-unknown", "", "disk", 1*1024*1024*1024), + }, + }, + { + name: "missing project entry results in empty project_name and domain labels", + servers: []nova.Server{ + {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + }, + projects: []identity.Project{}, + domains: []identity.Domain{}, + flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, + hypervisors: []hv1.Hypervisor{ + {ObjectMeta: metav1.ObjectMeta{Name: "node001-bb01", Labels: map[string]string{"topology.kubernetes.io/zone": "az1"}}}, + }, + expectedMetrics: []collectedKVMMetric{ + kvmInstanceMetric("node001-bb01", "az1", "project-1", "", "", "", "flavor-1", 1), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "", "", "", "vcpu", 2), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "", "", "", "memory", 4096*1024*1024), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "", "", "", "disk", 1*1024*1024*1024), + }, + }, + { + name: "missing flavor entry results in zero capacity", + servers: []nova.Server{ + {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "flavor-missing", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{}, + hypervisors: []hv1.Hypervisor{ + {ObjectMeta: metav1.ObjectMeta{Name: "node001-bb01", Labels: map[string]string{"topology.kubernetes.io/zone": "az1"}}}, + }, + expectedMetrics: []collectedKVMMetric{ + kvmInstanceMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-missing", 1), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 0), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "memory", 0), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "disk", 0), + }, + }, + { + name: "no instances produces no metrics", + servers: []nova.Server{}, + projects: []identity.Project{ + {ID: "project-1", Name: "Project One", DomainID: "domain-1"}, + }, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{ + {ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}, + }, + hypervisors: []hv1.Hypervisor{ + {ObjectMeta: metav1.ObjectMeta{Name: "node001-bb01", Labels: map[string]string{"topology.kubernetes.io/zone": "az1"}}}, + }, + expectedMetrics: []collectedKVMMetric{}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + dbEnv := testlibDB.SetupDBEnv(t) + testDB := db.DB{DbMap: dbEnv.DbMap} + defer dbEnv.Close() + + if err := testDB.CreateTable( + testDB.AddTable(nova.Server{}), + testDB.AddTable(identity.Project{}), + testDB.AddTable(identity.Domain{}), + testDB.AddTable(nova.Flavor{}), + ); err != nil { + t.Fatalf("failed to create tables: %v", err) + } + + var mockData []any + for i := range tt.servers { + mockData = append(mockData, &tt.servers[i]) + } + for i := range tt.projects { + mockData = append(mockData, &tt.projects[i]) + } + for i := range tt.domains { + mockData = append(mockData, &tt.domains[i]) + } + for i := range tt.flavors { + mockData = append(mockData, &tt.flavors[i]) + } + if len(mockData) > 0 { + if err := testDB.Insert(mockData...); err != nil { + t.Fatalf("expected no error inserting data, got %v", err) + } + } + + client := buildKVMHypervisorClient(t, tt.hypervisors) + kpi := &KVMProjectUtilizationKPI{} + if err := kpi.Init(&testDB, client.Build(), conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error on Init, got %v", err) + } + + ch := make(chan prometheus.Metric, 100) + kpi.Collect(ch) + close(ch) + + actual := make(map[string]collectedKVMMetric) + for m := range ch { + var pm prometheusgo.Metric + if err := m.Write(&pm); err != nil { + t.Fatalf("failed to write metric: %v", err) + } + labels := make(map[string]string) + for _, lbl := range pm.Label { + labels[lbl.GetName()] = lbl.GetValue() + } + name := getMetricName(m.Desc().String()) + key := buildKVMMetricKey(name, labels) + if _, exists := actual[key]; exists { + t.Fatalf("duplicate metric key %q", key) + } + actual[key] = collectedKVMMetric{Name: name, Labels: labels, Value: pm.GetGauge().GetValue()} + } + + if len(actual) != len(tt.expectedMetrics) { + t.Errorf("expected %d metrics, got %d: actual=%v", len(tt.expectedMetrics), len(actual), actual) + } + for _, exp := range tt.expectedMetrics { + key := buildKVMMetricKey(exp.Name, exp.Labels) + got, ok := actual[key] + if !ok { + t.Errorf("missing metric %q", key) + continue + } + if got.Value != exp.Value { + t.Errorf("metric %q value: expected %v, got %v", key, exp.Value, got.Value) + } + if !reflect.DeepEqual(exp.Labels, got.Labels) { + t.Errorf("metric %q labels: expected %v, got %v", key, exp.Labels, got.Labels) + } + } + }) + } +} diff --git a/internal/knowledge/kpis/plugins/infrastructure/shared.go b/internal/knowledge/kpis/plugins/infrastructure/shared.go index 62eb44e9c..079dbf006 100644 --- a/internal/knowledge/kpis/plugins/infrastructure/shared.go +++ b/internal/knowledge/kpis/plugins/infrastructure/shared.go @@ -7,8 +7,11 @@ import ( "fmt" "regexp" "strconv" + "strings" "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "k8s.io/apimachinery/pkg/api/resource" ) const ( @@ -18,6 +21,7 @@ const ( hypervisorFamilyVMware = "vmware" vmwareComputeHostPattern = "nova-compute-%" vmwareIronicComputeHostPattern = "nova-compute-ironic-%" + kvmComputeHostPattern = "node%-bb%" ) // vmwareHost wraps HostDetails with Prometheus metric helpers. @@ -63,6 +67,105 @@ var vmwareHostLabels = []string{ "pinned_project_ids", } +var kvmHostLabels = []string{ + "compute_host", + "availability_zone", + "building_block", + "cpu_architecture", + "workload_type", + "enabled", + "decommissioned", + "external_customer", + "maintenance", + "os_version", +} + +type kvmHost struct { + hv1.Hypervisor +} + +func (h kvmHost) getHostLabels() []string { + decommissioned := false + externalCustomer := false + workloadType := "general-purpose" + cpuArchitecture := "cascade-lake" + + availabilityZone := h.Labels["topology.kubernetes.io/zone"] + if availabilityZone == "" { + availabilityZone = "unknown" + } + + buildingBlock := "unknown" + // Assuming hypervisor names are in the format nodeXXX-bbYY + parts := strings.Split(h.Name, "-") + if len(parts) > 1 { + buildingBlock = parts[1] + } + + osVersion := h.Spec.OperatingSystemVersion + if osVersion == "" { + osVersion = "unknown" + } + + for _, trait := range h.Status.Traits { + switch trait { + case "CUSTOM_HW_SAPPHIRE_RAPIDS": + cpuArchitecture = "sapphire-rapids" + case "CUSTOM_HANA_EXCLUSIVE_HOST": + workloadType = "hana" + case "CUSTOM_DECOMMISSIONING": + decommissioned = true + case "CUSTOM_EXTERNAL_CUSTOMER_EXCLUSIVE": + externalCustomer = true + } + } + + maintenance := h.Spec.Maintenance != hv1.MaintenanceUnset + + return []string{ + h.Name, + availabilityZone, + buildingBlock, + cpuArchitecture, + workloadType, + strconv.FormatBool(true), + strconv.FormatBool(decommissioned), + strconv.FormatBool(externalCustomer), + strconv.FormatBool(maintenance), + osVersion, + } +} + +// getResourceCapacity attempts to retrieve the effective capacity for the specified resource from the hypervisor status, falling back to the physical capacity if effective capacity is not available. It returns the capacity quantity and a boolean indicating whether any capacity information was found. +func (k kvmHost) getResourceCapacity(resourceName hv1.ResourceName) (capacity resource.Quantity, ok bool) { + if k.Status.EffectiveCapacity != nil { + qty, exists := k.Status.EffectiveCapacity[resourceName] + if exists && !qty.IsZero() { + return qty, true + } + } + if k.Status.Capacity == nil { + return resource.Quantity{}, false + } + qty, exists := k.Status.Capacity[resourceName] + if !exists || qty.IsZero() { + return resource.Quantity{}, false + } + return qty, true +} + +func (k kvmHost) getResourceAllocation(resourceName hv1.ResourceName) (allocation resource.Quantity) { + if k.Status.Allocation == nil { + return resource.MustParse("0") + } + + qty, exists := k.Status.Allocation[resourceName] + if !exists { + return resource.MustParse("0") + } + return qty +} + var fqNameRe = regexp.MustCompile(`fqName: "([^"]+)"`) func getMetricName(desc string) string { diff --git a/internal/knowledge/kpis/plugins/infrastructure/shared_test.go b/internal/knowledge/kpis/plugins/infrastructure/shared_test.go index 351fedc50..302023ef2 100644 --- a/internal/knowledge/kpis/plugins/infrastructure/shared_test.go +++ b/internal/knowledge/kpis/plugins/infrastructure/shared_test.go @@ -4,11 +4,34 @@ package infrastructure import ( + "strings" "testing" "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +func mockKVMHostLabels(host, az string) map[string]string { + bb := "unknown" + parts := strings.Split(host, "-") + if len(parts) > 1 { + bb = parts[1] + } + return map[string]string{ + "compute_host": host, + "availability_zone": az, + "building_block": bb, + "cpu_architecture": "cascade-lake", + "workload_type": "general-purpose", + "enabled": "true", + "decommissioned": "false", + "external_customer": "false", + "maintenance": "false", + "os_version": "unknown", + } +} + func mockVMwareHostLabels(computeHost, az string) map[string]string { return map[string]string{ "availability_zone": az, @@ -24,7 +47,7 @@ func mockVMwareHostLabels(computeHost, az string) map[string]string { } } -func TestVMwareHostGetHostLabels(t *testing.T) { +func TestVMwareHost_GetHostLabels(t *testing.T) { str := func(s string) *string { return &s } tests := []struct { @@ -92,6 +115,118 @@ func TestVMwareHostGetHostLabels(t *testing.T) { } } +func TestKVMHost_GetHostLabels(t *testing.T) { + tests := []struct { + name string + host kvmHost + want []string + }{ + { + name: "defaults with no traits and no labels", + host: kvmHost{hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{Name: "node001-bb01"}, + }}, + want: []string{"node001-bb01", "unknown", "bb01", "cascade-lake", "general-purpose", "true", "false", "false", "false"}, + }, + { + name: "availability zone from label", + host: kvmHost{hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node001-bb01", + Labels: map[string]string{"topology.kubernetes.io/zone": "az1"}, + }, + }}, + want: []string{"node001-bb01", "az1", "bb01", "cascade-lake", "general-purpose", "true", "false", "false", "false"}, + }, + { + name: "name without dash results in unknown building block", + host: kvmHost{hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{Name: "nodewithoutdash"}, + }}, + want: []string{"nodewithoutdash", "unknown", "unknown", "cascade-lake", "general-purpose", "true", "false", "false", "false"}, + }, + { + name: "sapphire rapids trait", + host: kvmHost{hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{Name: "node001-bb01"}, + Status: hv1.HypervisorStatus{Traits: []string{"CUSTOM_HW_SAPPHIRE_RAPIDS"}}, + }}, + want: []string{"node001-bb01", "unknown", "bb01", "sapphire-rapids", "general-purpose", "true", "false", "false", "false"}, + }, + { + name: "hana exclusive host trait", + host: kvmHost{hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{Name: "node001-bb01"}, + Status: hv1.HypervisorStatus{Traits: []string{"CUSTOM_HANA_EXCLUSIVE_HOST"}}, + }}, + want: []string{"node001-bb01", "unknown", "bb01", "cascade-lake", "hana", "true", "false", "false", "false"}, + }, + { + name: "decommissioning trait", + host: kvmHost{hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{Name: "node001-bb01"}, + Status: hv1.HypervisorStatus{Traits: []string{"CUSTOM_DECOMMISSIONING"}}, + }}, + want: []string{"node001-bb01", "unknown", "bb01", "cascade-lake", "general-purpose", "true", "true", "false", "false"}, + }, + { + name: "external customer exclusive trait", + host: kvmHost{hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{Name: "node001-bb01"}, + Status: hv1.HypervisorStatus{Traits: []string{"CUSTOM_EXTERNAL_CUSTOMER_EXCLUSIVE"}}, + }}, + want: []string{"node001-bb01", "unknown", "bb01", "cascade-lake", "general-purpose", "true", "false", "true", "false"}, + }, + { + name: "maintenance set", + host: kvmHost{hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{Name: "node001-bb01"}, + Spec: hv1.HypervisorSpec{Maintenance: hv1.MaintenanceManual}, + }}, + want: []string{"node001-bb01", "unknown", "bb01", "cascade-lake", "general-purpose", "true", "false", "false", "true"}, + }, + { + name: "all traits and maintenance set", + host: kvmHost{hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node001-bb42", + Labels: map[string]string{"topology.kubernetes.io/zone": "az3"}, + }, + Spec: hv1.HypervisorSpec{Maintenance: hv1.MaintenanceAuto}, + Status: hv1.HypervisorStatus{Traits: []string{ + "CUSTOM_HW_SAPPHIRE_RAPIDS", + "CUSTOM_HANA_EXCLUSIVE_HOST", + "CUSTOM_DECOMMISSIONING", + "CUSTOM_EXTERNAL_CUSTOMER_EXCLUSIVE", + }}, + }}, + want: []string{"node001-bb42", "az3", "bb42", "sapphire-rapids", "hana", "true", "true", "true", "true"}, + }, + { + name: "os version set", + host: kvmHost{hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{Name: "node001-bb01"}, + Spec: hv1.HypervisorSpec{OperatingSystemVersion: "1.1.1"}, + }}, + want: []string{"node001-bb01", "unknown", "bb01", "cascade-lake", "general-purpose", "true", "false", "false", "false", "1.1.1"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := tt.host.getHostLabels() + if len(got) != len(kvmHostLabels) { + t.Fatalf("getHostLabels() returned %d values, want %d (matching kvmHostLabels)", len(got), len(kvmHostLabels)) + } + for i, want := range tt.want { + if got[i] != want { + t.Errorf("label[%d] (%s) = %q, want %q", i, kvmHostLabels[i], got[i], want) + } + } + }) + } +} + func TestIsKVMFlavor(t *testing.T) { tests := []struct { flavor string diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_resource_commitments.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_commitments.go similarity index 71% rename from internal/knowledge/kpis/plugins/infrastructure/vmware_resource_commitments.go rename to internal/knowledge/kpis/plugins/infrastructure/vmware_project_commitments.go index 0d3d5d3ed..14744b205 100644 --- a/internal/knowledge/kpis/plugins/infrastructure/vmware_resource_commitments.go +++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_commitments.go @@ -7,6 +7,7 @@ import ( "log/slog" "strings" + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/identity" "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/limes" "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" "github.com/cobaltcore-dev/cortex/internal/knowledge/db" @@ -24,7 +25,7 @@ import ( // For general purpose workloads its not possible to differentiate the cpu architecture. To avoid weird behavior in a dashboard we don't export this label for the metric. // For HANA flavors the cpu architecture is part of the flavor name (_v2 suffix for sapphire rapids, without suffix for cascade lake). // For both types of workload however we can not determine on which host the commitment is fulfilled. -type VMwareResourceCommitmentsKPI struct { +type VMwareProjectCommitmentsKPI struct { // BaseKPI provides common fields and methods for all KPIs, such as database connection and Kubernetes client. plugins.BaseKPI[struct{}] @@ -32,11 +33,11 @@ type VMwareResourceCommitmentsKPI struct { unusedHanaCommittedResourcesPerProject *prometheus.Desc } -func (k *VMwareResourceCommitmentsKPI) GetName() string { - return "vmware_resource_commitments_kpi" +func (k *VMwareProjectCommitmentsKPI) GetName() string { + return "vmware_project_commitments_kpi" } -func (k *VMwareResourceCommitmentsKPI) Init(dbConn *db.DB, c client.Client, opts conf.RawOpts) error { +func (k *VMwareProjectCommitmentsKPI) Init(dbConn *db.DB, c client.Client, opts conf.RawOpts) error { if err := k.BaseKPI.Init(dbConn, c, opts); err != nil { return err } @@ -44,38 +45,44 @@ func (k *VMwareResourceCommitmentsKPI) Init(dbConn *db.DB, c client.Client, opts k.unusedGeneralPurposeCommitmentsPerProject = prometheus.NewDesc( "cortex_vmware_commitments_general_purpose", "Committed general purpose resources that are currently unused. CPU (resource=cpu) in vCPUs, memory (resource=ram) in bytes.", - []string{"availability_zone", "resource", "project_id"}, nil, + []string{"availability_zone", "resource", "project_id", "project_name", "domain_id", "domain_name"}, nil, ) k.unusedHanaCommittedResourcesPerProject = prometheus.NewDesc( "cortex_vmware_commitments_hana_resources", "Total committed HANA instances capacity that is currently unused, translated to resources. CPU in vCPUs, memory and disk in bytes.", - []string{"availability_zone", "cpu_architecture", "resource", "project_id"}, nil, + []string{"availability_zone", "cpu_architecture", "resource", "project_id", "project_name", "domain_id", "domain_name"}, nil, ) return nil } -func (k *VMwareResourceCommitmentsKPI) Describe(ch chan<- *prometheus.Desc) { +func (k *VMwareProjectCommitmentsKPI) Describe(ch chan<- *prometheus.Desc) { ch <- k.unusedGeneralPurposeCommitmentsPerProject ch <- k.unusedHanaCommittedResourcesPerProject } -func (k *VMwareResourceCommitmentsKPI) Collect(ch chan<- prometheus.Metric) { +func (k *VMwareProjectCommitmentsKPI) Collect(ch chan<- prometheus.Metric) { if k.DB == nil { return } flavorsByName, err := k.getFlavorsByName() if err != nil { - slog.Error("vmware_resource_commitments: failed to load flavors", "err", err) + slog.Error("vmware_project_commitments: failed to load flavors", "err", err) return } - k.collectGeneralPurpose(ch, flavorsByName) - k.collectHana(ch, flavorsByName) + projects, err := k.getProjectsWithDomains() + if err != nil { + slog.Error("vmware_project_commitments: failed to load projects with domains", "err", err) + return + } + + k.collectGeneralPurpose(ch, flavorsByName, projects) + k.collectHana(ch, flavorsByName, projects) } // getFlavorsByName loads all flavors and returns them keyed by name. -func (k *VMwareResourceCommitmentsKPI) getFlavorsByName() (map[string]nova.Flavor, error) { +func (k *VMwareProjectCommitmentsKPI) getFlavorsByName() (map[string]nova.Flavor, error) { var flavors []nova.Flavor if _, err := k.DB.Select(&flavors, "SELECT * FROM "+nova.Flavor{}.TableName()); err != nil { return nil, err @@ -88,7 +95,7 @@ func (k *VMwareResourceCommitmentsKPI) getFlavorsByName() (map[string]nova.Flavo } // getGeneralPurposeCommitments loads confirmed/guaranteed cores and ram commitments. -func (k *VMwareResourceCommitmentsKPI) getGeneralPurposeCommitments() ([]limes.Commitment, error) { +func (k *VMwareProjectCommitmentsKPI) getGeneralPurposeCommitments() ([]limes.Commitment, error) { var commitments []limes.Commitment if _, err := k.DB.Select(&commitments, ` SELECT * FROM `+limes.Commitment{}.TableName()+` @@ -103,7 +110,7 @@ func (k *VMwareResourceCommitmentsKPI) getGeneralPurposeCommitments() ([]limes.C // getGeneralPurposeServers loads running non-HANA servers for general purpose usage accounting. // KVM-specific flavors are filtered out in Go since SQL LIKE cannot express the segment-exact pattern. -func (k *VMwareResourceCommitmentsKPI) getGeneralPurposeServers() ([]nova.Server, error) { +func (k *VMwareProjectCommitmentsKPI) getGeneralPurposeServers() ([]nova.Server, error) { var servers []nova.Server if _, err := k.DB.Select(&servers, ` SELECT * FROM `+nova.Server{}.TableName()+` @@ -122,7 +129,7 @@ func (k *VMwareResourceCommitmentsKPI) getGeneralPurposeServers() ([]nova.Server } // getHanaInstanceCommitments loads confirmed/guaranteed HANA instance commitments. -func (k *VMwareResourceCommitmentsKPI) getHanaInstanceCommitments() ([]limes.Commitment, error) { +func (k *VMwareProjectCommitmentsKPI) getHanaInstanceCommitments() ([]limes.Commitment, error) { var commitments []limes.Commitment if _, err := k.DB.Select(&commitments, ` SELECT * FROM `+limes.Commitment{}.TableName()+` @@ -136,7 +143,7 @@ func (k *VMwareResourceCommitmentsKPI) getHanaInstanceCommitments() ([]limes.Com } // getRunningHanaServers loads all running HANA VMware servers (KVM HANA flavors excluded in Go). -func (k *VMwareResourceCommitmentsKPI) getRunningHanaServers() ([]nova.Server, error) { +func (k *VMwareProjectCommitmentsKPI) getRunningHanaServers() ([]nova.Server, error) { var servers []nova.Server if _, err := k.DB.Select(&servers, ` SELECT * FROM `+nova.Server{}.TableName()+` @@ -156,15 +163,15 @@ func (k *VMwareResourceCommitmentsKPI) getRunningHanaServers() ([]nova.Server, e // collectGeneralPurpose computes and emits unused general purpose committed resources per project. // Unused = committed - in-use (clamped to zero; zero values are not emitted). -func (k *VMwareResourceCommitmentsKPI) collectGeneralPurpose(ch chan<- prometheus.Metric, flavorsByName map[string]nova.Flavor) { +func (k *VMwareProjectCommitmentsKPI) collectGeneralPurpose(ch chan<- prometheus.Metric, flavorsByName map[string]nova.Flavor, projects map[string]projectWithDomain) { commitments, err := k.getGeneralPurposeCommitments() if err != nil { - slog.Error("vmware_resource_commitments: failed to load gp commitments", "err", err) + slog.Error("vmware_project_commitments: failed to load gp commitments", "err", err) return } servers, err := k.getGeneralPurposeServers() if err != nil { - slog.Error("vmware_resource_commitments: failed to load gp servers", "err", err) + slog.Error("vmware_project_commitments: failed to load gp servers", "err", err) return } @@ -178,7 +185,7 @@ func (k *VMwareResourceCommitmentsKPI) collectGeneralPurpose(ch chan<- prometheu case "ram": bytes, err := bytesFromUnit(float64(c.Amount), c.Unit) if err != nil { - slog.Warn("vmware_resource_commitments: unknown ram unit", "unit", c.Unit, "err", err) + slog.Warn("vmware_project_commitments: unknown ram unit", "unit", c.Unit, "err", err) continue } committed[gpKey{c.ProjectID, c.AvailabilityZone, "ram"}] += bytes @@ -189,7 +196,7 @@ func (k *VMwareResourceCommitmentsKPI) collectGeneralPurpose(ch chan<- prometheu for _, s := range servers { flavor, ok := flavorsByName[s.FlavorName] if !ok { - slog.Warn("vmware_resource_commitments: gp flavor not found", "flavor", s.FlavorName) + slog.Warn("vmware_project_commitments: gp flavor not found", "flavor", s.FlavorName) continue } used[gpKey{s.TenantID, s.OSEXTAvailabilityZone, "cpu"}] += float64(flavor.VCPUs) @@ -201,11 +208,12 @@ func (k *VMwareResourceCommitmentsKPI) collectGeneralPurpose(ch chan<- prometheu if unused <= 0 { continue } + project := projects[key.projectID] ch <- prometheus.MustNewConstMetric( k.unusedGeneralPurposeCommitmentsPerProject, prometheus.GaugeValue, unused, - key.az, key.resource, key.projectID, + key.az, key.resource, key.projectID, project.ProjectName, project.DomainID, project.DomainName, ) } } @@ -213,7 +221,7 @@ func (k *VMwareResourceCommitmentsKPI) collectGeneralPurpose(ch chan<- prometheu // collectHana computes and emits unused committed HANA instance resources per project. // Each HANA instance commitment is compared against running servers; the remainder is // translated to cpu/ram/disk capacity using the flavor spec. -func (k *VMwareResourceCommitmentsKPI) collectHana(ch chan<- prometheus.Metric, flavorsByName map[string]nova.Flavor) { +func (k *VMwareProjectCommitmentsKPI) collectHana(ch chan<- prometheus.Metric, flavorsByName map[string]nova.Flavor, projects map[string]projectWithDomain) { commitments, err := k.getHanaInstanceCommitments() if err != nil { slog.Error("vmware_resource_commitments: failed to load hana commitments", "err", err) @@ -261,11 +269,36 @@ func (k *VMwareResourceCommitmentsKPI) collectHana(ch chan<- prometheus.Metric, } for key, value := range totals { + project := projects[key.projectID] ch <- prometheus.MustNewConstMetric( k.unusedHanaCommittedResourcesPerProject, prometheus.GaugeValue, value, - key.az, key.cpuArch, key.resource, key.projectID, + key.az, key.cpuArch, key.resource, key.projectID, project.ProjectName, project.DomainID, project.DomainName, ) } } + +type projectWithDomain struct { + ProjectID string `db:"project_id"` + ProjectName string `db:"project_name"` + DomainID string `db:"domain_id"` + DomainName string `db:"domain_name"` +} + +func (k *VMwareProjectCommitmentsKPI) getProjectsWithDomains() (map[string]projectWithDomain, error) { + var projects []projectWithDomain + if _, err := k.DB.Select(&projects, ` + SELECT p.id AS project_id, p.name AS project_name, COALESCE(d.id, '') AS domain_id, COALESCE(d.name, '') AS domain_name + FROM `+identity.Project{}.TableName()+` p + LEFT JOIN `+identity.Domain{}.TableName()+` d ON p.domain_id = d.id + `); err != nil { + return nil, err + } + + projectMap := make(map[string]projectWithDomain, len(projects)) + for _, p := range projects { + projectMap[p.ProjectID] = p + } + return projectMap, nil +} diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_resource_commitments_test.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_commitments_test.go similarity index 68% rename from internal/knowledge/kpis/plugins/infrastructure/vmware_resource_commitments_test.go rename to internal/knowledge/kpis/plugins/infrastructure/vmware_project_commitments_test.go index 6616dc558..8978f76a9 100644 --- a/internal/knowledge/kpis/plugins/infrastructure/vmware_resource_commitments_test.go +++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_commitments_test.go @@ -6,6 +6,7 @@ package infrastructure import ( "testing" + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/identity" "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/limes" "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" "github.com/cobaltcore-dev/cortex/internal/knowledge/db" @@ -15,7 +16,7 @@ import ( prometheusgo "github.com/prometheus/client_model/go" ) -func setupResourceCommitmentsDB(t *testing.T) (testDB *db.DB, cleanup func()) { +func setupProjectCommitmentsDB(t *testing.T) (testDB *db.DB, cleanup func()) { t.Helper() dbEnv := testlibDB.SetupDBEnv(t) testDB = &db.DB{DbMap: dbEnv.DbMap} @@ -23,18 +24,20 @@ func setupResourceCommitmentsDB(t *testing.T) (testDB *db.DB, cleanup func()) { testDB.AddTable(limes.Commitment{}), testDB.AddTable(nova.Server{}), testDB.AddTable(nova.Flavor{}), + testDB.AddTable(identity.Project{}), + testDB.AddTable(identity.Domain{}), ); err != nil { t.Fatalf("failed to create tables: %v", err) } return testDB, dbEnv.Close } -// collectResourceCommitmentsMetrics runs the KPI and returns all emitted metrics keyed by -// "metricName|az|cpu_architecture|resource|project_id". GP metrics have an empty cpu_architecture -// segment since the descriptor does not include that label. -func collectResourceCommitmentsMetrics(t *testing.T, testDB *db.DB) map[string]float64 { +// collectProjectCommitmentsMetrics runs the KPI and returns all emitted metrics keyed by +// "metricName|az|cpu_architecture|resource|project_id|project_name|domain_id|domain_name". +// GP metrics have an empty cpu_architecture segment since the descriptor does not include that label. +func collectProjectCommitmentsMetrics(t *testing.T, testDB *db.DB) map[string]float64 { t.Helper() - kpi := &VMwareResourceCommitmentsKPI{} + kpi := &VMwareProjectCommitmentsKPI{} if err := kpi.Init(testDB, nil, conf.NewRawOpts("{}")); err != nil { t.Fatalf("failed to init KPI: %v", err) } @@ -53,7 +56,7 @@ func collectResourceCommitmentsMetrics(t *testing.T, testDB *db.DB) map[string]f lbls[lp.GetName()] = lp.GetValue() } name := getMetricName(m.Desc().String()) - key := name + "|" + lbls["availability_zone"] + "|" + lbls["cpu_architecture"] + "|" + lbls["resource"] + "|" + lbls["project_id"] + key := name + "|" + lbls["availability_zone"] + "|" + lbls["cpu_architecture"] + "|" + lbls["resource"] + "|" + lbls["project_id"] + "|" + lbls["project_name"] + "|" + lbls["domain_id"] + "|" + lbls["domain_name"] result[key] = pm.GetGauge().GetValue() } return result @@ -61,30 +64,40 @@ func collectResourceCommitmentsMetrics(t *testing.T, testDB *db.DB) map[string]f // gpKey builds the expected map key for a general-purpose metric. // cpu_architecture is always empty because the GP metric descriptor omits that label. -func gpKey(az, resource, projectID string) string { - return "cortex_vmware_commitments_general_purpose|" + az + "||" + resource + "|" + projectID +func gpKey(az, resource string, p projectWithDomain) string { + return "cortex_vmware_commitments_general_purpose|" + az + "||" + resource + "|" + p.ProjectID + "|" + p.ProjectName + "|" + p.DomainID + "|" + p.DomainName } // hKey builds the expected map key for a HANA metric. -func hKey(az, cpuArch, resource, projectID string) string { - return "cortex_vmware_commitments_hana_resources|" + az + "|" + cpuArch + "|" + resource + "|" + projectID +func hKey(az, cpuArch, resource string, p projectWithDomain) string { + return "cortex_vmware_commitments_hana_resources|" + az + "|" + cpuArch + "|" + resource + "|" + p.ProjectID + "|" + p.ProjectName + "|" + p.DomainID + "|" + p.DomainName } -func TestVMwareResourceCommitmentsKPI_Init(t *testing.T) { +func TestVMwareProjectCommitmentsKPI_Init(t *testing.T) { dbEnv := testlibDB.SetupDBEnv(t) testDB := db.DB{DbMap: dbEnv.DbMap} defer dbEnv.Close() - kpi := &VMwareResourceCommitmentsKPI{} + kpi := &VMwareProjectCommitmentsKPI{} if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil { t.Fatalf("expected no error, got %v", err) } } -func TestVMwareResourceCommitmentsKPI_Collect_GeneralPurpose(t *testing.T) { + +func TestVMwareProjectCommitmentsKPI_Collect_GeneralPurpose(t *testing.T) { + // Reusable project/domain entries for test cases that need them. + p1 := identity.Project{ID: "p1", Name: "project-one", DomainID: "d1", Enabled: true} + p2 := identity.Project{ID: "p2", Name: "project-two", DomainID: "d1", Enabled: true} + d1 := identity.Domain{ID: "d1", Name: "domain-one", Enabled: true} + pd1 := projectWithDomain{ProjectID: "p1", ProjectName: "project-one", DomainID: "d1", DomainName: "domain-one"} + pd2 := projectWithDomain{ProjectID: "p2", ProjectName: "project-two", DomainID: "d1", DomainName: "domain-one"} + tests := []struct { name string commitments []limes.Commitment servers []nova.Server flavors []nova.Flavor + projects []identity.Project + domains []identity.Domain want map[string]float64 }{ { @@ -96,8 +109,10 @@ func TestVMwareResourceCommitmentsKPI_Collect_GeneralPurpose(t *testing.T) { commitments: []limes.Commitment{ {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 10, Status: "confirmed", ProjectID: "p1"}, }, + projects: []identity.Project{p1}, + domains: []identity.Domain{d1}, want: map[string]float64{ - gpKey("az1", "cpu", "p1"): 10, + gpKey("az1", "cpu", pd1): 10, }, }, { @@ -105,8 +120,10 @@ func TestVMwareResourceCommitmentsKPI_Collect_GeneralPurpose(t *testing.T) { commitments: []limes.Commitment{ {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "ram", AvailabilityZone: "az1", Amount: 1024, Unit: "MiB", Status: "confirmed", ProjectID: "p1"}, }, + projects: []identity.Project{p1}, + domains: []identity.Domain{d1}, want: map[string]float64{ - gpKey("az1", "ram", "p1"): 1024 * 1024 * 1024, + gpKey("az1", "ram", pd1): 1024 * 1024 * 1024, }, }, { @@ -114,8 +131,10 @@ func TestVMwareResourceCommitmentsKPI_Collect_GeneralPurpose(t *testing.T) { commitments: []limes.Commitment{ {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "ram", AvailabilityZone: "az1", Amount: 2, Unit: "GiB", Status: "confirmed", ProjectID: "p1"}, }, + projects: []identity.Project{p1}, + domains: []identity.Domain{d1}, want: map[string]float64{ - gpKey("az1", "ram", "p1"): 2 * 1024 * 1024 * 1024, + gpKey("az1", "ram", pd1): 2 * 1024 * 1024 * 1024, }, }, { @@ -130,8 +149,10 @@ func TestVMwareResourceCommitmentsKPI_Collect_GeneralPurpose(t *testing.T) { flavors: []nova.Flavor{ {ID: "f1", Name: "small", VCPUs: 3, RAM: 0, Disk: 0}, }, + projects: []identity.Project{p1}, + domains: []identity.Domain{d1}, want: map[string]float64{ - gpKey("az1", "cpu", "p1"): 4, // 10 - 2×3 = 4 + gpKey("az1", "cpu", pd1): 4, // 10 - 2×3 = 4 }, }, { @@ -145,7 +166,9 @@ func TestVMwareResourceCommitmentsKPI_Collect_GeneralPurpose(t *testing.T) { flavors: []nova.Flavor{ {ID: "f1", Name: "small", VCPUs: 4, RAM: 0, Disk: 0}, }, - want: map[string]float64{}, + projects: []identity.Project{p1}, + domains: []identity.Domain{d1}, + want: map[string]float64{}, }, { name: "over-used cpu produces no metric", @@ -158,7 +181,9 @@ func TestVMwareResourceCommitmentsKPI_Collect_GeneralPurpose(t *testing.T) { flavors: []nova.Flavor{ {ID: "f1", Name: "large", VCPUs: 8, RAM: 0, Disk: 0}, }, - want: map[string]float64{}, + projects: []identity.Project{p1}, + domains: []identity.Domain{d1}, + want: map[string]float64{}, }, { name: "hana servers not counted against gp commitments", @@ -171,8 +196,10 @@ func TestVMwareResourceCommitmentsKPI_Collect_GeneralPurpose(t *testing.T) { flavors: []nova.Flavor{ {ID: "f1", Name: "hana_small", VCPUs: 8, RAM: 0, Disk: 0}, }, + projects: []identity.Project{p1}, + domains: []identity.Domain{d1}, want: map[string]float64{ - gpKey("az1", "cpu", "p1"): 10, + gpKey("az1", "cpu", pd1): 10, }, }, { @@ -186,8 +213,10 @@ func TestVMwareResourceCommitmentsKPI_Collect_GeneralPurpose(t *testing.T) { flavors: []nova.Flavor{ {ID: "f1", Name: "m1_k_small", VCPUs: 4, RAM: 0, Disk: 0}, }, + projects: []identity.Project{p1}, + domains: []identity.Domain{d1}, want: map[string]float64{ - gpKey("az1", "cpu", "p1"): 10, + gpKey("az1", "cpu", pd1): 10, }, }, { @@ -203,8 +232,10 @@ func TestVMwareResourceCommitmentsKPI_Collect_GeneralPurpose(t *testing.T) { flavors: []nova.Flavor{ {ID: "f1", Name: "small", VCPUs: 2, RAM: 0, Disk: 0}, }, + projects: []identity.Project{p1}, + domains: []identity.Domain{d1}, want: map[string]float64{ - gpKey("az1", "cpu", "p1"): 8, // only 1 ACTIVE × 2 subtracted + gpKey("az1", "cpu", pd1): 8, // only 1 ACTIVE × 2 subtracted }, }, { @@ -212,8 +243,10 @@ func TestVMwareResourceCommitmentsKPI_Collect_GeneralPurpose(t *testing.T) { commitments: []limes.Commitment{ {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 5, Status: "guaranteed", ProjectID: "p1"}, }, + projects: []identity.Project{p1}, + domains: []identity.Domain{d1}, want: map[string]float64{ - gpKey("az1", "cpu", "p1"): 5, + gpKey("az1", "cpu", pd1): 5, }, }, { @@ -221,14 +254,18 @@ func TestVMwareResourceCommitmentsKPI_Collect_GeneralPurpose(t *testing.T) { commitments: []limes.Commitment{ {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 100, Status: "pending", ProjectID: "p1"}, }, - want: map[string]float64{}, + projects: []identity.Project{p1}, + domains: []identity.Domain{d1}, + want: map[string]float64{}, }, { name: "non-compute service type excluded", commitments: []limes.Commitment{ {ID: 1, UUID: "c1", ServiceType: "network", ResourceName: "cores", AvailabilityZone: "az1", Amount: 100, Status: "confirmed", ProjectID: "p1"}, }, - want: map[string]float64{}, + projects: []identity.Project{p1}, + domains: []identity.Domain{d1}, + want: map[string]float64{}, }, { name: "multiple commitments per project and AZ summed", @@ -238,10 +275,12 @@ func TestVMwareResourceCommitmentsKPI_Collect_GeneralPurpose(t *testing.T) { {ID: 3, UUID: "c3", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az2", Amount: 20, Status: "confirmed", ProjectID: "p1"}, {ID: 4, UUID: "c4", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 8, Status: "confirmed", ProjectID: "p2"}, }, + projects: []identity.Project{p1, p2}, + domains: []identity.Domain{d1}, want: map[string]float64{ - gpKey("az1", "cpu", "p1"): 15, - gpKey("az2", "cpu", "p1"): 20, - gpKey("az1", "cpu", "p2"): 8, + gpKey("az1", "cpu", pd1): 15, + gpKey("az2", "cpu", pd1): 20, + gpKey("az1", "cpu", pd2): 8, }, }, { @@ -256,16 +295,18 @@ func TestVMwareResourceCommitmentsKPI_Collect_GeneralPurpose(t *testing.T) { flavors: []nova.Flavor{ {ID: "f1", Name: "medium", VCPUs: 2, RAM: 256, Disk: 0}, }, + projects: []identity.Project{p1}, + domains: []identity.Domain{d1}, want: map[string]float64{ - gpKey("az1", "cpu", "p1"): 6, // 8 - 1×2 - gpKey("az1", "ram", "p1"): (512 - 256) * 1024 * 1024, // 512MiB - 256MB (flavor.RAM is in MB) + gpKey("az1", "cpu", pd1): 6, // 8 - 1×2 + gpKey("az1", "ram", pd1): (512 - 256) * 1024 * 1024, // 512MiB - 256MB (flavor.RAM is in MB) }, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - testDB, cleanup := setupResourceCommitmentsDB(t) + testDB, cleanup := setupProjectCommitmentsDB(t) defer cleanup() var rows []any @@ -278,13 +319,19 @@ func TestVMwareResourceCommitmentsKPI_Collect_GeneralPurpose(t *testing.T) { for i := range tt.flavors { rows = append(rows, &tt.flavors[i]) } + for i := range tt.projects { + rows = append(rows, &tt.projects[i]) + } + for i := range tt.domains { + rows = append(rows, &tt.domains[i]) + } if len(rows) > 0 { if err := testDB.Insert(rows...); err != nil { t.Fatalf("failed to insert test data: %v", err) } } - got := collectResourceCommitmentsMetrics(t, testDB) + got := collectProjectCommitmentsMetrics(t, testDB) if len(got) != len(tt.want) { t.Errorf("expected %d metrics, got %d: %v", len(tt.want), len(got), got) @@ -303,12 +350,21 @@ func TestVMwareResourceCommitmentsKPI_Collect_GeneralPurpose(t *testing.T) { } } -func TestVMwareResourceCommitmentsKPI_Collect_HANA(t *testing.T) { +func TestVMwareProjectCommitmentsKPI_Collect_HANA(t *testing.T) { + // Reusable project/domain entries for test cases that need them. + p1 := identity.Project{ID: "p1", Name: "project-one", DomainID: "d1", Enabled: true} + p2 := identity.Project{ID: "p2", Name: "project-two", DomainID: "d1", Enabled: true} + d1 := identity.Domain{ID: "d1", Name: "domain-one", Enabled: true} + pd1 := projectWithDomain{ProjectID: "p1", ProjectName: "project-one", DomainID: "d1", DomainName: "domain-one"} + pd2 := projectWithDomain{ProjectID: "p2", ProjectName: "project-two", DomainID: "d1", DomainName: "domain-one"} + tests := []struct { name string commitments []limes.Commitment servers []nova.Server flavors []nova.Flavor + projects []identity.Project + domains []identity.Domain want map[string]float64 }{ { @@ -323,10 +379,12 @@ func TestVMwareResourceCommitmentsKPI_Collect_HANA(t *testing.T) { flavors: []nova.Flavor{ {ID: "f1", Name: "hana_c128_m1600", VCPUs: 128, RAM: 1638400, Disk: 100}, }, + projects: []identity.Project{p1}, + domains: []identity.Domain{d1}, want: map[string]float64{ - hKey("az1", "cascade-lake", "cpu", "p1"): 2 * 128, - hKey("az1", "cascade-lake", "ram", "p1"): 2 * 1638400 * 1024 * 1024, - hKey("az1", "cascade-lake", "disk", "p1"): 2 * 100 * 1024 * 1024 * 1024, + hKey("az1", "cascade-lake", "cpu", pd1): 2 * 128, + hKey("az1", "cascade-lake", "ram", pd1): 2 * 1638400 * 1024 * 1024, + hKey("az1", "cascade-lake", "disk", pd1): 2 * 100 * 1024 * 1024 * 1024, }, }, { @@ -340,10 +398,12 @@ func TestVMwareResourceCommitmentsKPI_Collect_HANA(t *testing.T) { flavors: []nova.Flavor{ {ID: "f1", Name: "hana_c128_m1600", VCPUs: 128, RAM: 1638400, Disk: 100}, }, + projects: []identity.Project{p1}, + domains: []identity.Domain{d1}, want: map[string]float64{ - hKey("az1", "cascade-lake", "cpu", "p1"): 2 * 128, - hKey("az1", "cascade-lake", "ram", "p1"): 2 * 1638400 * 1024 * 1024, - hKey("az1", "cascade-lake", "disk", "p1"): 2 * 100 * 1024 * 1024 * 1024, + hKey("az1", "cascade-lake", "cpu", pd1): 2 * 128, + hKey("az1", "cascade-lake", "ram", pd1): 2 * 1638400 * 1024 * 1024, + hKey("az1", "cascade-lake", "disk", pd1): 2 * 100 * 1024 * 1024 * 1024, }, }, { @@ -358,7 +418,9 @@ func TestVMwareResourceCommitmentsKPI_Collect_HANA(t *testing.T) { flavors: []nova.Flavor{ {ID: "f1", Name: "hana_small", VCPUs: 64, RAM: 819200, Disk: 50}, }, - want: map[string]float64{}, + projects: []identity.Project{p1}, + domains: []identity.Domain{d1}, + want: map[string]float64{}, }, { name: "over-used hana produces no metric", @@ -372,7 +434,9 @@ func TestVMwareResourceCommitmentsKPI_Collect_HANA(t *testing.T) { flavors: []nova.Flavor{ {ID: "f1", Name: "hana_small", VCPUs: 64, RAM: 819200, Disk: 50}, }, - want: map[string]float64{}, + projects: []identity.Project{p1}, + domains: []identity.Domain{d1}, + want: map[string]float64{}, }, { name: "sapphire-rapids arch from _v2 suffix", @@ -382,10 +446,12 @@ func TestVMwareResourceCommitmentsKPI_Collect_HANA(t *testing.T) { flavors: []nova.Flavor{ {ID: "f1", Name: "hana_c256_m3200_v2", VCPUs: 256, RAM: 3276800, Disk: 200}, }, + projects: []identity.Project{p1}, + domains: []identity.Domain{d1}, want: map[string]float64{ - hKey("az1", "sapphire-rapids", "cpu", "p1"): 256, - hKey("az1", "sapphire-rapids", "ram", "p1"): 3276800 * 1024 * 1024, - hKey("az1", "sapphire-rapids", "disk", "p1"): 200 * 1024 * 1024 * 1024, + hKey("az1", "sapphire-rapids", "cpu", pd1): 256, + hKey("az1", "sapphire-rapids", "ram", pd1): 3276800 * 1024 * 1024, + hKey("az1", "sapphire-rapids", "disk", pd1): 200 * 1024 * 1024 * 1024, }, }, { @@ -398,13 +464,15 @@ func TestVMwareResourceCommitmentsKPI_Collect_HANA(t *testing.T) { {ID: "f1", Name: "hana_c128_m1600", VCPUs: 128, RAM: 1638400, Disk: 100}, {ID: "f2", Name: "hana_c128_m1600_v2", VCPUs: 128, RAM: 1638400, Disk: 100}, }, + projects: []identity.Project{p1}, + domains: []identity.Domain{d1}, want: map[string]float64{ - hKey("az1", "cascade-lake", "cpu", "p1"): 2 * 128, - hKey("az1", "cascade-lake", "ram", "p1"): 2 * 1638400 * 1024 * 1024, - hKey("az1", "cascade-lake", "disk", "p1"): 2 * 100 * 1024 * 1024 * 1024, - hKey("az1", "sapphire-rapids", "cpu", "p1"): 1 * 128, - hKey("az1", "sapphire-rapids", "ram", "p1"): 1 * 1638400 * 1024 * 1024, - hKey("az1", "sapphire-rapids", "disk", "p1"): 1 * 100 * 1024 * 1024 * 1024, + hKey("az1", "cascade-lake", "cpu", pd1): 2 * 128, + hKey("az1", "cascade-lake", "ram", pd1): 2 * 1638400 * 1024 * 1024, + hKey("az1", "cascade-lake", "disk", pd1): 2 * 100 * 1024 * 1024 * 1024, + hKey("az1", "sapphire-rapids", "cpu", pd1): 1 * 128, + hKey("az1", "sapphire-rapids", "ram", pd1): 1 * 1638400 * 1024 * 1024, + hKey("az1", "sapphire-rapids", "disk", pd1): 1 * 100 * 1024 * 1024 * 1024, }, }, { @@ -416,7 +484,9 @@ func TestVMwareResourceCommitmentsKPI_Collect_HANA(t *testing.T) { flavors: []nova.Flavor{ {ID: "f1", Name: "hana_k_large", VCPUs: 64, RAM: 819200, Disk: 50}, }, - want: map[string]float64{}, + projects: []identity.Project{p1}, + domains: []identity.Domain{d1}, + want: map[string]float64{}, }, { name: "DELETED and ERROR hana servers excluded from running count", @@ -431,10 +501,12 @@ func TestVMwareResourceCommitmentsKPI_Collect_HANA(t *testing.T) { flavors: []nova.Flavor{ {ID: "f1", Name: "hana_small", VCPUs: 64, RAM: 819200, Disk: 50}, }, + projects: []identity.Project{p1}, + domains: []identity.Domain{d1}, want: map[string]float64{ - hKey("az1", "cascade-lake", "cpu", "p1"): 2 * 64, // 3 committed - 1 ACTIVE = 2 unused - hKey("az1", "cascade-lake", "ram", "p1"): 2 * 819200 * 1024 * 1024, - hKey("az1", "cascade-lake", "disk", "p1"): 2 * 50 * 1024 * 1024 * 1024, + hKey("az1", "cascade-lake", "cpu", pd1): 2 * 64, // 3 committed - 1 ACTIVE = 2 unused + hKey("az1", "cascade-lake", "ram", pd1): 2 * 819200 * 1024 * 1024, + hKey("az1", "cascade-lake", "disk", pd1): 2 * 50 * 1024 * 1024 * 1024, }, }, { @@ -445,10 +517,12 @@ func TestVMwareResourceCommitmentsKPI_Collect_HANA(t *testing.T) { flavors: []nova.Flavor{ {ID: "f1", Name: "hana_small", VCPUs: 64, RAM: 819200, Disk: 50}, }, + projects: []identity.Project{p1}, + domains: []identity.Domain{d1}, want: map[string]float64{ - hKey("az1", "cascade-lake", "cpu", "p1"): 64, - hKey("az1", "cascade-lake", "ram", "p1"): 819200 * 1024 * 1024, - hKey("az1", "cascade-lake", "disk", "p1"): 50 * 1024 * 1024 * 1024, + hKey("az1", "cascade-lake", "cpu", pd1): 64, + hKey("az1", "cascade-lake", "ram", pd1): 819200 * 1024 * 1024, + hKey("az1", "cascade-lake", "disk", pd1): 50 * 1024 * 1024 * 1024, }, }, { @@ -456,7 +530,9 @@ func TestVMwareResourceCommitmentsKPI_Collect_HANA(t *testing.T) { commitments: []limes.Commitment{ {ID: 1, UUID: "h1", ServiceType: "compute", ResourceName: "instances_hana_nonexistent", AvailabilityZone: "az1", Amount: 2, Status: "confirmed", ProjectID: "p1"}, }, - want: map[string]float64{}, + projects: []identity.Project{p1}, + domains: []identity.Domain{d1}, + want: map[string]float64{}, }, { name: "multiple projects and AZs aggregated per bucket", @@ -468,23 +544,25 @@ func TestVMwareResourceCommitmentsKPI_Collect_HANA(t *testing.T) { flavors: []nova.Flavor{ {ID: "f1", Name: "hana_small", VCPUs: 64, RAM: 819200, Disk: 50}, }, + projects: []identity.Project{p1, p2}, + domains: []identity.Domain{d1}, want: map[string]float64{ - hKey("az1", "cascade-lake", "cpu", "p1"): 2 * 64, - hKey("az1", "cascade-lake", "ram", "p1"): 2 * 819200 * 1024 * 1024, - hKey("az1", "cascade-lake", "disk", "p1"): 2 * 50 * 1024 * 1024 * 1024, - hKey("az2", "cascade-lake", "cpu", "p1"): 3 * 64, - hKey("az2", "cascade-lake", "ram", "p1"): 3 * 819200 * 1024 * 1024, - hKey("az2", "cascade-lake", "disk", "p1"): 3 * 50 * 1024 * 1024 * 1024, - hKey("az1", "cascade-lake", "cpu", "p2"): 1 * 64, - hKey("az1", "cascade-lake", "ram", "p2"): 1 * 819200 * 1024 * 1024, - hKey("az1", "cascade-lake", "disk", "p2"): 1 * 50 * 1024 * 1024 * 1024, + hKey("az1", "cascade-lake", "cpu", pd1): 2 * 64, + hKey("az1", "cascade-lake", "ram", pd1): 2 * 819200 * 1024 * 1024, + hKey("az1", "cascade-lake", "disk", pd1): 2 * 50 * 1024 * 1024 * 1024, + hKey("az2", "cascade-lake", "cpu", pd1): 3 * 64, + hKey("az2", "cascade-lake", "ram", pd1): 3 * 819200 * 1024 * 1024, + hKey("az2", "cascade-lake", "disk", pd1): 3 * 50 * 1024 * 1024 * 1024, + hKey("az1", "cascade-lake", "cpu", pd2): 1 * 64, + hKey("az1", "cascade-lake", "ram", pd2): 1 * 819200 * 1024 * 1024, + hKey("az1", "cascade-lake", "disk", pd2): 1 * 50 * 1024 * 1024 * 1024, }, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - testDB, cleanup := setupResourceCommitmentsDB(t) + testDB, cleanup := setupProjectCommitmentsDB(t) defer cleanup() var rows []any @@ -497,13 +575,19 @@ func TestVMwareResourceCommitmentsKPI_Collect_HANA(t *testing.T) { for i := range tt.flavors { rows = append(rows, &tt.flavors[i]) } + for i := range tt.projects { + rows = append(rows, &tt.projects[i]) + } + for i := range tt.domains { + rows = append(rows, &tt.domains[i]) + } if len(rows) > 0 { if err := testDB.Insert(rows...); err != nil { t.Fatalf("failed to insert test data: %v", err) } } - got := collectResourceCommitmentsMetrics(t, testDB) + got := collectProjectCommitmentsMetrics(t, testDB) if len(got) != len(tt.want) { t.Errorf("expected %d metrics, got %d: %v", len(tt.want), len(got), got) diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization.go index 16fcac857..368ebd194 100644 --- a/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization.go +++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization.go @@ -21,6 +21,8 @@ import ( type vmwareProjectInstanceCount struct { ProjectID string `db:"project_id"` ProjectName string `db:"project_name"` + DomainID string `db:"domain_id"` + DomainName string `db:"domain_name"` ComputeHost string `db:"compute_host"` FlavorName string `db:"flavor_name"` AvailabilityZone string `db:"availability_zone"` @@ -30,6 +32,8 @@ type vmwareProjectInstanceCount struct { type vmwareProjectCapacityUsage struct { ProjectID string `db:"project_id"` ProjectName string `db:"project_name"` + DomainID string `db:"domain_id"` + DomainName string `db:"domain_name"` ComputeHost string `db:"compute_host"` AvailabilityZone string `db:"availability_zone"` TotalVCPUs float64 `db:"total_vcpus"` @@ -60,12 +64,12 @@ func (k *VMwareProjectUtilizationKPI) Init(dbConn *db.DB, c client.Client, opts k.instanceCountPerProjectAndHostAndFlavor = prometheus.NewDesc( "cortex_vmware_project_instances", "Number of running instances per project, hypervisor, and flavor on VMware.", - append(vmwareHostLabels, "project_id", "project_name", "flavor_name"), nil, + append(vmwareHostLabels, "project_id", "project_name", "domain_id", "domain_name", "flavor_name"), nil, ) k.capacityUsagePerProjectAndHost = prometheus.NewDesc( "cortex_vmware_project_capacity_usage", "Resource capacity used by a project per VMware hypervisor and flavor. CPU in vCPUs, memory and disk in bytes.", - append(vmwareHostLabels, "project_id", "project_name", "resource"), nil, + append(vmwareHostLabels, "project_id", "project_name", "domain_id", "domain_name", "resource"), nil, ) return nil } @@ -96,7 +100,7 @@ func (k *VMwareProjectUtilizationKPI) Collect(ch chan<- prometheus.Metric) { continue } hostLabels := host.getHostLabels() - hostLabels = append(hostLabels, projectInstanceCount.ProjectID, projectInstanceCount.ProjectName, projectInstanceCount.FlavorName) + hostLabels = append(hostLabels, projectInstanceCount.ProjectID, projectInstanceCount.ProjectName, projectInstanceCount.DomainID, projectInstanceCount.DomainName, projectInstanceCount.FlavorName) ch <- prometheus.MustNewConstMetric(k.instanceCountPerProjectAndHostAndFlavor, prometheus.GaugeValue, projectInstanceCount.InstanceCount, hostLabels...) } @@ -113,22 +117,11 @@ func (k *VMwareProjectUtilizationKPI) Collect(ch chan<- prometheus.Metric) { continue } hostLabels := host.getHostLabels() - hostLabels = append(hostLabels, projectCapacityUsage.ProjectID, projectCapacityUsage.ProjectName) - - memoryUsageBytes, err := bytesFromUnit(projectCapacityUsage.TotalRAMMB, "MB") - if err != nil { - slog.Error("vmware_project_utilization: failed to convert memory to bytes", "err", err) - continue - } - diskUsageBytes, err := bytesFromUnit(projectCapacityUsage.TotalDiskGB, "GB") - if err != nil { - slog.Error("vmware_project_utilization: failed to convert disk to bytes", "err", err) - continue - } + hostLabels = append(hostLabels, projectCapacityUsage.ProjectID, projectCapacityUsage.ProjectName, projectCapacityUsage.DomainID, projectCapacityUsage.DomainName) ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, projectCapacityUsage.TotalVCPUs, append(hostLabels, "vcpu")...) - ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, memoryUsageBytes, append(hostLabels, "memory")...) - ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, diskUsageBytes, append(hostLabels, "disk")...) + ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, projectCapacityUsage.TotalRAMMB*1024*1024, append(hostLabels, "memory")...) + ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, projectCapacityUsage.TotalDiskGB*1024*1024*1024, append(hostLabels, "disk")...) } } @@ -167,6 +160,8 @@ func (k *VMwareProjectUtilizationKPI) queryProjectCapacityUsage() ([]vmwareProje SELECT s.tenant_id AS project_id, COALESCE(p.name, '') AS project_name, + COALESCE(p.domain_id, '') AS domain_id, + COALESCE(d.name, '') AS domain_name, s.os_ext_srv_attr_host AS compute_host, s.os_ext_az_availability_zone AS availability_zone, COALESCE(SUM(f.vcpus), 0) AS total_vcpus, @@ -175,10 +170,11 @@ func (k *VMwareProjectUtilizationKPI) queryProjectCapacityUsage() ([]vmwareProje FROM ` + nova.Server{}.TableName() + ` s LEFT JOIN ` + nova.Flavor{}.TableName() + ` f ON s.flavor_name = f.name LEFT JOIN ` + identity.Project{}.TableName() + ` p ON p.id = s.tenant_id + LEFT JOIN ` + identity.Domain{}.TableName() + ` d ON d.id = p.domain_id WHERE s.status NOT IN ('DELETED', 'ERROR') AND s.os_ext_srv_attr_host LIKE '` + vmwareComputeHostPattern + `' AND s.os_ext_srv_attr_host NOT LIKE '` + vmwareIronicComputeHostPattern + `' - GROUP BY s.tenant_id, p.name, s.os_ext_srv_attr_host, s.os_ext_az_availability_zone + GROUP BY s.tenant_id, p.name, p.domain_id, d.name, s.os_ext_srv_attr_host, s.os_ext_az_availability_zone ` var usages []vmwareProjectCapacityUsage if _, err := k.DB.Select(&usages, query); err != nil { @@ -198,16 +194,19 @@ func (k *VMwareProjectUtilizationKPI) queryProjectInstanceCount() ([]vmwareProje SELECT s.tenant_id AS project_id, COALESCE(p.name, '') AS project_name, + COALESCE(p.domain_id, '') AS domain_id, + COALESCE(d.name, '') AS domain_name, s.os_ext_srv_attr_host AS compute_host, s.os_ext_az_availability_zone AS availability_zone, s.flavor_name, COUNT(*) AS instance_count FROM ` + nova.Server{}.TableName() + ` s LEFT JOIN ` + identity.Project{}.TableName() + ` p ON p.id = s.tenant_id + LEFT JOIN ` + identity.Domain{}.TableName() + ` d ON d.id = p.domain_id WHERE s.status NOT IN ('DELETED', 'ERROR') AND s.os_ext_srv_attr_host LIKE '` + vmwareComputeHostPattern + `' AND s.os_ext_srv_attr_host NOT LIKE '` + vmwareIronicComputeHostPattern + `' - GROUP BY s.tenant_id, p.name, s.os_ext_srv_attr_host, s.flavor_name, s.os_ext_az_availability_zone + GROUP BY s.tenant_id, p.name, p.domain_id, d.name, s.os_ext_srv_attr_host, s.flavor_name, s.os_ext_az_availability_zone ` var usages []vmwareProjectInstanceCount if _, err := k.DB.Select(&usages, query); err != nil { diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization_test.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization_test.go index 4c43c893b..853b88d2c 100644 --- a/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization_test.go +++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization_test.go @@ -33,18 +33,22 @@ func buildMetricKey(name string, labels map[string]string) string { } } -func instanceMetric(computeHost, az, projectID, projectName, flavorName string, value float64) collectedVMwareMetric { +func instanceMetric(computeHost, az, projectID, projectName, domainID, domainName, flavorName string, value float64) collectedVMwareMetric { labels := mockVMwareHostLabels(computeHost, az) labels["project_id"] = projectID labels["project_name"] = projectName + labels["domain_id"] = domainID + labels["domain_name"] = domainName labels["flavor_name"] = flavorName return collectedVMwareMetric{Name: "cortex_vmware_project_instances", Labels: labels, Value: value} } -func capacityMetric(computeHost, az, projectID, projectName, resource string, value float64) collectedVMwareMetric { +func capacityMetric(computeHost, az, projectID, projectName, domainID, domainName, resource string, value float64) collectedVMwareMetric { labels := mockVMwareHostLabels(computeHost, az) labels["project_id"] = projectID labels["project_name"] = projectName + labels["domain_id"] = domainID + labels["domain_name"] = domainName labels["resource"] = resource return collectedVMwareMetric{Name: "cortex_vmware_project_capacity_usage", Labels: labels, Value: value} } @@ -132,6 +136,7 @@ func TestVMwareProjectUtilizationKPI_queryProjectInstanceCount(t *testing.T) { name string servers []nova.Server projects []identity.Project + domains []identity.Domain expectedCounts map[string]vmwareProjectInstanceCount }{ { @@ -139,9 +144,10 @@ func TestVMwareProjectUtilizationKPI_queryProjectInstanceCount(t *testing.T) { servers: []nova.Server{ {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, }, - projects: []identity.Project{{ID: "project-1", Name: "Project One"}}, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, expectedCounts: map[string]vmwareProjectInstanceCount{ - "project-1|nova-compute-1|flavor-1|az1": {ProjectID: "project-1", ProjectName: "Project One", ComputeHost: "nova-compute-1", FlavorName: "flavor-1", AvailabilityZone: "az1", InstanceCount: 1}, + "project-1|nova-compute-1|flavor-1|az1": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "nova-compute-1", FlavorName: "flavor-1", AvailabilityZone: "az1", InstanceCount: 1}, }, }, { @@ -153,14 +159,15 @@ func TestVMwareProjectUtilizationKPI_queryProjectInstanceCount(t *testing.T) { {ID: "server-4", TenantID: "project-2", OSEXTSRVATTRHost: "nova-compute-2", FlavorName: "flavor-2", Status: "ACTIVE", OSEXTAvailabilityZone: "az2"}, }, projects: []identity.Project{ - {ID: "project-1", Name: "Project One"}, - {ID: "project-2", Name: "Project Two"}, + {ID: "project-1", Name: "Project One", DomainID: "domain-1"}, + {ID: "project-2", Name: "Project Two", DomainID: "domain-1"}, }, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, expectedCounts: map[string]vmwareProjectInstanceCount{ - "project-1|nova-compute-1|flavor-1|az1": {ProjectID: "project-1", ProjectName: "Project One", ComputeHost: "nova-compute-1", FlavorName: "flavor-1", AvailabilityZone: "az1", InstanceCount: 1}, - "project-1|nova-compute-1|flavor-2|az1": {ProjectID: "project-1", ProjectName: "Project One", ComputeHost: "nova-compute-1", FlavorName: "flavor-2", AvailabilityZone: "az1", InstanceCount: 1}, - "project-2|nova-compute-2|flavor-1|az2": {ProjectID: "project-2", ProjectName: "Project Two", ComputeHost: "nova-compute-2", FlavorName: "flavor-1", AvailabilityZone: "az2", InstanceCount: 1}, - "project-2|nova-compute-2|flavor-2|az2": {ProjectID: "project-2", ProjectName: "Project Two", ComputeHost: "nova-compute-2", FlavorName: "flavor-2", AvailabilityZone: "az2", InstanceCount: 1}, + "project-1|nova-compute-1|flavor-1|az1": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "nova-compute-1", FlavorName: "flavor-1", AvailabilityZone: "az1", InstanceCount: 1}, + "project-1|nova-compute-1|flavor-2|az1": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "nova-compute-1", FlavorName: "flavor-2", AvailabilityZone: "az1", InstanceCount: 1}, + "project-2|nova-compute-2|flavor-1|az2": {ProjectID: "project-2", ProjectName: "Project Two", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "nova-compute-2", FlavorName: "flavor-1", AvailabilityZone: "az2", InstanceCount: 1}, + "project-2|nova-compute-2|flavor-2|az2": {ProjectID: "project-2", ProjectName: "Project Two", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "nova-compute-2", FlavorName: "flavor-2", AvailabilityZone: "az2", InstanceCount: 1}, }, }, { @@ -170,9 +177,10 @@ func TestVMwareProjectUtilizationKPI_queryProjectInstanceCount(t *testing.T) { {ID: "server-2", TenantID: "project-1", OSEXTSRVATTRHost: "node-3", FlavorName: "flavor-2", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, {ID: "server-3", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-ironic-1", FlavorName: "flavor-2", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, }, - projects: []identity.Project{{ID: "project-1", Name: "Project One"}}, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, expectedCounts: map[string]vmwareProjectInstanceCount{ - "project-1|nova-compute-1|flavor-1|az1": {ProjectID: "project-1", ProjectName: "Project One", ComputeHost: "nova-compute-1", FlavorName: "flavor-1", AvailabilityZone: "az1", InstanceCount: 1}, + "project-1|nova-compute-1|flavor-1|az1": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "nova-compute-1", FlavorName: "flavor-1", AvailabilityZone: "az1", InstanceCount: 1}, }, }, { @@ -182,9 +190,10 @@ func TestVMwareProjectUtilizationKPI_queryProjectInstanceCount(t *testing.T) { {ID: "server-2", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-2", Status: "ERROR", OSEXTAvailabilityZone: "az1"}, {ID: "server-3", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-3", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, }, - projects: []identity.Project{{ID: "project-1", Name: "Project One"}}, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, expectedCounts: map[string]vmwareProjectInstanceCount{ - "project-1|nova-compute-1|flavor-3|az1": {ProjectID: "project-1", ProjectName: "Project One", ComputeHost: "nova-compute-1", FlavorName: "flavor-3", AvailabilityZone: "az1", InstanceCount: 1}, + "project-1|nova-compute-1|flavor-3|az1": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "nova-compute-1", FlavorName: "flavor-3", AvailabilityZone: "az1", InstanceCount: 1}, }, }, { @@ -195,26 +204,41 @@ func TestVMwareProjectUtilizationKPI_queryProjectInstanceCount(t *testing.T) { {ID: "server-3", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-2", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az2"}, {ID: "server-4", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-2", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az2"}, }, - projects: []identity.Project{{ID: "project-1", Name: "Project One"}}, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, expectedCounts: map[string]vmwareProjectInstanceCount{ - "project-1|nova-compute-1|flavor-1|az1": {ProjectID: "project-1", ProjectName: "Project One", ComputeHost: "nova-compute-1", FlavorName: "flavor-1", AvailabilityZone: "az1", InstanceCount: 2}, - "project-1|nova-compute-2|flavor-1|az2": {ProjectID: "project-1", ProjectName: "Project One", ComputeHost: "nova-compute-2", FlavorName: "flavor-1", AvailabilityZone: "az2", InstanceCount: 2}, + "project-1|nova-compute-1|flavor-1|az1": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "nova-compute-1", FlavorName: "flavor-1", AvailabilityZone: "az1", InstanceCount: 2}, + "project-1|nova-compute-2|flavor-1|az2": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "nova-compute-2", FlavorName: "flavor-1", AvailabilityZone: "az2", InstanceCount: 2}, }, }, { - name: "missing project entry results in empty project_name", + name: "project references non-existent domain results in empty domain fields", + servers: []nova.Server{ + {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-unknown"}}, + domains: []identity.Domain{}, + expectedCounts: map[string]vmwareProjectInstanceCount{ + // The domain_id is extracted from the project record, so it should be "domain-unknown" even though there is no matching domain entry + "project-1|nova-compute-1|flavor-1|az1": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-unknown", DomainName: "", ComputeHost: "nova-compute-1", FlavorName: "flavor-1", AvailabilityZone: "az1", InstanceCount: 1}, + }, + }, + { + name: "missing project entry results in empty project_name and domain", servers: []nova.Server{ {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, }, projects: []identity.Project{}, + domains: []identity.Domain{}, expectedCounts: map[string]vmwareProjectInstanceCount{ - "project-1|nova-compute-1|flavor-1|az1": {ProjectID: "project-1", ProjectName: "", ComputeHost: "nova-compute-1", FlavorName: "flavor-1", AvailabilityZone: "az1", InstanceCount: 1}, + "project-1|nova-compute-1|flavor-1|az1": {ProjectID: "project-1", ProjectName: "", DomainID: "", DomainName: "", ComputeHost: "nova-compute-1", FlavorName: "flavor-1", AvailabilityZone: "az1", InstanceCount: 1}, }, }, { name: "no instances returns empty result", servers: []nova.Server{}, - projects: []identity.Project{{ID: "project-1", Name: "Project One"}}, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, expectedCounts: map[string]vmwareProjectInstanceCount{}, }, } @@ -227,6 +251,7 @@ func TestVMwareProjectUtilizationKPI_queryProjectInstanceCount(t *testing.T) { if err := testDB.CreateTable( testDB.AddTable(nova.Server{}), testDB.AddTable(identity.Project{}), + testDB.AddTable(identity.Domain{}), ); err != nil { t.Fatalf("failed to create tables: %v", err) } @@ -238,6 +263,9 @@ func TestVMwareProjectUtilizationKPI_queryProjectInstanceCount(t *testing.T) { for i := range tt.projects { mockData = append(mockData, &tt.projects[i]) } + for i := range tt.domains { + mockData = append(mockData, &tt.domains[i]) + } if len(mockData) > 0 { if err := testDB.Insert(mockData...); err != nil { t.Fatalf("expected no error, got %v", err) @@ -277,6 +305,7 @@ func TestVMwareProjectUtilizationKPI_queryProjectCapacityUsage(t *testing.T) { name string servers []nova.Server projects []identity.Project + domains []identity.Domain flavors []nova.Flavor expectedUsages map[string]vmwareProjectCapacityUsage }{ @@ -285,10 +314,11 @@ func TestVMwareProjectUtilizationKPI_queryProjectCapacityUsage(t *testing.T) { servers: []nova.Server{ {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, }, - projects: []identity.Project{{ID: "project-1", Name: "Project One"}}, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, expectedUsages: map[string]vmwareProjectCapacityUsage{ - "project-1|nova-compute-1|az1": {ProjectID: "project-1", ProjectName: "Project One", ComputeHost: "nova-compute-1", AvailabilityZone: "az1", TotalVCPUs: 2, TotalRAMMB: 4096, TotalDiskGB: 1}, + "project-1|nova-compute-1|az1": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "nova-compute-1", AvailabilityZone: "az1", TotalVCPUs: 2, TotalRAMMB: 4096, TotalDiskGB: 1}, }, }, { @@ -299,16 +329,17 @@ func TestVMwareProjectUtilizationKPI_queryProjectCapacityUsage(t *testing.T) { {ID: "server-3", TenantID: "project-2", OSEXTSRVATTRHost: "nova-compute-2", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az2"}, }, projects: []identity.Project{ - {ID: "project-1", Name: "Project One"}, - {ID: "project-2", Name: "Project Two"}, + {ID: "project-1", Name: "Project One", DomainID: "domain-1"}, + {ID: "project-2", Name: "Project Two", DomainID: "domain-1"}, }, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, flavors: []nova.Flavor{ {ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}, {ID: "f2", Name: "flavor-2", VCPUs: 4, RAM: 8192, Disk: 2}, }, expectedUsages: map[string]vmwareProjectCapacityUsage{ - "project-1|nova-compute-1|az1": {ProjectID: "project-1", ProjectName: "Project One", ComputeHost: "nova-compute-1", AvailabilityZone: "az1", TotalVCPUs: 6, TotalRAMMB: 12288, TotalDiskGB: 3}, - "project-2|nova-compute-2|az2": {ProjectID: "project-2", ProjectName: "Project Two", ComputeHost: "nova-compute-2", AvailabilityZone: "az2", TotalVCPUs: 2, TotalRAMMB: 4096, TotalDiskGB: 1}, + "project-1|nova-compute-1|az1": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "nova-compute-1", AvailabilityZone: "az1", TotalVCPUs: 6, TotalRAMMB: 12288, TotalDiskGB: 3}, + "project-2|nova-compute-2|az2": {ProjectID: "project-2", ProjectName: "Project Two", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "nova-compute-2", AvailabilityZone: "az2", TotalVCPUs: 2, TotalRAMMB: 4096, TotalDiskGB: 1}, }, }, { @@ -316,10 +347,11 @@ func TestVMwareProjectUtilizationKPI_queryProjectCapacityUsage(t *testing.T) { servers: []nova.Server{ {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-missing", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, }, - projects: []identity.Project{{ID: "project-1", Name: "Project One"}}, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, expectedUsages: map[string]vmwareProjectCapacityUsage{ - "project-1|nova-compute-1|az1": {ProjectID: "project-1", ProjectName: "Project One", ComputeHost: "nova-compute-1", AvailabilityZone: "az1", TotalVCPUs: 0, TotalRAMMB: 0, TotalDiskGB: 0}, + "project-1|nova-compute-1|az1": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "nova-compute-1", AvailabilityZone: "az1", TotalVCPUs: 0, TotalRAMMB: 0, TotalDiskGB: 0}, }, }, { @@ -327,7 +359,8 @@ func TestVMwareProjectUtilizationKPI_queryProjectCapacityUsage(t *testing.T) { servers: []nova.Server{ {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "node-3", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, }, - projects: []identity.Project{{ID: "project-1", Name: "Project One"}}, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, expectedUsages: map[string]vmwareProjectCapacityUsage{}, }, @@ -336,7 +369,8 @@ func TestVMwareProjectUtilizationKPI_queryProjectCapacityUsage(t *testing.T) { servers: []nova.Server{ {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "DELETED", OSEXTAvailabilityZone: "az1"}, }, - projects: []identity.Project{{ID: "project-1", Name: "Project One"}}, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, expectedUsages: map[string]vmwareProjectCapacityUsage{}, }, @@ -344,8 +378,9 @@ func TestVMwareProjectUtilizationKPI_queryProjectCapacityUsage(t *testing.T) { name: "no instances returns empty capacity usage", servers: []nova.Server{}, projects: []identity.Project{ - {ID: "project-1", Name: "Project One"}, + {ID: "project-1", Name: "Project One", DomainID: "domain-1"}, }, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, expectedUsages: map[string]vmwareProjectCapacityUsage{}, }, @@ -355,10 +390,11 @@ func TestVMwareProjectUtilizationKPI_queryProjectCapacityUsage(t *testing.T) { {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, {ID: "server-2", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, }, - projects: []identity.Project{{ID: "project-1", Name: "Project One"}}, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, expectedUsages: map[string]vmwareProjectCapacityUsage{ - "project-1|nova-compute-1|az1": {ProjectID: "project-1", ProjectName: "Project One", ComputeHost: "nova-compute-1", AvailabilityZone: "az1", TotalVCPUs: 4, TotalRAMMB: 8192, TotalDiskGB: 2}, + "project-1|nova-compute-1|az1": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "nova-compute-1", AvailabilityZone: "az1", TotalVCPUs: 4, TotalRAMMB: 8192, TotalDiskGB: 2}, }, }, { @@ -366,19 +402,34 @@ func TestVMwareProjectUtilizationKPI_queryProjectCapacityUsage(t *testing.T) { servers: []nova.Server{ {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-ironic-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, }, - projects: []identity.Project{{ID: "project-1", Name: "Project One"}}, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, expectedUsages: map[string]vmwareProjectCapacityUsage{}, }, { - name: "missing project entry results in empty project_name", + name: "project references non-existent domain results in empty domain fields", + servers: []nova.Server{ + {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-unknown"}}, + domains: []identity.Domain{}, + flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, + expectedUsages: map[string]vmwareProjectCapacityUsage{ + // The domain_id is extracted from the project record, so it should be "domain-unknown" even though there is no matching domain entry + "project-1|nova-compute-1|az1": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-unknown", DomainName: "", ComputeHost: "nova-compute-1", AvailabilityZone: "az1", TotalVCPUs: 2, TotalRAMMB: 4096, TotalDiskGB: 1}, + }, + }, + { + name: "missing project entry results in empty project_name and domain", servers: []nova.Server{ {ID: "server-1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, }, projects: []identity.Project{}, + domains: []identity.Domain{}, flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, expectedUsages: map[string]vmwareProjectCapacityUsage{ - "project-1|nova-compute-1|az1": {ProjectID: "project-1", ProjectName: "", ComputeHost: "nova-compute-1", AvailabilityZone: "az1", TotalVCPUs: 2, TotalRAMMB: 4096, TotalDiskGB: 1}, + "project-1|nova-compute-1|az1": {ProjectID: "project-1", ProjectName: "", DomainID: "", DomainName: "", ComputeHost: "nova-compute-1", AvailabilityZone: "az1", TotalVCPUs: 2, TotalRAMMB: 4096, TotalDiskGB: 1}, }, }, } @@ -392,6 +443,7 @@ func TestVMwareProjectUtilizationKPI_queryProjectCapacityUsage(t *testing.T) { if err := testDB.CreateTable( testDB.AddTable(nova.Server{}), testDB.AddTable(identity.Project{}), + testDB.AddTable(identity.Domain{}), testDB.AddTable(nova.Flavor{}), ); err != nil { t.Fatalf("failed to create tables: %v", err) @@ -404,6 +456,9 @@ func TestVMwareProjectUtilizationKPI_queryProjectCapacityUsage(t *testing.T) { for i := range tt.projects { mockData = append(mockData, &tt.projects[i]) } + for i := range tt.domains { + mockData = append(mockData, &tt.domains[i]) + } for i := range tt.flavors { mockData = append(mockData, &tt.flavors[i]) } @@ -446,6 +501,7 @@ func TestVMwareProjectUtilizationKPI_Collect(t *testing.T) { name string servers []nova.Server projects []identity.Project + domains []identity.Domain flavors []nova.Flavor hostDetails []compute.HostDetails expectedMetrics []collectedVMwareMetric @@ -455,16 +511,17 @@ func TestVMwareProjectUtilizationKPI_Collect(t *testing.T) { servers: []nova.Server{ {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, }, - projects: []identity.Project{{ID: "project-1", Name: "Project One"}}, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, hostDetails: []compute.HostDetails{ {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"}, }, expectedMetrics: []collectedVMwareMetric{ - instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "flavor-1", 1), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "vcpu", 2), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "memory", 4096*1024*1024), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "disk", 1*1024*1024*1024), + instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-1", 1), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 2), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "memory", 4096*1024*1024), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "disk", 1*1024*1024*1024), }, }, { @@ -475,9 +532,10 @@ func TestVMwareProjectUtilizationKPI_Collect(t *testing.T) { {ID: "s3", TenantID: "project-2", OSEXTSRVATTRHost: "nova-compute-2", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az2"}, }, projects: []identity.Project{ - {ID: "project-1", Name: "Project One"}, - {ID: "project-2", Name: "Project Two"}, + {ID: "project-1", Name: "Project One", DomainID: "domain-1"}, + {ID: "project-2", Name: "Project Two", DomainID: "domain-1"}, }, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, flavors: []nova.Flavor{ {ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}, {ID: "f2", Name: "flavor-2", VCPUs: 4, RAM: 8192, Disk: 2}, @@ -487,17 +545,17 @@ func TestVMwareProjectUtilizationKPI_Collect(t *testing.T) { {ComputeHost: "nova-compute-2", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az2"}, }, expectedMetrics: []collectedVMwareMetric{ - instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "flavor-1", 1), - instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "flavor-2", 1), - instanceMetric("nova-compute-2", "az2", "project-2", "Project Two", "flavor-1", 1), + instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-1", 1), + instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-2", 1), + instanceMetric("nova-compute-2", "az2", "project-2", "Project Two", "domain-1", "Domain One", "flavor-1", 1), // nova-compute-1/project-1: 1*flavor-1 + 1*flavor-2 - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "vcpu", 6), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "memory", 12288*1024*1024), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "disk", 3*1024*1024*1024), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 6), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "memory", 12288*1024*1024), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "disk", 3*1024*1024*1024), // nova-compute-2/project-2: 1*flavor-1 - capacityMetric("nova-compute-2", "az2", "project-2", "Project Two", "vcpu", 2), - capacityMetric("nova-compute-2", "az2", "project-2", "Project Two", "memory", 4096*1024*1024), - capacityMetric("nova-compute-2", "az2", "project-2", "Project Two", "disk", 1*1024*1024*1024), + capacityMetric("nova-compute-2", "az2", "project-2", "Project Two", "domain-1", "Domain One", "vcpu", 2), + capacityMetric("nova-compute-2", "az2", "project-2", "Project Two", "domain-1", "Domain One", "memory", 4096*1024*1024), + capacityMetric("nova-compute-2", "az2", "project-2", "Project Two", "domain-1", "Domain One", "disk", 1*1024*1024*1024), }, }, { @@ -507,16 +565,17 @@ func TestVMwareProjectUtilizationKPI_Collect(t *testing.T) { {ID: "s2", TenantID: "project-1", OSEXTSRVATTRHost: "node-3", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, {ID: "s3", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-ironic-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, }, - projects: []identity.Project{{ID: "project-1", Name: "Project One"}}, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, hostDetails: []compute.HostDetails{ {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"}, }, expectedMetrics: []collectedVMwareMetric{ - instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "flavor-1", 1), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "vcpu", 2), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "memory", 4096*1024*1024), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "disk", 1*1024*1024*1024), + instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-1", 1), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 2), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "memory", 4096*1024*1024), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "disk", 1*1024*1024*1024), }, }, { @@ -526,7 +585,8 @@ func TestVMwareProjectUtilizationKPI_Collect(t *testing.T) { {ID: "s2", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-2", Status: "ERROR", OSEXTAvailabilityZone: "az1"}, {ID: "s3", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-3", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, }, - projects: []identity.Project{{ID: "project-1", Name: "Project One"}}, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, flavors: []nova.Flavor{ {ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}, {ID: "f2", Name: "flavor-2", VCPUs: 4, RAM: 8192, Disk: 2}, @@ -536,10 +596,10 @@ func TestVMwareProjectUtilizationKPI_Collect(t *testing.T) { {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"}, }, expectedMetrics: []collectedVMwareMetric{ - instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "flavor-3", 1), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "vcpu", 8), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "memory", 16384*1024*1024), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "disk", 4*1024*1024*1024), + instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-3", 1), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 8), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "memory", 16384*1024*1024), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "disk", 4*1024*1024*1024), }, }, { @@ -550,38 +610,59 @@ func TestVMwareProjectUtilizationKPI_Collect(t *testing.T) { {ID: "s3", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-2", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az2"}, {ID: "s4", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-2", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az2"}, }, - projects: []identity.Project{{ID: "project-1", Name: "Project One"}}, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, hostDetails: []compute.HostDetails{ {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"}, {ComputeHost: "nova-compute-2", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az2"}, }, expectedMetrics: []collectedVMwareMetric{ - instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "flavor-1", 2), - instanceMetric("nova-compute-2", "az2", "project-1", "Project One", "flavor-1", 2), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "vcpu", 4), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "memory", 2*4096*1024*1024), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "disk", 2*1024*1024*1024), - capacityMetric("nova-compute-2", "az2", "project-1", "Project One", "vcpu", 4), - capacityMetric("nova-compute-2", "az2", "project-1", "Project One", "memory", 2*4096*1024*1024), - capacityMetric("nova-compute-2", "az2", "project-1", "Project One", "disk", 2*1024*1024*1024), + instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-1", 2), + instanceMetric("nova-compute-2", "az2", "project-1", "Project One", "domain-1", "Domain One", "flavor-1", 2), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 4), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "memory", 2*4096*1024*1024), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "disk", 2*1024*1024*1024), + capacityMetric("nova-compute-2", "az2", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 4), + capacityMetric("nova-compute-2", "az2", "project-1", "Project One", "domain-1", "Domain One", "memory", 2*4096*1024*1024), + capacityMetric("nova-compute-2", "az2", "project-1", "Project One", "domain-1", "Domain One", "disk", 2*1024*1024*1024), + }, + }, + { + name: "project references non-existent domain results in empty domain labels", + servers: []nova.Server{ + {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-unknown"}}, + domains: []identity.Domain{}, + flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, + hostDetails: []compute.HostDetails{ + {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"}, + }, + expectedMetrics: []collectedVMwareMetric{ + // The domain_id is extracted from the project record, so it should be "domain-unknown" even though there is no matching domain entry + instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-unknown", "", "flavor-1", 1), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-unknown", "", "vcpu", 2), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-unknown", "", "memory", 4096*1024*1024), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-unknown", "", "disk", 1*1024*1024*1024), }, }, { - name: "missing project entry results in empty project_name label", + name: "missing project entry results in empty project_name and domain labels", servers: []nova.Server{ {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-1", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, }, projects: []identity.Project{}, + domains: []identity.Domain{}, flavors: []nova.Flavor{{ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}}, hostDetails: []compute.HostDetails{ {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"}, }, expectedMetrics: []collectedVMwareMetric{ - instanceMetric("nova-compute-1", "az1", "project-1", "", "flavor-1", 1), - capacityMetric("nova-compute-1", "az1", "project-1", "", "vcpu", 2), - capacityMetric("nova-compute-1", "az1", "project-1", "", "memory", 4096*1024*1024), - capacityMetric("nova-compute-1", "az1", "project-1", "", "disk", 1*1024*1024*1024), + instanceMetric("nova-compute-1", "az1", "project-1", "", "", "", "flavor-1", 1), + capacityMetric("nova-compute-1", "az1", "project-1", "", "", "", "vcpu", 2), + capacityMetric("nova-compute-1", "az1", "project-1", "", "", "", "memory", 4096*1024*1024), + capacityMetric("nova-compute-1", "az1", "project-1", "", "", "", "disk", 1*1024*1024*1024), }, }, { @@ -589,24 +670,26 @@ func TestVMwareProjectUtilizationKPI_Collect(t *testing.T) { servers: []nova.Server{ {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "flavor-missing", Status: "ACTIVE", OSEXTAvailabilityZone: "az1"}, }, - projects: []identity.Project{{ID: "project-1", Name: "Project One"}}, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, flavors: []nova.Flavor{}, hostDetails: []compute.HostDetails{ {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"}, }, expectedMetrics: []collectedVMwareMetric{ - instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "flavor-missing", 1), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "vcpu", 0), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "memory", 0), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "disk", 0), + instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-missing", 1), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 0), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "memory", 0), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "disk", 0), }, }, { name: "no instances produces no metrics", servers: []nova.Server{}, projects: []identity.Project{ - {ID: "project-1", Name: "Project One"}, + {ID: "project-1", Name: "Project One", DomainID: "domain-1"}, }, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, flavors: []nova.Flavor{ {ID: "f1", Name: "flavor-1", VCPUs: 2, RAM: 4096, Disk: 1}, }, @@ -626,6 +709,7 @@ func TestVMwareProjectUtilizationKPI_Collect(t *testing.T) { if err := testDB.CreateTable( testDB.AddTable(nova.Server{}), testDB.AddTable(identity.Project{}), + testDB.AddTable(identity.Domain{}), testDB.AddTable(nova.Flavor{}), ); err != nil { t.Fatalf("failed to create tables: %v", err) @@ -638,6 +722,9 @@ func TestVMwareProjectUtilizationKPI_Collect(t *testing.T) { for i := range tt.projects { mockData = append(mockData, &tt.projects[i]) } + for i := range tt.domains { + mockData = append(mockData, &tt.domains[i]) + } for i := range tt.flavors { mockData = append(mockData, &tt.flavors[i]) } diff --git a/internal/knowledge/kpis/supported_kpis.go b/internal/knowledge/kpis/supported_kpis.go index 63a35866b..cfcf56bd3 100644 --- a/internal/knowledge/kpis/supported_kpis.go +++ b/internal/knowledge/kpis/supported_kpis.go @@ -13,19 +13,18 @@ import ( // Configuration of supported kpis. var supportedKPIs = map[string]plugins.KPI{ - "kvm_host_capacity_kpi": &compute.KVMResourceCapacityKPI{}, "vmware_host_contention_kpi": &compute.VMwareHostContentionKPI{}, "vmware_project_noisiness_kpi": &compute.VMwareProjectNoisinessKPI{}, - "host_running_vms_kpi": &compute.HostRunningVMsKPI{}, - "flavor_running_vms_kpi": &compute.FlavorRunningVMsKPI{}, "vm_migration_statistics_kpi": &compute.VMMigrationStatisticsKPI{}, "vm_life_span_kpi": &compute.VMLifeSpanKPI{}, "vm_commitments_kpi": &compute.VMCommitmentsKPI{}, "vm_faults_kpi": &compute.VMFaultsKPI{}, - "vmware_project_utilization_kpi": &infrastructure.VMwareProjectUtilizationKPI{}, - "vmware_resource_commitments_kpi": &infrastructure.VMwareResourceCommitmentsKPI{}, - "vmware_host_capacity_kpi": &infrastructure.VMwareHostCapacityKPI{}, + "kvm_host_capacity_kpi": &infrastructure.KVMHostCapacityKPI{}, + "kvm_project_utilization_kpi": &infrastructure.KVMProjectUtilizationKPI{}, + "vmware_project_utilization_kpi": &infrastructure.VMwareProjectUtilizationKPI{}, + "vmware_project_commitments_kpi": &infrastructure.VMwareProjectCommitmentsKPI{}, + "vmware_host_capacity_kpi": &infrastructure.VMwareHostCapacityKPI{}, "netapp_storage_pool_cpu_usage_kpi": &storage.NetAppStoragePoolCPUUsageKPI{}, diff --git a/internal/scheduling/external/nova.go b/internal/scheduling/external/nova.go index b59a37d5b..741c5659c 100644 --- a/internal/scheduling/external/nova.go +++ b/internal/scheduling/external/nova.go @@ -17,6 +17,9 @@ type NovaReaderInterface interface { GetAllFlavors(ctx context.Context) ([]nova.Flavor, error) GetServerByID(ctx context.Context, serverID string) (*nova.Server, error) GetFlavorByName(ctx context.Context, flavorName string) (*nova.Flavor, error) + // GetDeletedServerByID returns a deleted server by its ID from the deleted_servers table. + // Returns nil, nil if the server is not found in the deleted_servers table. + GetDeletedServerByID(ctx context.Context, serverID string) (*nova.DeletedServer, error) } // NovaReader provides read access to Nova data stored in the database. @@ -107,3 +110,17 @@ func (r *NovaReader) GetFlavorByName(ctx context.Context, flavorName string) (*n } return &flavors[0], nil } + +// GetDeletedServerByID returns a deleted Nova server by its ID from the deleted_servers table. +// Returns nil, nil if the server is not found in the deleted_servers table. +func (r *NovaReader) GetDeletedServerByID(ctx context.Context, serverID string) (*nova.DeletedServer, error) { + var servers []nova.DeletedServer + query := "SELECT * FROM " + nova.DeletedServer{}.TableName() + " WHERE id = $1" + if err := r.Select(ctx, &servers, query, serverID); err != nil { + return nil, fmt.Errorf("failed to query deleted server by ID: %w", err) + } + if len(servers) == 0 { + return nil, nil + } + return &servers[0], nil +} diff --git a/internal/scheduling/lib/detector_test.go b/internal/scheduling/lib/detector_test.go index 3bc2f746d..ec31de814 100644 --- a/internal/scheduling/lib/detector_test.go +++ b/internal/scheduling/lib/detector_test.go @@ -9,7 +9,6 @@ import ( "testing" "github.com/cobaltcore-dev/cortex/api/v1alpha1" - testlib "github.com/cobaltcore-dev/cortex/pkg/testing" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -40,8 +39,8 @@ func TestDetector_Init(t *testing.T) { step := BaseDetector[mockDetectorOptions]{} cl := fake.NewClientBuilder().Build() params := []v1alpha1.Parameter{ - {Key: "option1", StringValue: testlib.Ptr("value1")}, - {Key: "option2", IntValue: testlib.Ptr(int64(2))}, + {Key: "option1", StringValue: new("value1")}, + {Key: "option2", IntValue: new(int64(2))}, } err := step.Init(t.Context(), cl, v1alpha1.DetectorSpec{ Params: params, @@ -63,8 +62,8 @@ func TestDetector_Init_InvalidJSON(t *testing.T) { step := BaseDetector[mockDetectorOptions]{} cl := fake.NewClientBuilder().Build() params := []v1alpha1.Parameter{ - {Key: "option1", StringValue: testlib.Ptr("value1")}, - {Key: "option2", StringValue: testlib.Ptr("value2")}, // Invalid int value + {Key: "option1", StringValue: new("value1")}, + {Key: "option2", StringValue: new("value2")}, // Invalid int value } err := step.Init(t.Context(), cl, v1alpha1.DetectorSpec{ Params: params, @@ -271,8 +270,8 @@ func TestBaseDetector_Validate(t *testing.T) { { name: "valid params", params: []v1alpha1.Parameter{ - {Key: "option1", StringValue: testlib.Ptr("value1")}, - {Key: "option2", IntValue: testlib.Ptr(int64(2))}, + {Key: "option1", StringValue: new("value1")}, + {Key: "option2", IntValue: new(int64(2))}, }, expectError: false, }, @@ -284,8 +283,8 @@ func TestBaseDetector_Validate(t *testing.T) { { name: "invalid JSON", params: []v1alpha1.Parameter{ - {Key: "option1", StringValue: testlib.Ptr("value1")}, - {Key: "option2", StringValue: testlib.Ptr("value2")}, + {Key: "option1", StringValue: new("value1")}, + {Key: "option2", StringValue: new("value2")}, }, expectError: true, }, diff --git a/internal/scheduling/lib/filter_weigher_pipeline_step_test.go b/internal/scheduling/lib/filter_weigher_pipeline_step_test.go index 5064ae26f..013946672 100644 --- a/internal/scheduling/lib/filter_weigher_pipeline_step_test.go +++ b/internal/scheduling/lib/filter_weigher_pipeline_step_test.go @@ -8,7 +8,6 @@ import ( "testing" "github.com/cobaltcore-dev/cortex/api/v1alpha1" - testlib "github.com/cobaltcore-dev/cortex/pkg/testing" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) @@ -30,7 +29,7 @@ func TestBaseFilterWeigherPipelineStep_Init(t *testing.T) { { name: "successful initialization with valid params", params: []v1alpha1.Parameter{ - {Key: "bla", StringValue: testlib.Ptr("")}, + {Key: "bla", StringValue: new("")}, }, expectError: false, }, @@ -42,7 +41,7 @@ func TestBaseFilterWeigherPipelineStep_Init(t *testing.T) { { name: "error on invalid JSON params", params: []v1alpha1.Parameter{ - {Key: "unexpected", StringValue: testlib.Ptr("{invalid json}")}, + {Key: "unexpected", StringValue: new("{invalid json}")}, }, expectError: true, }, @@ -201,8 +200,8 @@ func TestBaseFilterWeigherPipelineStep_Validate(t *testing.T) { { name: "invalid JSON", params: []v1alpha1.Parameter{ - {Key: "option1", StringValue: testlib.Ptr("value1")}, - {Key: "option2", StringValue: testlib.Ptr("value2")}, + {Key: "option1", StringValue: new("value1")}, + {Key: "option2", StringValue: new("value2")}, }, expectError: true, }, diff --git a/internal/scheduling/lib/history_client.go b/internal/scheduling/lib/history_client.go index 156b8521c..035ec92b4 100644 --- a/internal/scheduling/lib/history_client.go +++ b/internal/scheduling/lib/history_client.go @@ -113,6 +113,10 @@ func generateExplanation(result *v1alpha1.DecisionResult, pipelineErr error) str fmt.Fprintf(&sb, "\nSelected host: %s.", *result.TargetHost) } + if weighingExpl := ExplainWeighing(result); weighingExpl != "" { + fmt.Fprintf(&sb, "\n\n%s", weighingExpl) + } + return strings.TrimSpace(sb.String()) } diff --git a/internal/scheduling/lib/history_client_test.go b/internal/scheduling/lib/history_client_test.go index 56572f3eb..9cf866913 100644 --- a/internal/scheduling/lib/history_client_test.go +++ b/internal/scheduling/lib/history_client_test.go @@ -11,7 +11,6 @@ import ( "testing" "github.com/cobaltcore-dev/cortex/api/v1alpha1" - testlib "github.com/cobaltcore-dev/cortex/pkg/testing" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -63,7 +62,7 @@ func TestGenerateExplanation(t *testing.T) { { name: "result with target host only no steps", result: &v1alpha1.DecisionResult{ - TargetHost: testlib.Ptr("host-1"), + TargetHost: new("host-1"), }, expected: "Selected host: host-1.", }, @@ -90,7 +89,7 @@ func TestGenerateExplanation(t *testing.T) { }, }, }, - TargetHost: testlib.Ptr("host-a"), + TargetHost: new("host-a"), }, expected: "Started with 3 host(s).\n\n" + "filter_capacity filtered out host-c\n" + @@ -114,7 +113,7 @@ func TestGenerateExplanation(t *testing.T) { }, }, }, - TargetHost: testlib.Ptr("host-x"), + TargetHost: new("host-x"), }, expected: "Started with 2 host(s).\n\n\n2 hosts remaining (host-x, host-y)\n\nSelected host: host-x.", }, @@ -130,7 +129,7 @@ func TestGenerateExplanation(t *testing.T) { { name: "no weights with target", result: &v1alpha1.DecisionResult{ - TargetHost: testlib.Ptr("host-1"), + TargetHost: new("host-1"), StepResults: []v1alpha1.StepResult{ {StepName: "some-step", Activations: map[string]float64{}}, }, @@ -153,7 +152,7 @@ func TestGenerateExplanation(t *testing.T) { }, }, }, - TargetHost: testlib.Ptr("host-a"), + TargetHost: new("host-a"), }, expected: "Started with 2 host(s).\n\n\n2 hosts remaining (host-a, host-b)\n\nSelected host: host-a.", }, @@ -177,7 +176,7 @@ func TestGenerateExplanation(t *testing.T) { Activations: surviving, }, }, - TargetHost: testlib.Ptr("host-000"), + TargetHost: new("host-000"), } }(), expected: "Started with 50 host(s).\n\n" + @@ -240,12 +239,12 @@ func TestHistoryClient_CreateOrUpdateHistory(t *testing.T) { }, Status: v1alpha1.DecisionStatus{ Result: &v1alpha1.DecisionResult{ - TargetHost: testlib.Ptr("compute-1"), + TargetHost: new("compute-1"), }, }, }, expectHistoryLen: 0, - expectTargetHost: testlib.Ptr("compute-1"), + expectTargetHost: new("compute-1"), expectSuccessful: true, expectCondStatus: metav1.ConditionTrue, expectReason: v1alpha1.HistoryReasonSchedulingSucceeded, @@ -286,12 +285,12 @@ func TestHistoryClient_CreateOrUpdateHistory(t *testing.T) { }, Status: v1alpha1.DecisionStatus{ Result: &v1alpha1.DecisionResult{ - TargetHost: testlib.Ptr("compute-2"), + TargetHost: new("compute-2"), }, }, }, expectHistoryLen: 1, // pre-existing entry preserved, no current to archive - expectTargetHost: testlib.Ptr("compute-2"), + expectTargetHost: new("compute-2"), expectSuccessful: true, expectCondStatus: metav1.ConditionTrue, expectReason: v1alpha1.HistoryReasonSchedulingSucceeded, @@ -312,7 +311,7 @@ func TestHistoryClient_CreateOrUpdateHistory(t *testing.T) { PipelineRef: corev1.ObjectReference{Name: "old-pipeline"}, Intent: v1alpha1.SchedulingIntentUnknown, Successful: true, - TargetHost: testlib.Ptr("old-host"), + TargetHost: new("old-host"), }, }, } @@ -331,12 +330,12 @@ func TestHistoryClient_CreateOrUpdateHistory(t *testing.T) { }, Status: v1alpha1.DecisionStatus{ Result: &v1alpha1.DecisionResult{ - TargetHost: testlib.Ptr("new-host"), + TargetHost: new("new-host"), }, }, }, expectHistoryLen: 1, // old current archived - expectTargetHost: testlib.Ptr("new-host"), + expectTargetHost: new("new-host"), expectSuccessful: true, expectCondStatus: metav1.ConditionTrue, expectReason: v1alpha1.HistoryReasonSchedulingSucceeded, @@ -419,7 +418,7 @@ func TestHistoryClient_CreateOrUpdateHistory(t *testing.T) { Timestamp: metav1.Now(), PipelineRef: corev1.ObjectReference{Name: "prev-pipeline"}, Successful: true, - TargetHost: testlib.Ptr("old-backend"), + TargetHost: new("old-backend"), }, History: entries, }, @@ -439,12 +438,12 @@ func TestHistoryClient_CreateOrUpdateHistory(t *testing.T) { }, Status: v1alpha1.DecisionStatus{ Result: &v1alpha1.DecisionResult{ - TargetHost: testlib.Ptr("backend-1"), + TargetHost: new("backend-1"), }, }, }, expectHistoryLen: 10, // 10 existing + 1 archived current, capped to 10 - expectTargetHost: testlib.Ptr("backend-1"), + expectTargetHost: new("backend-1"), expectSuccessful: true, expectCondStatus: metav1.ConditionTrue, expectReason: v1alpha1.HistoryReasonSchedulingSucceeded, @@ -466,13 +465,13 @@ func TestHistoryClient_CreateOrUpdateHistory(t *testing.T) { }, Status: v1alpha1.DecisionStatus{ Result: &v1alpha1.DecisionResult{ - TargetHost: testlib.Ptr("h1"), + TargetHost: new("h1"), OrderedHosts: []string{"h1", "h2", "h3", "h4", "h5"}, }, }, }, expectHistoryLen: 0, - expectTargetHost: testlib.Ptr("h1"), + expectTargetHost: new("h1"), expectSuccessful: true, expectCondStatus: metav1.ConditionTrue, expectReason: v1alpha1.HistoryReasonSchedulingSucceeded, diff --git a/internal/scheduling/lib/pipeline_controller_test.go b/internal/scheduling/lib/pipeline_controller_test.go index e876ec5e6..f609c6606 100644 --- a/internal/scheduling/lib/pipeline_controller_test.go +++ b/internal/scheduling/lib/pipeline_controller_test.go @@ -18,7 +18,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/event" "github.com/cobaltcore-dev/cortex/api/v1alpha1" - testlib "github.com/cobaltcore-dev/cortex/pkg/testing" ) func TestBasePipelineController_InitAllPipelines(t *testing.T) { @@ -216,7 +215,7 @@ func TestBasePipelineController_handlePipelineChange(t *testing.T) { schedulingDomain: v1alpha1.SchedulingDomainNova, expectReady: true, expectInMap: true, - expectAllStepsIndexed: testlib.Ptr(true), + expectAllStepsIndexed: new(true), }, { name: "pipeline init fails", @@ -276,7 +275,7 @@ func TestBasePipelineController_handlePipelineChange(t *testing.T) { schedulingDomain: v1alpha1.SchedulingDomainNova, expectReady: true, expectInMap: true, - expectAllStepsIndexed: testlib.Ptr(false), + expectAllStepsIndexed: new(false), unknownFilters: []string{"unknown-filter"}, }, { @@ -300,7 +299,7 @@ func TestBasePipelineController_handlePipelineChange(t *testing.T) { schedulingDomain: v1alpha1.SchedulingDomainNova, expectReady: true, expectInMap: true, - expectAllStepsIndexed: testlib.Ptr(false), + expectAllStepsIndexed: new(false), unknownWeighers: []string{"unknown-weigher"}, }, { @@ -323,7 +322,7 @@ func TestBasePipelineController_handlePipelineChange(t *testing.T) { schedulingDomain: v1alpha1.SchedulingDomainNova, expectReady: true, expectInMap: true, - expectAllStepsIndexed: testlib.Ptr(false), + expectAllStepsIndexed: new(false), unknownDetectors: []string{"unknown-detector"}, }, { @@ -354,7 +353,7 @@ func TestBasePipelineController_handlePipelineChange(t *testing.T) { schedulingDomain: v1alpha1.SchedulingDomainNova, expectReady: true, expectInMap: true, - expectAllStepsIndexed: testlib.Ptr(false), + expectAllStepsIndexed: new(false), unknownFilters: []string{"unknown-filter-1", "unknown-filter-2"}, unknownWeighers: []string{"unknown-weigher-1"}, }, diff --git a/internal/scheduling/lib/weighing_explainer.go b/internal/scheduling/lib/weighing_explainer.go new file mode 100644 index 000000000..2e66bce41 --- /dev/null +++ b/internal/scheduling/lib/weighing_explainer.go @@ -0,0 +1,430 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package lib + +import ( + "fmt" + "math" + "sort" + "strings" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" +) + +const ( + // maxExplainedHosts limits the pairwise explanation to the top N hosts. + maxExplainedHosts = 3 + + // negligibleContributionThreshold is the absolute contribution value below + // which a weigher is considered to have negligible impact on a host pair. + negligibleContributionThreshold = 0.01 + + // singularityEpsilon is the minimum pivot magnitude during Gaussian + // elimination. Below this the system is considered ill-conditioned. + singularityEpsilon = 1e-10 +) + +// weigherContribution holds the signed contribution of a single weigher to the +// score gap between two hosts. A positive value means the weigher favors the +// higher-ranked host; negative means it opposes the observed ranking. +type weigherContribution struct { + stepName string + contribution float64 +} + +// ExplainWeighing produces a human-readable explanation of how weigher steps +// influenced the ranking of the top hosts in a scheduling decision. +// +// The algorithm works in three stages: +// +// 1. Multiplier recovery: The pipeline applies weights via an additive formula: +// AggregatedOut[h] = NormalizedIn[h] + sum_i(mult_i * tanh(act_i[h])). +// Since DecisionResult stores raw activations but not multipliers, we recover +// them by solving the over-determined linear system (M hosts, N weighers) +// via least-squares (normal equations). This handles negative multipliers +// correctly and produces exact results when M >= N. +// +// 2. Counterfactual analysis: For the #1 host, we ask "if weigher X were +// removed, would a different host be selected?" This identifies decisive +// weighers whose removal would change the scheduling outcome. +// +// 3. Pairwise decomposition: For each consecutive pair in the top-N ranking, +// we report which weigher contributed most to the gap, and flag any weigher +// that opposed the outcome (negative contribution to a positive gap). +// +// Returns an empty string when the result is nil, has fewer than 2 ordered +// hosts, or contains no weigher steps. +func ExplainWeighing(result *v1alpha1.DecisionResult) string { + if result == nil || len(result.OrderedHosts) < 2 { + return "" + } + + weigherSteps := identifyWeigherSteps(result) + if len(weigherSteps) == 0 { + return "" + } + + // Determine how many top hosts to explain. + topN := min(maxExplainedHosts, len(result.OrderedHosts)) + topHosts := result.OrderedHosts[:topN] + + // Recover the multipliers from the linear system: + // sum_i(mult_i * tanh(act_i[h])) = AggregatedOut[h] - NormalizedIn[h] + // using all ordered hosts as data points for a least-squares fit. + // Falls back to initial-bias-only explanation if recovery fails (e.g., all + // activations are zero making the matrix singular, or under-determined). + multipliers, ok := recoverMultipliers(weigherSteps, result.OrderedHosts, result.NormalizedInWeights, result.AggregatedOutWeights) + if !ok { + return explainWithoutMultipliers(result, topHosts, weigherSteps) + } + + // Precompute effective contributions: contribution[weigherIdx][host] gives + // the signed weight contribution of that weigher to that host's final score. + // Computed for ALL ordered hosts (not just top-N) so counterfactual analysis + // correctly considers hosts outside the top-N that might rise to #1. + contributions := make([]map[string]float64, len(weigherSteps)) + for i, step := range weigherSteps { + contributions[i] = make(map[string]float64, len(result.OrderedHosts)) + for _, h := range result.OrderedHosts { + contributions[i][h] = multipliers[i] * math.Tanh(step.Activations[h]) + } + } + + var sb strings.Builder + + // --- Header: state the ranking being explained --- + sb.WriteString("Weighing impact on top-") + fmt.Fprintf(&sb, "%d (", topN) + for i, h := range topHosts { + if i > 0 { + sb.WriteString(", ") + } + sb.WriteString(h) + } + sb.WriteString("):\n") + + // --- Counterfactual analysis for the #1 host --- + // Ask: "Is there a single weigher whose removal would dethrone the #1 host?" + // Evaluated over ALL ordered hosts so we don't miss a host outside the top-N + // that would rise to #1 when a weigher is removed. + counterfactualReported := false + for i, step := range weigherSteps { + newRanking := computeCounterfactualRanking(result.OrderedHosts, result.AggregatedOutWeights, contributions[i]) + if newRanking[0] != topHosts[0] { + fmt.Fprintf(&sb, " Without %s, %s would be #1 instead of %s.\n", + step.StepName, newRanking[0], topHosts[0]) + counterfactualReported = true + break + } + } + + // --- Pairwise decomposition for each consecutive pair in top-N --- + // Track which weighers have negligible impact across ALL pairs. + negligibleCandidates := make(map[string]bool, len(weigherSteps)) + for _, step := range weigherSteps { + negligibleCandidates[step.StepName] = true + } + + for rank := range topN - 1 { + higher := topHosts[rank] + lower := topHosts[rank+1] + totalGap := result.AggregatedOutWeights[higher] - result.AggregatedOutWeights[lower] + + // Compute the initial weight bias (contribution from NormalizedInWeights). + initialBias := 0.0 + if result.NormalizedInWeights != nil { + initialBias = result.NormalizedInWeights[higher] - result.NormalizedInWeights[lower] + } + + // Gather per-weigher contributions to this pair's gap. + pairContribs := make([]weigherContribution, len(weigherSteps)) + for i, step := range weigherSteps { + c := contributions[i][higher] - contributions[i][lower] + pairContribs[i] = weigherContribution{stepName: step.StepName, contribution: c} + if math.Abs(c) >= negligibleContributionThreshold { + delete(negligibleCandidates, step.StepName) + } + } + + // Find leading positive contributor (supports the ranking). + var leading *weigherContribution + for j := range pairContribs { + if pairContribs[j].contribution > negligibleContributionThreshold { + if leading == nil || pairContribs[j].contribution > leading.contribution { + leading = &pairContribs[j] + } + } + } + + // Report the leading cause for this pair. + switch { + case leading != nil: + fmt.Fprintf(&sb, " %s is #%d because of %s (contributed %+.2f to gap of %.2f).\n", + higher, rank+1, leading.stepName, leading.contribution, totalGap) + case math.Abs(initialBias) > negligibleContributionThreshold: + fmt.Fprintf(&sb, " %s is #%d due to initial weight bias (%+.2f).\n", + higher, rank+1, initialBias) + case !counterfactualReported: + fmt.Fprintf(&sb, " %s is #%d over %s by a narrow margin (gap: %.4f).\n", + higher, rank+1, lower, totalGap) + } + + // Report the strongest opposing weigher (if significant). + var opposing *weigherContribution + for j := range pairContribs { + if pairContribs[j].contribution < -negligibleContributionThreshold { + if opposing == nil || pairContribs[j].contribution < opposing.contribution { + opposing = &pairContribs[j] + } + } + } + if opposing != nil { + fmt.Fprintf(&sb, " %s opposed this ranking (contributed %.2f).\n", + opposing.stepName, opposing.contribution) + } + } + + // --- Negligible-impact weighers --- + if len(negligibleCandidates) > 0 { + names := sortedMapKeys(negligibleCandidates) + fmt.Fprintf(&sb, " %s had negligible impact on top-%d ordering.\n", + strings.Join(names, ", "), topN) + } + + return strings.TrimSpace(sb.String()) +} + +// explainWithoutMultipliers provides a simpler explanation when multiplier +// recovery is not possible (e.g., all activations are zero, or the system is +// under-determined). It reports only the initial weight bias and raw activation +// differentials without exact contribution magnitudes. +func explainWithoutMultipliers(result *v1alpha1.DecisionResult, topHosts []string, _ []v1alpha1.StepResult) string { + topN := len(topHosts) + var sb strings.Builder + + sb.WriteString("Weighing impact on top-") + fmt.Fprintf(&sb, "%d (", topN) + for i, h := range topHosts { + if i > 0 { + sb.WriteString(", ") + } + sb.WriteString(h) + } + sb.WriteString("):\n") + + for rank := range topN - 1 { + higher := topHosts[rank] + lower := topHosts[rank+1] + + initialBias := 0.0 + if result.NormalizedInWeights != nil { + initialBias = result.NormalizedInWeights[higher] - result.NormalizedInWeights[lower] + } + + if math.Abs(initialBias) > negligibleContributionThreshold { + fmt.Fprintf(&sb, " %s is #%d due to initial weight bias (%+.2f).\n", + higher, rank+1, initialBias) + } else { + totalGap := result.AggregatedOutWeights[higher] - result.AggregatedOutWeights[lower] + fmt.Fprintf(&sb, " %s is #%d over %s (gap: %.4f).\n", + higher, rank+1, lower, totalGap) + } + } + + return strings.TrimSpace(sb.String()) +} + +// identifyWeigherSteps returns the subset of step results that represent +// weigher (scoring) steps rather than filter steps. A weigher step is one +// whose activation map contains entries for ALL hosts in OrderedHosts — +// filters reduce the host set while weighers score all remaining hosts. +func identifyWeigherSteps(result *v1alpha1.DecisionResult) []v1alpha1.StepResult { + orderedHostSet := make(map[string]struct{}, len(result.OrderedHosts)) + for _, h := range result.OrderedHosts { + orderedHostSet[h] = struct{}{} + } + + var weigherSteps []v1alpha1.StepResult + for _, step := range result.StepResults { + if len(step.Activations) == 0 { + continue + } + isWeigher := true + for h := range orderedHostSet { + if _, exists := step.Activations[h]; !exists { + isWeigher = false + break + } + } + if isWeigher { + weigherSteps = append(weigherSteps, step) + } + } + return weigherSteps +} + +// recoverMultipliers solves for the weigher multipliers using least-squares. +// +// The additive pipeline formula guarantees: +// +// AggregatedOut[h] - NormalizedIn[h] = sum_i(mult_i * tanh(act_i[h])) +// +// for every host h. This forms a linear system A*x = b where: +// - A is an M×N matrix: A[h][i] = tanh(activation_i[h]) +// - b is an M-vector: b[h] = AggregatedOut[h] - NormalizedIn[h] +// - x is the N-vector of unknown multipliers +// +// With M hosts (rows) and N weighers (columns), and typically M >> N, we +// solve the normal equations: (A^T * A) * x = A^T * b. +// +// Returns the recovered multipliers and true on success, or nil and false if +// the system is under-determined or ill-conditioned. +func recoverMultipliers( + weigherSteps []v1alpha1.StepResult, + orderedHosts []string, + normalizedIn map[string]float64, + aggregatedOut map[string]float64, +) ([]float64, bool) { + + M := len(orderedHosts) // number of data points (hosts) + N := len(weigherSteps) // number of unknowns (multipliers) + + if M < N || N == 0 { + return nil, false + } + + // Build the M×N matrix A where A[h][i] = tanh(activation_i[host_h]) + // and the M-vector b where b[h] = aggregatedOut[host_h] - normalizedIn[host_h]. + A := make([][]float64, M) + b := make([]float64, M) + for h, host := range orderedHosts { + A[h] = make([]float64, N) + for i, step := range weigherSteps { + A[h][i] = math.Tanh(step.Activations[host]) + } + b[h] = aggregatedOut[host] - normalizedIn[host] + } + + // Compute A^T * A (N×N symmetric matrix). + ata := make([][]float64, N) + for i := range N { + ata[i] = make([]float64, N) + for j := range N { + sum := 0.0 + for h := range M { + sum += A[h][i] * A[h][j] + } + ata[i][j] = sum + } + } + + // Compute A^T * b (N-vector). + atb := make([]float64, N) + for i := range N { + sum := 0.0 + for h := range M { + sum += A[h][i] * b[h] + } + atb[i] = sum + } + + // Solve the N×N system (A^T A) x = A^T b. + return solveLinearSystem(ata, atb) +} + +// solveLinearSystem solves a square linear system Ax = b using Gaussian +// elimination with partial pivoting. Returns the solution vector and true +// on success, or nil and false if the matrix is singular (pivot < epsilon). +func solveLinearSystem(a [][]float64, b []float64) ([]float64, bool) { + n := len(b) + if n == 0 { + return nil, false + } + + // Create augmented matrix [A|b] to avoid modifying the inputs. + aug := make([][]float64, n) + for i := range n { + aug[i] = make([]float64, n+1) + copy(aug[i][:n], a[i]) + aug[i][n] = b[i] + } + + // Forward elimination with partial pivoting. + for col := range n { + // Find the row with the largest absolute value in this column. + maxRow := col + maxVal := math.Abs(aug[col][col]) + for row := col + 1; row < n; row++ { + if v := math.Abs(aug[row][col]); v > maxVal { + maxVal = v + maxRow = row + } + } + + if maxVal < singularityEpsilon { + return nil, false + } + + // Swap rows if needed. + if maxRow != col { + aug[col], aug[maxRow] = aug[maxRow], aug[col] + } + + // Eliminate below the pivot. + pivot := aug[col][col] + for row := col + 1; row < n; row++ { + factor := aug[row][col] / pivot + for j := col; j <= n; j++ { + aug[row][j] -= factor * aug[col][j] + } + } + } + + // Back substitution. + x := make([]float64, n) + for i := n - 1; i >= 0; i-- { + if math.Abs(aug[i][i]) < singularityEpsilon { + return nil, false + } + sum := aug[i][n] + for j := i + 1; j < n; j++ { + sum -= aug[i][j] * x[j] + } + x[i] = sum / aug[i][i] + } + return x, true +} + +// computeCounterfactualRanking computes what the ranking of topHosts would be +// if a single weigher's contributions were removed. It subtracts the weigher's +// per-host contribution from the aggregated weights and re-sorts. +func computeCounterfactualRanking( + topHosts []string, + aggregatedOut map[string]float64, + weigherContribs map[string]float64, +) []string { + // Compute hypothetical scores without this weigher. + hypothetical := make(map[string]float64, len(topHosts)) + for _, h := range topHosts { + hypothetical[h] = aggregatedOut[h] - weigherContribs[h] + } + + // Sort by hypothetical score descending. + ranking := make([]string, len(topHosts)) + copy(ranking, topHosts) + sort.Slice(ranking, func(i, j int) bool { + return hypothetical[ranking[i]] > hypothetical[ranking[j]] + }) + return ranking +} + +// sortedMapKeys returns the keys of a bool map in sorted order. +func sortedMapKeys(m map[string]bool) []string { + keys := make([]string, 0, len(m)) + for k := range m { + keys = append(keys, k) + } + sort.Strings(keys) + return keys +} diff --git a/internal/scheduling/lib/weighing_explainer_test.go b/internal/scheduling/lib/weighing_explainer_test.go new file mode 100644 index 000000000..3e63508d7 --- /dev/null +++ b/internal/scheduling/lib/weighing_explainer_test.go @@ -0,0 +1,514 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package lib + +import ( + "math" + "sort" + "strings" + "testing" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" +) + +func TestExplainWeighing(t *testing.T) { + tests := []struct { + name string + result *v1alpha1.DecisionResult + contains []string // substrings that must appear in output + excludes []string // substrings that must NOT appear in output + empty bool // expect empty string + }{ + { + name: "nil result returns empty", + result: nil, + empty: true, + }, + { + name: "single host returns empty", + result: &v1alpha1.DecisionResult{ + OrderedHosts: []string{"host-a"}, + NormalizedInWeights: map[string]float64{"host-a": 0.5}, + AggregatedOutWeights: map[string]float64{"host-a": 0.8}, + }, + empty: true, + }, + { + name: "only filter steps returns empty", + result: &v1alpha1.DecisionResult{ + NormalizedInWeights: map[string]float64{"host-a": 0.5, "host-b": 0.3, "host-c": 0.1}, + AggregatedOutWeights: map[string]float64{"host-a": 0.5, "host-b": 0.3}, + OrderedHosts: []string{"host-a", "host-b"}, + StepResults: []v1alpha1.StepResult{ + // A true filter: activations do NOT include host-b (it was filtered after this step). + {StepName: "filter_az", Activations: map[string]float64{"host-a": 1.0}}, + }, + }, + empty: true, + }, + { + name: "two hosts single weigher positive multiplier", + result: func() *v1alpha1.DecisionResult { + // Simulate: multiplier = 2.0, act[a] = 1.0, act[b] = -0.5 + // contrib[a] = 2.0 * tanh(1.0) = 1.5231 + // contrib[b] = 2.0 * tanh(-0.5) = -0.9242 + mult := 2.0 + actA, actB := 1.0, -0.5 + inA, inB := 0.3, 0.3 + outA := inA + mult*math.Tanh(actA) + outB := inB + mult*math.Tanh(actB) + return &v1alpha1.DecisionResult{ + NormalizedInWeights: map[string]float64{"host-a": inA, "host-b": inB}, + AggregatedOutWeights: map[string]float64{"host-a": outA, "host-b": outB}, + OrderedHosts: []string{"host-a", "host-b"}, + StepResults: []v1alpha1.StepResult{ + {StepName: "weigher_cpu", Activations: map[string]float64{"host-a": actA, "host-b": actB}}, + }, + } + }(), + contains: []string{"host-a", "host-b", "weigher_cpu", "is #1 because of"}, + }, + { + name: "two hosts single weigher negative multiplier", + result: func() *v1alpha1.DecisionResult { + // Simulate: multiplier = -1.0 (inverts behavior) + // A host with HIGH raw activation is penalized. + // act[a] = -2.0 (high penalty activates less due to neg mult -> boosts) + // act[b] = 3.0 (high activation penalized by neg mult -> hurts) + mult := -1.0 + actA, actB := -2.0, 3.0 + inA, inB := 0.0, 0.0 + outA := inA + mult*math.Tanh(actA) + outB := inB + mult*math.Tanh(actB) + // outA = 0 + (-1)*tanh(-2) = 0 + (-1)*(-0.964) = +0.964 + // outB = 0 + (-1)*tanh(3) = 0 + (-1)*(0.995) = -0.995 + return &v1alpha1.DecisionResult{ + NormalizedInWeights: map[string]float64{"host-a": inA, "host-b": inB}, + AggregatedOutWeights: map[string]float64{"host-a": outA, "host-b": outB}, + OrderedHosts: []string{"host-a", "host-b"}, + StepResults: []v1alpha1.StepResult{ + {StepName: "kvm_binpack", Activations: map[string]float64{"host-a": actA, "host-b": actB}}, + }, + } + }(), + contains: []string{"host-a", "is #1 because of", "kvm_binpack"}, + }, + { + name: "three hosts two weighers with counterfactual", + result: func() *v1alpha1.DecisionResult { + // weigher_cpu (mult=3.0) strongly favors host-a + // weigher_mem (mult=1.0) slightly favors host-b + // Without weigher_cpu, host-b should be #1. + multCPU, multMem := 3.0, 1.0 + actCPU := map[string]float64{"h1": 1.0, "h2": 0.2, "h3": -0.5} + actMem := map[string]float64{"h1": -0.3, "h2": 0.8, "h3": 0.1} + in := map[string]float64{"h1": 0.0, "h2": 0.0, "h3": 0.0} + out := map[string]float64{} + for _, h := range []string{"h1", "h2", "h3"} { + out[h] = in[h] + multCPU*math.Tanh(actCPU[h]) + multMem*math.Tanh(actMem[h]) + } + // Sort to determine OrderedHosts. + hosts := []string{"h1", "h2", "h3"} + sortByWeight(hosts, out) + return &v1alpha1.DecisionResult{ + NormalizedInWeights: in, + AggregatedOutWeights: out, + OrderedHosts: hosts, + StepResults: []v1alpha1.StepResult{ + {StepName: "weigher_cpu", Activations: actCPU}, + {StepName: "weigher_mem", Activations: actMem}, + }, + } + }(), + contains: []string{"Without weigher_cpu", "would be #1 instead"}, + }, + { + name: "opposing weigher reported", + result: func() *v1alpha1.DecisionResult { + // weigher_boost (mult=2.0) pushes host-a up + // weigher_penalty (mult=-0.5) opposes host-a + // Net: host-a still wins due to weigher_boost dominance. + multBoost, multPenalty := 2.0, -0.5 + actBoost := map[string]float64{"x": 1.5, "y": -0.5} + actPenalty := map[string]float64{"x": 1.0, "y": -1.0} + in := map[string]float64{"x": 0.0, "y": 0.0} + out := map[string]float64{} + for _, h := range []string{"x", "y"} { + out[h] = in[h] + multBoost*math.Tanh(actBoost[h]) + multPenalty*math.Tanh(actPenalty[h]) + } + hosts := []string{"x", "y"} + sortByWeight(hosts, out) + return &v1alpha1.DecisionResult{ + NormalizedInWeights: in, + AggregatedOutWeights: out, + OrderedHosts: hosts, + StepResults: []v1alpha1.StepResult{ + {StepName: "weigher_boost", Activations: actBoost}, + {StepName: "weigher_penalty", Activations: actPenalty}, + }, + } + }(), + contains: []string{"opposed this ranking"}, + }, + { + name: "negligible impact weigher reported", + result: func() *v1alpha1.DecisionResult { + // weigher_big (mult=5.0) dominates + // weigher_tiny (mult=0.001) is negligible + multBig, multTiny := 5.0, 0.001 + actBig := map[string]float64{"a": 2.0, "b": -1.0} + actTiny := map[string]float64{"a": 0.5, "b": 0.4} + in := map[string]float64{"a": 0.0, "b": 0.0} + out := map[string]float64{} + for _, h := range []string{"a", "b"} { + out[h] = in[h] + multBig*math.Tanh(actBig[h]) + multTiny*math.Tanh(actTiny[h]) + } + hosts := []string{"a", "b"} + sortByWeight(hosts, out) + return &v1alpha1.DecisionResult{ + NormalizedInWeights: in, + AggregatedOutWeights: out, + OrderedHosts: hosts, + StepResults: []v1alpha1.StepResult{ + {StepName: "weigher_big", Activations: actBig}, + {StepName: "weigher_tiny", Activations: actTiny}, + }, + } + }(), + contains: []string{"weigher_tiny", "negligible"}, + }, + { + name: "initial weight bias dominates when activations are zero", + result: &v1alpha1.DecisionResult{ + NormalizedInWeights: map[string]float64{"fast": 0.8, "slow": 0.2}, + AggregatedOutWeights: map[string]float64{"fast": 0.8, "slow": 0.2}, + OrderedHosts: []string{"fast", "slow"}, + StepResults: []v1alpha1.StepResult{ + {StepName: "weigher_noop", Activations: map[string]float64{"fast": 0.0, "slow": 0.0}}, + }, + }, + // Matrix is singular (all-zero activations), so fallback reports initial bias. + contains: []string{"initial weight bias", "+0.60"}, + }, + { + name: "mixed filter and weigher steps ignores filters", + result: func() *v1alpha1.DecisionResult { + mult := 1.5 + actW := map[string]float64{"a": 1.0, "b": -0.5} + in := map[string]float64{"a": 0.0, "b": 0.0, "c": 0.0} + out := map[string]float64{} + for _, h := range []string{"a", "b"} { + out[h] = in[h] + mult*math.Tanh(actW[h]) + } + hosts := []string{"a", "b"} + sortByWeight(hosts, out) + return &v1alpha1.DecisionResult{ + NormalizedInWeights: map[string]float64{"a": 0.0, "b": 0.0}, + AggregatedOutWeights: out, + OrderedHosts: hosts, + StepResults: []v1alpha1.StepResult{ + // Filter step: removed host c (activations don't include all OrderedHosts... wait, they do include a and b). + // Actually a filter that removed c would have activations for a and b only. + // Since OrderedHosts = [a, b], this LOOKS like a weigher. + // Let's make the filter clearly filter: it has activations for a, b but not c. + // But since c is not in OrderedHosts, the filter will still "match" as weigher. + // To properly test: make a filter that has activations for only 'a' (not 'b'). + {StepName: "filter_az", Activations: map[string]float64{"a": 1.0}}, + {StepName: "weigher_cpu", Activations: actW}, + }, + } + }(), + contains: []string{"weigher_cpu"}, + excludes: []string{"filter_az"}, + }, + { + name: "under-determined system falls back to gap-only explanation", + result: &v1alpha1.DecisionResult{ + NormalizedInWeights: map[string]float64{"a": 0.0, "b": 0.0}, + AggregatedOutWeights: map[string]float64{"a": 1.0, "b": 0.5}, + OrderedHosts: []string{"a", "b"}, + StepResults: []v1alpha1.StepResult{ + {StepName: "w1", Activations: map[string]float64{"a": 1.0, "b": 0.5}}, + {StepName: "w2", Activations: map[string]float64{"a": 0.8, "b": 0.3}}, + {StepName: "w3", Activations: map[string]float64{"a": 0.6, "b": 0.1}}, + }, + }, + // 2 hosts, 3 weighers -> M < N -> under-determined, falls back to gap report + contains: []string{"a is #1 over b"}, + }, + { + name: "multiplier recovery accuracy", + result: func() *v1alpha1.DecisionResult { + // Known multipliers: [2.5, -1.0, 0.7] + mults := []float64{2.5, -1.0, 0.7} + acts := []map[string]float64{ + {"h1": 0.8, "h2": -0.3, "h3": 1.2, "h4": -0.7}, + {"h1": 0.5, "h2": 1.0, "h3": -0.2, "h4": 0.3}, + {"h1": -1.0, "h2": 0.6, "h3": 0.4, "h4": -0.1}, + } + hosts := []string{"h1", "h2", "h3", "h4"} + in := map[string]float64{"h1": 0.1, "h2": 0.2, "h3": -0.1, "h4": 0.0} + out := make(map[string]float64) + for _, h := range hosts { + out[h] = in[h] + for i, m := range mults { + out[h] += m * math.Tanh(acts[i][h]) + } + } + sortByWeight(hosts, out) + return &v1alpha1.DecisionResult{ + NormalizedInWeights: in, + AggregatedOutWeights: out, + OrderedHosts: hosts, + StepResults: []v1alpha1.StepResult{ + {StepName: "w_big", Activations: acts[0]}, + {StepName: "w_neg", Activations: acts[1]}, + {StepName: "w_small", Activations: acts[2]}, + }, + } + }(), + contains: []string{"top-3", "is #1 because of"}, + }, + { + name: "counterfactual not reported when removal does not change #1", + result: func() *v1alpha1.DecisionResult { + // Two weighers both favor host-a. Removing either still leaves a as #1. + mult1, mult2 := 2.0, 2.0 + act1 := map[string]float64{"a": 1.0, "b": -1.0} + act2 := map[string]float64{"a": 0.8, "b": -0.8} + in := map[string]float64{"a": 0.0, "b": 0.0} + out := map[string]float64{} + for _, h := range []string{"a", "b"} { + out[h] = in[h] + mult1*math.Tanh(act1[h]) + mult2*math.Tanh(act2[h]) + } + return &v1alpha1.DecisionResult{ + NormalizedInWeights: in, + AggregatedOutWeights: out, + OrderedHosts: []string{"a", "b"}, + StepResults: []v1alpha1.StepResult{ + {StepName: "w1", Activations: act1}, + {StepName: "w2", Activations: act2}, + }, + } + }(), + excludes: []string{"Without", "would be #1"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := ExplainWeighing(tt.result) + if tt.empty { + if got != "" { + t.Errorf("expected empty string, got:\n%s", got) + } + return + } + if got == "" { + t.Fatal("expected non-empty explanation, got empty string") + } + for _, want := range tt.contains { + if !strings.Contains(got, want) { + t.Errorf("output missing expected substring %q.\nGot:\n%s", want, got) + } + } + for _, notWant := range tt.excludes { + if strings.Contains(got, notWant) { + t.Errorf("output should NOT contain %q.\nGot:\n%s", notWant, got) + } + } + }) + } +} + +func TestRecoverMultipliers(t *testing.T) { + // Verify that recovered multipliers match the known ground truth. + knownMults := []float64{2.5, -1.0, 0.7} + acts := []map[string]float64{ + {"h1": 0.8, "h2": -0.3, "h3": 1.2, "h4": -0.7, "h5": 0.4}, + {"h1": 0.5, "h2": 1.0, "h3": -0.2, "h4": 0.3, "h5": -0.9}, + {"h1": -1.0, "h2": 0.6, "h3": 0.4, "h4": -0.1, "h5": 0.2}, + } + hosts := []string{"h1", "h2", "h3", "h4", "h5"} + normalizedIn := map[string]float64{"h1": 0.1, "h2": 0.2, "h3": -0.1, "h4": 0.0, "h5": 0.3} + aggregatedOut := make(map[string]float64) + for _, h := range hosts { + aggregatedOut[h] = normalizedIn[h] + for i, m := range knownMults { + aggregatedOut[h] += m * math.Tanh(acts[i][h]) + } + } + + steps := []v1alpha1.StepResult{ + {StepName: "w1", Activations: acts[0]}, + {StepName: "w2", Activations: acts[1]}, + {StepName: "w3", Activations: acts[2]}, + } + + recovered, ok := recoverMultipliers(steps, hosts, normalizedIn, aggregatedOut) + if !ok { + t.Fatal("recoverMultipliers failed unexpectedly") + } + if len(recovered) != len(knownMults) { + t.Fatalf("got %d multipliers, want %d", len(recovered), len(knownMults)) + } + for i, want := range knownMults { + if math.Abs(recovered[i]-want) > 1e-6 { + t.Errorf("multiplier[%d] = %.6f, want %.6f", i, recovered[i], want) + } + } +} + +func TestSolveLinearSystem(t *testing.T) { + // Simple 2x2 system: 2x + y = 5, x + 3y = 10 => x=1, y=3 + a := [][]float64{{2, 1}, {1, 3}} + b := []float64{5, 10} + x, ok := solveLinearSystem(a, b) + if !ok { + t.Fatal("solveLinearSystem failed") + } + if math.Abs(x[0]-1.0) > 1e-10 || math.Abs(x[1]-3.0) > 1e-10 { + t.Errorf("got x=%v, want [1, 3]", x) + } + + // Singular matrix should return false. + singularA := [][]float64{{1, 2}, {2, 4}} + singularB := []float64{3, 6} + _, ok = solveLinearSystem(singularA, singularB) + if ok { + t.Error("expected failure for singular matrix") + } +} + +func TestIdentifyWeigherSteps(t *testing.T) { + result := &v1alpha1.DecisionResult{ + OrderedHosts: []string{"a", "b", "c"}, + StepResults: []v1alpha1.StepResult{ + // Filter: missing host c + {StepName: "filter_x", Activations: map[string]float64{"a": 1.0, "b": 1.0}}, + // Weigher: has all ordered hosts + {StepName: "weigher_y", Activations: map[string]float64{"a": 0.5, "b": 0.3, "c": -0.2}}, + // Empty step: should be excluded + {StepName: "empty_step", Activations: map[string]float64{}}, + // Another weigher + {StepName: "weigher_z", Activations: map[string]float64{"a": 1.0, "b": 0.0, "c": 0.5, "d": 0.1}}, + }, + } + + steps := identifyWeigherSteps(result) + if len(steps) != 2 { + t.Fatalf("got %d weigher steps, want 2", len(steps)) + } + if steps[0].StepName != "weigher_y" { + t.Errorf("step[0] = %q, want weigher_y", steps[0].StepName) + } + if steps[1].StepName != "weigher_z" { + t.Errorf("step[1] = %q, want weigher_z", steps[1].StepName) + } +} + +// sortByWeight sorts hosts in descending order of their weight (for test setup). +func sortByWeight(hosts []string, weights map[string]float64) { + sort.Slice(hosts, func(i, j int) bool { + return weights[hosts[i]] > weights[hosts[j]] + }) +} + +// TestExplainWeighingDemo is a demonstration test that simulates a realistic +// Nova scheduling scenario with multiple weighers (including a negative +// multiplier for balancing) and prints the full explanation output. Run with: +// +// go test ./internal/scheduling/lib/ -run TestExplainWeighingDemo -v +func TestExplainWeighingDemo(t *testing.T) { + // Simulates a Nova scheduling pipeline with 5 compute hosts and 3 weighers: + // - kvm_binpack (mult=-1.0): inverted to achieve memory balancing + // - kvm_failover_evacuation (mult=2.0): prefers hosts with fewer VMs to evacuate + // - kvm_prefer_smaller_hosts (mult=0.5): slight preference for smaller hosts + // + // Host nova-compute-01 wins because kvm_failover_evacuation strongly favors it, + // despite kvm_binpack opposing it (since it has high memory usage). + multBinpack := -1.0 + multFailover := 2.0 + multSmaller := 0.5 + + hosts := []string{ + "nova-compute-01", + "nova-compute-02", + "nova-compute-03", + "nova-compute-04", + "nova-compute-05", + } + + // Raw activations (before tanh) — simulating real scheduler outputs. + actBinpack := map[string]float64{ + "nova-compute-01": 0.8, // high memory usage -> high binpack score + "nova-compute-02": 0.3, // moderate + "nova-compute-03": -0.2, // low usage + "nova-compute-04": 0.6, // moderately full + "nova-compute-05": -0.5, // nearly empty + } + actFailover := map[string]float64{ + "nova-compute-01": 1.5, // few VMs to evacuate in failure -> strongly preferred + "nova-compute-02": 0.4, // moderate evacuation risk + "nova-compute-03": -0.1, // slight risk + "nova-compute-04": -0.8, // high evacuation cost + "nova-compute-05": 0.2, // low risk + } + actSmaller := map[string]float64{ + "nova-compute-01": -0.3, // large host + "nova-compute-02": 0.1, // medium + "nova-compute-03": 0.7, // smaller + "nova-compute-04": -0.1, // medium-large + "nova-compute-05": 1.2, // smallest + } + + // Simulate the pipeline: compute aggregated output weights. + normalizedIn := map[string]float64{ + "nova-compute-01": 0.10, + "nova-compute-02": 0.08, + "nova-compute-03": 0.05, + "nova-compute-04": 0.12, + "nova-compute-05": 0.06, + } + aggregatedOut := make(map[string]float64, len(hosts)) + for _, h := range hosts { + aggregatedOut[h] = normalizedIn[h] + + multBinpack*math.Tanh(actBinpack[h]) + + multFailover*math.Tanh(actFailover[h]) + + multSmaller*math.Tanh(actSmaller[h]) + } + + // Sort hosts by aggregated weight to get OrderedHosts. + sortByWeight(hosts, aggregatedOut) + + result := &v1alpha1.DecisionResult{ + NormalizedInWeights: normalizedIn, + AggregatedOutWeights: aggregatedOut, + OrderedHosts: hosts, + TargetHost: &hosts[0], + StepResults: []v1alpha1.StepResult{ + {StepName: "kvm_binpack", Activations: actBinpack}, + {StepName: "kvm_failover_evacuation", Activations: actFailover}, + {StepName: "kvm_prefer_smaller_hosts", Activations: actSmaller}, + }, + } + + explanation := ExplainWeighing(result) + if explanation == "" { + t.Fatal("expected non-empty explanation") + } + + t.Logf("=== Demo: Full scheduling explanation ===\n\n%s\n", explanation) + + // Verify key properties of the explanation. + if !strings.Contains(explanation, "nova-compute-01") { + t.Error("expected #1 host in explanation") + } + if !strings.Contains(explanation, "kvm_failover_evacuation") { + t.Error("expected dominant weigher in explanation") + } + if !strings.Contains(explanation, "kvm_binpack") { + t.Error("expected opposing weigher in explanation") + } +} diff --git a/internal/scheduling/manila/filter_weigher_pipeline_controller_test.go b/internal/scheduling/manila/filter_weigher_pipeline_controller_test.go index e16c55c27..3ca4fd7f1 100644 --- a/internal/scheduling/manila/filter_weigher_pipeline_controller_test.go +++ b/internal/scheduling/manila/filter_weigher_pipeline_controller_test.go @@ -18,7 +18,6 @@ import ( api "github.com/cobaltcore-dev/cortex/api/external/manila" "github.com/cobaltcore-dev/cortex/api/v1alpha1" - testlib "github.com/cobaltcore-dev/cortex/pkg/testing" "github.com/sapcc/go-bits/must" "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/storage" @@ -487,10 +486,10 @@ func TestFilterWeigherPipelineController_InitPipeline(t *testing.T) { { Name: "netapp_cpu_usage_balancing", Params: []v1alpha1.Parameter{ - {Key: "AvgCPUUsageLowerBound", FloatValue: testlib.Ptr(0.0)}, - {Key: "AvgCPUUsageUpperBound", FloatValue: testlib.Ptr(90.0)}, - {Key: "MaxCPUUsageLowerBound", FloatValue: testlib.Ptr(0.0)}, - {Key: "MaxCPUUsageUpperBound", FloatValue: testlib.Ptr(100.0)}, + {Key: "AvgCPUUsageLowerBound", FloatValue: new(0.0)}, + {Key: "AvgCPUUsageUpperBound", FloatValue: new(90.0)}, + {Key: "MaxCPUUsageLowerBound", FloatValue: new(0.0)}, + {Key: "MaxCPUUsageUpperBound", FloatValue: new(100.0)}, }, }, }, diff --git a/internal/scheduling/nova/plugins/detectors/avoid_high_steal_pct_test.go b/internal/scheduling/nova/plugins/detectors/avoid_high_steal_pct_test.go index 5214b7d6e..df9036674 100644 --- a/internal/scheduling/nova/plugins/detectors/avoid_high_steal_pct_test.go +++ b/internal/scheduling/nova/plugins/detectors/avoid_high_steal_pct_test.go @@ -10,7 +10,6 @@ import ( "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" - testlib "github.com/cobaltcore-dev/cortex/pkg/testing" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) @@ -29,7 +28,7 @@ func TestAvoidHighStealPctStep_Init(t *testing.T) { } params := []v1alpha1.Parameter{ - {Key: "maxStealPctOverObservedTimeSpan", FloatValue: testlib.Ptr(80.0)}, + {Key: "maxStealPctOverObservedTimeSpan", FloatValue: new(80.0)}, } tests := []struct { @@ -391,28 +390,28 @@ func TestAvoidHighStealPctStep_Validate(t *testing.T) { { name: "valid params", params: []v1alpha1.Parameter{ - {Key: "maxStealPctOverObservedTimeSpan", FloatValue: testlib.Ptr(80.0)}, + {Key: "maxStealPctOverObservedTimeSpan", FloatValue: new(80.0)}, }, expectError: false, }, { name: "valid params with zero threshold", params: []v1alpha1.Parameter{ - {Key: "maxStealPctOverObservedTimeSpan", FloatValue: testlib.Ptr(0.0)}, + {Key: "maxStealPctOverObservedTimeSpan", FloatValue: new(0.0)}, }, expectError: false, }, { name: "invalid params with negative threshold", params: []v1alpha1.Parameter{ - {Key: "maxStealPctOverObservedTimeSpan", FloatValue: testlib.Ptr(-5.0)}, + {Key: "maxStealPctOverObservedTimeSpan", FloatValue: new(-5.0)}, }, expectError: true, }, { name: "invalid JSON", params: []v1alpha1.Parameter{ - {Key: "invalidJSON", StringValue: testlib.Ptr("{invalid json}")}, + {Key: "invalidJSON", StringValue: new("{invalid json}")}, }, expectError: true, }, diff --git a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go index 88e2f07d5..b97d3e0e5 100644 --- a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go +++ b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go @@ -25,6 +25,10 @@ type FilterHasEnoughCapacityOpts struct { // When a reservation type is in this list, its capacity is not blocked. // Default: empty (all reservation types are considered) IgnoredReservationTypes []v1alpha1.ReservationType `json:"ignoredReservationTypes,omitempty"` + + // IgnoreAllocations skips subtracting current VM allocations from host capacity. + // When true, only raw hardware capacity is considered (empty datacenter scenario). + IgnoreAllocations bool `json:"ignoreAllocations,omitempty"` } func (FilterHasEnoughCapacityOpts) Validate() error { return nil } @@ -80,18 +84,20 @@ func (s *FilterHasEnoughCapacity) Run(traceLog *slog.Logger, request api.Externa freeResourcesByHost[hv.Name] = hv.Status.EffectiveCapacity } - // Subtract allocated resources. - for resourceName, allocated := range hv.Status.Allocation { - free, ok := freeResourcesByHost[hv.Name][resourceName] - if !ok { - traceLog.Error( - "hypervisor with allocation for unknown resource", - "host", hv.Name, "resource", resourceName, - ) - continue + // Subtract allocated resources (skip when ignoring allocations for empty-datacenter capacity queries). + if !s.Options.IgnoreAllocations { + for resourceName, allocated := range hv.Status.Allocation { + free, ok := freeResourcesByHost[hv.Name][resourceName] + if !ok { + traceLog.Error( + "hypervisor with allocation for unknown resource", + "host", hv.Name, "resource", resourceName, + ) + continue + } + free.Sub(allocated) + freeResourcesByHost[hv.Name][resourceName] = free } - free.Sub(allocated) - freeResourcesByHost[hv.Name][resourceName] = free } } @@ -190,6 +196,10 @@ func (s *FilterHasEnoughCapacity) Run(traceLog *slog.Logger, request api.Externa // Oversize spec-only: if a pending VM is larger than the remaining slot, block its full size. var resourcesToBlock map[hv1.ResourceName]resource.Quantity if reservation.Spec.Type == v1alpha1.ReservationTypeCommittedResource && + // When ignoring allocations (empty-datacenter scenario) VM resources are not + // deducted, so the confirmed-VM adjustment would under-block: always use the + // full slot instead. + !s.Options.IgnoreAllocations && // if the reservation is not being migrated, block only unused resources reservation.Spec.TargetHost == reservation.Status.Host && reservation.Spec.CommittedResourceReservation != nil && diff --git a/internal/scheduling/nova/plugins/weighers/kvm_failover_evacuation_test.go b/internal/scheduling/nova/plugins/weighers/kvm_failover_evacuation_test.go index 0664e55d4..1aee73b06 100644 --- a/internal/scheduling/nova/plugins/weighers/kvm_failover_evacuation_test.go +++ b/internal/scheduling/nova/plugins/weighers/kvm_failover_evacuation_test.go @@ -9,7 +9,6 @@ import ( api "github.com/cobaltcore-dev/cortex/api/external/nova" "github.com/cobaltcore-dev/cortex/api/v1alpha1" - testlib "github.com/cobaltcore-dev/cortex/pkg/testing" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -177,7 +176,7 @@ func TestKVMFailoverEvacuationStep_Run(t *testing.T) { newFailoverReservation("failover-1", "host1", false, map[string]string{"instance-123": "original-host"}), }, request: newNovaRequest("instance-123", true, []string{"host1", "host2", "host3"}), - opts: KVMFailoverEvacuationOpts{FailoverHostWeight: testlib.Ptr(1.0), DefaultHostWeight: testlib.Ptr(0.1)}, + opts: KVMFailoverEvacuationOpts{FailoverHostWeight: new(1.0), DefaultHostWeight: new(0.1)}, expectedWeights: map[string]float64{"host1": 1.0, "host2": 0.1, "host3": 0.1}, }, { @@ -186,7 +185,7 @@ func TestKVMFailoverEvacuationStep_Run(t *testing.T) { newFailoverReservation("failover-1", "host1", false, map[string]string{"other-instance": "original-host"}), }, request: newNovaRequest("instance-123", true, []string{"host1", "host2", "host3"}), - opts: KVMFailoverEvacuationOpts{FailoverHostWeight: testlib.Ptr(1.0), DefaultHostWeight: testlib.Ptr(0.1)}, + opts: KVMFailoverEvacuationOpts{FailoverHostWeight: new(1.0), DefaultHostWeight: new(0.1)}, expectedWeights: map[string]float64{"host1": 0.1, "host2": 0.1, "host3": 0.1}, }, { @@ -196,14 +195,14 @@ func TestKVMFailoverEvacuationStep_Run(t *testing.T) { newFailoverReservation("failover-2", "host3", false, map[string]string{"instance-123": "original-host"}), }, request: newNovaRequest("instance-123", true, []string{"host1", "host2", "host3"}), - opts: KVMFailoverEvacuationOpts{FailoverHostWeight: testlib.Ptr(1.0), DefaultHostWeight: testlib.Ptr(0.1)}, + opts: KVMFailoverEvacuationOpts{FailoverHostWeight: new(1.0), DefaultHostWeight: new(0.1)}, expectedWeights: map[string]float64{"host1": 1.0, "host2": 0.1, "host3": 1.0}, }, { name: "No reservations - all hosts get default weight", reservations: []*v1alpha1.Reservation{}, request: newNovaRequest("instance-123", true, []string{"host1", "host2"}), - opts: KVMFailoverEvacuationOpts{FailoverHostWeight: testlib.Ptr(1.0), DefaultHostWeight: testlib.Ptr(0.1)}, + opts: KVMFailoverEvacuationOpts{FailoverHostWeight: new(1.0), DefaultHostWeight: new(0.1)}, expectedWeights: map[string]float64{"host1": 0.1, "host2": 0.1}, }, { @@ -212,7 +211,7 @@ func TestKVMFailoverEvacuationStep_Run(t *testing.T) { newFailoverReservation("failover-1", "host1", false, map[string]string{"instance-123": "original-host"}), }, request: newNovaRequest("instance-123", true, []string{"host1", "host2"}), - opts: KVMFailoverEvacuationOpts{FailoverHostWeight: testlib.Ptr(0.9), DefaultHostWeight: testlib.Ptr(0.05)}, + opts: KVMFailoverEvacuationOpts{FailoverHostWeight: new(0.9), DefaultHostWeight: new(0.05)}, expectedWeights: map[string]float64{"host1": 0.9, "host2": 0.05}, }, { @@ -230,7 +229,7 @@ func TestKVMFailoverEvacuationStep_Run(t *testing.T) { newFailoverReservation("failed-failover", "host1", true, map[string]string{"instance-123": "original-host"}), }, request: newNovaRequest("instance-123", true, []string{"host1", "host2"}), - opts: KVMFailoverEvacuationOpts{FailoverHostWeight: testlib.Ptr(1.0), DefaultHostWeight: testlib.Ptr(0.1)}, + opts: KVMFailoverEvacuationOpts{FailoverHostWeight: new(1.0), DefaultHostWeight: new(0.1)}, expectedWeights: map[string]float64{"host1": 0.1, "host2": 0.1}, }, { @@ -239,7 +238,7 @@ func TestKVMFailoverEvacuationStep_Run(t *testing.T) { newCommittedReservation("committed-res", "host1"), }, request: newNovaRequest("instance-123", true, []string{"host1", "host2"}), - opts: KVMFailoverEvacuationOpts{FailoverHostWeight: testlib.Ptr(1.0), DefaultHostWeight: testlib.Ptr(0.1)}, + opts: KVMFailoverEvacuationOpts{FailoverHostWeight: new(1.0), DefaultHostWeight: new(0.1)}, expectedWeights: map[string]float64{"host1": 0.1, "host2": 0.1}, }, { @@ -248,7 +247,7 @@ func TestKVMFailoverEvacuationStep_Run(t *testing.T) { newFailoverReservation("failover-1", "host1", false, map[string]string{"instance-123": "original-host"}), }, request: newNovaRequest("instance-123", false, []string{"host1", "host2", "host3"}), - opts: KVMFailoverEvacuationOpts{FailoverHostWeight: testlib.Ptr(1.0), DefaultHostWeight: testlib.Ptr(0.1)}, + opts: KVMFailoverEvacuationOpts{FailoverHostWeight: new(1.0), DefaultHostWeight: new(0.1)}, expectedWeights: map[string]float64{"host1": 0, "host2": 0, "host3": 0}, }, } @@ -285,8 +284,8 @@ func TestKVMFailoverEvacuationStep_Run(t *testing.T) { func TestKVMFailoverEvacuationOpts_Validate(t *testing.T) { opts := KVMFailoverEvacuationOpts{ - FailoverHostWeight: testlib.Ptr(1.0), - DefaultHostWeight: testlib.Ptr(0.1), + FailoverHostWeight: new(1.0), + DefaultHostWeight: new(0.1), } if err := opts.Validate(); err != nil { t.Errorf("expected no error, got %v", err) diff --git a/internal/scheduling/nova/plugins/weighers/kvm_failover_reservation_consolidation_test.go b/internal/scheduling/nova/plugins/weighers/kvm_failover_reservation_consolidation_test.go index 62d69d319..2ba65b436 100644 --- a/internal/scheduling/nova/plugins/weighers/kvm_failover_reservation_consolidation_test.go +++ b/internal/scheduling/nova/plugins/weighers/kvm_failover_reservation_consolidation_test.go @@ -10,7 +10,6 @@ import ( api "github.com/cobaltcore-dev/cortex/api/external/nova" "github.com/cobaltcore-dev/cortex/api/v1alpha1" - testlib "github.com/cobaltcore-dev/cortex/pkg/testing" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -115,7 +114,7 @@ func TestKVMFailoverReservationConsolidationStep_Run(t *testing.T) { }, // Request for group-D - no same-group on any host request: newFailoverReservationRequest("group-D", []string{"host1", "host2", "host3"}), - opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: testlib.Ptr(1.0), SameSpecPenalty: testlib.Ptr(0.1)}, + opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: new(1.0), SameSpecPenalty: new(0.1)}, // T=4, host1: (1/4)*3=0.75, host2: (1/4)*1=0.25, host3: 0 expectedWeights: map[string]float64{"host1": 0.75, "host2": 0.25, "host3": 0}, }, @@ -136,7 +135,7 @@ func TestKVMFailoverReservationConsolidationStep_Run(t *testing.T) { newFailoverReservationWithGroup("res-10", "host2", "group-D"), }, request: newFailoverReservationRequest("group-A", []string{"host1", "host2", "host3"}), - opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: testlib.Ptr(1.0), SameSpecPenalty: testlib.Ptr(0.1)}, + opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: new(1.0), SameSpecPenalty: new(0.1)}, // T=10 // host1: (1/10)*5 - (0.1/10)*0 = 0.5 // host2: (1/10)*5 - (0.1/10)*3 = 0.5 - 0.03 = 0.47 @@ -154,7 +153,7 @@ func TestKVMFailoverReservationConsolidationStep_Run(t *testing.T) { newFailoverReservationWithGroup("res-5", "host2", "group-D"), }, request: newFailoverReservationRequest("group-A", []string{"host2", "host3"}), - opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: testlib.Ptr(1.0), SameSpecPenalty: testlib.Ptr(0.1)}, + opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: new(1.0), SameSpecPenalty: new(0.1)}, // T=5 // host2: (1/5)*5 - (0.1/5)*3 = 1.0 - 0.06 = 0.94 // host3: 0 @@ -164,7 +163,7 @@ func TestKVMFailoverReservationConsolidationStep_Run(t *testing.T) { name: "no reservations: all hosts get default weight (no effect)", reservations: []*v1alpha1.Reservation{}, request: newFailoverReservationRequest("group-A", []string{"host1", "host2"}), - opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: testlib.Ptr(1.0), SameSpecPenalty: testlib.Ptr(0.1)}, + opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: new(1.0), SameSpecPenalty: new(0.1)}, expectedWeights: map[string]float64{"host1": 0, "host2": 0}, }, { @@ -174,7 +173,7 @@ func TestKVMFailoverReservationConsolidationStep_Run(t *testing.T) { }, // Use a non-failover request (evacuation) request: newNovaRequest("instance-123", true, []string{"host1", "host2"}), - opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: testlib.Ptr(1.0), SameSpecPenalty: testlib.Ptr(0.1)}, + opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: new(1.0), SameSpecPenalty: new(0.1)}, expectedWeights: map[string]float64{"host1": 0, "host2": 0}, }, { @@ -184,7 +183,7 @@ func TestKVMFailoverReservationConsolidationStep_Run(t *testing.T) { }, // Use a non-failover request (no hints = create intent) request: newNovaRequest("instance-123", false, []string{"host1", "host2"}), - opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: testlib.Ptr(1.0), SameSpecPenalty: testlib.Ptr(0.1)}, + opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: new(1.0), SameSpecPenalty: new(0.1)}, expectedWeights: map[string]float64{"host1": 0, "host2": 0}, }, { @@ -209,7 +208,7 @@ func TestKVMFailoverReservationConsolidationStep_Run(t *testing.T) { newCommittedReservation("committed-1", "host2"), }, request: newFailoverReservationRequest("group-A", []string{"host1", "host2", "host3"}), - opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: testlib.Ptr(1.0), SameSpecPenalty: testlib.Ptr(0.1)}, + opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: new(1.0), SameSpecPenalty: new(0.1)}, // T=1 (only 1 failover reservation), committed reservation ignored // host1: (1/1)*1 - (0.1/1)*1 = 0.9 // host2: 0 (committed reservation not counted) @@ -223,7 +222,7 @@ func TestKVMFailoverReservationConsolidationStep_Run(t *testing.T) { newFailoverReservation("failed-res", "host2", true, map[string]string{"vm-1": "h-1"}), }, request: newFailoverReservationRequest("group-A", []string{"host1", "host2"}), - opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: testlib.Ptr(1.0), SameSpecPenalty: testlib.Ptr(0.1)}, + opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: new(1.0), SameSpecPenalty: new(0.1)}, // T=1 (failed reservation ignored) // host1: (1/1)*1 - (0.1/1)*1 = 0.9 // host2: 0 @@ -237,7 +236,7 @@ func TestKVMFailoverReservationConsolidationStep_Run(t *testing.T) { newFailoverReservationWithGroup("res-3", "host2", "group-B"), }, request: newFailoverReservationRequest("group-A", []string{"host1", "host2"}), - opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: testlib.Ptr(2.0), SameSpecPenalty: testlib.Ptr(0.5)}, + opts: KVMFailoverReservationConsolidationOpts{TotalCountWeight: new(2.0), SameSpecPenalty: new(0.5)}, // T=3, W=2.0, P=0.5 // host1: (2/3)*2 - (0.5/3)*2 = 1.3333 - 0.3333 = 1.0 // host2: (2/3)*1 - (0.5/3)*0 = 0.6667 @@ -294,8 +293,8 @@ func TestKVMFailoverReservationConsolidationOpts_Validate(t *testing.T) { { name: "valid: both set, p < w", opts: KVMFailoverReservationConsolidationOpts{ - TotalCountWeight: testlib.Ptr(2.0), - SameSpecPenalty: testlib.Ptr(0.5), + TotalCountWeight: new(2.0), + SameSpecPenalty: new(0.5), }, }, { @@ -305,36 +304,36 @@ func TestKVMFailoverReservationConsolidationOpts_Validate(t *testing.T) { { name: "valid: both zero", opts: KVMFailoverReservationConsolidationOpts{ - TotalCountWeight: testlib.Ptr(0.0), - SameSpecPenalty: testlib.Ptr(0.0), + TotalCountWeight: new(0.0), + SameSpecPenalty: new(0.0), }, }, { name: "invalid: negative totalCountWeight", opts: KVMFailoverReservationConsolidationOpts{ - TotalCountWeight: testlib.Ptr(-1.0), + TotalCountWeight: new(-1.0), }, wantErr: "totalCountWeight must be non-negative", }, { name: "invalid: negative sameSpecPenalty", opts: KVMFailoverReservationConsolidationOpts{ - SameSpecPenalty: testlib.Ptr(-0.1), + SameSpecPenalty: new(-0.1), }, wantErr: "sameSpecPenalty must be non-negative", }, { name: "invalid: p >= w", opts: KVMFailoverReservationConsolidationOpts{ - TotalCountWeight: testlib.Ptr(1.0), - SameSpecPenalty: testlib.Ptr(1.0), + TotalCountWeight: new(1.0), + SameSpecPenalty: new(1.0), }, wantErr: "sameSpecPenalty must be less than totalCountWeight", }, { name: "invalid: w=0 with p>0 (default penalty with zero weight)", opts: KVMFailoverReservationConsolidationOpts{ - TotalCountWeight: testlib.Ptr(0.0), + TotalCountWeight: new(0.0), // SameSpecPenalty defaults to 0.1 }, wantErr: "sameSpecPenalty must be zero when totalCountWeight is zero", @@ -342,8 +341,8 @@ func TestKVMFailoverReservationConsolidationOpts_Validate(t *testing.T) { { name: "invalid: w=0 with explicit p>0", opts: KVMFailoverReservationConsolidationOpts{ - TotalCountWeight: testlib.Ptr(0.0), - SameSpecPenalty: testlib.Ptr(0.5), + TotalCountWeight: new(0.0), + SameSpecPenalty: new(0.5), }, wantErr: "sameSpecPenalty must be zero when totalCountWeight is zero", }, diff --git a/internal/scheduling/nova/plugins/weighers/vmware_anti_affinity_noisy_projects_test.go b/internal/scheduling/nova/plugins/weighers/vmware_anti_affinity_noisy_projects_test.go index 304ab0612..2d5d1c389 100644 --- a/internal/scheduling/nova/plugins/weighers/vmware_anti_affinity_noisy_projects_test.go +++ b/internal/scheduling/nova/plugins/weighers/vmware_anti_affinity_noisy_projects_test.go @@ -12,7 +12,6 @@ import ( api "github.com/cobaltcore-dev/cortex/api/external/nova" "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" - testlib "github.com/cobaltcore-dev/cortex/pkg/testing" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) @@ -84,19 +83,19 @@ func TestVMwareAntiAffinityNoisyProjectsStep_Init(t *testing.T) { params := []v1alpha1.Parameter{ { Key: "avgCPUUsageLowerBound", - FloatValue: testlib.Ptr(20.0), + FloatValue: new(20.0), }, { Key: "avgCPUUsageUpperBound", - FloatValue: testlib.Ptr(100.0), + FloatValue: new(100.0), }, { Key: "avgCPUUsageActivationLowerBound", - FloatValue: testlib.Ptr(0.0), + FloatValue: new(0.0), }, { Key: "avgCPUUsageActivationUpperBound", - FloatValue: testlib.Ptr(-0.5), + FloatValue: new(-0.5), }, } diff --git a/internal/scheduling/nova/plugins/weighers/vmware_avoid_long_term_contended_hosts_test.go b/internal/scheduling/nova/plugins/weighers/vmware_avoid_long_term_contended_hosts_test.go index 5a69cdaf7..0b690d853 100644 --- a/internal/scheduling/nova/plugins/weighers/vmware_avoid_long_term_contended_hosts_test.go +++ b/internal/scheduling/nova/plugins/weighers/vmware_avoid_long_term_contended_hosts_test.go @@ -12,7 +12,6 @@ import ( api "github.com/cobaltcore-dev/cortex/api/external/nova" "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" - testlib "github.com/cobaltcore-dev/cortex/pkg/testing" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) @@ -86,35 +85,35 @@ func TestVMwareAvoidLongTermContendedHostsStep_Init(t *testing.T) { params := []v1alpha1.Parameter{ { Key: "avgCPUContentionLowerBound", - FloatValue: testlib.Ptr(0.0), + FloatValue: new(0.0), }, { Key: "avgCPUContentionUpperBound", - FloatValue: testlib.Ptr(100.0), + FloatValue: new(100.0), }, { Key: "avgCPUContentionActivationLowerBound", - FloatValue: testlib.Ptr(0.0), + FloatValue: new(0.0), }, { Key: "avgCPUContentionActivationUpperBound", - FloatValue: testlib.Ptr(-1.0), + FloatValue: new(-1.0), }, { Key: "maxCPUContentionLowerBound", - FloatValue: testlib.Ptr(0.0), + FloatValue: new(0.0), }, { Key: "maxCPUContentionUpperBound", - FloatValue: testlib.Ptr(100.0), + FloatValue: new(100.0), }, { Key: "maxCPUContentionActivationLowerBound", - FloatValue: testlib.Ptr(0.0), + FloatValue: new(0.0), }, { Key: "maxCPUContentionActivationUpperBound", - FloatValue: testlib.Ptr(-1.0), + FloatValue: new(-1.0), }, } diff --git a/internal/scheduling/nova/plugins/weighers/vmware_avoid_short_term_contended_hosts_test.go b/internal/scheduling/nova/plugins/weighers/vmware_avoid_short_term_contended_hosts_test.go index 0dfe280d0..895bee459 100644 --- a/internal/scheduling/nova/plugins/weighers/vmware_avoid_short_term_contended_hosts_test.go +++ b/internal/scheduling/nova/plugins/weighers/vmware_avoid_short_term_contended_hosts_test.go @@ -12,7 +12,6 @@ import ( api "github.com/cobaltcore-dev/cortex/api/external/nova" "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" - testlib "github.com/cobaltcore-dev/cortex/pkg/testing" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) @@ -86,35 +85,35 @@ func TestVMwareAvoidShortTermContendedHostsStep_Init(t *testing.T) { params := []v1alpha1.Parameter{ { Key: "avgCPUContentionLowerBound", - FloatValue: testlib.Ptr(0.0), + FloatValue: new(0.0), }, { Key: "avgCPUContentionUpperBound", - FloatValue: testlib.Ptr(100.0), + FloatValue: new(100.0), }, { Key: "avgCPUContentionActivationLowerBound", - FloatValue: testlib.Ptr(0.0), + FloatValue: new(0.0), }, { Key: "avgCPUContentionActivationUpperBound", - FloatValue: testlib.Ptr(-1.0), + FloatValue: new(-1.0), }, { Key: "maxCPUContentionLowerBound", - FloatValue: testlib.Ptr(0.0), + FloatValue: new(0.0), }, { Key: "maxCPUContentionUpperBound", - FloatValue: testlib.Ptr(100.0), + FloatValue: new(100.0), }, { Key: "maxCPUContentionActivationLowerBound", - FloatValue: testlib.Ptr(0.0), + FloatValue: new(0.0), }, { Key: "maxCPUContentionActivationUpperBound", - FloatValue: testlib.Ptr(-1.0), + FloatValue: new(-1.0), }, } diff --git a/internal/scheduling/reservations/capacity/config.go b/internal/scheduling/reservations/capacity/config.go new file mode 100644 index 000000000..dc134e887 --- /dev/null +++ b/internal/scheduling/reservations/capacity/config.go @@ -0,0 +1,53 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package capacity + +import ( + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// Config holds configuration for the capacity controller. +type Config struct { + // ReconcileInterval is how often the controller probes the scheduler and updates CRDs. + ReconcileInterval metav1.Duration `json:"capacityReconcileInterval"` + + // TotalPipeline is the scheduler pipeline used for the empty-state probe. + // This pipeline should ignore current VM allocations (e.g. kvm-report-capacity). + TotalPipeline string `json:"capacityTotalPipeline"` + + // PlaceablePipeline is the scheduler pipeline used for the current-state probe. + // This pipeline considers current VM allocations to determine remaining placement capacity. + PlaceablePipeline string `json:"capacityPlaceablePipeline"` + + // SchedulerURL is the endpoint of the nova external scheduler. + SchedulerURL string `json:"schedulerURL"` +} + +// ApplyDefaults fills in any unset values with defaults. +func (c *Config) ApplyDefaults() { + defaults := DefaultConfig() + if c.ReconcileInterval.Duration == 0 { + c.ReconcileInterval = defaults.ReconcileInterval + } + if c.TotalPipeline == "" { + c.TotalPipeline = defaults.TotalPipeline + } + if c.PlaceablePipeline == "" { + c.PlaceablePipeline = defaults.PlaceablePipeline + } + if c.SchedulerURL == "" { + c.SchedulerURL = defaults.SchedulerURL + } +} + +func DefaultConfig() Config { + return Config{ + ReconcileInterval: metav1.Duration{Duration: 5 * time.Minute}, + TotalPipeline: "kvm-report-capacity", + PlaceablePipeline: "kvm-general-purpose-load-balancing", + SchedulerURL: "http://localhost:8080/scheduler/nova/external", + } +} diff --git a/internal/scheduling/reservations/capacity/controller.go b/internal/scheduling/reservations/capacity/controller.go new file mode 100644 index 000000000..eba8a9fec --- /dev/null +++ b/internal/scheduling/reservations/capacity/controller.go @@ -0,0 +1,340 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package capacity + +import ( + "context" + "fmt" + "hash/fnv" + "sort" + "strings" + "time" + + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "github.com/google/uuid" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + + schedulerapi "github.com/cobaltcore-dev/cortex/api/external/nova" + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" +) + +var log = ctrl.Log.WithName("capacity-controller").WithValues("module", "capacity") + +// Controller reconciles FlavorGroupCapacity CRDs on a fixed interval. +// For each (flavor group × AZ) pair it probes all flavors in the group and updates the CRD status. +type Controller struct { + client client.Client + schedulerClient *reservations.SchedulerClient + config Config +} + +func NewController(c client.Client, config Config) *Controller { + return &Controller{ + client: c, + schedulerClient: reservations.NewSchedulerClient(config.SchedulerURL), + config: config, + } +} + +// Start runs the periodic reconcile loop. Implements manager.Runnable. +func (c *Controller) Start(ctx context.Context) error { + timer := time.NewTimer(0) // fire immediately on start + defer timer.Stop() + + for { + select { + case <-ctx.Done(): + return nil + case <-timer.C: + if err := c.reconcileAll(ctx); err != nil { + log.Error(err, "reconcile cycle failed") + } + timer.Reset(c.config.ReconcileInterval.Duration) + } + } +} + +// reconcileAll iterates all flavor groups × AZs and upserts FlavorGroupCapacity CRDs. +func (c *Controller) reconcileAll(ctx context.Context) error { + knowledge := &reservations.FlavorGroupKnowledgeClient{Client: c.client} + flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) + if err != nil { + return fmt.Errorf("failed to get flavor groups: %w", err) + } + + var hvList hv1.HypervisorList + if err := c.client.List(ctx, &hvList); err != nil { + return fmt.Errorf("failed to list hypervisors: %w", err) + } + + hvByName := make(map[string]hv1.Hypervisor, len(hvList.Items)) + for _, hv := range hvList.Items { + hvByName[hv.Name] = hv + } + + azs := availabilityZones(hvList.Items) + + for groupName, groupData := range flavorGroups { + for _, az := range azs { + if err := c.reconcileOne(ctx, groupName, groupData, az, hvByName, hvList.Items); err != nil { + log.Error(err, "failed to reconcile flavor group capacity", + "flavorGroup", groupName, "az", az) + // Continue with other pairs rather than aborting the whole cycle. + } + } + } + return nil +} + +// reconcileOne updates the FlavorGroupCapacity CRD for one (group × AZ) pair. +func (c *Controller) reconcileOne( + ctx context.Context, + groupName string, + groupData compute.FlavorGroupFeature, + az string, + hvByName map[string]hv1.Hypervisor, + allHVs []hv1.Hypervisor, +) error { + + smallestFlavorBytes := int64(groupData.SmallestFlavor.MemoryMB) * 1024 * 1024 //nolint:gosec + if smallestFlavorBytes <= 0 { + return fmt.Errorf("smallest flavor %q has invalid memory %d MB", + groupData.SmallestFlavor.Name, groupData.SmallestFlavor.MemoryMB) + } + + crdName := crdNameFor(groupName, az) + + var existing v1alpha1.FlavorGroupCapacity + err := c.client.Get(ctx, types.NamespacedName{Name: crdName}, &existing) + if apierrors.IsNotFound(err) { + existing = v1alpha1.FlavorGroupCapacity{ + ObjectMeta: metav1.ObjectMeta{Name: crdName}, + Spec: v1alpha1.FlavorGroupCapacitySpec{ + FlavorGroup: groupName, + AvailabilityZone: az, + }, + } + if createErr := c.client.Create(ctx, &existing); createErr != nil { + return fmt.Errorf("failed to create FlavorGroupCapacity %s: %w", crdName, createErr) + } + } else if err != nil { + return fmt.Errorf("failed to get FlavorGroupCapacity %s: %w", crdName, err) + } + + // Build a lookup of existing per-flavor data so we can preserve stale values on probe failure. + existingByName := make(map[string]v1alpha1.FlavorCapacityStatus, len(existing.Status.Flavors)) + for _, f := range existing.Status.Flavors { + existingByName[f.FlavorName] = f + } + + // Probe all flavors in the group. Sort for stable CRD output. + flavors := make([]compute.FlavorInGroup, len(groupData.Flavors)) + copy(flavors, groupData.Flavors) + sort.Slice(flavors, func(i, j int) bool { return flavors[i].Name < flavors[j].Name }) + + allFresh := true + newFlavors := make([]v1alpha1.FlavorCapacityStatus, 0, len(flavors)) + for _, flavor := range flavors { + cur := existingByName[flavor.Name] + cur.FlavorName = flavor.Name + + totalVMSlots, totalHosts, totalErr := c.probeScheduler(ctx, flavor, az, c.config.TotalPipeline, hvByName) + placeableVMs, placeableHosts, placeableErr := c.probeScheduler(ctx, flavor, az, c.config.PlaceablePipeline, hvByName) + + if totalErr != nil { + allFresh = false + } else { + cur.TotalCapacityVMSlots = totalVMSlots + cur.TotalCapacityHosts = totalHosts + } + if placeableErr != nil { + allFresh = false + } else { + cur.PlaceableVMs = placeableVMs + cur.PlaceableHosts = placeableHosts + } + newFlavors = append(newFlavors, cur) + } + + // Count total instances and committed capacity (always available regardless of probe results). + totalInstances := countInstancesInAZ(allHVs, az) + committedCapacity, committedErr := c.sumCommittedCapacity(ctx, groupName, az, smallestFlavorBytes) + if committedErr != nil { + log.Error(committedErr, "failed to sum committed capacity", "flavorGroup", groupName, "az", az) + committedCapacity = 0 + } + + patch := client.MergeFrom(existing.DeepCopy()) + existing.Status.Flavors = newFlavors + existing.Status.TotalInstances = totalInstances + existing.Status.CommittedCapacity = committedCapacity + existing.Status.LastReconcileAt = metav1.Now() + + freshCondition := metav1.Condition{ + Type: v1alpha1.FlavorGroupCapacityConditionReady, + ObservedGeneration: existing.Generation, + } + if allFresh { + freshCondition.Status = metav1.ConditionTrue + freshCondition.Reason = "ReconcileSucceeded" + freshCondition.Message = "capacity data is up-to-date" + } else { + freshCondition.Status = metav1.ConditionFalse + freshCondition.Reason = "ReconcileFailed" + freshCondition.Message = "one or more flavor probes failed" + } + meta.SetStatusCondition(&existing.Status.Conditions, freshCondition) + + if patchErr := c.client.Status().Patch(ctx, &existing, patch); patchErr != nil { + return fmt.Errorf("failed to patch FlavorGroupCapacity %s status: %w", crdName, patchErr) + } + return nil +} + +// probeScheduler calls the scheduler with the given pipeline and returns VM slots + host count. +// Capacity is computed as sum of floor(hostMemory / flavorMemory) across returned hosts. +func (c *Controller) probeScheduler( + ctx context.Context, + flavor compute.FlavorInGroup, + az, pipeline string, + hvByName map[string]hv1.Hypervisor, +) (capacity, hosts int64, err error) { + + flavorBytes := int64(flavor.MemoryMB) * 1024 * 1024 //nolint:gosec + if flavorBytes <= 0 { + return 0, 0, fmt.Errorf("flavor %q has invalid memory %d MB", flavor.Name, flavor.MemoryMB) + } + + // Build EligibleHosts from all known hypervisors so that novaLimitHostsToRequest + // (which filters the response to hosts present in the request) does not zero out + // the result. The AZ filter in the pipeline handles narrowing to the correct AZ. + eligibleHosts := make([]schedulerapi.ExternalSchedulerHost, 0, len(hvByName)) + for name := range hvByName { + eligibleHosts = append(eligibleHosts, schedulerapi.ExternalSchedulerHost{ComputeHost: name}) + } + + resp, err := c.schedulerClient.ScheduleReservation(ctx, reservations.ScheduleReservationRequest{ + InstanceUUID: uuid.New().String(), + ProjectID: "cortex-capacity-probe", + FlavorName: flavor.Name, + MemoryMB: flavor.MemoryMB, + VCPUs: flavor.VCPUs, + FlavorExtraSpecs: flavor.ExtraSpecs, + AvailabilityZone: az, + Pipeline: pipeline, + EligibleHosts: eligibleHosts, + }) + if err != nil { + return 0, 0, fmt.Errorf("scheduler call failed (pipeline=%s): %w", pipeline, err) + } + + hosts = int64(len(resp.Hosts)) + for _, hostName := range resp.Hosts { + hv, ok := hvByName[hostName] + if !ok { + continue + } + effectiveCap := hv.Status.EffectiveCapacity + if effectiveCap == nil { + effectiveCap = hv.Status.Capacity + } + if effectiveCap == nil { + continue + } + memCap, ok := effectiveCap[hv1.ResourceMemory] + if !ok { + continue + } + if capBytes := memCap.Value(); capBytes > 0 { + capacity += capBytes / flavorBytes + } + } + return capacity, hosts, nil +} + +// sumCommittedCapacity sums AcceptedSpec.Amount (or Spec.Amount as fallback) across all +// CommittedResource CRDs for the given (flavorGroup, az) pair with an active state +// (guaranteed or confirmed) and resource type memory. Returns the total in slots. +func (c *Controller) sumCommittedCapacity(ctx context.Context, groupName, az string, smallestFlavorBytes int64) (int64, error) { + var list v1alpha1.CommittedResourceList + if err := c.client.List(ctx, &list); err != nil { + return 0, fmt.Errorf("failed to list CommittedResources: %w", err) + } + + var total int64 + for _, cr := range list.Items { + if cr.Spec.FlavorGroupName != groupName { + continue + } + if cr.Spec.AvailabilityZone != az { + continue + } + if cr.Spec.ResourceType != v1alpha1.CommittedResourceTypeMemory { + continue + } + if cr.Spec.State != v1alpha1.CommitmentStatusGuaranteed && cr.Spec.State != v1alpha1.CommitmentStatusConfirmed { + continue + } + amount := cr.Spec.Amount + if cr.Status.AcceptedSpec != nil { + amount = cr.Status.AcceptedSpec.Amount + } + if bytes := amount.Value(); bytes > 0 { + total += bytes / smallestFlavorBytes + } + } + return total, nil +} + +// availabilityZones returns a sorted, deduplicated list of AZs from Hypervisor CRD labels. +func availabilityZones(hvs []hv1.Hypervisor) []string { + azSet := make(map[string]struct{}) + for _, hv := range hvs { + if az, ok := hv.Labels["topology.kubernetes.io/zone"]; ok && az != "" { + azSet[az] = struct{}{} + } + } + azs := make([]string, 0, len(azSet)) + for az := range azSet { + azs = append(azs, az) + } + sort.Strings(azs) + return azs +} + +// countInstancesInAZ counts total VM instances across all hypervisors in the given AZ. +func countInstancesInAZ(hvs []hv1.Hypervisor, az string) int64 { + var total int64 + for _, hv := range hvs { + if hv.Labels["topology.kubernetes.io/zone"] != az { + continue + } + total += int64(len(hv.Status.Instances)) + } + return total +} + +// crdNameFor produces a collision-safe DNS label for a (flavorGroup, az) pair. +// A 6-hex-char FNV-1a hash of the raw inputs is appended so that pairs differing only +// by characters that sanitise identically (e.g. "." vs "-") still get unique names. +func crdNameFor(flavorGroup, az string) string { + h := fnv.New32a() + _, _ = h.Write([]byte(flavorGroup + "\x00" + az)) + suffix := fmt.Sprintf("%06x", h.Sum32()&0xFFFFFF) + + prefix := strings.ToLower(flavorGroup + "-" + az) + prefix = strings.ReplaceAll(prefix, "_", "-") + prefix = strings.ReplaceAll(prefix, ".", "-") + if len(prefix) > 56 { // 56 + "-" + 6 = 63 chars (DNS label limit) + prefix = prefix[:56] + } + return prefix + "-" + suffix +} diff --git a/internal/scheduling/reservations/capacity/controller_test.go b/internal/scheduling/reservations/capacity/controller_test.go new file mode 100644 index 000000000..69a4e80bb --- /dev/null +++ b/internal/scheduling/reservations/capacity/controller_test.go @@ -0,0 +1,607 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package capacity + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "regexp" + "sort" + "testing" + + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + schedulerapi "github.com/cobaltcore-dev/cortex/api/external/nova" + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" +) + +// newTestScheme returns a runtime.Scheme with all required types registered. +func newTestScheme(t *testing.T) *runtime.Scheme { + t.Helper() + s := runtime.NewScheme() + if err := v1alpha1.AddToScheme(s); err != nil { + t.Fatalf("failed to add v1alpha1 scheme: %v", err) + } + if err := hv1.AddToScheme(s); err != nil { + t.Fatalf("failed to add hypervisor scheme: %v", err) + } + return s +} + +// newFlavorGroupKnowledge creates a ready Knowledge CRD with a single flavor group. +func newFlavorGroupKnowledge(t *testing.T, groupName string, smallestMemoryMB uint64) *v1alpha1.Knowledge { + t.Helper() + smallestFlavor := compute.FlavorInGroup{ + Name: groupName + "-small", + MemoryMB: smallestMemoryMB, + VCPUs: 2, + ExtraSpecs: map[string]string{"hw:cpu_policy": "dedicated"}, + } + features := []compute.FlavorGroupFeature{ + { + Name: groupName, + SmallestFlavor: smallestFlavor, + Flavors: []compute.FlavorInGroup{smallestFlavor}, + }, + } + raw, err := v1alpha1.BoxFeatureList(features) + if err != nil { + t.Fatalf("failed to box features: %v", err) + } + return &v1alpha1.Knowledge{ + ObjectMeta: metav1.ObjectMeta{Name: "flavor-groups"}, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Extractor: v1alpha1.KnowledgeExtractorSpec{Name: "flavor_groups"}, + }, + Status: v1alpha1.KnowledgeStatus{ + Raw: raw, + Conditions: []metav1.Condition{ + { + Type: v1alpha1.KnowledgeConditionReady, + Status: metav1.ConditionTrue, + Reason: "ExtractorSucceeded", + }, + }, + }, + } +} + +// newHypervisor creates a Hypervisor CRD with a topology AZ label and effective capacity. +func newHypervisor(name, az string, memoryBytes int64, instanceIDs ...string) *hv1.Hypervisor { + hv := &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{"topology.kubernetes.io/zone": az}, + }, + } + if memoryBytes > 0 { + qty := resource.NewQuantity(memoryBytes, resource.BinarySI) + hv.Status.EffectiveCapacity = map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *qty, + } + } + for _, id := range instanceIDs { + hv.Status.Instances = append(hv.Status.Instances, hv1.Instance{ID: id}) + } + return hv +} + +// newMockSchedulerServer creates an httptest server that always returns the given host list. +func newMockSchedulerServer(t *testing.T, hosts []string) *httptest.Server { + t.Helper() + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + resp := schedulerapi.ExternalSchedulerResponse{Hosts: hosts} + if err := json.NewEncoder(w).Encode(resp); err != nil { + t.Errorf("mock scheduler: failed to encode response: %v", err) + } + })) +} + +// --- unit tests for pure helper functions --- + +var ( + dnsLabelRE = regexp.MustCompile(`^[a-z0-9][a-z0-9-]{0,61}[a-z0-9]$`) + hashSuffixRE = regexp.MustCompile(`^[0-9a-f]{6}$`) +) + +func TestCrdNameFor(t *testing.T) { + tests := []struct { + group, az string + wantPrefix string + }{ + {"hana-v2", "qa-de-1a", "hana-v2-qa-de-1a-"}, + {"My_Group", "eu.west.1", "my-group-eu-west-1-"}, + {"G", "AZ_1", "g-az-1-"}, + } + for _, tt := range tests { + got := crdNameFor(tt.group, tt.az) + // Must be a valid DNS label (lowercase, hyphens, ≤63 chars). + if len(got) > 63 { + t.Errorf("crdNameFor(%q, %q) = %q (len=%d > 63)", tt.group, tt.az, got, len(got)) + } + if !dnsLabelRE.MatchString(got) { + t.Errorf("crdNameFor(%q, %q) = %q is not a valid DNS label", tt.group, tt.az, got) + } + // Must start with the expected sanitised prefix followed by a 6-hex-char hash suffix. + if len(got) < len(tt.wantPrefix)+6 || got[:len(tt.wantPrefix)] != tt.wantPrefix { + t.Errorf("crdNameFor(%q, %q) = %q, want prefix %q + 6 hex chars", tt.group, tt.az, got, tt.wantPrefix) + } + hashPart := got[len(tt.wantPrefix):] + if !hashSuffixRE.MatchString(hashPart) { + t.Errorf("crdNameFor(%q, %q) hash suffix %q is not 6 hex chars", tt.group, tt.az, hashPart) + } + } + + // Inputs that differ only by "." vs "-" must produce different CRD names. + dotName := crdNameFor("hana.v2", "qa-de-1a") + dashName := crdNameFor("hana-v2", "qa-de-1a") + if dotName == dashName { + t.Errorf("crdNameFor collision: hana.v2 and hana-v2 both produced %q", dotName) + } +} + +func TestAvailabilityZones(t *testing.T) { + hvs := []hv1.Hypervisor{ + *newHypervisor("h1", "az-a", 0), + *newHypervisor("h2", "az-b", 0), + *newHypervisor("h3", "az-a", 0), // duplicate + {ObjectMeta: metav1.ObjectMeta{Name: "h4"}}, // no label + } + got := availabilityZones(hvs) + want := []string{"az-a", "az-b"} + if len(got) != len(want) { + t.Fatalf("availabilityZones() = %v, want %v", got, want) + } + sort.Strings(got) + for i := range want { + if got[i] != want[i] { + t.Errorf("availabilityZones()[%d] = %q, want %q", i, got[i], want[i]) + } + } +} + +func TestCountInstancesInAZ(t *testing.T) { + hvs := []hv1.Hypervisor{ + *newHypervisor("h1", "az-a", 0, "vm1", "vm2"), + *newHypervisor("h2", "az-a", 0, "vm3"), + *newHypervisor("h3", "az-b", 0, "vm4"), + } + if got := countInstancesInAZ(hvs, "az-a"); got != 3 { + t.Errorf("countInstancesInAZ(az-a) = %d, want 3", got) + } + if got := countInstancesInAZ(hvs, "az-b"); got != 1 { + t.Errorf("countInstancesInAZ(az-b) = %d, want 1", got) + } + if got := countInstancesInAZ(hvs, "az-c"); got != 0 { + t.Errorf("countInstancesInAZ(az-c) = %d, want 0", got) + } +} + +// --- integration-style tests for reconcileOne --- + +func TestReconcileOne_CreatesCRD(t *testing.T) { + const ( + groupName = "hana-v2" + az = "qa-de-1a" + memMB = 4096 // 4 GiB + memBytes = int64(memMB) * 1024 * 1024 + ) + + scheme := newTestScheme(t) + hv := newHypervisor("host-1", az, memBytes, "vm1") + knowledge := newFlavorGroupKnowledge(t, groupName, memMB) + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(knowledge, hv). + WithStatusSubresource(&v1alpha1.FlavorGroupCapacity{}, &v1alpha1.Knowledge{}). + Build() + + // Both probes return host-1 so capacity = floor(4GiB/4GiB) = 1 + schedulerServer := newMockSchedulerServer(t, []string{"host-1"}) + defer schedulerServer.Close() + + ctrl := NewController(fakeClient, Config{ + SchedulerURL: schedulerServer.URL, + TotalPipeline: "kvm-report-capacity", + PlaceablePipeline: "kvm-general-purpose", + }) + + smallFlavor := compute.FlavorInGroup{Name: groupName + "-small", MemoryMB: memMB, VCPUs: 2} + groupData := compute.FlavorGroupFeature{ + SmallestFlavor: smallFlavor, + Flavors: []compute.FlavorInGroup{smallFlavor}, + } + hvByName := map[string]hv1.Hypervisor{"host-1": *hv} + + if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, hvByName, []hv1.Hypervisor{*hv}); err != nil { + t.Fatalf("reconcileOne failed: %v", err) + } + + var crd v1alpha1.FlavorGroupCapacity + if err := fakeClient.Get(context.Background(), types.NamespacedName{Name: crdNameFor(groupName, az)}, &crd); err != nil { + t.Fatalf("failed to get CRD: %v", err) + } + if len(crd.Status.Flavors) != 1 { + t.Fatalf("len(Status.Flavors) = %d, want 1", len(crd.Status.Flavors)) + } + f := crd.Status.Flavors[0] + if f.FlavorName != groupName+"-small" { + t.Errorf("FlavorName = %q, want %q", f.FlavorName, groupName+"-small") + } + if f.TotalCapacityVMSlots != 1 { + t.Errorf("TotalCapacityVMSlots = %d, want 1", f.TotalCapacityVMSlots) + } + if f.TotalCapacityHosts != 1 { + t.Errorf("TotalCapacityHosts = %d, want 1", f.TotalCapacityHosts) + } + if f.PlaceableVMs != 1 { + t.Errorf("PlaceableVMs = %d, want 1", f.PlaceableVMs) + } + if f.PlaceableHosts != 1 { + t.Errorf("PlaceableHosts = %d, want 1", f.PlaceableHosts) + } + if crd.Status.TotalInstances != 1 { + t.Errorf("TotalInstances = %d, want 1", crd.Status.TotalInstances) + } +} + +func TestReconcileOne_SetsReadyConditionFalseOnSchedulerError(t *testing.T) { + const ( + groupName = "hana-v2" + az = "qa-de-1a" + memMB = 2048 + ) + + scheme := newTestScheme(t) + knowledge := newFlavorGroupKnowledge(t, groupName, memMB) + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(knowledge). + WithStatusSubresource(&v1alpha1.FlavorGroupCapacity{}, &v1alpha1.Knowledge{}). + Build() + + // Scheduler returns 500 to simulate error + failServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer failServer.Close() + + ctrl := NewController(fakeClient, Config{ + SchedulerURL: failServer.URL, + TotalPipeline: "kvm-report-capacity", + PlaceablePipeline: "kvm-general-purpose", + }) + + smallFlavor := compute.FlavorInGroup{Name: groupName + "-small", MemoryMB: memMB, VCPUs: 2} + groupData := compute.FlavorGroupFeature{ + SmallestFlavor: smallFlavor, + Flavors: []compute.FlavorInGroup{smallFlavor}, + } + + // reconcileOne returns no error itself (it continues on probe failure), but sets Ready=False + if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, map[string]hv1.Hypervisor{}, []hv1.Hypervisor{}); err != nil { + t.Fatalf("reconcileOne failed: %v", err) + } + + var crd v1alpha1.FlavorGroupCapacity + if err := fakeClient.Get(context.Background(), types.NamespacedName{Name: crdNameFor(groupName, az)}, &crd); err != nil { + t.Fatalf("failed to get CRD: %v", err) + } + + var freshStatus metav1.ConditionStatus + for _, c := range crd.Status.Conditions { + if c.Type == v1alpha1.FlavorGroupCapacityConditionReady { + freshStatus = c.Status + } + } + if freshStatus != metav1.ConditionFalse { + t.Errorf("Ready condition = %q, want %q", freshStatus, metav1.ConditionFalse) + } +} + +func TestReconcileOne_IdempotentUpdate(t *testing.T) { + const ( + groupName = "hana-v2" + az = "qa-de-1a" + memMB = 2048 + memBytes = int64(memMB) * 1024 * 1024 + ) + + scheme := newTestScheme(t) + hv := newHypervisor("host-1", az, memBytes) + knowledge := newFlavorGroupKnowledge(t, groupName, memMB) + crdName := crdNameFor(groupName, az) + + // Pre-create the CRD to test the update path (not create path) + existing := &v1alpha1.FlavorGroupCapacity{ + ObjectMeta: metav1.ObjectMeta{Name: crdName}, + Spec: v1alpha1.FlavorGroupCapacitySpec{ + FlavorGroup: groupName, + AvailabilityZone: az, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(knowledge, hv, existing). + WithStatusSubresource(&v1alpha1.FlavorGroupCapacity{}, &v1alpha1.Knowledge{}). + Build() + + schedulerServer := newMockSchedulerServer(t, []string{"host-1"}) + defer schedulerServer.Close() + + ctrl := NewController(fakeClient, Config{ + SchedulerURL: schedulerServer.URL, + TotalPipeline: "kvm-report-capacity", + PlaceablePipeline: "kvm-general-purpose", + }) + + smallFlavor := compute.FlavorInGroup{Name: groupName + "-small", MemoryMB: memMB, VCPUs: 2} + groupData := compute.FlavorGroupFeature{ + SmallestFlavor: smallFlavor, + Flavors: []compute.FlavorInGroup{smallFlavor}, + } + hvByName := map[string]hv1.Hypervisor{"host-1": *hv} + + // First call + if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, hvByName, []hv1.Hypervisor{*hv}); err != nil { + t.Fatalf("first reconcileOne failed: %v", err) + } + // Second call — should not error on the already-existing CRD + if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, hvByName, []hv1.Hypervisor{*hv}); err != nil { + t.Fatalf("second reconcileOne failed: %v", err) + } + + var crd v1alpha1.FlavorGroupCapacity + if err := fakeClient.Get(context.Background(), types.NamespacedName{Name: crdName}, &crd); err != nil { + t.Fatalf("failed to get CRD: %v", err) + } + if len(crd.Status.Flavors) != 1 { + t.Fatalf("len(Status.Flavors) = %d, want 1", len(crd.Status.Flavors)) + } + if crd.Status.Flavors[0].TotalCapacityVMSlots != 1 { + t.Errorf("TotalCapacityVMSlots = %d, want 1", crd.Status.Flavors[0].TotalCapacityVMSlots) + } +} + +func TestReconcileAll_SkipsGroupsWithNoAZs(t *testing.T) { + scheme := newTestScheme(t) + knowledge := newFlavorGroupKnowledge(t, "hana-v2", 2048) + + // No hypervisors → no AZs → reconcileAll returns without error + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(knowledge). + WithStatusSubresource(&v1alpha1.FlavorGroupCapacity{}, &v1alpha1.Knowledge{}). + Build() + + ctrl := NewController(fakeClient, Config{ + SchedulerURL: "http://localhost:9999", // unreachable; not called + TotalPipeline: "kvm-report-capacity", + PlaceablePipeline: "kvm-general-purpose", + }) + + if err := ctrl.reconcileAll(context.Background()); err != nil { + t.Errorf("reconcileAll with no hypervisors returned error: %v", err) + } + + var list v1alpha1.FlavorGroupCapacityList + if err := fakeClient.List(context.Background(), &list); err != nil { + t.Fatalf("failed to list CRDs: %v", err) + } + if len(list.Items) != 0 { + t.Errorf("expected 0 CRDs, got %d", len(list.Items)) + } +} + +func TestProbeScheduler_CapacityCalculation(t *testing.T) { + const memMB = 4096 + const memBytes = int64(memMB) * 1024 * 1024 + + scheme := newTestScheme(t) + hv1Obj := newHypervisor("host-1", "az-a", memBytes) + hv2Obj := newHypervisor("host-2", "az-a", memBytes*2) // 2x memory + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).Build() + + // Scheduler returns both hosts + srv := newMockSchedulerServer(t, []string{"host-1", "host-2"}) + defer srv.Close() + + c := NewController(fakeClient, Config{SchedulerURL: srv.URL}) + hvByName := map[string]hv1.Hypervisor{ + "host-1": *hv1Obj, + "host-2": *hv2Obj, + } + flavor := compute.FlavorInGroup{Name: "test-flavor", MemoryMB: memMB} + + capacity, hosts, err := c.probeScheduler(context.Background(), flavor, "az-a", "test-pipeline", hvByName) + if err != nil { + t.Fatalf("probeScheduler failed: %v", err) + } + if hosts != 2 { + t.Errorf("hosts = %d, want 2", hosts) + } + // host-1 = 1 slot (4GiB/4GiB), host-2 = 2 slots (8GiB/4GiB) + if capacity != 3 { + t.Errorf("capacity = %d, want 3", capacity) + } +} + +func TestReconcileAll_MultipleGroupsAndAZs(t *testing.T) { + scheme := newTestScheme(t) + + const memMB = 2048 + const memBytes = int64(memMB) * 1024 * 1024 + + // Two AZs, two hypervisors + hv1Obj := newHypervisor("h1", "az-a", memBytes) + hv2Obj := newHypervisor("h2", "az-b", memBytes) + knowledge := newFlavorGroupKnowledge(t, "2152", memMB) + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(knowledge, hv1Obj, hv2Obj). + WithStatusSubresource(&v1alpha1.FlavorGroupCapacity{}, &v1alpha1.Knowledge{}). + Build() + + srv := newMockSchedulerServer(t, []string{}) + defer srv.Close() + + c := NewController(fakeClient, Config{ + SchedulerURL: srv.URL, + TotalPipeline: "kvm-report-capacity", + PlaceablePipeline: "kvm-general-purpose", + }) + + if err := c.reconcileAll(context.Background()); err != nil { + t.Fatalf("reconcileAll failed: %v", err) + } + + // Expect one CRD per AZ for the single group + var list v1alpha1.FlavorGroupCapacityList + if err := fakeClient.List(context.Background(), &list); err != nil { + t.Fatalf("failed to list CRDs: %v", err) + } + if len(list.Items) != 2 { + names := make([]string, len(list.Items)) + for i, item := range list.Items { + names[i] = item.Name + } + t.Errorf("expected 2 CRDs (one per AZ), got %d: %v", len(list.Items), names) + } +} + +func TestReconcileAll_FlavorGroupsKnowledgeNotReady(t *testing.T) { + scheme := newTestScheme(t) + + // Knowledge CRD exists but is not Ready + knowledge := &v1alpha1.Knowledge{ + ObjectMeta: metav1.ObjectMeta{Name: "flavor-groups"}, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Extractor: v1alpha1.KnowledgeExtractorSpec{Name: "flavor_groups"}, + }, + Status: v1alpha1.KnowledgeStatus{ + Conditions: []metav1.Condition{ + { + Type: v1alpha1.KnowledgeConditionReady, + Status: metav1.ConditionFalse, + Reason: "NotReady", + }, + }, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(knowledge). + WithStatusSubresource(&v1alpha1.Knowledge{}). + Build() + + c := NewController(fakeClient, Config{ + SchedulerURL: "http://localhost:9999", + TotalPipeline: "kvm-report-capacity", + PlaceablePipeline: "kvm-general-purpose", + }) + + // Should return an error when knowledge is not ready + if err := c.reconcileAll(context.Background()); err == nil { + t.Error("reconcileAll should fail when flavor groups knowledge is not ready") + } +} + +func TestReconcileOne_ZeroMemoryFlavorReturnsError(t *testing.T) { + scheme := newTestScheme(t) + fakeClient := fake.NewClientBuilder().WithScheme(scheme).Build() + c := NewController(fakeClient, Config{}) + + groupData := compute.FlavorGroupFeature{ + SmallestFlavor: compute.FlavorInGroup{Name: "bad-flavor", MemoryMB: 0}, + } + err := c.reconcileOne(context.Background(), "hana-v2", groupData, "az-a", nil, nil) + if err == nil { + t.Error("expected error for zero-memory flavor") + } +} + +// Verify that the module-level log variable from reservations package doesn't +// collide with the one in this package. +func TestPackageLogVar(t *testing.T) { + _ = reservations.NewSchedulerClient("http://localhost") +} + +func TestSumCommittedCapacity(t *testing.T) { + const ( + groupName = "hana-v2" + az = "qa-de-1a" + memMB = 4096 + memBytes = int64(memMB) * 1024 * 1024 + ) + + newCR := func(name, group, zone string, state v1alpha1.CommitmentStatus, resType v1alpha1.CommittedResourceType, amount string, acceptedAmount string) *v1alpha1.CommittedResource { + qty := resource.MustParse(amount) + cr := &v1alpha1.CommittedResource{ + ObjectMeta: metav1.ObjectMeta{Name: name}, + Spec: v1alpha1.CommittedResourceSpec{ + FlavorGroupName: group, + AvailabilityZone: zone, + State: state, + ResourceType: resType, + Amount: qty, + }, + } + if acceptedAmount != "" { + accepted := resource.MustParse(acceptedAmount) + cr.Status.AcceptedSpec = &v1alpha1.CommittedResourceSpec{ + Amount: accepted, + } + } + return cr + } + + scheme := newTestScheme(t) + objects := []client.Object{ + // Should count: confirmed, memory, right group+AZ, AcceptedAmount set + newCR("cr1", groupName, az, v1alpha1.CommitmentStatusConfirmed, v1alpha1.CommittedResourceTypeMemory, "8Gi", "8Gi"), + // Should count: guaranteed, memory, right group+AZ, no AcceptedAmount → falls back to Spec.Amount + newCR("cr2", groupName, az, v1alpha1.CommitmentStatusGuaranteed, v1alpha1.CommittedResourceTypeMemory, "4Gi", ""), + // Should NOT count: wrong state + newCR("cr3", groupName, az, v1alpha1.CommitmentStatusPlanned, v1alpha1.CommittedResourceTypeMemory, "4Gi", ""), + // Should NOT count: wrong resource type + newCR("cr4", groupName, az, v1alpha1.CommitmentStatusConfirmed, v1alpha1.CommittedResourceTypeCores, "4Gi", ""), + // Should NOT count: wrong AZ + newCR("cr5", groupName, "other-az", v1alpha1.CommitmentStatusConfirmed, v1alpha1.CommittedResourceTypeMemory, "4Gi", ""), + // Should NOT count: wrong flavor group + newCR("cr6", "other-group", az, v1alpha1.CommitmentStatusConfirmed, v1alpha1.CommittedResourceTypeMemory, "4Gi", ""), + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(objects...). + Build() + + c := NewController(fakeClient, Config{}) + // smallestFlavorBytes = 4GiB → cr1 = 8GiB/4GiB = 2 slots, cr2 = 4GiB/4GiB = 1 slot → total = 3 + got, err := c.sumCommittedCapacity(context.Background(), groupName, az, memBytes) + if err != nil { + t.Fatalf("sumCommittedCapacity failed: %v", err) + } + if got != 3 { + t.Errorf("sumCommittedCapacity = %d, want 3", got) + } +} diff --git a/internal/scheduling/reservations/capacity/metrics.go b/internal/scheduling/reservations/capacity/metrics.go new file mode 100644 index 000000000..bd13ca7ca --- /dev/null +++ b/internal/scheduling/reservations/capacity/metrics.go @@ -0,0 +1,118 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package capacity + +import ( + "context" + "time" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +var ( + capacityLabels = []string{"flavor_group", "az"} + capacityFlavorLabels = []string{"flavor_group", "az", "flavor_name"} +) + +// Monitor provides Prometheus metrics for FlavorGroupCapacity CRDs. +// It implements prometheus.Collector and reads CRD status on each Collect call. +type Monitor struct { + client client.Client + totalCapacityVMSlots *prometheus.GaugeVec + placeableVMs *prometheus.GaugeVec + totalCapacityHosts *prometheus.GaugeVec + placeableHosts *prometheus.GaugeVec + totalInstances *prometheus.GaugeVec + committedCapacity *prometheus.GaugeVec +} + +// NewMonitor creates a new Monitor that reads FlavorGroupCapacity CRDs. +func NewMonitor(c client.Client) Monitor { + return Monitor{ + client: c, + totalCapacityVMSlots: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_committed_resource_capacity_total", + Help: "Total schedulable slots in an empty-datacenter scenario per flavor.", + }, capacityFlavorLabels), + placeableVMs: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_committed_resource_capacity_placeable", + Help: "Schedulable slots remaining given current VM allocations per flavor.", + }, capacityFlavorLabels), + totalCapacityHosts: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_committed_resource_capacity_hosts_total", + Help: "Number of hosts eligible for this flavor in the empty-state probe.", + }, capacityFlavorLabels), + placeableHosts: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_committed_resource_capacity_hosts_placeable", + Help: "Number of hosts still able to accept a new VM of this flavor.", + }, capacityFlavorLabels), + totalInstances: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_committed_resource_capacity_instances", + Help: "Total VM instances running on hypervisors in this AZ (not filtered by flavor group).", + }, capacityLabels), + committedCapacity: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_committed_resource_capacity_committed", + Help: "Sum of AcceptedAmount across Ready CommittedResource CRDs for this flavor group and AZ.", + }, capacityLabels), + } +} + +// Describe implements prometheus.Collector. +func (m *Monitor) Describe(ch chan<- *prometheus.Desc) { + m.totalCapacityVMSlots.Describe(ch) + m.placeableVMs.Describe(ch) + m.totalCapacityHosts.Describe(ch) + m.placeableHosts.Describe(ch) + m.totalInstances.Describe(ch) + m.committedCapacity.Describe(ch) +} + +// Collect implements prometheus.Collector — lists all FlavorGroupCapacity CRDs and exports gauges. +func (m *Monitor) Collect(ch chan<- prometheus.Metric) { + var list v1alpha1.FlavorGroupCapacityList + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := m.client.List(ctx, &list); err != nil { + log.Error(err, "failed to list FlavorGroupCapacity CRDs for metrics") + return + } + + // Reset all gauges so deleted CRDs don't linger. + m.totalCapacityVMSlots.Reset() + m.placeableVMs.Reset() + m.totalCapacityHosts.Reset() + m.placeableHosts.Reset() + m.totalInstances.Reset() + m.committedCapacity.Reset() + + for _, crd := range list.Items { + groupAZLabels := prometheus.Labels{ + "flavor_group": crd.Spec.FlavorGroup, + "az": crd.Spec.AvailabilityZone, + } + m.totalInstances.With(groupAZLabels).Set(float64(crd.Status.TotalInstances)) + m.committedCapacity.With(groupAZLabels).Set(float64(crd.Status.CommittedCapacity)) + + for _, f := range crd.Status.Flavors { + flavorLabels := prometheus.Labels{ + "flavor_group": crd.Spec.FlavorGroup, + "az": crd.Spec.AvailabilityZone, + "flavor_name": f.FlavorName, + } + m.totalCapacityVMSlots.With(flavorLabels).Set(float64(f.TotalCapacityVMSlots)) + m.placeableVMs.With(flavorLabels).Set(float64(f.PlaceableVMs)) + m.totalCapacityHosts.With(flavorLabels).Set(float64(f.TotalCapacityHosts)) + m.placeableHosts.With(flavorLabels).Set(float64(f.PlaceableHosts)) + } + } + + m.totalCapacityVMSlots.Collect(ch) + m.placeableVMs.Collect(ch) + m.totalCapacityHosts.Collect(ch) + m.placeableHosts.Collect(ch) + m.totalInstances.Collect(ch) + m.committedCapacity.Collect(ch) +} diff --git a/internal/scheduling/reservations/commitments/api/change_commitments.go b/internal/scheduling/reservations/commitments/api/change_commitments.go index b7783b599..fd821799b 100644 --- a/internal/scheduling/reservations/commitments/api/change_commitments.go +++ b/internal/scheduling/reservations/commitments/api/change_commitments.go @@ -24,6 +24,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/util/retry" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" ) @@ -40,6 +41,13 @@ func sortedKeys[K ~string, V any](m map[K]V) []K { return keys } +// crWatch pairs a CRD name with the generation written by the API so the polling loop +// can skip cache reads that have not yet reflected the write (stale-cache guard). +type crWatch struct { + name string + generation int64 +} + // crSnapshot captures a CommittedResource CRD's prior state for batch rollback. // prevSpec is nil when the CRD was newly created (i.e. did not exist before the batch). // wasDeleted is true when the batch operation deleted the CRD; rollback must re-create it. @@ -156,7 +164,7 @@ func (api *HTTPAPI) processCommitmentChanges(ctx context.Context, w http.Respons allowRejection := req.RequiresConfirmation() var ( - toWatch []string // CRD names to poll for terminal conditions (upserts only) + toWatch []crWatch // CRD names + expected generations to poll for terminal conditions (upserts only) snapshots []crSnapshot // ordered list for deterministic rollback failedReason string rollback bool @@ -199,7 +207,7 @@ ProcessLoop: isDelete := commitment.NewStatus.IsNone() crName := "commitment-" + string(commitment.UUID) - logger.V(1).Info("processing commitment", + logger.Info("processing commitment", "commitmentUUID", commitment.UUID, "oldStatus", commitment.OldStatus.UnwrapOr("none"), "newStatus", commitment.NewStatus.UnwrapOr("none"), @@ -223,15 +231,20 @@ ProcessLoop: if isDelete { // Limes is removing this commitment; delete the CRD if it exists. snap.wasDeleted = true + snapshots = append(snapshots, snap) if snap.prevSpec != nil { if err := api.client.Delete(ctx, existing); err != nil && !apierrors.IsNotFound(err) { failedReason = fmt.Sprintf("commitment %s: failed to delete CommittedResource CRD: %v", commitment.UUID, err) rollback = true break ProcessLoop } + if err := commitments.DeleteChildReservations(ctx, api.client, existing); err != nil { + failedReason = fmt.Sprintf("commitment %s: failed to delete child reservations: %v", commitment.UUID, err) + rollback = true + break ProcessLoop + } logger.V(1).Info("deleted CommittedResource CRD", "name", crName) } - snapshots = append(snapshots, snap) continue } @@ -246,6 +259,10 @@ ProcessLoop: cr := &v1alpha1.CommittedResource{} cr.Name = crName if _, err := controllerutil.CreateOrUpdate(ctx, api.client, cr, func() error { + if cr.Spec.AvailabilityZone != "" && cr.Spec.AvailabilityZone != stateDesired.AvailabilityZone { + return fmt.Errorf("cannot change availability zone of commitment %s: current=%q requested=%q", + commitment.UUID, cr.Spec.AvailabilityZone, stateDesired.AvailabilityZone) + } applyCRSpec(cr, stateDesired, allowRejection) if cr.Annotations == nil { cr.Annotations = make(map[string]string) @@ -258,7 +275,7 @@ ProcessLoop: break ProcessLoop } - toWatch = append(toWatch, crName) + toWatch = append(toWatch, crWatch{name: crName, generation: cr.Generation}) snapshots = append(snapshots, snap) logger.V(1).Info("upserted CommittedResource CRD", "name", crName) } @@ -289,9 +306,9 @@ ProcessLoop: case len(rejected) > 0: var b strings.Builder fmt.Fprintf(&b, "%d commitment(s) failed to apply:", len(rejected)) - for _, crName := range toWatch { // iterate toWatch for deterministic order - if reason, ok := rejected[crName]; ok { - fmt.Fprintf(&b, "\n- commitment %s: %s", strings.TrimPrefix(crName, "commitment-"), reason) + for _, w := range toWatch { // iterate toWatch for deterministic order + if reason, ok := rejected[w.name]; ok { + fmt.Fprintf(&b, "\n- commitment %s: %s", strings.TrimPrefix(w.name, "commitment-"), reason) } } failedReason = b.String() @@ -326,26 +343,32 @@ ProcessLoop: // - Ready=False, Reason=Planned — success; controller reserves capacity at activation time // - Ready=False, Reason=Rejected — failure; reason reported to caller // +// Each entry in watches carries the generation written by the API. The polling loop skips any +// cache read whose generation is older than that value, preventing a stale Ready=True (or +// Ready=False/Rejected) condition from a prior reconcile cycle from being mistaken for the +// outcome of the current write. +// // Returns a map of crName → rejection reason for failed CRDs, and any polling errors (e.g. timeout). func watchCRsUntilReady( ctx context.Context, logger logr.Logger, k8sClient client.Client, - crNames []string, + watches []crWatch, timeout time.Duration, pollInterval time.Duration, ) (rejected map[string]string, errs []error) { - if len(crNames) == 0 { + if len(watches) == 0 { return nil, nil } rejected = make(map[string]string) deadline := time.Now().Add(timeout) - pending := make(map[string]struct{}, len(crNames)) - for _, name := range crNames { - pending[name] = struct{}{} + // pending maps CR name → the minimum generation the cache must show before we trust conditions. + pending := make(map[string]int64, len(watches)) + for _, w := range watches { + pending[w.name] = w.generation } for { @@ -354,23 +377,49 @@ func watchCRsUntilReady( return rejected, errs } - for name := range pending { + for name, expectedGen := range pending { cr := &v1alpha1.CommittedResource{} if err := k8sClient.Get(ctx, types.NamespacedName{Name: name}, cr); err != nil { continue // transient; keep waiting } + // The informer cache may not have caught up with the spec write yet. Until the + // cache reflects at least the generation we wrote, any condition we read belongs + // to an older spec version and must not be treated as terminal. + if cr.Generation < expectedGen { + logger.V(1).Info("cache not yet reflecting write, skipping", + "name", name, + "cacheGeneration", cr.Generation, + "expectedGeneration", expectedGen, + ) + continue + } + cond := meta.FindStatusCondition(cr.Status.Conditions, v1alpha1.CommittedResourceConditionReady) if cond == nil { continue // controller hasn't reconciled yet } + // Skip conditions stamped by a prior reconcile: ObservedGeneration < Generation means + // the condition reflects an older spec version and must not be treated as terminal. + if cond.ObservedGeneration < cr.Generation { + logger.V(1).Info("skipping stale condition on CommittedResource", + "name", name, + "generation", cr.Generation, + "conditionObservedGeneration", cond.ObservedGeneration, + "reason", cond.Reason, + ) + continue + } switch { case cond.Status == metav1.ConditionTrue: + logger.Info("CommittedResource accepted", "name", name) delete(pending, name) case cond.Status == metav1.ConditionFalse && cond.Reason == v1alpha1.CommittedResourceReasonPlanned: + logger.Info("CommittedResource planned (will reserve at activation)", "name", name) delete(pending, name) // planned = accepted; controller will reserve at activation case cond.Status == metav1.ConditionFalse && cond.Reason == v1alpha1.CommittedResourceReasonRejected: + logger.Info("CommittedResource rejected", "name", name, "reason", cond.Message) delete(pending, name) rejected[name] = cond.Message // Reason=Reserving: controller is placing slots; keep waiting. @@ -418,15 +467,21 @@ func rollbackCR(ctx context.Context, logger logr.Logger, k8sClient client.Client return } - cr := &v1alpha1.CommittedResource{} - if err := k8sClient.Get(ctx, types.NamespacedName{Name: snap.crName}, cr); err != nil { - logger.Error(err, "failed to fetch CommittedResource CRD for rollback", "name", snap.crName) - return - } - cr.Spec = *snap.prevSpec - if err := k8sClient.Update(ctx, cr); err != nil { + // The controller may write status (bumping resourceVersion) between our Get and Update. + // RetryOnConflict retries with exponential backoff when that race occurs. + err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + cr := &v1alpha1.CommittedResource{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: snap.crName}, cr); err != nil { + return err + } + cr.Spec = *snap.prevSpec + return k8sClient.Update(ctx, cr) + }) + if err != nil { logger.Error(err, "failed to restore CommittedResource CRD spec during rollback", "name", snap.crName) + return } + logger.V(1).Info("restored CommittedResource CRD spec during rollback", "name", snap.crName) } // applyCRSpec writes CommitmentState fields into a CommittedResource CRD spec. diff --git a/internal/scheduling/reservations/commitments/api/change_commitments_e2e_test.go b/internal/scheduling/reservations/commitments/api/change_commitments_e2e_test.go index 3f19b4857..c0be4ad7a 100644 --- a/internal/scheduling/reservations/commitments/api/change_commitments_e2e_test.go +++ b/internal/scheduling/reservations/commitments/api/change_commitments_e2e_test.go @@ -174,7 +174,7 @@ func (e *e2eEnv) driveReconciles(ctx context.Context) { } // reconcileAll drives one round of reconciles: -// 1. CR pass 1 — adds finalizer and creates Reservation CRDs. +// 1. CR pass 1 — creates Reservation CRDs based on current state. // 2. Reservation pass — calls the scheduler, sets TargetHost (first reconcile) then Ready=True (second). // 3. CR pass 2 — re-fetches each CR and picks up Reservation outcomes (placed or rejected). // @@ -225,11 +225,9 @@ func (e *e2eEnv) reconcileAll(ctx context.Context) { // e2eIsTerminalCR returns true for states the API polling loop treats as final: // Accepted (Ready=True), Rejected, or Planned. -// CRs with DeletionTimestamp are never terminal here: they need one more reconcile to remove -// their finalizer (set by the controller on first reconcile) so the fake client can delete them. func e2eIsTerminalCR(cr v1alpha1.CommittedResource) bool { if !cr.DeletionTimestamp.IsZero() { - return false + return true } cond := apimeta.FindStatusCondition(cr.Status.Conditions, v1alpha1.CommittedResourceConditionReady) if cond == nil { @@ -243,7 +241,6 @@ func e2eIsTerminalCR(cr v1alpha1.CommittedResource) bool { } // waitForCRAbsent polls until the named CommittedResource no longer exists or the 1s deadline passes. -// Used after rollback calls because the finalizer removal happens asynchronously in the background reconcile loop. func (e *e2eEnv) waitForCRAbsent(t *testing.T, crName string) { t.Helper() deadline := time.Now().Add(1 * time.Second) diff --git a/internal/scheduling/reservations/commitments/api/change_commitments_test.go b/internal/scheduling/reservations/commitments/api/change_commitments_test.go index a98e840aa..999ddd240 100644 --- a/internal/scheduling/reservations/commitments/api/change_commitments_test.go +++ b/internal/scheduling/reservations/commitments/api/change_commitments_test.go @@ -130,6 +130,23 @@ func TestHandleChangeCommitments(t *testing.T) { ExpectedAPIResponse: newAPIResponse("uuid-b: not sufficient capacity"), ExpectedDeletedCRs: []string{"commitment-uuid-a", "commitment-uuid-b"}, }, + // --- AZ immutability --- + { + // AZ is immutable once set on a CommittedResource. Attempting to change it via + // change-commitments must be rejected immediately, before any polling or controller + // interaction, and the CR must remain at its original spec. + Name: "AZ change on existing CR: must be rejected", + Flavors: []*TestFlavor{m1Small}, + ExistingCRs: []*TestCR{ + {CommitmentUUID: "uuid-az-stale", State: v1alpha1.CommitmentStatusConfirmed, + AmountMiB: 1024, ProjectID: "project-A", AZ: "az-old", ReadyCondition: true}, + }, + CommitmentRequest: newCommitmentRequest("az-new", false, 1234, + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-az-stale", "confirmed", 2)), + ExpectedAPIResponse: newAPIResponse("cannot change availability zone"), + // CR spec must not have changed. + ExpectedCRSpecs: map[string]int64{"commitment-uuid-az-stale": 1024 * 1024 * 1024}, + }, // --- Timeout --- { Name: "Timeout: no condition set → rollback and timeout error", @@ -457,6 +474,10 @@ type TestCR struct { AmountMiB int64 ProjectID string AZ string + // ReadyCondition pre-sets Ready=True (Generation=1, ObservedGeneration=1) on the CR to simulate + // a CR that was previously accepted. Use together with NoCondition to test that the polling loop + // does not treat this stale condition as a valid outcome for a subsequent spec update. + ReadyCondition bool } type CommitmentChangeRequest struct { @@ -591,6 +612,9 @@ type fakeControllerClient struct { } func (c *fakeControllerClient) Create(ctx context.Context, obj client.Object, opts ...client.CreateOption) error { + if cr, ok := obj.(*v1alpha1.CommittedResource); ok { + cr.Generation = 1 // k8s sets generation=1 on first creation + } if err := c.Client.Create(ctx, obj, opts...); err != nil { return err } @@ -601,6 +625,14 @@ func (c *fakeControllerClient) Create(ctx context.Context, obj client.Object, op } func (c *fakeControllerClient) Update(ctx context.Context, obj client.Object, opts ...client.UpdateOption) error { + if cr, ok := obj.(*v1alpha1.CommittedResource); ok { + // k8s increments generation on each spec change; simulate that here so the + // polling loop can detect stale conditions from a prior generation. + existing := &v1alpha1.CommittedResource{} + if err := c.Get(ctx, client.ObjectKeyFromObject(cr), existing); err == nil { + cr.Generation = existing.Generation + 1 + } + } if err := c.Client.Update(ctx, obj, opts...); err != nil { return err } @@ -620,36 +652,40 @@ func (c *fakeControllerClient) setConditionFor(ctx context.Context, crName strin return } + cr := &v1alpha1.CommittedResource{} + if err := c.Get(ctx, client.ObjectKey{Name: crName}, cr); err != nil { + return + } + var cond metav1.Condition switch { case !hasOutcome || outcome == "": // Default: controller accepts. cond = metav1.Condition{ - Type: v1alpha1.CommittedResourceConditionReady, - Status: metav1.ConditionTrue, - Reason: v1alpha1.CommittedResourceReasonAccepted, - Message: "accepted", + Type: v1alpha1.CommittedResourceConditionReady, + Status: metav1.ConditionTrue, + Reason: v1alpha1.CommittedResourceReasonAccepted, + Message: "accepted", + ObservedGeneration: cr.Generation, } case outcome == v1alpha1.CommittedResourceReasonPlanned: cond = metav1.Condition{ - Type: v1alpha1.CommittedResourceConditionReady, - Status: metav1.ConditionFalse, - Reason: v1alpha1.CommittedResourceReasonPlanned, - Message: "commitment is not yet active", + Type: v1alpha1.CommittedResourceConditionReady, + Status: metav1.ConditionFalse, + Reason: v1alpha1.CommittedResourceReasonPlanned, + Message: "commitment is not yet active", + ObservedGeneration: cr.Generation, } default: cond = metav1.Condition{ - Type: v1alpha1.CommittedResourceConditionReady, - Status: metav1.ConditionFalse, - Reason: v1alpha1.CommittedResourceReasonRejected, - Message: outcome, + Type: v1alpha1.CommittedResourceConditionReady, + Status: metav1.ConditionFalse, + Reason: v1alpha1.CommittedResourceReasonRejected, + Message: outcome, + ObservedGeneration: cr.Generation, } } - cr := &v1alpha1.CommittedResource{} - if err := c.Get(ctx, client.ObjectKey{Name: crName}, cr); err != nil { - return - } meta.SetStatusCondition(&cr.Status.Conditions, cond) if err := c.Client.Status().Update(ctx, cr); err != nil { return // best-effort: if the update races with another write, the polling loop retries @@ -695,6 +731,13 @@ func newCRTestEnv(t *testing.T, tc CommitmentChangeTestCase) *CRTestEnv { WithScheme(scheme). WithObjects(objects...). WithStatusSubresource(&v1alpha1.CommittedResource{}, &v1alpha1.Knowledge{}). + WithIndex(&v1alpha1.Reservation{}, "spec.committedResourceReservation.commitmentUUID", func(obj client.Object) []string { + res, ok := obj.(*v1alpha1.Reservation) + if !ok || res.Spec.CommittedResourceReservation == nil { + return nil + } + return []string{res.Spec.CommittedResourceReservation.CommitmentUUID} + }). Build() noCondition := make(map[string]struct{}) @@ -830,7 +873,7 @@ func (env *CRTestEnv) VerifyCRAmountBytes(crName string, wantBytes int64) { func (tc *TestCR) toCommittedResource() *v1alpha1.CommittedResource { amount := resource.NewQuantity(tc.AmountMiB*1024*1024, resource.BinarySI) - return &v1alpha1.CommittedResource{ + cr := &v1alpha1.CommittedResource{ ObjectMeta: metav1.ObjectMeta{ Name: "commitment-" + tc.CommitmentUUID, }, @@ -844,6 +887,18 @@ func (tc *TestCR) toCommittedResource() *v1alpha1.CommittedResource { State: tc.State, }, } + if tc.ReadyCondition { + cr.Generation = 1 + meta.SetStatusCondition(&cr.Status.Conditions, metav1.Condition{ + Type: v1alpha1.CommittedResourceConditionReady, + Status: metav1.ConditionTrue, + Reason: v1alpha1.CommittedResourceReasonAccepted, + Message: "accepted", + LastTransitionTime: metav1.Now(), + ObservedGeneration: 1, + }) + } + return cr } // ============================================================================ diff --git a/internal/scheduling/reservations/commitments/api/handler.go b/internal/scheduling/reservations/commitments/api/handler.go index 051a82fa2..ce9a876ec 100644 --- a/internal/scheduling/reservations/commitments/api/handler.go +++ b/internal/scheduling/reservations/commitments/api/handler.go @@ -26,6 +26,7 @@ type HTTPAPI struct { usageMonitor ReportUsageAPIMonitor capacityMonitor ReportCapacityAPIMonitor infoMonitor InfoAPIMonitor + quotaMonitor QuotaAPIMonitor // Mutex to serialize change-commitments requests changeMutex sync.Mutex } @@ -44,6 +45,7 @@ func NewAPIWithConfig(k8sClient client.Client, config commitments.APIConfig, usa usageMonitor: NewReportUsageAPIMonitor(), capacityMonitor: NewReportCapacityAPIMonitor(), infoMonitor: NewInfoAPIMonitor(), + quotaMonitor: NewQuotaAPIMonitor(), } } @@ -52,6 +54,7 @@ func (api *HTTPAPI) Init(mux *http.ServeMux, registry prometheus.Registerer, log registry.MustRegister(&api.usageMonitor) registry.MustRegister(&api.capacityMonitor) registry.MustRegister(&api.infoMonitor) + registry.MustRegister(&api.quotaMonitor) mux.HandleFunc("/commitments/v1/change-commitments", api.HandleChangeCommitments) mux.HandleFunc("/commitments/v1/report-capacity", api.HandleReportCapacity) mux.HandleFunc("/commitments/v1/info", api.HandleInfo) diff --git a/internal/scheduling/reservations/commitments/api/info.go b/internal/scheduling/reservations/commitments/api/info.go index cd8846101..c9576008f 100644 --- a/internal/scheduling/reservations/commitments/api/info.go +++ b/internal/scheduling/reservations/commitments/api/info.go @@ -147,6 +147,12 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, logger logr.Logger) (l return liquid.ServiceInfo{}, fmt.Errorf("%w: failed to create unit for flavor group %q: %w", errInternalServiceInfo, groupName, err) } + // Determine topology: AZSeparatedTopology only for groups that accept commitments + // (AZSeparatedTopology means quota is also AZ-aware, required when HasQuota=true) + ramTopology := liquid.AZAwareTopology + if resCfg.RAM.HandlesCommitments { + ramTopology = liquid.AZSeparatedTopology + } resources[ramResourceName] = liquid.ResourceInfo{ DisplayName: fmt.Sprintf( "multiples of %d MiB (usable by: %s)", @@ -154,7 +160,7 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, logger logr.Logger) (l flavorListStr, ), Unit: ramUnit, - Topology: liquid.AZAwareTopology, + Topology: ramTopology, NeedsResourceDemand: false, HasCapacity: resCfg.RAM.HasCapacity, HasQuota: resCfg.RAM.HasQuota, diff --git a/internal/scheduling/reservations/commitments/api/info_test.go b/internal/scheduling/reservations/commitments/api/info_test.go index 60426a2aa..514ebc752 100644 --- a/internal/scheduling/reservations/commitments/api/info_test.go +++ b/internal/scheduling/reservations/commitments/api/info_test.go @@ -194,11 +194,11 @@ func TestHandleInfo_ResourceFlagsFromConfig(t *testing.T) { WithObjects(knowledge). Build() - // hana_fixed: ram accepts commitments; v2_variable: nothing accepts commitments + // hana_fixed: ram accepts commitments and has quota; v2_variable: nothing accepts commitments cfg := commitments.DefaultAPIConfig() cfg.FlavorGroupResourceConfig = map[string]commitments.FlavorGroupResourcesConfig{ "hana_fixed": { - RAM: commitments.ResourceTypeConfig{HandlesCommitments: true, HasCapacity: true}, + RAM: commitments.ResourceTypeConfig{HandlesCommitments: true, HasCapacity: true, HasQuota: true}, Cores: commitments.ResourceTypeConfig{HasCapacity: true}, Instances: commitments.ResourceTypeConfig{HasCapacity: true}, }, @@ -230,6 +230,7 @@ func TestHandleInfo_ResourceFlagsFromConfig(t *testing.T) { t.Fatalf("expected 6 resources (3 per flavor group), got %d", len(serviceInfo.Resources)) } + // Test RAM resource: hw_version_hana_fixed_ram (fixed ratio → commitments + quota) ramResource, ok := serviceInfo.Resources["hw_version_hana_fixed_ram"] if !ok { t.Fatal("expected hw_version_hana_fixed_ram resource to exist") @@ -240,7 +241,14 @@ func TestHandleInfo_ResourceFlagsFromConfig(t *testing.T) { if !ramResource.HandlesCommitments { t.Error("hw_version_hana_fixed_ram: expected HandlesCommitments=true (set in config)") } + if ramResource.Topology != liquid.AZSeparatedTopology { + t.Errorf("hw_version_hana_fixed_ram: expected Topology=%q, got %q", liquid.AZSeparatedTopology, ramResource.Topology) + } + if !ramResource.HasQuota { + t.Error("hw_version_hana_fixed_ram: expected HasQuota=true (fixed ratio groups accept quotas)") + } + // Test Cores resource: hw_version_hana_fixed_cores (always AZAwareTopology, no quota) coresResource, ok := serviceInfo.Resources["hw_version_hana_fixed_cores"] if !ok { t.Fatal("expected hw_version_hana_fixed_cores resource to exist") @@ -251,7 +259,14 @@ func TestHandleInfo_ResourceFlagsFromConfig(t *testing.T) { if coresResource.HandlesCommitments { t.Error("hw_version_hana_fixed_cores: expected HandlesCommitments=false") } + if coresResource.Topology != liquid.AZAwareTopology { + t.Errorf("hw_version_hana_fixed_cores: expected Topology=%q, got %q", liquid.AZAwareTopology, coresResource.Topology) + } + if coresResource.HasQuota { + t.Error("hw_version_hana_fixed_cores: expected HasQuota=false") + } + // Test Instances resource: hw_version_hana_fixed_instances (always AZAwareTopology, no quota) instancesResource, ok := serviceInfo.Resources["hw_version_hana_fixed_instances"] if !ok { t.Fatal("expected hw_version_hana_fixed_instances resource to exist") @@ -262,6 +277,12 @@ func TestHandleInfo_ResourceFlagsFromConfig(t *testing.T) { if instancesResource.HandlesCommitments { t.Error("hw_version_hana_fixed_instances: expected HandlesCommitments=false") } + if instancesResource.Topology != liquid.AZAwareTopology { + t.Errorf("hw_version_hana_fixed_instances: expected Topology=%q, got %q", liquid.AZAwareTopology, instancesResource.Topology) + } + if instancesResource.HasQuota { + t.Error("hw_version_hana_fixed_instances: expected HasQuota=false") + } // v2_variable is covered by "*" wildcard: HasCapacity=true, HandlesCommitments=false v2RamResource, ok := serviceInfo.Resources["hw_version_v2_variable_ram"] @@ -274,6 +295,12 @@ func TestHandleInfo_ResourceFlagsFromConfig(t *testing.T) { if v2RamResource.HandlesCommitments { t.Error("hw_version_v2_variable_ram: expected HandlesCommitments=false (not in config)") } + if v2RamResource.Topology != liquid.AZAwareTopology { + t.Errorf("hw_version_v2_variable_ram: expected Topology=%q, got %q", liquid.AZAwareTopology, v2RamResource.Topology) + } + if v2RamResource.HasQuota { + t.Error("hw_version_v2_variable_ram: expected HasQuota=false (variable ratio)") + } v2CoresResource, ok := serviceInfo.Resources["hw_version_v2_variable_cores"] if !ok { @@ -282,6 +309,15 @@ func TestHandleInfo_ResourceFlagsFromConfig(t *testing.T) { if !v2CoresResource.HasCapacity { t.Error("hw_version_v2_variable_cores: expected HasCapacity=true") } + if v2CoresResource.HandlesCommitments { + t.Error("hw_version_v2_variable_cores: expected HandlesCommitments=false") + } + if v2CoresResource.Topology != liquid.AZAwareTopology { + t.Errorf("hw_version_v2_variable_cores: expected Topology=%q, got %q", liquid.AZAwareTopology, v2CoresResource.Topology) + } + if v2CoresResource.HasQuota { + t.Error("hw_version_v2_variable_cores: expected HasQuota=false") + } v2InstancesResource, ok := serviceInfo.Resources["hw_version_v2_variable_instances"] if !ok { @@ -290,4 +326,13 @@ func TestHandleInfo_ResourceFlagsFromConfig(t *testing.T) { if !v2InstancesResource.HasCapacity { t.Error("hw_version_v2_variable_instances: expected HasCapacity=true") } + if v2InstancesResource.HandlesCommitments { + t.Error("hw_version_v2_variable_instances: expected HandlesCommitments=false") + } + if v2InstancesResource.Topology != liquid.AZAwareTopology { + t.Errorf("hw_version_v2_variable_instances: expected Topology=%q, got %q", liquid.AZAwareTopology, v2InstancesResource.Topology) + } + if v2InstancesResource.HasQuota { + t.Error("hw_version_v2_variable_instances: expected HasQuota=false") + } } diff --git a/internal/scheduling/reservations/commitments/api/quota.go b/internal/scheduling/reservations/commitments/api/quota.go index c77fdf1a6..9c34e879c 100644 --- a/internal/scheduling/reservations/commitments/api/quota.go +++ b/internal/scheduling/reservations/commitments/api/quota.go @@ -4,19 +4,36 @@ package api import ( + "encoding/json" + "fmt" + "math" "net/http" + "strconv" + "time" + "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/google/uuid" + "github.com/sapcc/go-api-declarations/liquid" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/util/retry" + "sigs.k8s.io/controller-runtime/pkg/client" ) +// projectQuotaCRDName returns the CRD object name for a given project UUID. +// Convention: "quota-" +func projectQuotaCRDName(projectID string) string { + return "quota-" + projectID +} + // HandleQuota implements PUT /commitments/v1/projects/:project_id/quota from Limes LIQUID API. // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid // -// This is a no-op endpoint that accepts quota requests but doesn't store them. -// Cortex does not enforce quotas for committed resources - quota enforcement -// happens through commitment validation at change-commitments time. -// The endpoint exists for API compatibility with the LIQUID specification. +// This endpoint receives quota requests from Limes and persists them as ProjectQuota CRDs. +// One CRD per project, named "quota-". func (api *HTTPAPI) HandleQuota(w http.ResponseWriter, r *http.Request) { + startTime := time.Now() + // Extract or generate request ID for tracing requestID := r.Header.Get("X-Request-ID") if requestID == "" { @@ -27,14 +44,150 @@ func (api *HTTPAPI) HandleQuota(w http.ResponseWriter, r *http.Request) { log := apiLog.WithValues("requestID", requestID, "endpoint", "quota") if r.Method != http.MethodPut { - http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + api.quotaError(w, http.StatusMethodNotAllowed, "Method not allowed", startTime) + return + } + + // Check if quota API is enabled + if !api.config.EnableQuotaAPI { + api.quotaError(w, http.StatusServiceUnavailable, "Quota API is disabled", startTime) + return + } + + // Extract project UUID from URL path + projectID, err := extractProjectIDFromPath(r.URL.Path) + if err != nil { + log.Error(err, "failed to extract project ID from path") + api.quotaError(w, http.StatusBadRequest, "Invalid URL path: "+err.Error(), startTime) return } - // No-op: Accept the quota request but don't store it - // Cortex handles capacity through commitments, not quotas - log.V(1).Info("received quota request (no-op)", "path", r.URL.Path) + // Parse request body + var req liquid.ServiceQuotaRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + log.Error(err, "failed to decode quota request body") + api.quotaError(w, http.StatusBadRequest, "Invalid request body: "+err.Error(), startTime) + return + } + + // Extract project/domain metadata if available + var projectName, domainID, domainName string + if meta, ok := req.ProjectMetadata.Unpack(); ok { + // Consistency check: metadata UUID must match URL path UUID + if meta.UUID != "" && meta.UUID != projectID { + log.Info("project UUID mismatch", "urlProjectID", projectID, "metadataUUID", meta.UUID) + api.quotaError(w, http.StatusBadRequest, fmt.Sprintf("Project UUID mismatch: URL has %q but metadata has %q", projectID, meta.UUID), startTime) + return + } + projectName = meta.Name + domainID = meta.Domain.UUID + domainName = meta.Domain.Name + } + + if domainID == "" { + api.quotaError(w, http.StatusBadRequest, "missing domain UUID in project metadata", startTime) + return + } + + // Build the spec quota map from the liquid request. + // liquid API uses uint64; our CRD uses int64 (K8s convention). + // Guard against overflow: uint64 values > MaxInt64 would wrap to negative. + specQuota := make(map[string]v1alpha1.ResourceQuota, len(req.Resources)) + for resourceName, resQuota := range req.Resources { + if resQuota.Quota > math.MaxInt64 { + api.quotaError(w, http.StatusBadRequest, fmt.Sprintf("Quota value for resource %q exceeds int64 max", resourceName), startTime) + return + } + rq := v1alpha1.ResourceQuota{ + Quota: int64(resQuota.Quota), + } + if len(resQuota.PerAZ) > 0 { + rq.PerAZ = make(map[string]int64, len(resQuota.PerAZ)) + for az, azQuota := range resQuota.PerAZ { + if azQuota.Quota > math.MaxInt64 { + api.quotaError(w, http.StatusBadRequest, fmt.Sprintf("Quota value for resource %q in AZ %q exceeds int64 max", resourceName, az), startTime) + return + } + rq.PerAZ[string(az)] = int64(azQuota.Quota) + } + } + specQuota[string(resourceName)] = rq + } + + // Create or update ProjectQuota CRD with retry-on-conflict to handle + // concurrent status updates from the quota controller. + crdName := projectQuotaCRDName(projectID) + ctx := r.Context() + + err = retry.RetryOnConflict(retry.DefaultRetry, func() error { + var existing v1alpha1.ProjectQuota + getErr := api.client.Get(ctx, client.ObjectKey{Name: crdName}, &existing) + if getErr != nil { + if !apierrors.IsNotFound(getErr) { + return getErr + } + // Not found -- create new + pq := &v1alpha1.ProjectQuota{ + ObjectMeta: metav1.ObjectMeta{ + Name: crdName, + }, + Spec: v1alpha1.ProjectQuotaSpec{ + ProjectID: projectID, + ProjectName: projectName, + DomainID: domainID, + DomainName: domainName, + Quota: specQuota, + }, + } + if createErr := api.client.Create(ctx, pq); createErr != nil { + // If another request just created it, retry will fetch and update + if apierrors.IsAlreadyExists(createErr) { + return createErr + } + return createErr + } + log.V(1).Info("created ProjectQuota", "name", crdName, "projectID", projectID, "resources", len(specQuota)) + return nil + } + + // Update existing (re-fetched on each retry to get fresh resourceVersion) + existing.Spec.Quota = specQuota + if projectName != "" { + existing.Spec.ProjectName = projectName + } + if domainID != "" { + existing.Spec.DomainID = domainID + } + if domainName != "" { + existing.Spec.DomainName = domainName + } + if updateErr := api.client.Update(ctx, &existing); updateErr != nil { + return updateErr + } + log.V(1).Info("updated ProjectQuota", "name", crdName, "projectID", projectID, "resources", len(specQuota)) + return nil + }) + if err != nil { + log.Error(err, "failed to create/update ProjectQuota", "name", crdName) + api.quotaError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to persist quota: %v", err), startTime) + return + } // Return 204 No Content as expected by the LIQUID API w.WriteHeader(http.StatusNoContent) + api.recordQuotaMetrics(http.StatusNoContent, startTime) +} + +// quotaError writes an HTTP error response and records metrics. Used for error paths in HandleQuota. +func (api *HTTPAPI) quotaError(w http.ResponseWriter, statusCode int, msg string, startTime time.Time) { + http.Error(w, msg, statusCode) + api.recordQuotaMetrics(statusCode, startTime) +} + +// recordQuotaMetrics records Prometheus metrics for a quota API request. +func (api *HTTPAPI) recordQuotaMetrics(statusCode int, startTime time.Time) { + duration := time.Since(startTime).Seconds() + statusCodeStr := strconv.Itoa(statusCode) + api.quotaMonitor.requestCounter.WithLabelValues(statusCodeStr).Inc() + api.quotaMonitor.requestDuration.WithLabelValues(statusCodeStr).Observe(duration) } diff --git a/internal/scheduling/reservations/commitments/api/quota_monitor.go b/internal/scheduling/reservations/commitments/api/quota_monitor.go new file mode 100644 index 000000000..c06d4b788 --- /dev/null +++ b/internal/scheduling/reservations/commitments/api/quota_monitor.go @@ -0,0 +1,47 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package api + +import "github.com/prometheus/client_golang/prometheus" + +// QuotaAPIMonitor provides metrics for the CR quota API. +type QuotaAPIMonitor struct { + requestCounter *prometheus.CounterVec + requestDuration *prometheus.HistogramVec +} + +// NewQuotaAPIMonitor creates a new monitor with Prometheus metrics. +// Metrics are pre-initialized with zero values for common HTTP status codes +// to ensure they appear in Prometheus before the first request. +func NewQuotaAPIMonitor() QuotaAPIMonitor { + m := QuotaAPIMonitor{ + requestCounter: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_committed_resource_quota_api_requests_total", + Help: "Total number of quota API requests by status code.", + }, []string{"status_code"}), + requestDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "cortex_committed_resource_quota_api_request_duration_seconds", + Help: "Duration of quota API requests in seconds.", + Buckets: prometheus.DefBuckets, + }, []string{"status_code"}), + } + // Pre-initialize common status codes so they appear in Prometheus before the first request + for _, statusCode := range []string{"204", "400", "405", "500"} { + m.requestCounter.WithLabelValues(statusCode) + m.requestDuration.WithLabelValues(statusCode) + } + return m +} + +// Describe implements prometheus.Collector. +func (m *QuotaAPIMonitor) Describe(ch chan<- *prometheus.Desc) { + m.requestCounter.Describe(ch) + m.requestDuration.Describe(ch) +} + +// Collect implements prometheus.Collector. +func (m *QuotaAPIMonitor) Collect(ch chan<- prometheus.Metric) { + m.requestCounter.Collect(ch) + m.requestDuration.Collect(ch) +} diff --git a/internal/scheduling/reservations/commitments/api/quota_test.go b/internal/scheduling/reservations/commitments/api/quota_test.go new file mode 100644 index 000000000..8632d2af4 --- /dev/null +++ b/internal/scheduling/reservations/commitments/api/quota_test.go @@ -0,0 +1,372 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package api + +import ( + "bytes" + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + commitments "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/commitments" + "github.com/majewsky/gg/option" + "github.com/sapcc/go-api-declarations/liquid" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +// newTestScheme returns a scheme with v1alpha1 types registered. +func newTestScheme(t *testing.T) *runtime.Scheme { + t.Helper() + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add scheme: %v", err) + } + return scheme +} + +// marshalQuotaReq marshals a ServiceQuotaRequest, failing the test on error. +func marshalQuotaReq(t *testing.T, req liquid.ServiceQuotaRequest) []byte { + t.Helper() + body, err := json.Marshal(req) + if err != nil { + t.Fatalf("failed to marshal request: %v", err) + } + return body +} + +func TestHandleQuota_ErrorCases(t *testing.T) { + tests := []struct { + name string + method string + path string + body []byte + metadata *liquid.ProjectMetadata + enableQuota *bool // nil = default (enabled) + expectedStatus int + }{ + { + name: "MethodNotAllowed_GET", + method: http.MethodGet, + path: "/commitments/v1/projects/project-abc/quota", + body: nil, + expectedStatus: http.StatusMethodNotAllowed, + }, + { + name: "MethodNotAllowed_POST", + method: http.MethodPost, + path: "/commitments/v1/projects/project-abc/quota", + body: nil, + expectedStatus: http.StatusMethodNotAllowed, + }, + { + name: "DisabledAPI", + method: http.MethodPut, + path: "/commitments/v1/projects/project-abc/quota", + body: []byte(`{"resources":{}}`), + enableQuota: boolPtr(false), + expectedStatus: http.StatusServiceUnavailable, + }, + { + name: "InvalidBody", + method: http.MethodPut, + path: "/commitments/v1/projects/project-abc/quota", + body: []byte("{invalid"), + expectedStatus: http.StatusBadRequest, + }, + { + name: "EmptyBody", + method: http.MethodPut, + path: "/commitments/v1/projects/project-abc/quota", + body: []byte(""), + expectedStatus: http.StatusBadRequest, + }, + { + name: "UUIDMismatch", + method: http.MethodPut, + path: "/commitments/v1/projects/project-abc/quota", + metadata: &liquid.ProjectMetadata{ + UUID: "different-uuid", + Name: "my-project", + Domain: liquid.DomainMetadata{UUID: "domain-123", Name: "my-domain"}, + }, + expectedStatus: http.StatusBadRequest, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + scheme := newTestScheme(t) + k8sClient := fake.NewClientBuilder().WithScheme(scheme).Build() + + var httpAPI *HTTPAPI + if tc.enableQuota != nil && !*tc.enableQuota { + config := commitments.DefaultAPIConfig() + config.EnableQuotaAPI = false + httpAPI = NewAPIWithConfig(k8sClient, config, nil) + } else { + httpAPI = NewAPI(k8sClient) + } + + // Build body: use provided bytes or construct from metadata + var bodyReader *bytes.Reader + switch { + case tc.body != nil: + bodyReader = bytes.NewReader(tc.body) + case tc.metadata != nil: + quotaReq := liquid.ServiceQuotaRequest{ + Resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{ + "hw_version_hana_1_ram": {Quota: 100}, + }, + } + quotaReq.ProjectMetadata = option.Some(*tc.metadata) + bodyReader = bytes.NewReader(marshalQuotaReq(t, quotaReq)) + default: + bodyReader = bytes.NewReader([]byte{}) + } + + req := httptest.NewRequest(tc.method, tc.path, bodyReader) + w := httptest.NewRecorder() + + httpAPI.HandleQuota(w, req) + + resp := w.Result() + defer resp.Body.Close() + + if resp.StatusCode != tc.expectedStatus { + t.Errorf("expected status %d, got %d", tc.expectedStatus, resp.StatusCode) + } + }) + } +} + +func TestHandleQuota_CreateAndUpdate(t *testing.T) { + tests := []struct { + name string + // existing is a pre-existing CRD to seed (nil = create, non-nil = update) + existing *v1alpha1.ProjectQuota + projectID string + resources map[liquid.ResourceName]liquid.ResourceQuotaRequest + metadata *liquid.ProjectMetadata + expectQuota map[string]int64 // resource name → expected total quota + expectPerAZ map[string]map[string]int64 // resource name → az → expected quota + expectName string + expectDomain string + expectDomName string + }{ + { + name: "Create_WithPerAZ", + projectID: "project-abc-123", + resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{ + "hw_version_hana_1_ram": { + Quota: 100, + PerAZ: map[liquid.AvailabilityZone]liquid.AZResourceQuotaRequest{ + "az-a": {Quota: 60}, + "az-b": {Quota: 40}, + }, + }, + }, + metadata: &liquid.ProjectMetadata{ + UUID: "project-abc-123", + Domain: liquid.DomainMetadata{UUID: "domain-1"}, + }, + expectQuota: map[string]int64{"hw_version_hana_1_ram": 100}, + expectPerAZ: map[string]map[string]int64{ + "hw_version_hana_1_ram": {"az-a": 60, "az-b": 40}, + }, + expectDomain: "domain-1", + }, + { + name: "Create_EmptyResources", + projectID: "project-empty", + resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{}, + metadata: &liquid.ProjectMetadata{ + UUID: "project-empty", + Domain: liquid.DomainMetadata{UUID: "domain-1"}, + }, + expectQuota: map[string]int64{}, + expectDomain: "domain-1", + }, + { + name: "Create_WithMetadata", + projectID: "project-meta-test", + resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{ + "hw_version_hana_1_ram": {Quota: 50}, + }, + metadata: &liquid.ProjectMetadata{ + UUID: "project-meta-test", + Name: "my-project-name", + Domain: liquid.DomainMetadata{ + UUID: "domain-uuid-456", + Name: "my-domain-name", + }, + }, + expectQuota: map[string]int64{"hw_version_hana_1_ram": 50}, + expectName: "my-project-name", + expectDomain: "domain-uuid-456", + expectDomName: "my-domain-name", + }, + { + name: "Update_QuotaValues", + existing: &v1alpha1.ProjectQuota{ + Spec: v1alpha1.ProjectQuotaSpec{ + ProjectID: "project-xyz", + DomainID: "original-domain", + DomainName: "original-domain-name", + ProjectName: "original-project-name", + Quota: map[string]v1alpha1.ResourceQuota{ + "hw_version_hana_1_ram": {Quota: 50, PerAZ: map[string]int64{"az-a": 50}}, + }, + }, + }, + projectID: "project-xyz", + metadata: &liquid.ProjectMetadata{ + UUID: "project-xyz", + Name: "original-project-name", + Domain: liquid.DomainMetadata{ + UUID: "original-domain", + Name: "original-domain-name", + }, + }, + resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{ + "hw_version_hana_1_ram": { + Quota: 200, + PerAZ: map[liquid.AvailabilityZone]liquid.AZResourceQuotaRequest{ + "az-a": {Quota: 120}, + "az-b": {Quota: 80}, + }, + }, + }, + expectQuota: map[string]int64{"hw_version_hana_1_ram": 200}, + expectPerAZ: map[string]map[string]int64{ + "hw_version_hana_1_ram": {"az-a": 120, "az-b": 80}, + }, + // Metadata should be preserved when not provided in update + expectDomain: "original-domain", + expectDomName: "original-domain-name", + expectName: "original-project-name", + }, + { + name: "Update_WithNewMetadata", + existing: &v1alpha1.ProjectQuota{ + Spec: v1alpha1.ProjectQuotaSpec{ + ProjectID: "project-update-meta", + DomainID: "old-domain", + DomainName: "old-domain-name", + ProjectName: "old-project-name", + Quota: map[string]v1alpha1.ResourceQuota{ + "hw_version_hana_1_ram": {Quota: 10}, + }, + }, + }, + projectID: "project-update-meta", + resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{ + "hw_version_hana_1_ram": {Quota: 99}, + }, + metadata: &liquid.ProjectMetadata{ + UUID: "project-update-meta", + Name: "new-project-name", + Domain: liquid.DomainMetadata{ + UUID: "new-domain", + Name: "new-domain-name", + }, + }, + expectQuota: map[string]int64{"hw_version_hana_1_ram": 99}, + expectName: "new-project-name", + expectDomain: "new-domain", + expectDomName: "new-domain-name", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + scheme := newTestScheme(t) + builder := fake.NewClientBuilder().WithScheme(scheme) + + if tc.existing != nil { + tc.existing.Name = projectQuotaCRDName(tc.projectID) + builder = builder.WithObjects(tc.existing) + } + k8sClient := builder.Build() + httpAPI := NewAPI(k8sClient) + + quotaReq := liquid.ServiceQuotaRequest{ + Resources: tc.resources, + } + if tc.metadata != nil { + quotaReq.ProjectMetadata = option.Some(*tc.metadata) + } + body := marshalQuotaReq(t, quotaReq) + + path := "/commitments/v1/projects/" + tc.projectID + "/quota" + req := httptest.NewRequest(http.MethodPut, path, bytes.NewReader(body)) + w := httptest.NewRecorder() + + httpAPI.HandleQuota(w, req) + + resp := w.Result() + defer resp.Body.Close() + + if resp.StatusCode != http.StatusNoContent { + t.Fatalf("expected status %d (No Content), got %d", http.StatusNoContent, resp.StatusCode) + } + + // Verify the ProjectQuota CRD + var pq v1alpha1.ProjectQuota + crdName := projectQuotaCRDName(tc.projectID) + if err := k8sClient.Get(context.Background(), client.ObjectKey{Name: crdName}, &pq); err != nil { + t.Fatalf("failed to get ProjectQuota CRD %q: %v", crdName, err) + } + + if pq.Spec.ProjectID != tc.projectID { + t.Errorf("expected ProjectID %q, got %q", tc.projectID, pq.Spec.ProjectID) + } + + // Verify quota totals + for resName, expectedTotal := range tc.expectQuota { + actual, ok := pq.Spec.Quota[resName] + if !ok { + t.Errorf("expected resource %q in quota spec", resName) + continue + } + if actual.Quota != expectedTotal { + t.Errorf("resource %q: expected quota %d, got %d", resName, expectedTotal, actual.Quota) + } + } + + // Verify per-AZ quotas + for resName, azMap := range tc.expectPerAZ { + actual, ok := pq.Spec.Quota[resName] + if !ok { + t.Errorf("expected resource %q in quota spec for per-AZ check", resName) + continue + } + for az, expectedAZ := range azMap { + if actual.PerAZ[az] != expectedAZ { + t.Errorf("resource %q AZ %q: expected %d, got %d", resName, az, expectedAZ, actual.PerAZ[az]) + } + } + } + + // Verify metadata + if tc.expectName != "" && pq.Spec.ProjectName != tc.expectName { + t.Errorf("expected ProjectName %q, got %q", tc.expectName, pq.Spec.ProjectName) + } + if tc.expectDomain != "" && pq.Spec.DomainID != tc.expectDomain { + t.Errorf("expected DomainID %q, got %q", tc.expectDomain, pq.Spec.DomainID) + } + if tc.expectDomName != "" && pq.Spec.DomainName != tc.expectDomName { + t.Errorf("expected DomainName %q, got %q", tc.expectDomName, pq.Spec.DomainName) + } + }) + } +} + +func boolPtr(b bool) *bool { + return &b +} diff --git a/internal/scheduling/reservations/commitments/api/report_capacity_test.go b/internal/scheduling/reservations/commitments/api/report_capacity_test.go index 8f6029438..9972c6681 100644 --- a/internal/scheduling/reservations/commitments/api/report_capacity_test.go +++ b/internal/scheduling/reservations/commitments/api/report_capacity_test.go @@ -177,7 +177,7 @@ func TestCapacityCalculator(t *testing.T) { }) t.Run("CalculateCapacity returns perAZ entries for all AZs from request", func(t *testing.T) { - flavorGroupKnowledge := createTestFlavorGroupKnowledge(t, "test-group") + flavorGroupKnowledge := createTestFlavorGroupKnowledge(t) fakeClient := fake.NewClientBuilder(). WithScheme(scheme). WithObjects(flavorGroupKnowledge). @@ -203,7 +203,7 @@ func TestCapacityCalculator(t *testing.T) { }) t.Run("CalculateCapacity with empty AllAZs returns empty perAZ maps", func(t *testing.T) { - flavorGroupKnowledge := createTestFlavorGroupKnowledge(t, "test-group") + flavorGroupKnowledge := createTestFlavorGroupKnowledge(t) fakeClient := fake.NewClientBuilder(). WithScheme(scheme). WithObjects(flavorGroupKnowledge). @@ -228,7 +228,7 @@ func TestCapacityCalculator(t *testing.T) { }) t.Run("CalculateCapacity responds to different AZ sets correctly", func(t *testing.T) { - flavorGroupKnowledge := createTestFlavorGroupKnowledge(t, "test-group") + flavorGroupKnowledge := createTestFlavorGroupKnowledge(t) fakeClient := fake.NewClientBuilder(). WithScheme(scheme). WithObjects(flavorGroupKnowledge). @@ -260,6 +260,106 @@ func TestCapacityCalculator(t *testing.T) { verifyPerAZMatchesRequest(t, res, req2.AllAZs) } }) + + t.Run("CalculateCapacity reads capacity and usage from Ready CRD", func(t *testing.T) { + knowledge := createTestFlavorGroupKnowledge(t) + crd := createTestFlavorGroupCapacity("test-group", "az-one", "test_c8_m32", 1000, 800, true) + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(knowledge, crd). + WithStatusSubresource(crd). + Build() + + calculator := commitments.NewCapacityCalculator(fakeClient) + req := liquid.ServiceCapacityRequest{AllAZs: []liquid.AvailabilityZone{"az-one"}} + report, err := calculator.CalculateCapacity(context.Background(), req) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + ramRes := report.Resources["hw_version_test-group_ram"] + if ramRes == nil { + t.Fatal("expected hw_version_test-group_ram resource") + } + azReport := ramRes.PerAZ["az-one"] + if azReport == nil { + t.Fatal("expected az-one entry") + } + if azReport.Capacity != 1000 { + t.Errorf("expected capacity=1000, got %d", azReport.Capacity) + } + if !azReport.Usage.IsSome() { + t.Fatal("expected usage to be set for Ready CRD") + } + // usage = capacity - placeable = 1000 - 800 = 200 + if usage := azReport.Usage.UnwrapOr(0); usage != 200 { + t.Errorf("expected usage=200 (1000-800), got %d", usage) + } + }) + + t.Run("CalculateCapacity returns zero capacity for missing CRD", func(t *testing.T) { + knowledge := createTestFlavorGroupKnowledge(t) + // CRD exists only for az-one; az-two has no CRD + crd := createTestFlavorGroupCapacity("test-group", "az-one", "test_c8_m32", 500, 400, true) + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(knowledge, crd). + WithStatusSubresource(crd). + Build() + + calculator := commitments.NewCapacityCalculator(fakeClient) + req := liquid.ServiceCapacityRequest{AllAZs: []liquid.AvailabilityZone{"az-one", "az-two"}} + report, err := calculator.CalculateCapacity(context.Background(), req) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + ramRes := report.Resources["hw_version_test-group_ram"] + if ramRes == nil { + t.Fatal("expected hw_version_test-group_ram resource") + } + azTwo := ramRes.PerAZ["az-two"] + if azTwo == nil { + t.Fatal("expected az-two entry even without CRD") + } + if azTwo.Capacity != 0 { + t.Errorf("expected capacity=0 for missing CRD, got %d", azTwo.Capacity) + } + }) + + t.Run("CalculateCapacity omits usage for stale CRD (Ready=False)", func(t *testing.T) { + knowledge := createTestFlavorGroupKnowledge(t) + crd := createTestFlavorGroupCapacity("test-group", "az-one", "test_c8_m32", 1000, 800, false) + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(knowledge, crd). + WithStatusSubresource(crd). + Build() + + calculator := commitments.NewCapacityCalculator(fakeClient) + req := liquid.ServiceCapacityRequest{AllAZs: []liquid.AvailabilityZone{"az-one"}} + report, err := calculator.CalculateCapacity(context.Background(), req) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + ramRes := report.Resources["hw_version_test-group_ram"] + if ramRes == nil { + t.Fatal("expected hw_version_test-group_ram resource") + } + azReport := ramRes.PerAZ["az-one"] + if azReport == nil { + t.Fatal("expected az-one entry") + } + // Stale CRD: last-known capacity is still reported + if azReport.Capacity != 1000 { + t.Errorf("expected last-known capacity=1000 for stale CRD, got %d", azReport.Capacity) + } + // Stale CRD: usage must be absent (None) + if azReport.Usage.IsSome() { + t.Error("expected usage to be absent (None) for stale CRD") + } + }) } // verifyPerAZMatchesRequest checks that perAZ entries match exactly the requested AZs. @@ -318,14 +418,47 @@ func createEmptyFlavorGroupKnowledge() *v1alpha1.Knowledge { } } -// createTestFlavorGroupKnowledge creates a test Knowledge CRD with flavor group data +// createTestFlavorGroupCapacity creates a FlavorGroupCapacity CRD for testing. +// totalSlots and placeableSlots are for the named smallest flavor entry. +// ready controls whether the Ready condition is True or False. +func createTestFlavorGroupCapacity(group, az, smallestFlavorName string, totalSlots, placeableSlots int64, ready bool) *v1alpha1.FlavorGroupCapacity { + conditionStatus := v1.ConditionTrue + if !ready { + conditionStatus = v1.ConditionFalse + } + return &v1alpha1.FlavorGroupCapacity{ + ObjectMeta: v1.ObjectMeta{ + Name: group + "-" + az, + }, + Spec: v1alpha1.FlavorGroupCapacitySpec{ + FlavorGroup: group, + AvailabilityZone: az, + }, + Status: v1alpha1.FlavorGroupCapacityStatus{ + Flavors: []v1alpha1.FlavorCapacityStatus{ + { + FlavorName: smallestFlavorName, + TotalCapacityVMSlots: totalSlots, + PlaceableVMs: placeableSlots, + }, + }, + Conditions: []v1.Condition{ + { + Type: v1alpha1.FlavorGroupCapacityConditionReady, + Status: conditionStatus, + }, + }, + }, + } +} + // that accepts commitments (has fixed RAM/core ratio) -func createTestFlavorGroupKnowledge(t *testing.T, groupName string) *v1alpha1.Knowledge { +func createTestFlavorGroupKnowledge(t *testing.T) *v1alpha1.Knowledge { t.Helper() features := []map[string]interface{}{ { - "name": groupName, + "name": "test-group", "flavors": []map[string]interface{}{ { "name": "test_c8_m32", diff --git a/internal/scheduling/reservations/commitments/api/report_usage_test.go b/internal/scheduling/reservations/commitments/api/report_usage_test.go index 719a7bbb1..a867122ff 100644 --- a/internal/scheduling/reservations/commitments/api/report_usage_test.go +++ b/internal/scheduling/reservations/commitments/api/report_usage_test.go @@ -25,6 +25,8 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/controller-runtime/pkg/log" @@ -427,6 +429,7 @@ type TestVMUsage struct { AZ string Host string CreatedAt time.Time + OSType string // pre-computed os_type, e.g. "windows8Server64Guest" or "unknown" } func newTestVMUsage(uuid string, flavor *TestFlavor, projectID, az, host string, createdAt time.Time) *TestVMUsage { @@ -465,6 +468,7 @@ type ExpectedVMUsage struct { CommitmentID string // Empty string = PAYG MemoryMB uint64 // For verification VideoRAMMiB *uint64 // nil = expect field absent + OSType string // Empty string = skip check } // ============================================================================ @@ -497,6 +501,10 @@ func (m *mockUsageDBClient) addVM(vm *TestVMUsage) { extraSpecs["hw_video:ram_max_mb"] = strconv.FormatUint(*vm.Flavor.VideoRAMMiB, 10) } extrasJSON, _ := json.Marshal(extraSpecs) //nolint:errcheck // test helper, always valid + osType := vm.OSType + if osType == "" { + osType = "unknown" + } row := commitments.VMRow{ ID: vm.UUID, Name: vm.UUID, @@ -509,6 +517,7 @@ func (m *mockUsageDBClient) addVM(vm *TestVMUsage) { FlavorVCPUs: uint64(vm.Flavor.VCPUs), //nolint:gosec FlavorDisk: vm.Flavor.DiskGB, FlavorExtras: string(extrasJSON), + OSType: osType, } m.rows[vm.ProjectID] = append(m.rows[vm.ProjectID], row) } @@ -562,15 +571,43 @@ func newUsageTestEnv( knowledgeCRD := createKnowledgeCRD(flavorGroups) k8sReservations = append(k8sReservations, knowledgeCRD) + // Create CommittedResource CRDs (one per unique commitment). + // The usage reconciler writes assignment results into these; CalculateUsage reads them back. + seenCommitments := make(map[string]bool) + var crObjects []client.Object + for _, tr := range reservations { + if seenCommitments[tr.CommitmentID] { + continue + } + seenCommitments[tr.CommitmentID] = true + crObjects = append(crObjects, tr.toCommittedResourceCRD()) + } + + k8sReservations = append(k8sReservations, crObjects...) k8sClient := fake.NewClientBuilder(). WithScheme(scheme). WithObjects(k8sReservations...). WithStatusSubresource(&v1alpha1.Reservation{}). WithStatusSubresource(&v1alpha1.Knowledge{}). + WithStatusSubresource(&v1alpha1.CommittedResource{}). WithIndex(&v1alpha1.Reservation{}, "spec.type", func(obj client.Object) []string { res := obj.(*v1alpha1.Reservation) return []string{string(res.Spec.Type)} }). + WithIndex(&v1alpha1.CommittedResource{}, "spec.commitmentUUID", func(obj client.Object) []string { + cr, ok := obj.(*v1alpha1.CommittedResource) + if !ok { + return nil + } + return []string{cr.Spec.CommitmentUUID} + }). + WithIndex(&v1alpha1.CommittedResource{}, "spec.projectID", func(obj client.Object) []string { + cr, ok := obj.(*v1alpha1.CommittedResource) + if !ok || cr.Spec.ProjectID == "" { + return nil + } + return []string{cr.Spec.ProjectID} + }). Build() // Create mock DB client with VMs @@ -579,6 +616,25 @@ func newUsageTestEnv( dbClient.addVM(vm) } + // Run usage reconciler to populate CommittedResource.Status with VM assignments. + // CalculateUsage reads from this status, so the API returns the correct commitment assignments. + if len(crObjects) > 0 { + rec := &commitments.UsageReconciler{ + Client: k8sClient, + Conf: commitments.UsageReconcilerConfig{CooldownInterval: metav1.Duration{Duration: 0}}, + UsageDB: dbClient, + Monitor: commitments.NewUsageReconcilerMonitor(), + } + ctx := context.Background() + for _, obj := range crObjects { + cr := obj.(*v1alpha1.CommittedResource) + req := ctrl.Request{NamespacedName: types.NamespacedName{Name: cr.Name}} + if _, err := rec.Reconcile(ctx, req); err != nil { + t.Fatalf("usage reconciler failed for %s: %v", cr.Name, err) + } + } + } + // Create API with mock DB client api := NewAPIWithConfig(k8sClient, commitments.DefaultAPIConfig(), dbClient) mux := http.NewServeMux() @@ -806,6 +862,12 @@ func verifyUsageReport(t *testing.T, tc UsageReportTestCase, actual liquid.Servi } } + // Verify os_type when specified + if expectedVM.OSType != "" && actualVM.OSType != expectedVM.OSType { + t.Errorf("Resource %s AZ %s VM %s: expected os_type %q, got %q", + instancesResourceName, azName, expectedVM.UUID, expectedVM.OSType, actualVM.OSType) + } + // Assert HWVersion is absent from the serialized output (must not appear per LIQUID schema) if rawFlavor, ok := actualRawVMs[expectedVM.UUID]; ok { if flavorRaw, ok := rawFlavor["flavor"]; ok { @@ -848,6 +910,43 @@ type vmFlavorAttrs struct { // Helper Functions // ============================================================================ +// toCommittedResourceCRD creates a minimal CommittedResource CRD for this commitment. +// Used by the test setup to pre-populate the CR objects that the usage reconciler writes status into. +func (tr *UsageTestReservation) toCommittedResourceCRD() *v1alpha1.CommittedResource { + amount := resource.MustParse(strconv.FormatInt(tr.Flavor.MemoryMB*int64(tr.Count), 10) + "Mi") + spec := v1alpha1.CommittedResourceSpec{ + CommitmentUUID: tr.CommitmentID, + ProjectID: tr.ProjectID, + DomainID: "test-domain", + AvailabilityZone: tr.AZ, + FlavorGroupName: tr.Flavor.Group, + ResourceType: v1alpha1.CommittedResourceTypeMemory, + State: v1alpha1.CommitmentStatusConfirmed, + Amount: amount, + } + if !tr.StartTime.IsZero() { + spec.StartTime = &metav1.Time{Time: tr.StartTime} + } + return &v1alpha1.CommittedResource{ + ObjectMeta: metav1.ObjectMeta{Name: "cr-" + tr.CommitmentID}, + Spec: spec, + Status: v1alpha1.CommittedResourceStatus{ + AcceptedSpec: &spec, + // Simulate the CR controller having accepted the current generation (0 for fake client). + // Without this, the usage reconciler's readiness gate blocks usage calculation. + Conditions: []metav1.Condition{ + { + Type: v1alpha1.CommittedResourceConditionReady, + Status: metav1.ConditionTrue, + Reason: v1alpha1.CommittedResourceReasonAccepted, + ObservedGeneration: 0, + LastTransitionTime: metav1.Now(), + }, + }, + }, + } +} + // toK8sReservation converts a UsageTestReservation to a K8s Reservation. func (tr *UsageTestReservation) toK8sReservation(number int) *v1alpha1.Reservation { name := fmt.Sprintf("commitment-%s-%d", tr.CommitmentID, number) diff --git a/internal/scheduling/reservations/commitments/api/usage_test.go b/internal/scheduling/reservations/commitments/api/usage_test.go index 71fed90bb..d15967d16 100644 --- a/internal/scheduling/reservations/commitments/api/usage_test.go +++ b/internal/scheduling/reservations/commitments/api/usage_test.go @@ -19,6 +19,8 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/controller-runtime/pkg/log" @@ -119,6 +121,13 @@ func TestUsageCalculator_CalculateUsage(t *testing.T) { k8sClient := fake.NewClientBuilder(). WithScheme(scheme). WithObjects(objects...). + WithIndex(&v1alpha1.CommittedResource{}, "spec.projectID", func(obj client.Object) []string { + cr, ok := obj.(*v1alpha1.CommittedResource) + if !ok || cr.Spec.ProjectID == "" { + return nil + } + return []string{cr.Spec.ProjectID} + }). Build() // Setup mock Nova client @@ -301,6 +310,21 @@ func TestUsageCalculator_ExpiredAndFutureCommitments(t *testing.T) { k8sClient := fake.NewClientBuilder(). WithScheme(scheme). WithObjects(objects...). + WithStatusSubresource(&v1alpha1.CommittedResource{}). + WithIndex(&v1alpha1.CommittedResource{}, "spec.commitmentUUID", func(obj client.Object) []string { + cr, ok := obj.(*v1alpha1.CommittedResource) + if !ok { + return nil + } + return []string{cr.Spec.CommitmentUUID} + }). + WithIndex(&v1alpha1.CommittedResource{}, "spec.projectID", func(obj client.Object) []string { + cr, ok := obj.(*v1alpha1.CommittedResource) + if !ok || cr.Spec.ProjectID == "" { + return nil + } + return []string{cr.Spec.ProjectID} + }). Build() dbClient := &mockUsageDBClient{ @@ -309,6 +333,65 @@ func TestUsageCalculator_ExpiredAndFutureCommitments(t *testing.T) { }, } + // Create CommittedResource CRDs and run the usage reconciler so that + // CalculateUsage can read pre-computed assignments from CRD status. + seen := make(map[string]bool) + for _, r := range tt.reservations { + if r.Spec.CommittedResourceReservation == nil { + continue + } + uuid := r.Spec.CommittedResourceReservation.CommitmentUUID + if seen[uuid] { + continue + } + seen[uuid] = true + amount := resource.MustParse("4Gi") + spec := v1alpha1.CommittedResourceSpec{ + CommitmentUUID: uuid, + ProjectID: r.Spec.CommittedResourceReservation.ProjectID, + DomainID: "test-domain", + AvailabilityZone: r.Spec.AvailabilityZone, + FlavorGroupName: r.Spec.CommittedResourceReservation.ResourceGroup, + ResourceType: v1alpha1.CommittedResourceTypeMemory, + State: v1alpha1.CommitmentStatusConfirmed, + Amount: amount, + StartTime: r.Spec.StartTime, + EndTime: r.Spec.EndTime, + } + cr := &v1alpha1.CommittedResource{ + ObjectMeta: metav1.ObjectMeta{Name: "cr-" + uuid}, + Spec: spec, + } + if err := k8sClient.Create(ctx, cr); err != nil { + t.Fatalf("failed to create CommittedResource %s: %v", uuid, err) + } + cr.Status = v1alpha1.CommittedResourceStatus{ + AcceptedSpec: &spec, + Conditions: []metav1.Condition{ + { + Type: v1alpha1.CommittedResourceConditionReady, + Status: metav1.ConditionTrue, + Reason: v1alpha1.CommittedResourceReasonAccepted, + ObservedGeneration: 0, + LastTransitionTime: metav1.Now(), + }, + }, + } + if err := k8sClient.Status().Update(ctx, cr); err != nil { + t.Fatalf("failed to update CommittedResource status %s: %v", uuid, err) + } + rec := &commitments.UsageReconciler{ + Client: k8sClient, + Conf: commitments.UsageReconcilerConfig{CooldownInterval: metav1.Duration{Duration: 0}}, + UsageDB: dbClient, + Monitor: commitments.NewUsageReconcilerMonitor(), + } + req := ctrl.Request{NamespacedName: types.NamespacedName{Name: cr.Name}} + if _, err := rec.Reconcile(ctx, req); err != nil { + t.Fatalf("usage reconciler failed for %s: %v", uuid, err) + } + } + calc := commitments.NewUsageCalculator(k8sClient, dbClient) logger := log.FromContext(ctx) report, err := calc.CalculateUsage(ctx, logger, tt.projectID, tt.allAZs) @@ -465,6 +548,13 @@ func TestUsageMultipleCalculation_FloorDivision(t *testing.T) { k8sClient := fake.NewClientBuilder(). WithScheme(scheme). WithObjects(objects...). + WithIndex(&v1alpha1.CommittedResource{}, "spec.projectID", func(obj client.Object) []string { + cr, ok := obj.(*v1alpha1.CommittedResource) + if !ok || cr.Spec.ProjectID == "" { + return nil + } + return []string{cr.Spec.ProjectID} + }). Build() dbClient := &mockUsageDBClient{ diff --git a/internal/scheduling/reservations/commitments/capacity.go b/internal/scheduling/reservations/commitments/capacity.go index 8cd3a7159..076428fa6 100644 --- a/internal/scheduling/reservations/commitments/capacity.go +++ b/internal/scheduling/reservations/commitments/capacity.go @@ -7,11 +7,13 @@ import ( "context" "fmt" - "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" - "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" . "github.com/majewsky/gg/option" "github.com/sapcc/go-api-declarations/liquid" + apimeta "k8s.io/apimachinery/pkg/api/meta" "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" ) // CapacityCalculator computes capacity reports for Limes LIQUID API. @@ -25,54 +27,93 @@ func NewCapacityCalculator(client client.Client) *CapacityCalculator { // CalculateCapacity computes per-AZ capacity for all flavor groups. // For each flavor group, three resources are reported: _ram, _cores, _instances. -// All flavor groups are included, not just those with fixed RAM/core ratio. -// The request provides the list of all AZs from Limes that must be included in the report. +// Capacity and usage are read from FlavorGroupCapacity CRDs pre-computed by the capacity controller. func (c *CapacityCalculator) CalculateCapacity(ctx context.Context, req liquid.ServiceCapacityRequest) (liquid.ServiceCapacityReport, error) { - // Get all flavor groups from Knowledge CRDs + // Get all flavor groups from Knowledge CRDs (needed for smallest-flavor lookup). knowledge := &reservations.FlavorGroupKnowledgeClient{Client: c.client} flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) if err != nil { return liquid.ServiceCapacityReport{}, fmt.Errorf("failed to get flavor groups: %w", err) } - // Get version from Knowledge CRD (same as info API version) + // Get version from Knowledge CRD (same as info API version). var infoVersion int64 = -1 if knowledgeCRD, err := knowledge.Get(ctx); err == nil && knowledgeCRD != nil && !knowledgeCRD.Status.LastContentChange.IsZero() { infoVersion = knowledgeCRD.Status.LastContentChange.Unix() } - // Build capacity report for all flavor groups + // List all FlavorGroupCapacity CRDs and index by (flavorGroup, az). + var capacityList v1alpha1.FlavorGroupCapacityList + if err := c.client.List(ctx, &capacityList); err != nil { + return liquid.ServiceCapacityReport{}, fmt.Errorf("failed to list FlavorGroupCapacity CRDs: %w", err) + } + type groupAZKey struct{ group, az string } + crdByKey := make(map[groupAZKey]*v1alpha1.FlavorGroupCapacity, len(capacityList.Items)) + for i := range capacityList.Items { + crd := &capacityList.Items[i] + crdByKey[groupAZKey{crd.Spec.FlavorGroup, crd.Spec.AvailabilityZone}] = crd + } + + // Build capacity report for all flavor groups. report := liquid.ServiceCapacityReport{ InfoVersion: infoVersion, Resources: make(map[liquid.ResourceName]*liquid.ResourceCapacityReport), } + logger := LoggerFromContext(ctx) for groupName, groupData := range flavorGroups { - // All flavor groups are included in capacity reporting (not just those with fixed ratio). - - // Calculate per-AZ capacity (placeholder: capacity=0 for all resources) - azCapacity := c.calculateAZCapacity(groupName, groupData, req.AllAZs) + smallestFlavorName := groupData.SmallestFlavor.Name + + azCapacity := make(map[liquid.AvailabilityZone]*liquid.AZResourceCapacityReport, len(req.AllAZs)) + for _, az := range req.AllAZs { + crd, ok := crdByKey[groupAZKey{groupName, string(az)}] + if !ok { + // No CRD for this (group, AZ) pair — report zero. + azCapacity[az] = &liquid.AZResourceCapacityReport{Capacity: 0} + continue + } + + // If the CRD data is stale, report last-known capacity but omit usage. + ready := apimeta.IsStatusConditionTrue(crd.Status.Conditions, v1alpha1.FlavorGroupCapacityConditionReady) + if !ready { + logger.Info("FlavorGroupCapacity CRD is stale, reporting capacity without usage", + "flavorGroup", groupName, "az", az) + } + + // Find the smallest-flavor entry in the CRD status. + var smallest *v1alpha1.FlavorCapacityStatus + for i := range crd.Status.Flavors { + if crd.Status.Flavors[i].FlavorName == smallestFlavorName { + smallest = &crd.Status.Flavors[i] + break + } + } + if smallest == nil { + azCapacity[az] = &liquid.AZResourceCapacityReport{Capacity: 0} + continue + } + + capacity := uint64(smallest.TotalCapacityVMSlots) //nolint:gosec + azEntry := &liquid.AZResourceCapacityReport{Capacity: capacity} + if ready { + placeable := uint64(smallest.PlaceableVMs) //nolint:gosec + var usage uint64 + if capacity > placeable { + usage = capacity - placeable + } + azEntry.Usage = Some[uint64](usage) + } + azCapacity[az] = azEntry + } - // === 1. RAM Resource === - ramResourceName := liquid.ResourceName(ResourceNameRAM(groupName)) - report.Resources[ramResourceName] = &liquid.ResourceCapacityReport{ + // All three resources share the same capacity units (multiples of smallest flavor). + report.Resources[liquid.ResourceName(ResourceNameRAM(groupName))] = &liquid.ResourceCapacityReport{ PerAZ: azCapacity, } - - // === 2. Cores Resource === - // NOTE: Copying RAM capacity is only valid while capacity=0 (placeholder). - // When real capacity is implemented, derive cores capacity with unit conversion - // (e.g., cores = RAM / ramCoreRatio). See calculateAZCapacity for details. - coresResourceName := liquid.ResourceName(ResourceNameCores(groupName)) - report.Resources[coresResourceName] = &liquid.ResourceCapacityReport{ + report.Resources[liquid.ResourceName(ResourceNameCores(groupName))] = &liquid.ResourceCapacityReport{ PerAZ: c.copyAZCapacity(azCapacity), } - - // === 3. Instances Resource === - // NOTE: Same as cores - copying is only valid while capacity=0 (placeholder). - // When real capacity is implemented, derive instances capacity appropriately. - instancesResourceName := liquid.ResourceName(ResourceNameInstances(groupName)) - report.Resources[instancesResourceName] = &liquid.ResourceCapacityReport{ + report.Resources[liquid.ResourceName(ResourceNameInstances(groupName))] = &liquid.ResourceCapacityReport{ PerAZ: c.copyAZCapacity(azCapacity), } } @@ -81,7 +122,7 @@ func (c *CapacityCalculator) CalculateCapacity(ctx context.Context, req liquid.S } // copyAZCapacity creates a deep copy of the AZ capacity map. -// This is needed because each resource needs its own map instance. +// Each resource needs its own map instance. func (c *CapacityCalculator) copyAZCapacity( src map[liquid.AvailabilityZone]*liquid.AZResourceCapacityReport, ) map[liquid.AvailabilityZone]*liquid.AZResourceCapacityReport { @@ -95,31 +136,3 @@ func (c *CapacityCalculator) copyAZCapacity( } return result } - -func (c *CapacityCalculator) calculateAZCapacity( - _ string, // groupName - reserved for future use - _ compute.FlavorGroupFeature, // groupData - reserved for future use - allAZs []liquid.AvailabilityZone, // list of all AZs from Limes request -) map[liquid.AvailabilityZone]*liquid.AZResourceCapacityReport { - - // Create report entry for each AZ with placeholder capacity=0. - // - // NOTE: When implementing real capacity calculation here, you MUST also update - // the copying logic in CalculateCapacity() for _cores and _instances resources. - // Those resources use different units (vCPUs and VM count) than _ram (memory multiples), - // so the capacity values cannot be simply copied - they require unit conversion: - // - _cores capacity = RAM capacity / ramCoreRatio - // - _instances capacity = needs its own derivation logic - // - // TODO: Calculate actual capacity from Reservation CRDs or host resources - // TODO: Calculate actual usage from VM allocations - result := make(map[liquid.AvailabilityZone]*liquid.AZResourceCapacityReport) - for _, az := range allAZs { - result[az] = &liquid.AZResourceCapacityReport{ - Capacity: 0, // Placeholder: capacity=0 until actual calculation is implemented - Usage: Some[uint64](0), // Placeholder: usage=0 until actual calculation is implemented - } - } - - return result -} diff --git a/internal/scheduling/reservations/commitments/committed_resource_controller.go b/internal/scheduling/reservations/commitments/committed_resource_controller.go index 2389440e3..a74c82f30 100644 --- a/internal/scheduling/reservations/commitments/committed_resource_controller.go +++ b/internal/scheduling/reservations/commitments/committed_resource_controller.go @@ -6,6 +6,7 @@ package commitments import ( "context" "fmt" + "time" "github.com/go-logr/logr" "k8s.io/apimachinery/pkg/api/meta" @@ -15,7 +16,6 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" - "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/handler" "github.com/cobaltcore-dev/cortex/api/v1alpha1" @@ -23,7 +23,13 @@ import ( "github.com/cobaltcore-dev/cortex/pkg/multicluster" ) -const crFinalizer = "committed-resource.reservations.cortex.cloud/cleanup" +const ( + // maxReservingAgeForSlowdown is how long a CR can be continuously in Reserving state + // before the Reservation watch stops re-enqueuing it. Beyond this point only the + // RequeueAfter backoff timer drives retries, preventing reservation churn from a broken + // rollback from flooding the reconcile queue. + maxReservingAgeForSlowdown = 30 * time.Minute +) // CommittedResourceController reconciles CommittedResource CRDs and owns all child Reservation CRUD. type CommittedResourceController struct { @@ -49,15 +55,14 @@ func (r *CommittedResourceController) Reconcile(ctx context.Context, req ctrl.Re ) if !cr.DeletionTimestamp.IsZero() { - return r.reconcileDeletion(ctx, logger, &cr) + return ctrl.Result{}, nil } - if !controllerutil.ContainsFinalizer(&cr, crFinalizer) { - controllerutil.AddFinalizer(&cr, crFinalizer) - if err := r.Update(ctx, &cr); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to add finalizer: %w", err) - } - return ctrl.Result{}, nil + // Treat time-expired CRs as inactive regardless of Spec.State. + // The syncer updates State to expired on its own schedule (e.g. hourly); routing directly + // to reconcileInactive here ensures reservation slots are deleted as soon as EndTime passes. + if cr.Spec.EndTime != nil && cr.Spec.EndTime.Time.Before(time.Now()) { + return r.reconcileInactive(ctx, logger, &cr) } switch cr.Spec.State { @@ -78,8 +83,21 @@ func (r *CommittedResourceController) Reconcile(ctx context.Context, req ctrl.Re // reconcilePending handles a confirmation attempt (Limes state: pending). // If AllowRejection=true (API path), placement failure marks the CR Rejected so the HTTP API // can report the outcome back to Limes. If AllowRejection=false (syncer path), the controller -// retries indefinitely — Limes does not require confirmation for these transitions. +// retries with exponential backoff — Limes does not require confirmation for these transitions. func (r *CommittedResourceController) reconcilePending(ctx context.Context, logger logr.Logger, cr *v1alpha1.CommittedResource) (ctrl.Result, error) { + logger.Info("reconciling pending resource", + "generation", cr.Generation, + "az", cr.Spec.AvailabilityZone, + "amount", cr.Spec.Amount.String(), + "allowRejection", cr.Spec.AllowRejection, + ) + // If this spec generation was already rejected, don't re-apply. + // Without this guard the controller oscillates: apply bad spec → delete reservations → + // Reservation watch re-enqueues → apply bad spec again → loop. + if isRejectedForGeneration(cr) { + logger.V(1).Info("spec already rejected for current generation", "generation", cr.Generation) + return ctrl.Result{}, nil + } result, applyErr := r.applyReservationState(ctx, logger, cr) if applyErr != nil { if cr.Spec.AllowRejection { @@ -89,8 +107,9 @@ func (r *CommittedResourceController) reconcilePending(ctx context.Context, logg } return ctrl.Result{}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonRejected, applyErr.Error()) } - logger.Error(applyErr, "pending commitment placement failed, will retry", "requeueAfter", r.Conf.RequeueIntervalRetry.Duration) - return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalRetry.Duration}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonReserving, applyErr.Error()) + delay := r.retryDelay(cr) + logger.Error(applyErr, "pending commitment placement failed, will retry", "requeueAfter", delay) + return ctrl.Result{RequeueAfter: delay}, r.setNotReadyRetry(ctx, cr, applyErr.Error()) } allReady, anyFailed, failReason, err := r.checkChildReservationStatus(ctx, cr, result.TotalSlots) if err != nil { @@ -104,17 +123,34 @@ func (r *CommittedResourceController) reconcilePending(ctx context.Context, logg } return ctrl.Result{}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonRejected, failReason) } - logger.Info("pending commitment placement failed, will retry", "reason", failReason, "requeueAfter", r.Conf.RequeueIntervalRetry.Duration) - return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalRetry.Duration}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonReserving, failReason) + delay := r.retryDelay(cr) + logger.Info("pending commitment placement failed, will retry", "reason", failReason, "requeueAfter", delay) + return ctrl.Result{RequeueAfter: delay}, r.setNotReadyRetry(ctx, cr, failReason) } if !allReady { // Reservation controller hasn't processed all slots yet; Reservation watch will re-enqueue. - return ctrl.Result{}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonReserving, "waiting for reservation placement") + // Reset the retry timer: applyReservationState just succeeded, so the watch suppression + // gate should not fire while we wait for slots to become ready. + return ctrl.Result{}, r.patchNotReady(ctx, cr, v1alpha1.CommittedResourceReasonReserving, "waiting for reservation placement", true) } + logger.Info("committed resource accepted", "generation", cr.Generation, "amount", cr.Spec.Amount.String()) return ctrl.Result{}, r.setAccepted(ctx, cr) } func (r *CommittedResourceController) reconcileCommitted(ctx context.Context, logger logr.Logger, cr *v1alpha1.CommittedResource) (ctrl.Result, error) { + logger.Info("reconciling committed resource", + "generation", cr.Generation, + "az", cr.Spec.AvailabilityZone, + "amount", cr.Spec.Amount.String(), + "allowRejection", cr.Spec.AllowRejection, + ) + // If this spec generation was already rejected, maintain rollback state without re-applying. + // Without this guard the controller oscillates: apply bad spec → rollback → + // Reservation watch re-enqueues → apply bad spec again → loop. + if isRejectedForGeneration(cr) { + logger.V(1).Info("spec already rejected for current generation, maintaining rollback state", "generation", cr.Generation) + return ctrl.Result{}, r.rollbackToAccepted(ctx, logger, cr) + } // Spec errors are permanent regardless of AllowRejection — a bad spec won't fix itself. if _, err := FromCommittedResource(*cr); err != nil { logger.Error(err, "invalid commitment spec, rejecting") @@ -123,14 +159,15 @@ func (r *CommittedResourceController) reconcileCommitted(ctx context.Context, lo result, applyErr := r.applyReservationState(ctx, logger, cr) if applyErr != nil { if cr.Spec.AllowRejection { - logger.Error(applyErr, "committed placement failed, rolling back to accepted amount") + logger.Error(applyErr, "committed placement failed, rolling back to accepted spec") if rollbackErr := r.rollbackToAccepted(ctx, logger, cr); rollbackErr != nil { return ctrl.Result{}, rollbackErr } return ctrl.Result{}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonRejected, applyErr.Error()) } - logger.Error(applyErr, "committed placement incomplete, will retry", "requeueAfter", r.Conf.RequeueIntervalRetry.Duration) - return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalRetry.Duration}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonReserving, applyErr.Error()) + delay := r.retryDelay(cr) + logger.Error(applyErr, "committed placement incomplete, will retry", "requeueAfter", delay) + return ctrl.Result{RequeueAfter: delay}, r.setNotReadyRetry(ctx, cr, applyErr.Error()) } allReady, anyFailed, failReason, err := r.checkChildReservationStatus(ctx, cr, result.TotalSlots) if err != nil { @@ -138,19 +175,23 @@ func (r *CommittedResourceController) reconcileCommitted(ctx context.Context, lo } if anyFailed { if cr.Spec.AllowRejection { - logger.Info("committed placement failed, rolling back to accepted amount", "reason", failReason) + logger.Info("committed placement failed, rolling back to accepted spec", "reason", failReason) if rollbackErr := r.rollbackToAccepted(ctx, logger, cr); rollbackErr != nil { return ctrl.Result{}, rollbackErr } return ctrl.Result{}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonRejected, failReason) } - logger.Info("committed placement failed, will retry", "reason", failReason, "requeueAfter", r.Conf.RequeueIntervalRetry.Duration) - return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalRetry.Duration}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonReserving, failReason) + delay := r.retryDelay(cr) + logger.Info("committed placement failed, will retry", "reason", failReason, "requeueAfter", delay) + return ctrl.Result{RequeueAfter: delay}, r.setNotReadyRetry(ctx, cr, failReason) } if !allReady { // Reservation controller hasn't processed all slots yet; Reservation watch will re-enqueue. - return ctrl.Result{}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonReserving, "waiting for reservation placement") + // Reset the retry timer: applyReservationState just succeeded, so the watch suppression + // gate should not fire while we wait for slots to become ready. + return ctrl.Result{}, r.patchNotReady(ctx, cr, v1alpha1.CommittedResourceReasonReserving, "waiting for reservation placement", true) } + logger.Info("committed resource accepted", "generation", cr.Generation, "amount", cr.Spec.Amount.String()) return ctrl.Result{}, r.setAccepted(ctx, cr) } @@ -230,8 +271,8 @@ func (r *CommittedResourceController) checkChildReservationStatus(ctx context.Co func (r *CommittedResourceController) setAccepted(ctx context.Context, cr *v1alpha1.CommittedResource) error { now := metav1.Now() old := cr.DeepCopy() - acceptedAmount := cr.Spec.Amount.DeepCopy() - cr.Status.AcceptedAmount = &acceptedAmount + specCopy := cr.Spec.DeepCopy() + cr.Status.AcceptedSpec = specCopy cr.Status.AcceptedAt = &now meta.SetStatusCondition(&cr.Status.Conditions, metav1.Condition{ Type: v1alpha1.CommittedResourceConditionReady, @@ -239,6 +280,7 @@ func (r *CommittedResourceController) setAccepted(ctx context.Context, cr *v1alp Reason: v1alpha1.CommittedResourceReasonAccepted, Message: "commitment successfully reserved", LastTransitionTime: now, + ObservedGeneration: cr.Generation, }) if err := r.Status().Patch(ctx, cr, client.MergeFrom(old)); err != nil { return client.IgnoreNotFound(err) @@ -254,23 +296,17 @@ func (r *CommittedResourceController) reconcileInactive(ctx context.Context, log return ctrl.Result{}, r.setNotReady(ctx, cr, string(cr.Spec.State), "commitment is no longer active") } -func (r *CommittedResourceController) reconcileDeletion(ctx context.Context, logger logr.Logger, cr *v1alpha1.CommittedResource) (ctrl.Result, error) { - if err := r.deleteChildReservations(ctx, cr); err != nil { - return ctrl.Result{}, err - } - controllerutil.RemoveFinalizer(cr, crFinalizer) - if err := r.Update(ctx, cr); err != nil { - return ctrl.Result{}, client.IgnoreNotFound(err) - } - logger.Info("committed resource deleted, child reservations cleaned up") - return ctrl.Result{}, nil -} - // deleteChildReservations deletes all Reservation CRDs owned by this CommittedResource, // identified by matching CommitmentUUID in the reservation spec. func (r *CommittedResourceController) deleteChildReservations(ctx context.Context, cr *v1alpha1.CommittedResource) error { + return DeleteChildReservations(ctx, r.Client, cr) +} + +// DeleteChildReservations deletes all Reservation CRDs belonging to cr, matched by CommitmentUUID. +// Called both by the controller on inactive/rollback transitions and by the API handler on CR deletion. +func DeleteChildReservations(ctx context.Context, k8sClient client.Client, cr *v1alpha1.CommittedResource) error { var list v1alpha1.ReservationList - if err := r.List(ctx, &list, + if err := k8sClient.List(ctx, &list, client.MatchingLabels{v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource}, client.MatchingFields{idxReservationByCommitmentUUID: cr.Spec.CommitmentUUID}, ); err != nil { @@ -278,33 +314,39 @@ func (r *CommittedResourceController) deleteChildReservations(ctx context.Contex } for i := range list.Items { res := &list.Items[i] - if err := r.Delete(ctx, res); client.IgnoreNotFound(err) != nil { + if err := k8sClient.Delete(ctx, res); client.IgnoreNotFound(err) != nil { return fmt.Errorf("failed to delete reservation %s: %w", res.Name, err) } } return nil } -// rollbackToAccepted restores child Reservations to match Status.AcceptedAmount. -// If AcceptedAmount is nil (new CR that was never accepted), all child Reservations are deleted. +// rollbackToAccepted restores child Reservations to match Status.AcceptedSpec. +// AcceptedSpec is a full snapshot of the spec at the last successful reconcile, so rollback always +// targets the correct AZ, amount, project, domain — even when the current spec has been mutated. +// If AcceptedSpec is nil (CR was never accepted), all child Reservations are deleted. func (r *CommittedResourceController) rollbackToAccepted(ctx context.Context, logger logr.Logger, cr *v1alpha1.CommittedResource) error { - if cr.Status.AcceptedAmount == nil { + if cr.Status.AcceptedSpec == nil { return r.deleteChildReservations(ctx, cr) } knowledge := &reservations.FlavorGroupKnowledgeClient{Client: r.Client} flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) if err != nil { // Can't compute the rollback target — fall back to full delete rather than leaving - // a partial state that's inconsistent with the unknown AcceptedAmount. + // a partial state that's inconsistent with the unknown accepted state. logger.Error(err, "flavor knowledge unavailable during rollback, deleting all child reservations") return r.deleteChildReservations(ctx, cr) } - state, err := FromCommittedResource(*cr) + + var state *CommitmentState + // Use the full accepted spec snapshot: ensures rollback targets the exact previously-accepted + // placement (AZ, amount, project, domain) even if the current spec has been mutated. + tempCR := v1alpha1.CommittedResource{Spec: *cr.Status.AcceptedSpec} + state, err = FromCommittedResource(tempCR) if err != nil { logger.Error(err, "invalid spec during rollback, deleting all child reservations") return r.deleteChildReservations(ctx, cr) } - state.TotalMemoryBytes = cr.Status.AcceptedAmount.Value() state.NamePrefix = cr.Name + "-" state.CreatorRequestID = reservations.GlobalRequestIDFromContext(ctx) state.ParentGeneration = cr.Generation @@ -314,27 +356,115 @@ func (r *CommittedResourceController) rollbackToAccepted(ctx context.Context, lo return nil } +// isRejectedForGeneration returns true when the CR's Ready condition is already Rejected +// for the current spec generation. Used to short-circuit re-applying a spec that was +// already tried and rejected in a previous reconcile cycle. +func isRejectedForGeneration(cr *v1alpha1.CommittedResource) bool { + cond := meta.FindStatusCondition(cr.Status.Conditions, v1alpha1.CommittedResourceConditionReady) + return cond != nil && + cond.Status == metav1.ConditionFalse && + cond.Reason == v1alpha1.CommittedResourceReasonRejected && + cond.ObservedGeneration == cr.Generation +} + +// retryDelay computes an exponential backoff interval for the AllowRejection=false retry paths. +// The exponent is derived from the time already spent in Reserving state: each doubling period +// advances one step, giving the same base→2*base→4*base→… sequence as a counter would, but +// without storing a raw count in status. +// The delay is capped at MaxRequeueInterval. +func (r *CommittedResourceController) retryDelay(cr *v1alpha1.CommittedResource) time.Duration { + base := r.Conf.RequeueIntervalRetry.Duration + cond := meta.FindStatusCondition(cr.Status.Conditions, v1alpha1.CommittedResourceConditionReady) + if cond == nil || cond.Reason != v1alpha1.CommittedResourceReasonReserving { + return base + } + elapsed := time.Since(cond.LastTransitionTime.Time) + var exp uint + for step := base; elapsed >= step && exp < 6; exp++ { + step *= 2 + } + delay := base * time.Duration(uint64(1)< 0 && delay > maxDelay { + return maxDelay + } + return delay +} + // setNotReady patches Ready=False on CommittedResource status. func (r *CommittedResourceController) setNotReady(ctx context.Context, cr *v1alpha1.CommittedResource, reason, message string) error { + return r.patchNotReady(ctx, cr, reason, message, false) +} + +// setNotReadyRetry patches Ready=False/Reserving for the AllowRejection=false retry paths. +// The retry timer is not reset so that the elapsed time in Reserving state continues to +// drive the exponential backoff in retryDelay. +func (r *CommittedResourceController) setNotReadyRetry(ctx context.Context, cr *v1alpha1.CommittedResource, message string) error { + return r.patchNotReady(ctx, cr, v1alpha1.CommittedResourceReasonReserving, message, false) +} + +// patchNotReady patches Ready=False with reason/message. +// resetTimer=true forces LastTransitionTime to be refreshed even if reason is unchanged — +// use this on the "apply succeeded, waiting for slots" path so the watch suppression gate +// does not fire while placement is working. +func (r *CommittedResourceController) patchNotReady(ctx context.Context, cr *v1alpha1.CommittedResource, reason, message string, resetTimer bool) error { old := cr.DeepCopy() - meta.SetStatusCondition(&cr.Status.Conditions, metav1.Condition{ + setReadyConditionFalse(&cr.Status.Conditions, reason, message, cr.Generation, resetTimer) + if err := r.Status().Patch(ctx, cr, client.MergeFrom(old)); err != nil { + return client.IgnoreNotFound(err) + } + return nil +} + +// setReadyConditionFalse sets Ready=False with the given reason/message. +// Unlike meta.SetStatusCondition, it refreshes LastTransitionTime whenever Status OR Reason +// changes, so retryDelay and the watch suppression gate always measure time-in-current-reason. +// resetTimer forces a refresh even when reason is unchanged (use on the "apply succeeded, +// waiting for slots" path to clear the failure history). +func setReadyConditionFalse(conditions *[]metav1.Condition, reason, message string, generation int64, resetTimer bool) { + now := metav1.Now() + for i, c := range *conditions { + if c.Type != v1alpha1.CommittedResourceConditionReady { + continue + } + newCond := metav1.Condition{ + Type: v1alpha1.CommittedResourceConditionReady, + Status: metav1.ConditionFalse, + Reason: reason, + Message: message, + ObservedGeneration: generation, + } + if !resetTimer && c.Status == metav1.ConditionFalse && c.Reason == reason { + newCond.LastTransitionTime = c.LastTransitionTime + } else { + newCond.LastTransitionTime = now + } + (*conditions)[i] = newCond + return + } + *conditions = append(*conditions, metav1.Condition{ Type: v1alpha1.CommittedResourceConditionReady, Status: metav1.ConditionFalse, Reason: reason, Message: message, - LastTransitionTime: metav1.Now(), + ObservedGeneration: generation, + LastTransitionTime: now, }) - if err := r.Status().Patch(ctx, cr, client.MergeFrom(old)); err != nil { - return client.IgnoreNotFound(err) - } - return nil } // SetupWithManager sets up the controller with the Manager. func (r *CommittedResourceController) SetupWithManager(mgr ctrl.Manager, mcl *multicluster.Client) error { ctx := context.Background() - if err := IndexFields(ctx, mcl); err != nil { - return fmt.Errorf("failed to set up field indexes: %w", err) + if err := indexReservationByCommitmentUUID(ctx, mcl); err != nil { + return fmt.Errorf("failed to set up reservation field index: %w", err) + } + // Also register idxCommittedResourceByUUID here: the Reservation watch handler uses it to map + // Reservation→CR. The UsageReconciler registers the same index, but it may not be set up when + // commitmentsUsageDB is unconfigured. The once-guard in field_index.go makes this idempotent. + if err := indexCommittedResourceByUUID(ctx, mcl); err != nil { + return fmt.Errorf("failed to set up committed resource field index: %w", err) + } + if err := indexCommittedResourceByProjectID(ctx, mcl); err != nil { + return fmt.Errorf("failed to set up committed resource project index: %w", err) } bldr := multicluster.BuildController(mcl, mgr) @@ -347,6 +477,10 @@ func (r *CommittedResourceController) SetupWithManager(mgr ctrl.Manager, mcl *mu return err } // Re-enqueue the parent CommittedResource when a child Reservation changes (e.g. external deletion). + // Suppressed when the CR has been continuously in Reserving state for longer than + // maxReservingAgeForSlowdown: a broken rollback creates reservation churn that would bypass + // RequeueAfter and keep the controller in a tight loop. The Reservation watch is the fast path + // for normal "waiting for placement" transitions; the RequeueAfter backoff handles retry. bldr, err = bldr.WatchesMulticluster( &v1alpha1.Reservation{}, handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, obj client.Object) []ctrl.Request { @@ -363,7 +497,21 @@ func (r *CommittedResourceController) SetupWithManager(mgr ctrl.Manager, mcl *mu if len(crList.Items) == 0 { return nil } - return []ctrl.Request{{NamespacedName: types.NamespacedName{Name: crList.Items[0].Name}}} + cr := &crList.Items[0] + // Suppress fast-path re-enqueues only when the reservation belongs to the current + // generation AND the CR has been in Reserving state for too long. A new spec (higher + // generation) gets a fresh start. + readyCond := meta.FindStatusCondition(cr.Status.Conditions, v1alpha1.CommittedResourceConditionReady) + if readyCond != nil && + readyCond.Reason == v1alpha1.CommittedResourceReasonReserving && + readyCond.ObservedGeneration == cr.Generation && + res.Spec.CommittedResourceReservation.ParentGeneration == cr.Generation && + time.Since(readyCond.LastTransitionTime.Time) > maxReservingAgeForSlowdown { + LoggerFromContext(ctx).V(1).Info("Reserving state age exceeded threshold, watch re-enqueues suppressed — retrying via backoff timer only", + "name", cr.Name, "reservingAge", time.Since(readyCond.LastTransitionTime.Time).Round(time.Second), "threshold", maxReservingAgeForSlowdown) + return nil + } + return []ctrl.Request{{NamespacedName: types.NamespacedName{Name: cr.Name}}} }), ) if err != nil { diff --git a/internal/scheduling/reservations/commitments/committed_resource_controller_test.go b/internal/scheduling/reservations/commitments/committed_resource_controller_test.go index 1029ec997..e5c243478 100644 --- a/internal/scheduling/reservations/commitments/committed_resource_controller_test.go +++ b/internal/scheduling/reservations/commitments/committed_resource_controller_test.go @@ -28,13 +28,10 @@ import ( // ============================================================================ // newTestCommittedResource returns a CommittedResource with sensible defaults. -// The finalizer is pre-populated so tests can call Reconcile once without a -// separate finalizer-add round-trip. func newTestCommittedResource(name string, state v1alpha1.CommitmentStatus) *v1alpha1.CommittedResource { return &v1alpha1.CommittedResource{ ObjectMeta: metav1.ObjectMeta{ - Name: name, - Finalizers: []string{crFinalizer}, + Name: name, }, Spec: v1alpha1.CommittedResourceSpec{ CommitmentUUID: "test-uuid-1234", @@ -121,6 +118,13 @@ func newCRTestClient(scheme *runtime.Scheme, objects ...client.Object) client.Cl } return []string{cr.Spec.CommitmentUUID} }). + WithIndex(&v1alpha1.CommittedResource{}, idxCommittedResourceByProjectID, func(obj client.Object) []string { + cr, ok := obj.(*v1alpha1.CommittedResource) + if !ok || cr.Spec.ProjectID == "" { + return nil + } + return []string{cr.Spec.ProjectID} + }). Build() } @@ -283,8 +287,11 @@ func TestCommittedResourceController_Reconcile(t *testing.T) { if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: cr.Name}, &updated); err != nil { t.Fatalf("get CR: %v", err) } - if updated.Status.AcceptedAmount == nil { - t.Errorf("expected AcceptedAmount to be set on acceptance") + if updated.Status.AcceptedSpec == nil { + t.Errorf("expected AcceptedSpec to be set on acceptance") + } else if updated.Status.AcceptedSpec.AvailabilityZone != cr.Spec.AvailabilityZone { + t.Errorf("AcceptedSpec.AvailabilityZone: want %q, got %q", + cr.Spec.AvailabilityZone, updated.Status.AcceptedSpec.AvailabilityZone) } } }) @@ -410,11 +417,11 @@ func TestCommittedResourceController_PlacementFailure(t *testing.T) { func TestCommittedResourceController_Rollback(t *testing.T) { scheme := newCRTestScheme(t) - // CR at generation 2; AcceptedAmount reflects what was accepted at generation 1. + // CR at generation 2; AcceptedSpec reflects what was accepted at generation 1. cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusConfirmed) cr.Generation = 2 - accepted := resource.MustParse("4Gi") - cr.Status.AcceptedAmount = &accepted + acceptedSpec := cr.Spec + cr.Status.AcceptedSpec = &acceptedSpec // Existing reservation with stale ParentGeneration from the previous generation. existing := &v1alpha1.Reservation{ @@ -458,12 +465,309 @@ func TestCommittedResourceController_Rollback(t *testing.T) { } } +// TestCommittedResourceController_RollbackUsesAcceptedSpecAZ verifies that rollbackToAccepted +// targets the AZ from AcceptedSpec, not from the current (mutated) Spec. This is the core fix +// for the oscillation bug where a failed AZ change left the CR stuck placing reservations in +// the wrong AZ on every retry. +func TestCommittedResourceController_RollbackUsesAcceptedSpecAZ(t *testing.T) { + scheme := newCRTestScheme(t) + + // Spec has been mutated to a new AZ that failed placement. + cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusConfirmed) + cr.Spec.AvailabilityZone = "new-az" // the failed AZ + cr.Generation = 2 + + acceptedSpec := cr.Spec + acceptedSpec.AvailabilityZone = "accepted-az" // last successfully placed AZ + cr.Status.AcceptedSpec = &acceptedSpec + + // Existing reservation was placed in the wrong AZ by the failed rollback. + existing := &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cr-0", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + AvailabilityZone: "new-az", // wrong AZ + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: resource.MustParse("4Gi"), + hv1.ResourceCPU: resource.MustParse("2"), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + CommitmentUUID: "test-uuid-1234", + ProjectID: "test-project", + DomainID: "test-domain", + ResourceGroup: "test-group", + ParentGeneration: 2, + }, + }, + } + + k8sClient := newCRTestClient(scheme, cr, existing, newTestFlavorKnowledge()) + controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: CommittedResourceControllerConfig{}} + + if err := controller.rollbackToAccepted(context.Background(), logr.Discard(), cr); err != nil { + t.Fatalf("rollbackToAccepted: %v", err) + } + + // The reservation manager deletes the wrong-AZ reservation and creates a new one + // with the accepted AZ from AcceptedSpec. + var list v1alpha1.ReservationList + if err := k8sClient.List(context.Background(), &list, client.MatchingLabels{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }); err != nil { + t.Fatalf("list reservations: %v", err) + } + if len(list.Items) != 1 { + t.Fatalf("expected 1 reservation after rollback, got %d", len(list.Items)) + } + if got := list.Items[0].Spec.AvailabilityZone; got != "accepted-az" { + t.Errorf("rollback: reservation AZ: want %q (from AcceptedSpec), got %q (wrong: from current Spec)", "accepted-az", got) + } +} + +// TestCommittedResourceController_RollbackNilAcceptedSpec verifies that when AcceptedSpec is +// absent (pre-dates the field), rollbackToAccepted deletes child reservations rather than +// attempting a rollback with stale/wrong placement data. The controller repairs state on +// the next reconcile via ApplyCommitmentState. +func TestCommittedResourceController_RollbackNilAcceptedSpec(t *testing.T) { + scheme := newCRTestScheme(t) + + cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusConfirmed) + cr.Generation = 2 + // AcceptedSpec intentionally nil — simulates a CR that was never successfully accepted. + + existing := &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cr-0", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + CommitmentUUID: "test-uuid-1234", + ProjectID: "test-project", + }, + }, + } + + k8sClient := newCRTestClient(scheme, cr, existing, newTestFlavorKnowledge()) + controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: CommittedResourceControllerConfig{}} + + if err := controller.rollbackToAccepted(context.Background(), logr.Discard(), cr); err != nil { + t.Fatalf("rollbackToAccepted: %v", err) + } + + var list v1alpha1.ReservationList + if err := k8sClient.List(context.Background(), &list, client.MatchingLabels{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }); err != nil { + t.Fatalf("list reservations: %v", err) + } + if len(list.Items) != 0 { + t.Errorf("expected all reservations deleted when AcceptedSpec is nil, got %d", len(list.Items)) + } +} + +// increments on each placement failure (AllowRejection=false) and resets to 0 on acceptance. +// It also checks that the retry delay grows with each failure. +// TestCommittedResourceController_RejectedStaysRejected verifies that a CR rejected on one +// reconcile cycle stays rejected on subsequent cycles triggered by Reservation watch events, +// without re-applying the bad spec. This is the oscillation regression test: without the +// isRejectedForGeneration guard the controller would re-apply the bad spec on every +// Reservation watch re-enqueue, undoing the rollback each time. +func TestCommittedResourceController_RejectedStaysRejected(t *testing.T) { + tests := []struct { + name string + state v1alpha1.CommitmentStatus + }{ + {name: "confirmed", state: v1alpha1.CommitmentStatusConfirmed}, + {name: "pending", state: v1alpha1.CommitmentStatusPending}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + scheme := newCRTestScheme(t) + + // CR was previously accepted at AZ "accepted-az". + cr := newTestCommittedResource("test-cr", tt.state) + cr.Spec.AllowRejection = true + cr.Spec.AvailabilityZone = "bad-az" // spec was mutated to a failing AZ + cr.Generation = 2 + acceptedSpec := cr.Spec.DeepCopy() + acceptedSpec.AvailabilityZone = "accepted-az" + cr.Status.AcceptedSpec = acceptedSpec + + // No Knowledge → placement always fails. + k8sClient := newCRTestClient(scheme, cr) + controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: CommittedResourceControllerConfig{}} + + // Reconcile 1: applies bad spec → fails → rollback + Rejected. + if _, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)); err != nil { + t.Fatalf("reconcile 1: %v", err) + } + assertCondition(t, k8sClient, cr.Name, metav1.ConditionFalse, v1alpha1.CommittedResourceReasonRejected) + + // Reconcile 2: simulates Reservation watch re-enqueue after rollback. + // Must stay Rejected without re-applying the bad spec. + if _, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)); err != nil { + t.Fatalf("reconcile 2: %v", err) + } + assertCondition(t, k8sClient, cr.Name, metav1.ConditionFalse, v1alpha1.CommittedResourceReasonRejected) + + // Reconcile 3: another watch event — still stable. + if _, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)); err != nil { + t.Fatalf("reconcile 3: %v", err) + } + assertCondition(t, k8sClient, cr.Name, metav1.ConditionFalse, v1alpha1.CommittedResourceReasonRejected) + + // For committed state: rollback reservations should be in accepted-az, not bad-az. + if tt.state == v1alpha1.CommitmentStatusConfirmed { + var list v1alpha1.ReservationList + if err := k8sClient.List(context.Background(), &list, client.MatchingLabels{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }); err != nil { + t.Fatalf("list reservations: %v", err) + } + for _, res := range list.Items { + if res.Spec.AvailabilityZone == "bad-az" { + t.Errorf("rollback reservation still points to bad-az after %d reconciles — oscillation not fixed", 3) + } + } + } + }) + } +} + +func TestCommittedResourceController_RetryBackoff(t *testing.T) { + scheme := newCRTestScheme(t) + cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusConfirmed) + cr.Spec.AllowRejection = false + base := 30 * time.Second + k8sClient := newCRTestClient(scheme, cr) // no Knowledge → placement fails + controller := &CommittedResourceController{ + Client: k8sClient, + Scheme: scheme, + Conf: CommittedResourceControllerConfig{RequeueIntervalRetry: metav1.Duration{Duration: base}}, + } + + getCR := func() v1alpha1.CommittedResource { + t.Helper() + var updated v1alpha1.CommittedResource + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: cr.Name}, &updated); err != nil { + t.Fatalf("get CR: %v", err) + } + return updated + } + + // First failure: Reserving condition does not exist yet → delay = base * 2^0. + result1, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)) + if err != nil { + t.Fatalf("reconcile 1: %v", err) + } + if result1.RequeueAfter != base { + t.Errorf("after failure 1: RequeueAfter want %v, got %v", base, result1.RequeueAfter) + } + cond1 := meta.FindStatusCondition(getCR().Status.Conditions, v1alpha1.CommittedResourceConditionReady) + if cond1 == nil || cond1.Reason != v1alpha1.CommittedResourceReasonReserving { + t.Fatalf("after failure 1: expected Ready=False/Reserving condition") + } + + // Second failure: simulate base seconds elapsed by back-dating the condition's LastTransitionTime. + cr2 := getCR() + old2 := cr2.DeepCopy() + for i, c := range cr2.Status.Conditions { + if c.Type == v1alpha1.CommittedResourceConditionReady { + cr2.Status.Conditions[i].LastTransitionTime = metav1.NewTime(time.Now().Add(-base)) + } + } + if err := k8sClient.Status().Patch(context.Background(), &cr2, client.MergeFrom(old2)); err != nil { + t.Fatalf("back-date condition: %v", err) + } + result2, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)) + if err != nil { + t.Fatalf("reconcile 2: %v", err) + } + if result2.RequeueAfter != 2*base { + t.Errorf("after failure 2: RequeueAfter want %v, got %v", 2*base, result2.RequeueAfter) + } + + // Add Knowledge so placement succeeds; simulate reservation controller marking ready. + if err := k8sClient.Create(context.Background(), newTestFlavorKnowledge()); err != nil { + t.Fatalf("create knowledge: %v", err) + } + if _, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)); err != nil { + t.Fatalf("reconcile 3 (apply): %v", err) + } + setChildReservationsReady(t, k8sClient, cr.Spec.CommitmentUUID) + if _, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)); err != nil { + t.Fatalf("reconcile 4 (accept): %v", err) + } + + // After acceptance the Reserving condition is gone (replaced by Ready=True/Accepted). + acceptedCond := meta.FindStatusCondition(getCR().Status.Conditions, v1alpha1.CommittedResourceConditionReady) + if acceptedCond == nil || acceptedCond.Status != metav1.ConditionTrue { + t.Errorf("after acceptance: expected Ready=True condition") + } +} + +func TestRetryDelay(t *testing.T) { + base := 30 * time.Second + maxDelay := 30 * time.Minute + controller := &CommittedResourceController{ + Conf: CommittedResourceControllerConfig{ + RequeueIntervalRetry: metav1.Duration{Duration: base}, + MaxRequeueInterval: metav1.Duration{Duration: maxDelay}, + }, + } + tests := []struct { + elapsed time.Duration + want time.Duration + }{ + // Windows with base=30s: [0,30s)→30s, [30s,60s)→60s, [60s,120s)→120s, + // [120s,240s)→240s, [240s,480s)→480s, [480s,960s)→960s(16m), [960s,∞)→capped 30m. + // Use mid-window values to avoid boundary flakiness from time.Since epsilon. + {0, 30 * time.Second}, // start of first window + {15 * time.Second, base}, // mid [0s,30s) + {45 * time.Second, 2 * base}, // mid [30s,60s) + {90 * time.Second, 4 * base}, // mid [60s,120s) + {3 * time.Minute, 8 * base}, // mid [120s,240s) + {6 * time.Minute, 16 * base}, // mid [240s,480s) + {12 * time.Minute, 32 * base}, // mid [480s,960s) = 16m + {20 * time.Minute, 30 * time.Minute}, // [960s,∞) → capped + {60 * time.Minute, 30 * time.Minute}, // well beyond cap + } + for _, tt := range tests { + ltt := metav1.NewTime(time.Now().Add(-tt.elapsed)) + cr := &v1alpha1.CommittedResource{ + Status: v1alpha1.CommittedResourceStatus{ + Conditions: []metav1.Condition{ + { + Type: v1alpha1.CommittedResourceConditionReady, + Status: metav1.ConditionFalse, + Reason: v1alpha1.CommittedResourceReasonReserving, + LastTransitionTime: ltt, + }, + }, + }, + } + if got := controller.retryDelay(cr); got != tt.want { + t.Errorf("elapsed=%v: want %v, got %v", tt.elapsed, tt.want, got) + } + } +} + func TestCommittedResourceController_BadSpec(t *testing.T) { scheme := newCRTestScheme(t) cr := &v1alpha1.CommittedResource{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-cr", - Finalizers: []string{crFinalizer}, + Name: "test-cr", }, Spec: v1alpha1.CommittedResourceSpec{ CommitmentUUID: "x", // too short, fails commitmentUUIDPattern @@ -593,39 +897,3 @@ func TestCheckChildReservationStatus_GenerationGuard(t *testing.T) { }) } } - -func TestCommittedResourceController_Deletion(t *testing.T) { - scheme := newCRTestScheme(t) - cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusConfirmed) - child := &v1alpha1.Reservation{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-cr-0", - Labels: map[string]string{ - v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, - }, - }, - Spec: v1alpha1.ReservationSpec{ - Type: v1alpha1.ReservationTypeCommittedResource, - CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ - CommitmentUUID: "test-uuid-1234", - }, - }, - } - k8sClient := newCRTestClient(scheme, cr, child) - controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: CommittedResourceControllerConfig{}} - - if err := k8sClient.Delete(context.Background(), cr); err != nil { - t.Fatalf("delete CR: %v", err) - } - if _, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)); err != nil { - t.Fatalf("reconcile: %v", err) - } - - if got := countChildReservations(t, k8sClient, cr.Spec.CommitmentUUID); got != 0 { - t.Errorf("expected 0 child reservations after deletion, got %d", got) - } - var deleted v1alpha1.CommittedResource - if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: cr.Name}, &deleted); err == nil { - t.Errorf("expected CR to be gone after deletion, but it still exists with finalizers=%v", deleted.Finalizers) - } -} diff --git a/internal/scheduling/reservations/commitments/config.go b/internal/scheduling/reservations/commitments/config.go index c30c87953..e4005df47 100644 --- a/internal/scheduling/reservations/commitments/config.go +++ b/internal/scheduling/reservations/commitments/config.go @@ -15,13 +15,37 @@ import ( type Config struct { ReservationController ReservationControllerConfig `json:"committedResourceReservationController"` CommittedResourceController CommittedResourceControllerConfig `json:"committedResourceController"` + UsageReconciler UsageReconcilerConfig `json:"committedResourceUsageReconciler"` API APIConfig `json:"committedResourceAPI"` // DatasourceName is the name of the Datasource CRD that provides database - // connection info. Used to construct the UsageDBClient for report-usage. + // connection info. Used to construct the UsageDBClient for report-usage and usage reconciler. DatasourceName string `json:"datasourceName,omitempty"` } +// UsageReconcilerConfig holds tuning knobs for the usage reconciler. +type UsageReconcilerConfig struct { + // CooldownInterval is the minimum time between usage reconcile runs for the same CommittedResource. + // If a reconcile ran within this window, the next trigger is deferred until the window expires. + // This interval also acts as the periodic fallback: every successful reconcile schedules the + // next run after this duration so that changes not caught by watches are still picked up. + CooldownInterval metav1.Duration `json:"cooldownInterval"` +} + +func DefaultUsageReconcilerConfig() UsageReconcilerConfig { + return UsageReconcilerConfig{ + CooldownInterval: metav1.Duration{Duration: 5 * time.Minute}, + } +} + +// ApplyDefaults fills in zero-value fields from the defaults, leaving explicitly configured values intact. +func (c *UsageReconcilerConfig) ApplyDefaults() { + d := DefaultUsageReconcilerConfig() + if c.CooldownInterval.Duration == 0 { + c.CooldownInterval = d.CooldownInterval + } +} + // ReservationControllerConfig holds tuning knobs for the Reservation CRD controller. type ReservationControllerConfig struct { // RequeueIntervalActive is how often to re-verify a healthy Reservation CRD. @@ -45,8 +69,32 @@ type ReservationControllerConfig struct { // CommittedResourceControllerConfig holds tuning knobs for the CommittedResource CRD controller. type CommittedResourceControllerConfig struct { - // RequeueIntervalRetry is the back-off interval when placement is pending or failed. + // RequeueIntervalRetry is the base back-off interval when placement fails (AllowRejection=false path). + // The actual delay doubles with each consecutive failure: base * 2^min(failures, 6), capped at MaxRequeueInterval. + // If zero (unconfigured), backoff is disabled and the controller retries immediately on every failure. RequeueIntervalRetry metav1.Duration `json:"requeueIntervalRetry"` + + // MaxRequeueInterval caps the exponential backoff delay. + // Once this ceiling is reached, every subsequent retry fires after exactly this interval. + MaxRequeueInterval metav1.Duration `json:"maxRequeueInterval"` +} + +func DefaultCommittedResourceControllerConfig() CommittedResourceControllerConfig { + return CommittedResourceControllerConfig{ + RequeueIntervalRetry: metav1.Duration{Duration: 30 * time.Second}, + MaxRequeueInterval: metav1.Duration{Duration: 30 * time.Minute}, + } +} + +// ApplyDefaults fills in zero-value fields from the defaults, leaving explicitly configured values intact. +func (c *CommittedResourceControllerConfig) ApplyDefaults() { + d := DefaultCommittedResourceControllerConfig() + if c.RequeueIntervalRetry.Duration == 0 { + c.RequeueIntervalRetry = d.RequeueIntervalRetry + } + if c.MaxRequeueInterval.Duration == 0 { + c.MaxRequeueInterval = d.MaxRequeueInterval + } } // ResourceTypeConfig holds per-resource flags for a single resource type within a flavor group. @@ -72,6 +120,9 @@ type APIConfig struct { EnableReportUsage bool `json:"enableReportUsage"` // EnableReportCapacity controls whether the report-capacity endpoint is active. EnableReportCapacity bool `json:"enableReportCapacity"` + // EnableQuotaAPI controls whether the quota API endpoint is active. + // When false, the endpoint will return HTTP 503 Service Unavailable. + EnableQuotaAPI bool `json:"enableQuota"` // WatchTimeout is how long the change-commitments handler polls CommittedResource // CRD conditions before giving up and rolling back. WatchTimeout metav1.Duration `json:"watchTimeout"` @@ -101,6 +152,7 @@ func DefaultAPIConfig() APIConfig { EnableChangeCommitments: true, EnableReportUsage: true, EnableReportCapacity: true, + EnableQuotaAPI: true, WatchTimeout: metav1.Duration{Duration: 10 * time.Second}, WatchPollInterval: metav1.Duration{Duration: 500 * time.Millisecond}, } diff --git a/internal/scheduling/reservations/commitments/e2e_checks.go b/internal/scheduling/reservations/commitments/e2e_checks.go index cd4b15d05..dbdbef40e 100644 --- a/internal/scheduling/reservations/commitments/e2e_checks.go +++ b/internal/scheduling/reservations/commitments/e2e_checks.go @@ -24,7 +24,7 @@ const ( // This should match the service name in the helm chart. defaultCommitmentsAPIURL = "http://cortex-nova-scheduler:8080" - // defaultE2EProjectUUID is a well-known fake project UUID used when no TestProjectID is configured. + // defaultE2EProjectUUID is a well-known fake project UUID used when no ProjectID is configured. // It is intentionally not a real OpenStack project — commitments created under it self-expire. defaultE2EProjectUUID = "00000000-0000-0000-0000-000000000e2e" ) @@ -33,12 +33,23 @@ const ( type E2EChecksConfig struct { // BaseURL for the commitments API. If empty, defaults to defaultCommitmentsAPIURL. BaseURL string `json:"baseURL"` - // RoundTripCheck holds optional overrides for the round-trip check. - // If nil, defaults are used: testProjectID = defaultE2EProjectUUID, az = "". + // NoCleanup prevents test commitments from being deleted after a successful run. + // Useful for local inspection with Tilt. Zero value (false) means cleanup runs normally. + NoCleanup bool `json:"noCleanup,omitempty"` + // ProjectID is the OpenStack project UUID for all e2e test commitments. + // If empty, falls back to RoundTripCheck.TestProjectID, then defaultE2EProjectUUID. + ProjectID string `json:"projectID,omitempty"` + // AZs is the list of availability zones to test. If empty, falls back to + // RoundTripCheck.AZ, then uses "" (any AZ). + AZs []string `json:"azs,omitempty"` + // RoundTripCheck holds optional overrides for backward compatibility. + // Prefer the top-level ProjectID and AZs fields for new configurations. RoundTripCheck *E2ERoundTripConfig `json:"roundTripCheck,omitempty"` } // E2ERoundTripConfig holds optional overrides for the create→delete round-trip e2e check. +// +// Deprecated: use E2EChecksConfig.ProjectID and E2EChecksConfig.AZs instead. type E2ERoundTripConfig struct { // AZ is the availability zone to use (e.g. "qa-de-1d"). Defaults to "" if not set. AZ string `json:"az"` @@ -47,6 +58,33 @@ type E2ERoundTripConfig struct { TestProjectID string `json:"testProjectID"` } +// e2eProjectID returns the effective project UUID for e2e tests. +func e2eProjectID(config E2EChecksConfig) liquid.ProjectUUID { + if config.ProjectID != "" { + return liquid.ProjectUUID(config.ProjectID) + } + if rt := config.RoundTripCheck; rt != nil && rt.TestProjectID != "" { + return liquid.ProjectUUID(rt.TestProjectID) + } + return liquid.ProjectUUID(defaultE2EProjectUUID) +} + +// e2eAZs returns the effective AZ list for e2e tests. +// Falls back to RoundTripCheck.AZ (single AZ), then to [""] (any AZ). +func e2eAZs(config E2EChecksConfig) []liquid.AvailabilityZone { + if len(config.AZs) > 0 { + azs := make([]liquid.AvailabilityZone, len(config.AZs)) + for i, az := range config.AZs { + azs[i] = liquid.AvailabilityZone(az) + } + return azs + } + if rt := config.RoundTripCheck; rt != nil && rt.AZ != "" { + return []liquid.AvailabilityZone{liquid.AvailabilityZone(rt.AZ)} + } + return []liquid.AvailabilityZone{""} +} + // CheckCommitmentsInfoEndpoint verifies that GET /commitments/v1/info returns 200 with a valid ServiceInfo. func CheckCommitmentsInfoEndpoint(ctx context.Context, config E2EChecksConfig) { baseURL := e2eBaseURL(config) @@ -79,34 +117,28 @@ func CheckCommitmentsInfoEndpoint(ctx context.Context, config E2EChecksConfig) { ) } -// CheckCommitmentsRoundTrip iterates all HandlesCommitments resources from /info and for each one: +// CheckCommitmentsRoundTrip iterates all HandlesCommitments resources from /info and for each (AZ, resource) pair: // 1. Creates a confirmed test commitment (amount=2, expires in 5 minutes) -// 2. If accepted: calls the usage API to verify it returns 200, then deletes the commitment +// 2. If accepted: calls the usage API to verify it returns a valid response, then deletes the commitment // 3. If rejected: logs the reason and continues — capacity rejection is not an error // // Panics on infrastructure failures (non-200 from the API, deletion failure after acceptance). func CheckCommitmentsRoundTrip(ctx context.Context, config E2EChecksConfig) { baseURL := e2eBaseURL(config) - az := liquid.AvailabilityZone("") - projectID := liquid.ProjectUUID(defaultE2EProjectUUID) - if rt := config.RoundTripCheck; rt != nil { - if rt.AZ != "" { - az = liquid.AvailabilityZone(rt.AZ) - } - if rt.TestProjectID != "" { - projectID = liquid.ProjectUUID(rt.TestProjectID) - } - } + projectID := e2eProjectID(config) + azs := e2eAZs(config) serviceInfo := e2eFetchServiceInfo(ctx, baseURL) checked := 0 - for resourceName, resInfo := range serviceInfo.Resources { - if !resInfo.HandlesCommitments { - continue + for _, az := range azs { + for resourceName, resInfo := range serviceInfo.Resources { + if !resInfo.HandlesCommitments { + continue + } + e2eRoundTripResource(ctx, baseURL, serviceInfo.Version, az, projectID, resourceName, config.NoCleanup) + checked++ } - e2eRoundTripResource(ctx, baseURL, serviceInfo.Version, az, projectID, resourceName) - checked++ } if checked == 0 { @@ -114,7 +146,7 @@ func CheckCommitmentsRoundTrip(ctx context.Context, config E2EChecksConfig) { } } -// e2eRoundTripResource runs the create→usageCheck→delete cycle for one resource. +// e2eRoundTripResource runs the create→usageCheck→delete cycle for one (AZ, resource) pair. func e2eRoundTripResource( ctx context.Context, baseURL string, @@ -122,6 +154,7 @@ func e2eRoundTripResource( az liquid.AvailabilityZone, projectID liquid.ProjectUUID, resourceName liquid.ResourceName, + noCleanup bool, ) { testUUID := liquid.CommitmentUUID(fmt.Sprintf("e2e-%d", time.Now().UnixMilli())) @@ -167,6 +200,11 @@ func e2eRoundTripResource( // Register cleanup immediately so it runs even if the usage check panics. defer func() { + if noCleanup { + slog.Info("round-trip check: NoCleanup=true, keeping commitment for inspection", + "resource", resourceName, "uuid", testUUID) + return + } deleteReq := liquid.CommitmentChangeRequest{ InfoVersion: infoVersion, AZ: az, @@ -194,13 +232,227 @@ func e2eRoundTripResource( slog.Info("round-trip check: commitment deleted", "resource", resourceName, "uuid", testUUID) }() - // Smoke-check the usage API: verifies the usage calculation pipeline works for this project. - e2eCheckUsageAPI(ctx, baseURL, az, projectID) + report := e2eFetchUsageReport(ctx, baseURL, az, projectID) + e2eLogUsageReport(report, az, projectID) +} + +// CheckCommitmentsMultiFlavorGroupBatch exercises the pending→batch-confirm flow for each (AZ, resource) pair. +// For each pair it: +// 1. Creates a pending commitment (UUID:A, amount=1, ExpiresAt=10min) — non-confirming, always accepted +// 2. Sends an atomic batch: UUID:A pending→confirmed amount=3, UUID:B new confirmed amount=1 +// 3. If the batch is rejected (no capacity): cleans up the pending UUID:A and continues +// 4. If the batch is accepted: parses and logs the usage report; deletes unless NoCleanup=true +// +// This exercises the all-or-nothing semantics of change-commitments: if capacity for the full +// batch (4 units) is unavailable, neither UUID:A nor UUID:B is confirmed. +func CheckCommitmentsMultiFlavorGroupBatch(ctx context.Context, config E2EChecksConfig) { + baseURL := e2eBaseURL(config) + projectID := e2eProjectID(config) + azs := e2eAZs(config) + + serviceInfo := e2eFetchServiceInfo(ctx, baseURL) + + checked := 0 + for _, az := range azs { + for resourceName, resInfo := range serviceInfo.Resources { + if !resInfo.HandlesCommitments { + continue + } + e2eBatchFlavorGroupResource(ctx, baseURL, serviceInfo.Version, az, projectID, resourceName, config.NoCleanup) + checked++ + } + } + + if checked == 0 { + slog.Warn("batch check: no HandlesCommitments resources found in /info — nothing checked") + } } -// e2eCheckUsageAPI calls POST /commitments/v1/projects/:id/report-usage and verifies 200. -// The usage report for a project with no VMs will show zero usage — we only verify the endpoint works. -func e2eCheckUsageAPI(ctx context.Context, baseURL string, az liquid.AvailabilityZone, projectID liquid.ProjectUUID) { +// e2eBatchFlavorGroupResource runs the pending→batch-confirm cycle for one (AZ, resource) pair. +func e2eBatchFlavorGroupResource( + ctx context.Context, + baseURL string, + infoVersion int64, + az liquid.AvailabilityZone, + projectID liquid.ProjectUUID, + resourceName liquid.ResourceName, + noCleanup bool, +) { + + now := time.Now() + expiresAt := now.Add(10 * time.Minute) + uuidA := liquid.CommitmentUUID(fmt.Sprintf("e2e-batch-a-%d", now.UnixMilli())) + uuidB := liquid.CommitmentUUID(fmt.Sprintf("e2e-batch-b-%d", now.UnixMilli())) + + const ( + pendingAmountA = uint64(1) + confirmedAmountA = uint64(3) + confirmedAmountB = uint64(1) + ) + + // Request 1: create UUID:A as pending. + // Pending creation is non-confirming (totals unchanged) — always accepted, no rejection possible. + req1 := liquid.CommitmentChangeRequest{ + InfoVersion: infoVersion, + AZ: az, + ByProject: map[liquid.ProjectUUID]liquid.ProjectCommitmentChangeset{ + projectID: { + ByResource: map[liquid.ResourceName]liquid.ResourceCommitmentChangeset{ + resourceName: { + Commitments: []liquid.Commitment{{ + UUID: uuidA, + Amount: pendingAmountA, + NewStatus: Some(liquid.CommitmentStatusPending), + ExpiresAt: expiresAt, + }}, + }, + }, + }, + }, + } + + slog.Info("batch check: creating pending commitment", + "resource", resourceName, "uuid", uuidA, "project", projectID, "az", az) + if reason := e2eSendChangeCommitments(ctx, baseURL, req1); reason != "" { + panic(fmt.Sprintf("batch check: unexpected rejection for pending creation of %s: %s", resourceName, reason)) + } + slog.Info("batch check: pending commitment accepted", "resource", resourceName, "uuid", uuidA) + + // cleanup is updated as the state advances; the deferred call always runs the latest version. + var cleanup func() + defer func() { + if cleanup != nil { + cleanup() + } + }() + cleanup = func() { + req := liquid.CommitmentChangeRequest{ + InfoVersion: infoVersion, + AZ: az, + ByProject: map[liquid.ProjectUUID]liquid.ProjectCommitmentChangeset{ + projectID: { + ByResource: map[liquid.ResourceName]liquid.ResourceCommitmentChangeset{ + resourceName: { + Commitments: []liquid.Commitment{{ + UUID: uuidA, + Amount: pendingAmountA, + OldStatus: Some(liquid.CommitmentStatusPending), + NewStatus: None[liquid.CommitmentStatus](), + ExpiresAt: expiresAt, + }}, + }, + }, + }, + }, + } + slog.Info("batch check: deleting pending commitment", "resource", resourceName, "uuid", uuidA) + if reason := e2eSendChangeCommitments(ctx, baseURL, req); reason != "" { + panic(fmt.Sprintf("batch check: delete of pending commitment %s rejected: %s", uuidA, reason)) + } + slog.Info("batch check: pending commitment deleted", "resource", resourceName, "uuid", uuidA) + } + + // Request 2: atomic bundle — UUID:A pending→confirmed (grown to amount=3), UUID:B new confirmed amount=1. + // RequiresConfirmation=true because TotalConfirmed changes from 0 to 4. + // If capacity for all 4 units is unavailable, the whole bundle is rejected together. + req2 := liquid.CommitmentChangeRequest{ + InfoVersion: infoVersion, + AZ: az, + ByProject: map[liquid.ProjectUUID]liquid.ProjectCommitmentChangeset{ + projectID: { + ByResource: map[liquid.ResourceName]liquid.ResourceCommitmentChangeset{ + resourceName: { + TotalConfirmedBefore: 0, + TotalConfirmedAfter: confirmedAmountA + confirmedAmountB, + Commitments: []liquid.Commitment{ + { + UUID: uuidA, + Amount: confirmedAmountA, + OldStatus: Some(liquid.CommitmentStatusPending), + NewStatus: Some(liquid.CommitmentStatusConfirmed), + ExpiresAt: expiresAt, + }, + { + UUID: uuidB, + Amount: confirmedAmountB, + NewStatus: Some(liquid.CommitmentStatusConfirmed), + ExpiresAt: expiresAt, + }, + }, + }, + }, + }, + }, + } + + slog.Info("batch check: sending batch confirmation", + "resource", resourceName, "uuidA", uuidA, "uuidB", uuidB, + "totalConfirmed", confirmedAmountA+confirmedAmountB, + "project", projectID, "az", az) + + if reason := e2eSendChangeCommitments(ctx, baseURL, req2); reason != "" { + if !strings.Contains(reason, "no hosts found") { + panic(fmt.Sprintf("batch check: unexpected rejection for batch of %s: %s", resourceName, reason)) + } + slog.Info("batch check: batch rejected — no capacity for full amount, cleanup will remove pending", + "resource", resourceName, "reason", reason) + return + } + slog.Info("batch check: batch confirmed", "resource", resourceName, "uuidA", uuidA, "uuidB", uuidB) + + report := e2eFetchUsageReport(ctx, baseURL, az, projectID) + e2eLogUsageReport(report, az, projectID) + + if noCleanup { + slog.Info("batch check: NoCleanup=true, keeping commitments for inspection", + "resource", resourceName, "uuidA", uuidA, "uuidB", uuidB, "project", projectID) + cleanup = nil + return + } + + // Advance cleanup: delete both confirmed commitments. + cleanup = func() { + req := liquid.CommitmentChangeRequest{ + InfoVersion: infoVersion, + AZ: az, + ByProject: map[liquid.ProjectUUID]liquid.ProjectCommitmentChangeset{ + projectID: { + ByResource: map[liquid.ResourceName]liquid.ResourceCommitmentChangeset{ + resourceName: { + TotalConfirmedBefore: confirmedAmountA + confirmedAmountB, + TotalConfirmedAfter: 0, + Commitments: []liquid.Commitment{ + { + UUID: uuidA, + Amount: confirmedAmountA, + OldStatus: Some(liquid.CommitmentStatusConfirmed), + NewStatus: None[liquid.CommitmentStatus](), + ExpiresAt: expiresAt, + }, + { + UUID: uuidB, + Amount: confirmedAmountB, + OldStatus: Some(liquid.CommitmentStatusConfirmed), + NewStatus: None[liquid.CommitmentStatus](), + ExpiresAt: expiresAt, + }, + }, + }, + }, + }, + }, + } + slog.Info("batch check: deleting confirmed commitments", "resource", resourceName, "uuidA", uuidA, "uuidB", uuidB) + if reason := e2eSendChangeCommitments(ctx, baseURL, req); reason != "" { + panic(fmt.Sprintf("batch check: cleanup of confirmed commitments %s/%s rejected: %s", uuidA, uuidB, reason)) + } + slog.Info("batch check: confirmed commitments deleted", "resource", resourceName, "uuidA", uuidA, "uuidB", uuidB) + } +} + +// e2eFetchUsageReport calls POST /commitments/v1/projects/:id/report-usage, decodes the response, +// and returns it. Panics on HTTP errors or decode failures. +func e2eFetchUsageReport(ctx context.Context, baseURL string, az liquid.AvailabilityZone, projectID liquid.ProjectUUID) liquid.ServiceUsageReport { usageReq := liquid.ServiceUsageRequest{AllAZs: []liquid.AvailabilityZone{az}} body := must.Return(json.Marshal(usageReq)) url := fmt.Sprintf("%s/commitments/v1/projects/%s/report-usage", baseURL, projectID) @@ -214,7 +466,46 @@ func e2eCheckUsageAPI(ctx context.Context, baseURL string, az liquid.Availabilit bodyBytes := must.Return(io.ReadAll(resp.Body)) panic(fmt.Sprintf("usage API returned %d: %s", resp.StatusCode, bodyBytes)) } - slog.Info("round-trip check: usage API returned 200", "project", projectID) + var report liquid.ServiceUsageReport + if err := json.NewDecoder(resp.Body).Decode(&report); err != nil { + panic(fmt.Sprintf("failed to decode ServiceUsageReport: %v", err)) + } + return report +} + +// e2eLogUsageReport logs the usage summary from a ServiceUsageReport. +// For each resource it logs the total usage and, for subresources with attributes, +// counts committed (commitment_id present) vs PAYG (commitment_id absent) instances. +func e2eLogUsageReport(report liquid.ServiceUsageReport, az liquid.AvailabilityZone, projectID liquid.ProjectUUID) { + for resourceName, resReport := range report.Resources { + if resReport == nil { + continue + } + azReport := resReport.PerAZ[az] + if azReport == nil { + continue + } + committed := 0 + payg := 0 + for _, sub := range azReport.Subresources { + if len(sub.Attributes) == 0 { + continue + } + var attrs map[string]any + if err := json.Unmarshal(sub.Attributes, &attrs); err == nil && attrs["commitment_id"] != nil { + committed++ + } else { + payg++ + } + } + slog.Info("usage report", + "project", projectID, "az", az, "resource", resourceName, + "usage", azReport.Usage, + "subresources", len(azReport.Subresources), + "committed", committed, + "payg", payg, + ) + } } // e2eSendChangeCommitments sends a change-commitments request. @@ -273,5 +564,6 @@ func RunCommitmentsE2EChecks(ctx context.Context, config E2EChecksConfig) { slog.Info("running commitments e2e checks") CheckCommitmentsInfoEndpoint(ctx, config) CheckCommitmentsRoundTrip(ctx, config) + CheckCommitmentsMultiFlavorGroupBatch(ctx, config) slog.Info("all commitments e2e checks passed") } diff --git a/internal/scheduling/reservations/commitments/field_index.go b/internal/scheduling/reservations/commitments/field_index.go index 40760655d..5bcbefdc3 100644 --- a/internal/scheduling/reservations/commitments/field_index.go +++ b/internal/scheduling/reservations/commitments/field_index.go @@ -6,6 +6,7 @@ package commitments import ( "context" "errors" + "sync" "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/cobaltcore-dev/cortex/pkg/multicluster" @@ -14,50 +15,89 @@ import ( ) const idxCommittedResourceByUUID = "spec.commitmentUUID" +const idxCommittedResourceByProjectID = "spec.projectID" const idxReservationByCommitmentUUID = "spec.committedResourceReservation.commitmentUUID" -// IndexFields registers field indexes required by the CommittedResource controller. -func IndexFields(ctx context.Context, mcl *multicluster.Client) error { - log := logf.FromContext(ctx) - log.Info("Setting up field indexes for the CommittedResource controller") - if err := mcl.IndexField(ctx, - &v1alpha1.CommittedResource{}, - &v1alpha1.CommittedResourceList{}, - idxCommittedResourceByUUID, - func(obj client.Object) []string { - cr, ok := obj.(*v1alpha1.CommittedResource) - if !ok { - log.Error(errors.New("unexpected type"), "expected CommittedResource", "object", obj) - return nil - } - if cr.Spec.CommitmentUUID == "" { - return nil - } - return []string{cr.Spec.CommitmentUUID} - }, - ); err != nil { - log.Error(err, "failed to set up index for commitmentUUID") - return err - } - if err := mcl.IndexField(ctx, - &v1alpha1.Reservation{}, - &v1alpha1.ReservationList{}, - idxReservationByCommitmentUUID, - func(obj client.Object) []string { - res, ok := obj.(*v1alpha1.Reservation) - if !ok { - log.Error(errors.New("unexpected type"), "expected Reservation", "object", obj) - return nil - } - if res.Spec.CommittedResourceReservation == nil || res.Spec.CommittedResourceReservation.CommitmentUUID == "" { - return nil - } - return []string{res.Spec.CommittedResourceReservation.CommitmentUUID} - }, - ); err != nil { - log.Error(err, "failed to set up index for reservation commitmentUUID") - return err - } - log.Info("Successfully set up field indexes") - return nil +// once guards ensure each field index is registered exactly once. +// Both CommittedResourceController and UsageReconciler call indexCommittedResourceByUUID; +// the underlying cache returns "indexer conflict" on double registration. +var ( + onceIndexCRByUUID sync.Once + onceIndexCRByProjectID sync.Once + onceIndexReservationByUUID sync.Once +) + +// indexCommittedResourceByUUID registers the index used by UsageReconciler to look up +// CommittedResources by their CommitmentUUID. +func indexCommittedResourceByUUID(ctx context.Context, mcl *multicluster.Client) (err error) { + onceIndexCRByUUID.Do(func() { + log := logf.FromContext(ctx) + err = mcl.IndexField(ctx, + &v1alpha1.CommittedResource{}, + &v1alpha1.CommittedResourceList{}, + idxCommittedResourceByUUID, + func(obj client.Object) []string { + cr, ok := obj.(*v1alpha1.CommittedResource) + if !ok { + log.Error(errors.New("unexpected type"), "expected CommittedResource", "object", obj) + return nil + } + if cr.Spec.CommitmentUUID == "" { + return nil + } + return []string{cr.Spec.CommitmentUUID} + }, + ) + }) + return err +} + +// indexCommittedResourceByProjectID registers the index used to look up CommittedResources +// by their project ID, avoiding full-cluster scans when filtering per project. +func indexCommittedResourceByProjectID(ctx context.Context, mcl *multicluster.Client) (err error) { + onceIndexCRByProjectID.Do(func() { + log := logf.FromContext(ctx) + err = mcl.IndexField(ctx, + &v1alpha1.CommittedResource{}, + &v1alpha1.CommittedResourceList{}, + idxCommittedResourceByProjectID, + func(obj client.Object) []string { + cr, ok := obj.(*v1alpha1.CommittedResource) + if !ok { + log.Error(errors.New("unexpected type"), "expected CommittedResource", "object", obj) + return nil + } + if cr.Spec.ProjectID == "" { + return nil + } + return []string{cr.Spec.ProjectID} + }, + ) + }) + return err +} + +// indexReservationByCommitmentUUID registers the index used by CommittedResourceController to +// look up child Reservations by their CommitmentUUID. +func indexReservationByCommitmentUUID(ctx context.Context, mcl *multicluster.Client) (err error) { + onceIndexReservationByUUID.Do(func() { + log := logf.FromContext(ctx) + err = mcl.IndexField(ctx, + &v1alpha1.Reservation{}, + &v1alpha1.ReservationList{}, + idxReservationByCommitmentUUID, + func(obj client.Object) []string { + res, ok := obj.(*v1alpha1.Reservation) + if !ok { + log.Error(errors.New("unexpected type"), "expected Reservation", "object", obj) + return nil + } + if res.Spec.CommittedResourceReservation == nil || res.Spec.CommittedResourceReservation.CommitmentUUID == "" { + return nil + } + return []string{res.Spec.CommittedResourceReservation.CommitmentUUID} + }, + ) + }) + return err } diff --git a/internal/scheduling/reservations/commitments/integration_test.go b/internal/scheduling/reservations/commitments/integration_test.go index e89e2adb1..857cf2734 100644 --- a/internal/scheduling/reservations/commitments/integration_test.go +++ b/internal/scheduling/reservations/commitments/integration_test.go @@ -454,7 +454,7 @@ func (e *intgEnv) reconcileChildReservations(t *testing.T, crName string) { // condition or the 5 s deadline is reached. // // One pass: -// 1. CR controller (adds finalizer / creates Reservation CRDs / handles inactive states) +// 1. CR controller (creates Reservation CRDs / handles inactive states) // 2. Reservation controller ×2 per slot (first call sets TargetHost, second sets Ready=True) // 3. CR controller again (picks up placement outcomes: Accepted or Rejected) func intgDriveToTerminal(t *testing.T, env *intgEnv, crNames []string) { @@ -528,9 +528,6 @@ func intgDriveToTerminal(t *testing.T, env *intgEnv, crNames []string) { } func intgIsTerminalCR(cr v1alpha1.CommittedResource) bool { - if !cr.DeletionTimestamp.IsZero() { - return false // needs one more reconcile to remove its finalizer - } cond := meta.FindStatusCondition(cr.Status.Conditions, v1alpha1.CommittedResourceConditionReady) if cond == nil { return false @@ -696,7 +693,7 @@ func TestCRLifecycle(t *testing.T) { t.Fatalf("create CR: %v", err) } - // Reconcile as planned: finalizer added, no Reservations. + // Reconcile as planned: no Reservations created. env.reconcileCR(t, cr.Name) env.reconcileCR(t, cr.Name) if got := env.listChildReservations(t, cr.Name); len(got) != 0 { @@ -738,8 +735,8 @@ func TestCRLifecycle(t *testing.T) { } // Bring to confirmed+Ready=True. - env.reconcileCR(t, cr.Name) // adds finalizer env.reconcileCR(t, cr.Name) // creates Reservations + env.reconcileCR(t, cr.Name) // picks up Reservation outcomes env.reconcileChildReservations(t, cr.Name) // places slots → Ready=True if got := env.listChildReservations(t, cr.Name); len(got) != 1 { @@ -810,58 +807,6 @@ func TestCRLifecycle(t *testing.T) { } }) - t.Run("deletion: finalizer removed, child Reservations cleaned up", func(t *testing.T) { - env := newDefaultIntgEnv(t) - defer env.close() - - cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) - if err := env.k8sClient.Create(context.Background(), cr); err != nil { - t.Fatalf("create CR: %v", err) - } - - // Pre-create a child Reservation to verify it gets cleaned up on deletion. - // newTestCommittedResource pre-populates the finalizer, so Delete() immediately sets DeletionTimestamp. - child := &v1alpha1.Reservation{ - ObjectMeta: metav1.ObjectMeta{ - Name: "my-cr-0", - Labels: map[string]string{ - v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, - }, - }, - Spec: v1alpha1.ReservationSpec{ - Type: v1alpha1.ReservationTypeCommittedResource, - CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ - CommitmentUUID: "test-uuid-1234", - }, - }, - } - if err := env.k8sClient.Create(context.Background(), child); err != nil { - t.Fatalf("create child reservation: %v", err) - } - - crState := env.getCR(t, cr.Name) - if err := env.k8sClient.Delete(context.Background(), &crState); err != nil { - t.Fatalf("delete CR: %v", err) - } - env.reconcileCR(t, cr.Name) - - if got := env.listChildReservations(t, cr.Name); len(got) != 0 { - t.Errorf("post-deletion: expected 0 reservations, got %d", len(got)) - } - var final v1alpha1.CommittedResource - err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: cr.Name}, &final) - if client.IgnoreNotFound(err) != nil { - t.Fatalf("unexpected error after deletion: %v", err) - } - if err == nil { - for _, f := range final.Finalizers { - if f == crFinalizer { - t.Errorf("finalizer not removed after deletion reconcile") - } - } - } - }) - t.Run("confirmed→superseded: child Reservations deleted, CR marked inactive", func(t *testing.T) { env := newDefaultIntgEnv(t) defer env.close() @@ -1031,14 +976,12 @@ func TestCRLifecycle(t *testing.T) { if crState.Status.AcceptedAt == nil { t.Errorf("expected AcceptedAt to be set on acceptance") } - if crState.Status.AcceptedAmount == nil { - t.Errorf("expected AcceptedAmount to be set on acceptance") - } else if crState.Status.AcceptedAmount.Cmp(resource.MustParse("4Gi")) != 0 { - t.Errorf("AcceptedAmount: want 4Gi, got %s", crState.Status.AcceptedAmount.String()) + if crState.Status.AcceptedSpec == nil || crState.Status.AcceptedSpec.Amount.Cmp(resource.MustParse("4Gi")) != 0 { + t.Errorf("AcceptedSpec.Amount: want 4Gi, got %v", crState.Status.AcceptedSpec) } }) - t.Run("resize failure: rolls back to AcceptedAmount, prior slot preserved", func(t *testing.T) { + t.Run("resize failure: rolls back to AcceptedSpec, prior slot preserved", func(t *testing.T) { // Scheduler: accepts the first placement call (initial 4 GiB slot), rejects all subsequent. objects := []client.Object{newTestFlavorKnowledge(), intgHypervisor("host-1")} env := newIntgEnv(t, objects, intgAcceptFirstScheduler(1)) @@ -1058,8 +1001,8 @@ func TestCRLifecycle(t *testing.T) { if !meta.IsStatusConditionTrue(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) { t.Fatalf("phase 1: expected CR to be Ready=True after initial placement") } - if crState.Status.AcceptedAmount == nil || crState.Status.AcceptedAmount.Cmp(resource.MustParse("4Gi")) != 0 { - t.Fatalf("phase 1: AcceptedAmount must be 4Gi, got %v", crState.Status.AcceptedAmount) + if crState.Status.AcceptedSpec == nil || crState.Status.AcceptedSpec.Amount.Cmp(resource.MustParse("4Gi")) != 0 { + t.Fatalf("phase 1: AcceptedSpec.Amount must be 4Gi, got %v", crState.Status.AcceptedSpec) } // Phase 2: resize to 8 GiB (needs 2 slots). Scheduler has no more accepts. @@ -1096,7 +1039,7 @@ func TestCRLifecycle(t *testing.T) { t.Fatalf("list reservations: %v", err) } if len(finalList.Items) != 1 { - t.Errorf("resize rollback: want 1 slot (AcceptedAmount), got %d", len(finalList.Items)) + t.Errorf("resize rollback: want 1 slot (AcceptedSpec), got %d", len(finalList.Items)) } intgAssertCRCondition(t, env.k8sClient, []string{cr.Name}, metav1.ConditionFalse, v1alpha1.CommittedResourceReasonRejected) }) diff --git a/internal/scheduling/reservations/commitments/reservation_manager.go b/internal/scheduling/reservations/commitments/reservation_manager.go index 6d70bcd20..3e0d7dbc6 100644 --- a/internal/scheduling/reservations/commitments/reservation_manager.go +++ b/internal/scheduling/reservations/commitments/reservation_manager.go @@ -109,19 +109,24 @@ func (m *ReservationManager) ApplyCommitmentState( nextSlotIndex := GetNextSlotIndex(existing) - // Phase 3 (DELETE): Delete inconsistent reservations (wrong flavor group/project) + // Phase 3 (DELETE): Delete inconsistent reservations (wrong flavor group, project, or AZ). + // AZ is included because the reservation is pinned to a specific host; changing AZ requires + // re-placement on a host in the new AZ — patching the spec field in place is not sufficient. // They will be recreated with correct metadata in subsequent phases. var validReservations []v1alpha1.Reservation for _, res := range existing { if res.Spec.CommittedResourceReservation.ResourceGroup != desiredState.FlavorGroupName || - res.Spec.CommittedResourceReservation.ProjectID != desiredState.ProjectID { - log.Info("Found a reservation with wrong flavor group or project, delete and recreate afterward", + res.Spec.CommittedResourceReservation.ProjectID != desiredState.ProjectID || + res.Spec.AvailabilityZone != desiredState.AvailabilityZone { + log.Info("Found a reservation with wrong flavor group, project, or AZ, delete and recreate afterward", "commitmentUUID", desiredState.CommitmentUUID, "name", res.Name, "expectedFlavorGroup", desiredState.FlavorGroupName, "actualFlavorGroup", res.Spec.CommittedResourceReservation.ResourceGroup, "expectedProjectID", desiredState.ProjectID, - "actualProjectID", res.Spec.CommittedResourceReservation.ProjectID) + "actualProjectID", res.Spec.CommittedResourceReservation.ProjectID, + "expectedAZ", desiredState.AvailabilityZone, + "actualAZ", res.Spec.AvailabilityZone) result.Repaired++ result.RemovedReservations = append(result.RemovedReservations, res) memValue := res.Spec.Resources[hv1.ResourceMemory] @@ -227,9 +232,11 @@ func (m *ReservationManager) syncReservationMetadata( state *CommitmentState, ) (*v1alpha1.Reservation, error) { - // if any of CommitmentUUID, AZ, StarTime, EndTime differ from desired state, need to patch + // if any of CommitmentUUID, DomainID, StartTime, EndTime, ParentGeneration differ from desired state, need to patch. + // AvailabilityZone is intentionally excluded: an AZ mismatch is handled in Phase 3 (delete + recreate) + // because the reservation is pinned to a host and cannot simply be patched to a different AZ. if (state.CommitmentUUID != "" && reservation.Spec.CommittedResourceReservation.CommitmentUUID != state.CommitmentUUID) || - (state.AvailabilityZone != "" && reservation.Spec.AvailabilityZone != state.AvailabilityZone) || + (state.DomainID != "" && reservation.Spec.CommittedResourceReservation.DomainID != state.DomainID) || (state.StartTime != nil && (reservation.Spec.StartTime == nil || !reservation.Spec.StartTime.Time.Equal(*state.StartTime))) || (state.EndTime != nil && (reservation.Spec.EndTime == nil || !reservation.Spec.EndTime.Time.Equal(*state.EndTime))) || (state.ParentGeneration != 0 && reservation.Spec.CommittedResourceReservation.ParentGeneration != state.ParentGeneration) { @@ -243,13 +250,12 @@ func (m *ReservationManager) syncReservationMetadata( if state.CommitmentUUID != "" { reservation.Spec.CommittedResourceReservation.CommitmentUUID = state.CommitmentUUID } + if state.DomainID != "" { + reservation.Spec.CommittedResourceReservation.DomainID = state.DomainID + } if state.ParentGeneration != 0 { reservation.Spec.CommittedResourceReservation.ParentGeneration = state.ParentGeneration } - - if state.AvailabilityZone != "" { - reservation.Spec.AvailabilityZone = state.AvailabilityZone - } if state.StartTime != nil { reservation.Spec.StartTime = &metav1.Time{Time: *state.StartTime} } diff --git a/internal/scheduling/reservations/commitments/reservation_manager_test.go b/internal/scheduling/reservations/commitments/reservation_manager_test.go index bb2fdaf52..e2ce2f26c 100644 --- a/internal/scheduling/reservations/commitments/reservation_manager_test.go +++ b/internal/scheduling/reservations/commitments/reservation_manager_test.go @@ -45,6 +45,20 @@ func newTestCRSlot(name string, memGiB int64, targetHost, resourceGroup string, } } +// withAZ returns a copy of the reservation with the given availability zone set. +func withAZ(res v1alpha1.Reservation, az string) v1alpha1.Reservation { + res.Spec.AvailabilityZone = az + return res +} + +// withDomainID returns a copy of the reservation with the given domain ID set. +func withDomainID(res v1alpha1.Reservation, domainID string) v1alpha1.Reservation { + spec := *res.Spec.CommittedResourceReservation + spec.DomainID = domainID + res.Spec.CommittedResourceReservation = &spec + return res +} + // testFlavorGroups returns the default flavor groups map used across tests. func testFlavorGroups() map[string]compute.FlavorGroupFeature { return map[string]compute.FlavorGroupFeature{"test-group": testFlavorGroup()} @@ -59,6 +73,8 @@ func TestApplyCommitmentState(t *testing.T) { name string existingSlots []v1alpha1.Reservation desiredMemoryGiB int64 + desiredAZ string + desiredDomainID string flavorGroupOverride map[string]compute.FlavorGroupFeature // nil = testFlavorGroups() wantError bool wantRemovedCount int // exact count; -1 = at least one @@ -210,6 +226,97 @@ func TestApplyCommitmentState(t *testing.T) { } }, }, + // ---------------------------------------------------------------- + // AZ change must delete+recreate (re-placement required) + // ---------------------------------------------------------------- + { + // Bug: before the fix, AZ change was handled by syncReservationMetadata which + // patched the spec in place, leaving the reservation pinned to a host in the wrong AZ. + name: "AZ change on placed reservation triggers delete and recreate", + existingSlots: []v1alpha1.Reservation{ + withAZ(newTestCRSlot("commitment-abc123-0", 8, "host-1", "test-group", nil), "az-old"), + }, + desiredMemoryGiB: 8, + desiredAZ: "az-new", + wantRemovedCount: 1, + validateRemoved: func(t *testing.T, removed []v1alpha1.Reservation) { + if got := removed[0].Spec.AvailabilityZone; got != "az-old" { + t.Errorf("expected removed slot to have AZ az-old, got %q", got) + } + }, + validateRemaining: func(t *testing.T, remaining []v1alpha1.Reservation) { + if len(remaining) != 1 { + t.Fatalf("expected 1 remaining slot, got %d", len(remaining)) + } + r := remaining[0] + if got := r.Spec.AvailabilityZone; got != "az-new" { + t.Errorf("expected recreated slot to have AZ az-new, got %q", got) + } + if r.Spec.TargetHost != "" { + t.Errorf("expected recreated slot to have no TargetHost (pending re-placement), got %q", r.Spec.TargetHost) + } + }, + }, + { + name: "AZ change on unplaced reservation also triggers delete and recreate", + existingSlots: []v1alpha1.Reservation{ + withAZ(newTestCRSlot("commitment-abc123-0", 8, "", "test-group", nil), "az-old"), + }, + desiredMemoryGiB: 8, + desiredAZ: "az-new", + wantRemovedCount: 1, + validateRemaining: func(t *testing.T, remaining []v1alpha1.Reservation) { + if len(remaining) != 1 { + t.Fatalf("expected 1 remaining slot, got %d", len(remaining)) + } + if got := remaining[0].Spec.AvailabilityZone; got != "az-new" { + t.Errorf("expected recreated slot to have AZ az-new, got %q", got) + } + }, + }, + { + name: "matching AZ does not trigger delete", + existingSlots: []v1alpha1.Reservation{ + withAZ(newTestCRSlot("commitment-abc123-0", 8, "host-1", "test-group", nil), "az-1"), + }, + desiredMemoryGiB: 8, + desiredAZ: "az-1", + wantRemovedCount: 0, + validateRemaining: func(t *testing.T, remaining []v1alpha1.Reservation) { + if len(remaining) != 1 { + t.Fatalf("expected 1 remaining, got %d", len(remaining)) + } + if remaining[0].Spec.TargetHost != "host-1" { + t.Errorf("expected host-1 to be preserved, got %q", remaining[0].Spec.TargetHost) + } + }, + }, + // ---------------------------------------------------------------- + // DomainID change must be synced in place (no re-placement) + // ---------------------------------------------------------------- + { + // Bug: before the fix, DomainID was never synced — existing reservations silently + // kept stale domain metadata if Limes updated project information. + name: "DomainID change is synced in place without re-placement", + existingSlots: []v1alpha1.Reservation{ + withDomainID(newTestCRSlot("commitment-abc123-0", 8, "host-1", "test-group", nil), "domain-old"), + }, + desiredMemoryGiB: 8, + desiredDomainID: "domain-new", + wantRemovedCount: 0, + validateRemaining: func(t *testing.T, remaining []v1alpha1.Reservation) { + if len(remaining) != 1 { + t.Fatalf("expected 1 remaining, got %d", len(remaining)) + } + r := remaining[0] + if got := r.Spec.CommittedResourceReservation.DomainID; got != "domain-new" { + t.Errorf("expected DomainID domain-new, got %q", got) + } + if r.Spec.TargetHost != "host-1" { + t.Errorf("expected host-1 to be preserved (no re-placement), got %q", r.Spec.TargetHost) + } + }, + }, } scheme := newCRTestScheme(t) @@ -232,6 +339,8 @@ func TestApplyCommitmentState(t *testing.T) { ProjectID: "project-1", FlavorGroupName: "test-group", TotalMemoryBytes: tt.desiredMemoryGiB * 1024 * 1024 * 1024, + AvailabilityZone: tt.desiredAZ, + DomainID: tt.desiredDomainID, } applyResult, err := manager.ApplyCommitmentState( diff --git a/internal/scheduling/reservations/commitments/syncer.go b/internal/scheduling/reservations/commitments/syncer.go index 8d3a43adf..7c96823d0 100644 --- a/internal/scheduling/reservations/commitments/syncer.go +++ b/internal/scheduling/reservations/commitments/syncer.go @@ -5,6 +5,7 @@ package commitments import ( "context" + "errors" "fmt" "time" @@ -22,6 +23,10 @@ import ( var ( // CreatorValue identifies reservations created by this syncer. CreatorValue = "commitments-syncer" + + // errAZChanged is a sentinel returned from CreateOrUpdate mutateFns when the existing CR's + // AZ differs from the desired state. The caller logs an error and skips the CR. + errAZChanged = errors.New("availability zone changed") ) type SyncerConfig struct { @@ -30,7 +35,7 @@ type SyncerConfig struct { // Secret ref to SSO credentials stored in a k8s secret, if applicable. SSOSecretRef *corev1.SecretReference `json:"ssoSecretRef"` // SyncInterval defines how often the syncer reconciles Limes commitments to Reservation CRDs. - SyncInterval time.Duration `json:"committedResourceSyncInterval"` + SyncInterval metav1.Duration `json:"committedResourceSyncInterval"` } type Syncer struct { @@ -53,7 +58,7 @@ func NewSyncer(k8sClient client.Client, monitor *SyncerMonitor) *Syncer { } func (s *Syncer) Init(ctx context.Context, config SyncerConfig) error { - s.syncInterval = config.SyncInterval + s.syncInterval = config.SyncInterval.Duration if err := s.CommitmentsClient.Init(ctx, s.Client, config); err != nil { return err } @@ -291,8 +296,8 @@ func (s *Syncer) SyncReservations(ctx context.Context) error { // Count CommittedResource CRDs present locally but absent from Limes (do not delete — Limes // responses may be transient and deleting active CRDs would drop Reservation slots). - // Also GC CRDs whose EndTime has passed: the commitment is over, the controller's finalizer - // will clean up child Reservations on deletion. + // Also GC CRDs whose EndTime has passed: the commitment is over, child Reservations will be + // cleaned up by the syncer's orphan GC on the next sync cycle. var existingCRs v1alpha1.CommittedResourceList if err := s.List(ctx, &existingCRs); err != nil { logger.Error(err, "failed to list existing committed resource CRDs") @@ -392,7 +397,8 @@ func (s *Syncer) applyCommittedResourceSpec(cr *v1alpha1.CommittedResource, stat cr.Spec.ProjectID = state.ProjectID cr.Spec.DomainID = state.DomainID cr.Spec.State = state.State - cr.Spec.AllowRejection = false + // AllowRejection is not set here: the API path (applyCRSpec) sets it explicitly, + // and callers that go through CreateOrUpdate preserve the existing value. if state.StartTime != nil { t := metav1.NewTime(*state.StartTime) @@ -413,9 +419,25 @@ func (s *Syncer) upsertCommittedResource(ctx context.Context, logger logr.Logger cr.Name = "commitment-" + state.CommitmentUUID op, err := controllerutil.CreateOrUpdate(ctx, s.Client, cr, func() error { + if cr.Spec.AvailabilityZone != "" && cr.Spec.AvailabilityZone != state.AvailabilityZone { + return errAZChanged + } + // AllowRejection is an API execution flag, not a Limes commitment property. + // Preserve the existing value so a syncer write never clobbers an in-flight + // change-commitments request. For new CRDs the zero value (false) is correct. + allowRejection := cr.Spec.AllowRejection s.applyCommittedResourceSpec(cr, state) + cr.Spec.AllowRejection = allowRejection return nil }) + if errors.Is(err, errAZChanged) { + logger.Error(nil, "availability zone mismatch on existing commitment — skipping sync", + "commitmentUUID", state.CommitmentUUID, + "currentAZ", cr.Spec.AvailabilityZone, + "requestedAZ", state.AvailabilityZone, + ) + return controllerutil.OperationResultNone, nil + } if err != nil { return op, err } @@ -438,6 +460,14 @@ func (s *Syncer) updateCommittedResourceIfExists(ctx context.Context, logger log } return controllerutil.OperationResultNone, err } + if cr.Spec.AvailabilityZone != "" && cr.Spec.AvailabilityZone != state.AvailabilityZone { + logger.Error(nil, "availability zone mismatch on existing commitment — skipping sync", + "commitmentUUID", state.CommitmentUUID, + "currentAZ", cr.Spec.AvailabilityZone, + "requestedAZ", state.AvailabilityZone, + ) + return controllerutil.OperationResultNone, nil + } s.applyCommittedResourceSpec(cr, state) if err := s.Update(ctx, cr); err != nil { return controllerutil.OperationResultNone, err diff --git a/internal/scheduling/reservations/commitments/usage.go b/internal/scheduling/reservations/commitments/usage.go index d634fc2a0..a9333fdaa 100644 --- a/internal/scheduling/reservations/commitments/usage.go +++ b/internal/scheduling/reservations/commitments/usage.go @@ -30,7 +30,7 @@ type UsageDBClient interface { ListProjectVMs(ctx context.Context, projectID string) ([]VMRow, error) } -// VMRow is the result of a joined server+flavor query from Postgres. +// VMRow is the result of a joined server+flavor+image query from Postgres. type VMRow struct { ID string Name string @@ -43,6 +43,7 @@ type VMRow struct { FlavorVCPUs uint64 FlavorDisk uint64 FlavorExtras string // JSON string of flavor extra_specs + OSType string // pre-computed from Glance image properties; "unknown" when not found } // CommitmentStateWithUsage extends CommitmentState with usage tracking for billing calculations. @@ -51,8 +52,10 @@ type CommitmentStateWithUsage struct { CommitmentState // RemainingMemoryBytes is the uncommitted capacity left for VM assignment RemainingMemoryBytes int64 - // AssignedVMs tracks which VMs have been assigned to this commitment - AssignedVMs []string + // AssignedInstances tracks which VM instances have been assigned to this commitment + AssignedInstances []string + // UsedVCPUs is the total vCPU count of assigned VM instances + UsedVCPUs int64 } // NewCommitmentStateWithUsage creates a CommitmentStateWithUsage from a CommitmentState. @@ -60,16 +63,17 @@ func NewCommitmentStateWithUsage(state *CommitmentState) *CommitmentStateWithUsa return &CommitmentStateWithUsage{ CommitmentState: *state, RemainingMemoryBytes: state.TotalMemoryBytes, - AssignedVMs: []string{}, + AssignedInstances: []string{}, } } // AssignVM attempts to assign a VM to this commitment if there's enough capacity. // Returns true if the VM was assigned, false if not enough capacity. -func (c *CommitmentStateWithUsage) AssignVM(vmUUID string, vmMemoryBytes int64) bool { +func (c *CommitmentStateWithUsage) AssignVM(vmUUID string, vmMemoryBytes, vCPUs int64) bool { if c.RemainingMemoryBytes >= vmMemoryBytes { c.RemainingMemoryBytes -= vmMemoryBytes - c.AssignedVMs = append(c.AssignedVMs, vmUUID) + c.UsedVCPUs += vCPUs + c.AssignedInstances = append(c.AssignedInstances, vmUUID) return true } return false @@ -92,6 +96,7 @@ type VMUsageInfo struct { VCPUs uint64 DiskGB uint64 VideoRAMMiB *uint64 // optional, from flavor extra_specs hw_video:ram_max_mb + OSType string // pre-computed from Glance image; "unknown" for volume-booted or unmapped images AZ string Hypervisor string CreatedAt time.Time @@ -113,96 +118,111 @@ func NewUsageCalculator(client client.Client, usageDB UsageDBClient) *UsageCalcu } // CalculateUsage computes the usage report for a specific project. +// VM-to-commitment assignment is read from CommittedResource CRD status (pre-computed by the +// UsageReconciler). If a CR has no usage status yet, its VMs appear as PAYG until the first +// reconcile completes (within one CooldownInterval). func (c *UsageCalculator) CalculateUsage( ctx context.Context, log logr.Logger, projectID string, allAZs []liquid.AvailabilityZone, ) (liquid.ServiceUsageReport, error) { - // Step 1: Get flavor groups from knowledge + knowledge := &reservations.FlavorGroupKnowledgeClient{Client: c.client} flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) if err != nil { return liquid.ServiceUsageReport{}, fmt.Errorf("failed to get flavor groups: %w", err) } - // Get info version from Knowledge CRD (used by Limes to detect metadata changes) var infoVersion int64 = -1 if knowledgeCRD, err := knowledge.Get(ctx); err == nil && knowledgeCRD != nil && !knowledgeCRD.Status.LastContentChange.IsZero() { infoVersion = knowledgeCRD.Status.LastContentChange.Unix() } - // Step 2: Build commitment capacity map from K8s Reservation CRDs - commitmentsByAZFlavorGroup, err := c.buildCommitmentCapacityMap(ctx, log, projectID) + vmAssignments, err := c.BuildVMAssignmentsFromStatus(ctx, projectID) if err != nil { - return liquid.ServiceUsageReport{}, fmt.Errorf("failed to build commitment capacity map: %w", err) + return liquid.ServiceUsageReport{}, fmt.Errorf("failed to read VM assignments from CRD status: %w", err) } - // Step 3: Get and sort VMs for the project - vms, err := c.getProjectVMs(ctx, log, projectID, flavorGroups, allAZs) + vms, err := getProjectVMs(ctx, c.usageDB, log, projectID, flavorGroups, allAZs) if err != nil { return liquid.ServiceUsageReport{}, fmt.Errorf("failed to get project VMs: %w", err) } - sortVMsForUsageCalculation(vms) - - // Step 4: Assign VMs to commitments - vmAssignments, assignedToCommitments := c.assignVMsToCommitments(vms, commitmentsByAZFlavorGroup) - // Step 5: Build the response report := c.buildUsageResponse(vms, vmAssignments, flavorGroups, allAZs, infoVersion) + assignedToCommitments := 0 + for _, vm := range vms { + if vmAssignments[vm.UUID] != "" { + assignedToCommitments++ + } + } log.Info("completed usage report", "projectID", projectID, "vmCount", len(vms), "assignedToCommitments", assignedToCommitments, "payg", len(vms)-assignedToCommitments, - "commitments", countCommitmentStates(commitmentsByAZFlavorGroup), "resources", len(report.Resources)) return report, nil } +// BuildVMAssignmentsFromStatus reads pre-computed VM-to-commitment assignments from +// CommittedResource CRD status. Returns a map of vmUUID → commitmentUUID (empty string = PAYG). +// This is the read path that replaces the inline assignment algorithm in the usage API. +func (c *UsageCalculator) BuildVMAssignmentsFromStatus(ctx context.Context, projectID string) (map[string]string, error) { + var crList v1alpha1.CommittedResourceList + if err := c.client.List(ctx, &crList, client.MatchingFields{idxCommittedResourceByProjectID: projectID}); err != nil { + return nil, fmt.Errorf("failed to list CommittedResources: %w", err) + } + assignments := make(map[string]string) + for _, cr := range crList.Items { + for _, vmUUID := range cr.Status.AssignedInstances { + assignments[vmUUID] = cr.Spec.CommitmentUUID + } + } + return assignments, nil +} + // azFlavorGroupKey creates a deterministic key for az:flavorGroup lookups. func azFlavorGroupKey(az, flavorGroup string) string { return az + ":" + flavorGroup } -// buildCommitmentCapacityMap retrieves all CR reservations for a project and builds -// a map of az:flavorGroup -> list of CommitmentStateWithUsage, sorted for deterministic assignment. -func (c *UsageCalculator) buildCommitmentCapacityMap( +// buildCommitmentCapacityMap builds a map of az:flavorGroup -> list of CommitmentStateWithUsage +// from CommittedResource CRD status (AcceptedSpec). Using the accepted spec snapshot gives the +// billing-perspective capacity — what was confirmed — rather than the potentially-mutated current spec. +func buildCommitmentCapacityMap( ctx context.Context, + k8sClient client.Client, log logr.Logger, projectID string, ) (map[string][]*CommitmentStateWithUsage, error) { - // List all committed resource reservations - var allReservations v1alpha1.ReservationList - if err := c.client.List(ctx, &allReservations, client.MatchingLabels{ - v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, - }); err != nil { - return nil, fmt.Errorf("failed to list reservations: %w", err) + + var allCRs v1alpha1.CommittedResourceList + if err := k8sClient.List(ctx, &allCRs, client.MatchingFields{idxCommittedResourceByProjectID: projectID}); err != nil { + return nil, fmt.Errorf("failed to list CommittedResources: %w", err) } - // Group reservations by commitment UUID, filtering by project - reservationsByCommitment := make(map[string][]v1alpha1.Reservation) - for _, res := range allReservations.Items { - if res.Spec.CommittedResourceReservation == nil { + now := time.Now() + result := make(map[string][]*CommitmentStateWithUsage) + for _, cr := range allCRs.Items { + if cr.Status.AcceptedSpec == nil { + log.V(1).Info("skipping CR with no accepted spec", "cr", cr.Name) continue } - if res.Spec.CommittedResourceReservation.ProjectID != projectID { + // Use AcceptedSpec.State so sibling CRs whose spec is mid-transition (e.g. syncer just + // wrote expired before the CR controller accepted it) don't lose capacity prematurely. + if cr.Status.AcceptedSpec.State != v1alpha1.CommitmentStatusConfirmed && cr.Status.AcceptedSpec.State != v1alpha1.CommitmentStatusGuaranteed { continue } - commitmentUUID := res.Spec.CommittedResourceReservation.CommitmentUUID - reservationsByCommitment[commitmentUUID] = append(reservationsByCommitment[commitmentUUID], res) - } - // Build CommitmentState for each commitment and group by az:flavorGroup - // Only include commitments that are currently active (started and not expired) - now := time.Now() - result := make(map[string][]*CommitmentStateWithUsage) - for _, reservations := range reservationsByCommitment { - state, err := FromReservations(reservations) + // Build state from the accepted spec snapshot so capacity always reflects + // what was confirmed, not the potentially-mutated current spec. + tempCR := v1alpha1.CommittedResource{Spec: *cr.Status.AcceptedSpec} + state, err := FromCommittedResource(tempCR) if err != nil { - log.Error(err, "failed to build commitment state from reservations") + log.Error(err, "skipping CR with invalid accepted spec", "cr", cr.Name) continue } @@ -236,19 +256,20 @@ func (c *UsageCalculator) buildCommitmentCapacityMap( } // getProjectVMs retrieves all VMs for a project from Postgres and enriches them with flavor group info. -func (c *UsageCalculator) getProjectVMs( +func getProjectVMs( ctx context.Context, + usageDB UsageDBClient, log logr.Logger, projectID string, flavorGroups map[string]compute.FlavorGroupFeature, allAZs []liquid.AvailabilityZone, ) ([]VMUsageInfo, error) { - if c.usageDB == nil { + if usageDB == nil { return nil, errors.New("usage DB client not configured") } - rows, err := c.usageDB.ListProjectVMs(ctx, projectID) + rows, err := usageDB.ListProjectVMs(ctx, projectID) if err != nil { return nil, fmt.Errorf("failed to list VMs from Postgres: %w", err) } @@ -313,6 +334,7 @@ func (c *UsageCalculator) getProjectVMs( VCPUs: row.FlavorVCPUs, DiskGB: row.FlavorDisk, VideoRAMMiB: videoRAMMiB, + OSType: row.OSType, AZ: string(normalizedAZ), Hypervisor: row.Hypervisor, CreatedAt: createdAt, @@ -373,38 +395,22 @@ func sortCommitmentsForAssignment(commitments []*CommitmentStateWithUsage) { } // assignVMsToCommitments assigns VMs to commitments based on az:flavorGroup matching. -// Returns a map of vmUUID -> commitmentUUID (empty string for PAYG VMs) and count of assigned VMs. -func (c *UsageCalculator) assignVMsToCommitments( +// Mutates each CommitmentStateWithUsage in place: AssignedInstances and RemainingMemoryBytes are updated. +// VMs that don't fit any commitment are left unassigned (PAYG). +func assignVMsToCommitments( vms []VMUsageInfo, commitmentsByAZFlavorGroup map[string][]*CommitmentStateWithUsage, -) (vmAssignments map[string]string, assignedCount int) { - - vmAssignments = make(map[string]string, len(vms)) +) { for _, vm := range vms { key := azFlavorGroupKey(vm.AZ, vm.FlavorGroup) - commitments := commitmentsByAZFlavorGroup[key] - - vmMemoryBytes := int64(vm.MemoryMB) * 1024 * 1024 //nolint:gosec // VM memory from Nova, realistically bounded - assigned := false - - // Try to assign to first commitment with remaining capacity - for _, commitment := range commitments { - if commitment.AssignVM(vm.UUID, vmMemoryBytes) { - vmAssignments[vm.UUID] = commitment.CommitmentUUID - assigned = true - assignedCount++ + for _, commitment := range commitmentsByAZFlavorGroup[key] { + vmMemoryBytes := int64(vm.MemoryMB) * 1024 * 1024 //nolint:gosec // VM memory from Nova, realistically bounded + if commitment.AssignVM(vm.UUID, vmMemoryBytes, int64(vm.VCPUs)) { //nolint:gosec // VCPUs from Nova, realistically bounded break } } - - if !assigned { - // PAYG - no commitment assignment - vmAssignments[vm.UUID] = "" - } } - - return vmAssignments, assignedCount } // azUsageData aggregates usage data for a specific flavor group and AZ. @@ -457,6 +463,7 @@ func (c *UsageCalculator) buildUsageResponse( subresource, err := liquid.SubresourceBuilder[map[string]any]{ ID: vm.UUID, + Name: vm.Name, Attributes: attributes, }.Finalize() if err != nil { @@ -471,22 +478,33 @@ func (c *UsageCalculator) buildUsageResponse( } // Build ResourceUsageReport for all flavor groups (not just those with fixed ratio) - for flavorGroupName := range flavorGroups { + for flavorGroupName, groupData := range flavorGroups { // All flavor groups are included in usage reporting. // === 1. RAM Resource === ramResourceName := liquid.ResourceName(ResourceNameRAM(flavorGroupName)) ramPerAZ := make(map[liquid.AvailabilityZone]*liquid.AZResourceUsageReport) + // For AZSeparatedTopology resources (fixed-ratio groups), per-AZ Quota must be non-null. + // Use -1 ("infinite quota") as default until actual quota is read from ProjectQuota CRD. + ramHasAZQuota := groupData.HasFixedRamCoreRatio() for _, az := range allAZs { - ramPerAZ[az] = &liquid.AZResourceUsageReport{ + report := &liquid.AZResourceUsageReport{ Usage: 0, Subresources: []liquid.Subresource{}, } + if ramHasAZQuota { + report.Quota = Some(int64(-1)) // infinite — will be overridden by ProjectQuota CRD + } + ramPerAZ[az] = report } if azData, exists := usageByFlavorGroupAZ[flavorGroupName]; exists { for az, data := range azData { if _, known := ramPerAZ[az]; !known { - ramPerAZ[az] = &liquid.AZResourceUsageReport{} + report := &liquid.AZResourceUsageReport{} + if ramHasAZQuota { + report.Quota = Some(int64(-1)) + } + ramPerAZ[az] = report } ramPerAZ[az].Usage = data.ramUsage ramPerAZ[az].PhysicalUsage = Some(data.ramUsage) // No overcommit for RAM @@ -564,8 +582,9 @@ func buildVMAttributes(vm VMUsageInfo, commitmentID string) map[string]any { } result := map[string]any{ - "status": vm.Status, - "flavor": flavor, + "status": vm.Status, + "flavor": flavor, + "os_type": vm.OSType, } // Add commitment_id - nil for PAYG, string for committed @@ -614,7 +633,7 @@ func (c *dbUsageClient) getReader(ctx context.Context) (*external.PostgresReader return reader, nil } -// vmQueryRow is the scan target for the server+flavor JOIN query. +// vmQueryRow is the scan target for the server+flavor+image JOIN query. type vmQueryRow struct { ID string `db:"id"` Name string `db:"name"` @@ -627,6 +646,7 @@ type vmQueryRow struct { FlavorVCPUs uint64 `db:"flavor_vcpus"` FlavorDisk uint64 `db:"flavor_disk"` FlavorExtras string `db:"flavor_extras"` + OSType string `db:"os_type"` } // ListProjectVMs returns all VMs for a project joined with their flavor data from Postgres. @@ -645,9 +665,11 @@ func (c *dbUsageClient) ListProjectVMs(ctx context.Context, projectID string) ([ COALESCE(f.ram, 0) AS flavor_ram, COALESCE(f.vcpus, 0) AS flavor_vcpus, COALESCE(f.disk, 0) AS flavor_disk, - COALESCE(f.extra_specs, '') AS flavor_extras + COALESCE(f.extra_specs, '') AS flavor_extras, + COALESCE(NULLIF(i.os_type, ''), 'unknown') AS os_type FROM ` + nova.Server{}.TableName() + ` s LEFT JOIN ` + nova.Flavor{}.TableName() + ` f ON f.name = s.flavor_name + LEFT JOIN ` + nova.Image{}.TableName() + ` i ON i.id = s.image_ref WHERE s.tenant_id = $1` var rows []vmQueryRow diff --git a/internal/scheduling/reservations/commitments/usage_internals_test.go b/internal/scheduling/reservations/commitments/usage_internals_test.go index d9f6c474f..233db19fc 100644 --- a/internal/scheduling/reservations/commitments/usage_internals_test.go +++ b/internal/scheduling/reservations/commitments/usage_internals_test.go @@ -177,6 +177,7 @@ func TestBuildVMAttributes(t *testing.T) { MemoryMB: 4096, VCPUs: 16, DiskGB: 100, + OSType: "windows8Server64Guest", } t.Run("with commitment", func(t *testing.T) { @@ -186,7 +187,11 @@ func TestBuildVMAttributes(t *testing.T) { t.Errorf("status = %v, expected ACTIVE", attrs["status"]) } - for _, absent := range []string{"metadata", "tags", "os_type"} { + if attrs["os_type"] != "windows8Server64Guest" { + t.Errorf("os_type = %v, expected windows8Server64Guest", attrs["os_type"]) + } + + for _, absent := range []string{"metadata", "tags"} { if _, present := attrs[absent]; present { t.Errorf("%s must not appear in output (not available from Postgres cache)", absent) } @@ -404,21 +409,38 @@ func TestUsageCalculator_AssignVMsToCommitments(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - calc := &UsageCalculator{} - assignments, count := calc.assignVMsToCommitments(tt.vms, tt.commitments) + assignVMsToCommitments(tt.vms, tt.commitments) + + // Derive assignment map from mutated commitment states. + assignments := make(map[string]string) + totalAssigned := 0 + for _, states := range tt.commitments { + for _, state := range states { + for _, vmUUID := range state.AssignedInstances { + assignments[vmUUID] = state.CommitmentUUID + totalAssigned++ + } + } + } - if count != tt.expectedCount { - t.Errorf("assigned count = %d, expected %d", count, tt.expectedCount) + if totalAssigned != tt.expectedCount { + t.Errorf("assigned count = %d, expected %d", totalAssigned, tt.expectedCount) } for vmUUID, expectedCommitment := range tt.expectedAssignments { actual, ok := assignments[vmUUID] - if !ok { - t.Errorf("VM %s not in assignments", vmUUID) - continue - } - if actual != expectedCommitment { - t.Errorf("VM %s: commitment = %q, expected %q", vmUUID, actual, expectedCommitment) + if expectedCommitment == "" { + if ok { + t.Errorf("VM %s should be PAYG but was assigned to %q", vmUUID, actual) + } + } else { + if !ok { + t.Errorf("VM %s not in assignments", vmUUID) + continue + } + if actual != expectedCommitment { + t.Errorf("VM %s: commitment = %q, expected %q", vmUUID, actual, expectedCommitment) + } } } }) diff --git a/internal/scheduling/reservations/commitments/usage_reconciler.go b/internal/scheduling/reservations/commitments/usage_reconciler.go new file mode 100644 index 000000000..4d09439b9 --- /dev/null +++ b/internal/scheduling/reservations/commitments/usage_reconciler.go @@ -0,0 +1,351 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "context" + "fmt" + "time" + + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "github.com/go-logr/logr" + "github.com/sapcc/go-api-declarations/liquid" + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" + "github.com/cobaltcore-dev/cortex/pkg/multicluster" +) + +// UsageReconciler reconciles CommittedResource.Status usage fields (AssignedInstances, UsedResources, +// LastUsageReconcileAt) by running the deterministic VM-to-CR assignment periodically and on +// relevant change events. +type UsageReconciler struct { + client.Client + Conf UsageReconcilerConfig + UsageDB UsageDBClient + Monitor UsageReconcilerMonitor +} + +func (r *UsageReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + start := time.Now() + + var cr v1alpha1.CommittedResource + if err := r.Get(ctx, req.NamespacedName, &cr); err != nil { + return ctrl.Result{}, client.IgnoreNotFound(err) + } + + log := ctrl.LoggerFrom(ctx).WithValues("component", "usage-reconciler", "committedResource", req.Name) + + // Only active commitments have assigned VMs. Clear stale usage status if present. + if cr.Spec.State != v1alpha1.CommitmentStatusConfirmed && cr.Spec.State != v1alpha1.CommitmentStatusGuaranteed { + log.Info("skipping: commitment state is not active", "state", cr.Spec.State) + if len(cr.Status.AssignedInstances) > 0 || len(cr.Status.UsedResources) > 0 { + old := cr.DeepCopy() + cr.Status.AssignedInstances = nil + cr.Status.UsedResources = nil + cr.Status.LastUsageReconcileAt = nil + cr.Status.UsageObservedGeneration = nil + if err := r.Status().Patch(ctx, &cr, client.MergeFrom(old)); err != nil { + return ctrl.Result{}, client.IgnoreNotFound(err) + } + } + return ctrl.Result{}, nil + } + + // Skip expired or not-yet-started commitments, mirroring the time guards in buildCommitmentCapacityMap. + // The state field may not reflect expiry immediately (syncer updates on its own schedule), + // so we check EndTime directly to prevent stale assignments from persisting past expiry. + if cr.Spec.EndTime != nil && cr.Spec.EndTime.Time.Before(start) { + log.Info("skipping: commitment has expired", "endTime", cr.Spec.EndTime) + if len(cr.Status.AssignedInstances) > 0 || len(cr.Status.UsedResources) > 0 { + old := cr.DeepCopy() + cr.Status.AssignedInstances = nil + cr.Status.UsedResources = nil + cr.Status.LastUsageReconcileAt = nil + cr.Status.UsageObservedGeneration = nil + if err := r.Status().Patch(ctx, &cr, client.MergeFrom(old)); err != nil { + return ctrl.Result{}, client.IgnoreNotFound(err) + } + } + return ctrl.Result{}, nil + } + if cr.Spec.StartTime != nil && cr.Spec.StartTime.After(start) { + log.Info("skipping: commitment has not started yet", "startTime", cr.Spec.StartTime) + return ctrl.Result{}, nil + } + + cooldown := r.Conf.CooldownInterval.Duration + + // Gate: wait until the CR controller has accepted the current generation. + // The CR controller writes the Ready condition (with ObservedGeneration) only after + // updating the AcceptedSpec. Running before that would read stale capacity. + // We don't requeue here — the acceptedGenerationPredicate watch fires when the + // condition is written, triggering a fresh reconcile at that point. + readyCond := meta.FindStatusCondition(cr.Status.Conditions, v1alpha1.CommittedResourceConditionReady) + if readyCond == nil || readyCond.ObservedGeneration != cr.Generation || readyCond.Status != metav1.ConditionTrue { + log.Info("skipping: Ready condition not yet accepted for current generation", + "generation", cr.Generation, + "readyCondFound", readyCond != nil, + ) + return ctrl.Result{}, nil + } + + // Bypass cooldown when the spec generation has advanced since the last usage reconcile. + // This ensures spec changes (e.g. shrink) are reflected immediately rather than waiting + // for the next cooldown interval — follows the Kubernetes observedGeneration pattern. + generationAdvanced := cr.Status.UsageObservedGeneration == nil || + *cr.Status.UsageObservedGeneration != cr.Generation + if !generationAdvanced && cr.Status.LastUsageReconcileAt != nil { + if elapsed := time.Since(cr.Status.LastUsageReconcileAt.Time); elapsed < cooldown { + return ctrl.Result{RequeueAfter: cooldown - elapsed}, nil + } + } + + log = log.WithValues("projectID", cr.Spec.ProjectID) + trigger := "periodic" + if generationAdvanced { + trigger = "generation-change" + } + logger := log + logger.Info("usage reconcile starting", "trigger", trigger, "generation", cr.Generation) + + knowledge := &reservations.FlavorGroupKnowledgeClient{Client: r.Client} + flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) + if err != nil { + r.Monitor.reconcileDuration.WithLabelValues("error").Observe(time.Since(start).Seconds()) + return ctrl.Result{}, err + } + + commitmentsByAZFG, err := buildCommitmentCapacityMap(ctx, r.Client, logger, cr.Spec.ProjectID) + if err != nil { + r.Monitor.reconcileDuration.WithLabelValues("error").Observe(time.Since(start).Seconds()) + return ctrl.Result{}, err + } + if len(commitmentsByAZFG) == 0 { + logger.Info("no active commitments found for project, retrying after cooldown") + return ctrl.Result{RequeueAfter: cooldown}, nil + } + + // Derive the known AZs from the commitment map so that NormalizeAZ maps VM AZ strings + // to the same values used as commitment keys. VMs in unrecognised AZs get "unknown" and + // are treated as PAYG, which is the correct fallback. + allAZs := make([]liquid.AvailabilityZone, 0, len(commitmentsByAZFG)) + seenAZs := make(map[liquid.AvailabilityZone]struct{}) + for _, states := range commitmentsByAZFG { + for _, state := range states { + az := liquid.AvailabilityZone(state.AvailabilityZone) + if _, ok := seenAZs[az]; !ok { + seenAZs[az] = struct{}{} + allAZs = append(allAZs, az) + } + } + } + + vms, err := getProjectVMs(ctx, r.UsageDB, logger, cr.Spec.ProjectID, flavorGroups, allAZs) + if err != nil { + r.Monitor.reconcileDuration.WithLabelValues("error").Observe(time.Since(start).Seconds()) + return ctrl.Result{}, err + } + sortVMsForUsageCalculation(vms) + assignVMsToCommitments(vms, commitmentsByAZFG) + + now := metav1.Now() + written := 0 + totalAssigned := 0 + var writeErr error + for _, group := range commitmentsByAZFG { + for _, state := range group { + if err := r.writeUsageStatus(ctx, state, now); err != nil { + logger.Error(err, "failed to write usage status", "commitmentUUID", state.CommitmentUUID) + writeErr = err + } else { + written++ + totalAssigned += len(state.AssignedInstances) + } + } + } + if writeErr != nil { + return ctrl.Result{}, writeErr + } + + r.Monitor.reconcileDuration.WithLabelValues("success").Observe(time.Since(start).Seconds()) + // Observe status age once per reconcile, not once per commitment, to avoid biasing the + // histogram toward projects with many commitments. + if written > 0 && cr.Status.LastUsageReconcileAt != nil { + r.Monitor.statusAge.Observe(now.Time.Sub(cr.Status.LastUsageReconcileAt.Time).Seconds()) + } + r.Monitor.assignedInstances.WithLabelValues(cr.Spec.ProjectID).Set(float64(totalAssigned)) + + logger.Info("usage reconcile complete", + "commitments", written, + "vms", len(vms), + "assignedInstances", totalAssigned, + ) + + // Successful reconcile schedules the next run after the cooldown — acts as the periodic fallback. + return ctrl.Result{RequeueAfter: cooldown}, nil +} + +// writeUsageStatus patches AssignedInstances, UsedResources, and LastUsageReconcileAt on the CommittedResource +// identified by state.CommitmentUUID. +func (r *UsageReconciler) writeUsageStatus(ctx context.Context, state *CommitmentStateWithUsage, now metav1.Time) error { + var crList v1alpha1.CommittedResourceList + if err := r.List(ctx, &crList, client.MatchingFields{idxCommittedResourceByUUID: state.CommitmentUUID}); err != nil { + return err + } + if len(crList.Items) == 0 { + return nil + } + target := &crList.Items[0] + old := target.DeepCopy() + + usedBytes := state.TotalMemoryBytes - state.RemainingMemoryBytes + usedQty := resource.NewQuantity(usedBytes, resource.BinarySI) + usedCores := resource.NewQuantity(state.UsedVCPUs, resource.DecimalSI) + + target.Status.AssignedInstances = state.AssignedInstances + target.Status.UsedResources = map[string]resource.Quantity{ + "memory": *usedQty, + "cpu": *usedCores, + } + target.Status.LastUsageReconcileAt = &now + target.Status.UsageObservedGeneration = &target.Generation + + return r.Status().Patch(ctx, target, client.MergeFrom(old)) +} + +// hypervisorToCommittedResources maps a Hypervisor change to the CommittedResources of affected projects. +// When a hypervisor's VM list changes, all CommittedResources for projects that have reservations +// on that host need their usage re-evaluated. +func (r *UsageReconciler) hypervisorToCommittedResources(ctx context.Context, obj client.Object) []reconcile.Request { + hvName := obj.GetName() + log := ctrl.LoggerFrom(ctx) + + var reservationList v1alpha1.ReservationList + if err := r.List(ctx, &reservationList, client.MatchingLabels{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }); err != nil { + log.Error(err, "failed to list reservations for hypervisor event", "hypervisor", hvName) + return nil + } + + projectIDs := make(map[string]struct{}) + for _, res := range reservationList.Items { + if res.Status.Host == hvName && res.Spec.CommittedResourceReservation != nil { + projectIDs[res.Spec.CommittedResourceReservation.ProjectID] = struct{}{} + } + } + if len(projectIDs) == 0 { + return nil + } + + var requests []reconcile.Request + for projectID := range projectIDs { + var crList v1alpha1.CommittedResourceList + if err := r.List(ctx, &crList, client.MatchingFields{idxCommittedResourceByProjectID: projectID}); err != nil { + log.Error(err, "failed to list CommittedResources for hypervisor event", "hypervisor", hvName, "projectID", projectID) + return nil + } + for _, cr := range crList.Items { + requests = append(requests, reconcile.Request{ + NamespacedName: types.NamespacedName{Name: cr.Name}, + }) + } + } + return requests +} + +// SetupWithManager registers the usage reconciler with the controller manager. +func (r *UsageReconciler) SetupWithManager(mgr ctrl.Manager, mcl *multicluster.Client) error { + log := ctrl.Log.WithName("committed-resource-usage") + log.Info("starting usage reconciler", "cooldownInterval", r.Conf.CooldownInterval.Duration) + + if err := indexCommittedResourceByUUID(context.Background(), mcl); err != nil { + return fmt.Errorf("failed to set up committed resource field index: %w", err) + } + if err := indexCommittedResourceByProjectID(context.Background(), mcl); err != nil { + return fmt.Errorf("failed to set up committed resource project index: %w", err) + } + + bldr := multicluster.BuildController(mcl, mgr) + + // Watch CommittedResource status updates where the CR controller has just accepted the + // current generation. Fires when the Ready condition's ObservedGeneration advances to match + // metadata.generation. We intentionally do NOT watch spec changes (GenerationChangedPredicate): + // capacity is read from AcceptedSpec in status, which is only valid after the CR controller + // has finished — so triggering on spec changes would always hit the readiness gate and do nothing. + var err error + bldr, err = bldr.WatchesMulticluster( + &v1alpha1.CommittedResource{}, + &handler.EnqueueRequestForObject{}, + acceptedGenerationPredicate{log: log}, + ) + if err != nil { + return err + } + + // Watch Hypervisor CRDs: when VM instances on a host change, re-evaluate usage for + // projects that have reservations on that host. + bldr, err = bldr.WatchesMulticluster( + &hv1.Hypervisor{}, + handler.EnqueueRequestsFromMapFunc(r.hypervisorToCommittedResources), + ) + if err != nil { + return err + } + + // MaxConcurrentReconciles=1: the per-project assignment is globally consistent only when + // run serially — concurrent runs for the same project could produce conflicting writes. + return bldr.Named("committed-resource-usage"). + WithOptions(controller.Options{ + MaxConcurrentReconciles: 1, + }). + Complete(r) +} + +// acceptedGenerationPredicate fires on status-only updates where the CR controller has +// accepted the current spec generation (Ready=True, ObservedGeneration==metadata.generation) +// but the usage reconciler hasn't yet processed it (UsageObservedGeneration lags behind). +// Checking usage lag instead of Ready advancement makes it resilient to the race between +// the predicate firing and the reconciler reading from cache: if the first reconcile misses +// the window, the next status-only update re-fires the predicate. +type acceptedGenerationPredicate struct { + predicate.Funcs + log logr.Logger +} + +func (p acceptedGenerationPredicate) Update(e event.UpdateEvent) bool { + oldCR, ok1 := e.ObjectOld.(*v1alpha1.CommittedResource) + newCR, ok2 := e.ObjectNew.(*v1alpha1.CommittedResource) + if !ok1 || !ok2 { + return false + } + // Only react to status-only updates; spec changes are handled by GenerationChangedPredicate. + if oldCR.Generation != newCR.Generation { + return false + } + newCond := meta.FindStatusCondition(newCR.Status.Conditions, v1alpha1.CommittedResourceConditionReady) + if newCond == nil || newCond.Status != metav1.ConditionTrue || newCond.ObservedGeneration != newCR.Generation { + return false + } + // Don't fire if usage is already up to date for the accepted generation. + if newCR.Status.UsageObservedGeneration != nil && *newCR.Status.UsageObservedGeneration >= newCond.ObservedGeneration { + return false + } + p.log.Info("predicate fired: Ready accepted, usage not yet up to date", + "name", newCR.Name, "generation", newCR.Generation, + "usageObservedGeneration", newCR.Status.UsageObservedGeneration) + return true +} diff --git a/internal/scheduling/reservations/commitments/usage_reconciler_monitor.go b/internal/scheduling/reservations/commitments/usage_reconciler_monitor.go new file mode 100644 index 000000000..fe64ac823 --- /dev/null +++ b/internal/scheduling/reservations/commitments/usage_reconciler_monitor.go @@ -0,0 +1,55 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +// UsageReconcilerMonitor provides metrics for the usage reconciler. +type UsageReconcilerMonitor struct { + reconcileDuration *prometheus.HistogramVec + statusAge prometheus.Histogram + assignedInstances *prometheus.GaugeVec +} + +// NewUsageReconcilerMonitor creates a new monitor with Prometheus metrics. +func NewUsageReconcilerMonitor() UsageReconcilerMonitor { + m := UsageReconcilerMonitor{ + reconcileDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "cortex_cr_usage_reconcile_duration_seconds", + Help: "Duration of committed resource usage reconcile runs in seconds.", + Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30}, + }, []string{"result"}), + statusAge: prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "cortex_cr_usage_status_age_seconds", + Help: "Age of CommittedResource usage status at reconcile time, in seconds. Distribution across all active commitments shows freshness spread.", + Buckets: []float64{30, 60, 120, 300, 600, 900, 1800}, + }), + assignedInstances: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_cr_usage_assigned_vms_total", + Help: "Number of VMs currently assigned to committed resources for a project.", + }, []string{"project_id"}), + } + + // Pre-initialize result labels so metrics appear before the first reconcile. + m.reconcileDuration.WithLabelValues("success") + m.reconcileDuration.WithLabelValues("error") + + return m +} + +// Describe implements prometheus.Collector. +func (m UsageReconcilerMonitor) Describe(ch chan<- *prometheus.Desc) { + m.reconcileDuration.Describe(ch) + m.statusAge.Describe(ch) + m.assignedInstances.Describe(ch) +} + +// Collect implements prometheus.Collector. +func (m UsageReconcilerMonitor) Collect(ch chan<- prometheus.Metric) { + m.reconcileDuration.Collect(ch) + m.statusAge.Collect(ch) + m.assignedInstances.Collect(ch) +} diff --git a/internal/scheduling/reservations/commitments/usage_reconciler_test.go b/internal/scheduling/reservations/commitments/usage_reconciler_test.go new file mode 100644 index 000000000..ee07b0632 --- /dev/null +++ b/internal/scheduling/reservations/commitments/usage_reconciler_test.go @@ -0,0 +1,485 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "context" + "testing" + "time" + + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "github.com/go-logr/logr" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" +) + +func i64ptr(i int64) *int64 { return &i } + +// newUsageReconciler builds a minimal UsageReconciler for unit tests. +func newUsageReconciler(k8sClient client.Client, cooldown time.Duration) *UsageReconciler { + return &UsageReconciler{ + Client: k8sClient, + Conf: UsageReconcilerConfig{CooldownInterval: metav1.Duration{Duration: cooldown}}, + Monitor: NewUsageReconcilerMonitor(), + } +} + +// ============================================================================ +// acceptedGenerationPredicate +// ============================================================================ + +func TestAcceptedGenerationPredicate_Update(t *testing.T) { + readyCond := metav1.Condition{ + Type: v1alpha1.CommittedResourceConditionReady, + Status: metav1.ConditionTrue, + ObservedGeneration: 1, + Reason: v1alpha1.CommittedResourceReasonAccepted, + } + + tests := []struct { + name string + old *v1alpha1.CommittedResource + new *v1alpha1.CommittedResource + want bool + }{ + { + name: "generation changed: spec update ignored by this predicate", + old: &v1alpha1.CommittedResource{ObjectMeta: metav1.ObjectMeta{Generation: 1}}, + new: &v1alpha1.CommittedResource{ObjectMeta: metav1.ObjectMeta{Generation: 2}}, + want: false, + }, + { + name: "no Ready condition", + old: &v1alpha1.CommittedResource{ObjectMeta: metav1.ObjectMeta{Generation: 1}}, + new: &v1alpha1.CommittedResource{ObjectMeta: metav1.ObjectMeta{Generation: 1}}, + want: false, + }, + { + name: "Ready=False", + old: &v1alpha1.CommittedResource{ObjectMeta: metav1.ObjectMeta{Generation: 1}}, + new: &v1alpha1.CommittedResource{ + ObjectMeta: metav1.ObjectMeta{Generation: 1}, + Status: v1alpha1.CommittedResourceStatus{ + Conditions: []metav1.Condition{{ + Type: v1alpha1.CommittedResourceConditionReady, + Status: metav1.ConditionFalse, + ObservedGeneration: 1, + Reason: v1alpha1.CommittedResourceReasonReserving, + }}, + }, + }, + want: false, + }, + { + name: "Ready=True but ObservedGeneration lags metadata.generation", + old: &v1alpha1.CommittedResource{ObjectMeta: metav1.ObjectMeta{Generation: 2}}, + new: &v1alpha1.CommittedResource{ + ObjectMeta: metav1.ObjectMeta{Generation: 2}, + Status: v1alpha1.CommittedResourceStatus{ + Conditions: []metav1.Condition{{ + Type: v1alpha1.CommittedResourceConditionReady, + Status: metav1.ConditionTrue, + ObservedGeneration: 1, // lags behind generation=2 + Reason: v1alpha1.CommittedResourceReasonAccepted, + }}, + }, + }, + want: false, + }, + { + name: "usage already current: UsageObsGen equals Ready.ObservedGeneration", + old: &v1alpha1.CommittedResource{ObjectMeta: metav1.ObjectMeta{Generation: 1}}, + new: &v1alpha1.CommittedResource{ + ObjectMeta: metav1.ObjectMeta{Generation: 1}, + Status: v1alpha1.CommittedResourceStatus{ + Conditions: []metav1.Condition{readyCond}, + UsageObservedGeneration: i64ptr(1), + }, + }, + want: false, + }, + { + name: "UsageObsGen nil: fires (first usage reconcile needed after acceptance)", + old: &v1alpha1.CommittedResource{ObjectMeta: metav1.ObjectMeta{Generation: 1}}, + new: &v1alpha1.CommittedResource{ + ObjectMeta: metav1.ObjectMeta{Generation: 1}, + Status: v1alpha1.CommittedResourceStatus{ + Conditions: []metav1.Condition{readyCond}, + UsageObservedGeneration: nil, + }, + }, + want: true, + }, + { + name: "UsageObsGen lags: fires (retrigger after cache-race miss)", + old: &v1alpha1.CommittedResource{ObjectMeta: metav1.ObjectMeta{Generation: 1}}, + new: &v1alpha1.CommittedResource{ + ObjectMeta: metav1.ObjectMeta{Generation: 1}, + Status: v1alpha1.CommittedResourceStatus{ + Conditions: []metav1.Condition{readyCond}, + UsageObservedGeneration: i64ptr(0), // lags behind Ready.ObservedGeneration=1 + }, + }, + want: true, + }, + } + + p := acceptedGenerationPredicate{log: logr.Discard()} + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ev := event.UpdateEvent{ObjectOld: tt.old, ObjectNew: tt.new} + if got := p.Update(ev); got != tt.want { + t.Errorf("Update() = %v, want %v", got, tt.want) + } + }) + } +} + +// ============================================================================ +// UsageReconciler.Reconcile gate logic +// ============================================================================ + +func TestUsageReconciler_Reconcile_Gates(t *testing.T) { + scheme := newCRTestScheme(t) + const cooldown = 5 * time.Minute + + t.Run("CR not found: returns nil without requeue", func(t *testing.T) { + r := newUsageReconciler(newCRTestClient(scheme), cooldown) + result, err := r.Reconcile(context.Background(), reconcileReq("nonexistent")) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.RequeueAfter != 0 { + t.Errorf("RequeueAfter = %v, want 0", result.RequeueAfter) + } + }) + + t.Run("non-active state without stale data: no patch, no requeue", func(t *testing.T) { + cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusPlanned) + r := newUsageReconciler(newCRTestClient(scheme, cr), cooldown) + result, err := r.Reconcile(context.Background(), reconcileReq(cr.Name)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.RequeueAfter != 0 { + t.Errorf("RequeueAfter = %v, want 0", result.RequeueAfter) + } + }) + + t.Run("non-active state with stale data: clears AssignedInstances and timestamps", func(t *testing.T) { + now := metav1.Now() + cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusExpired) + cr.Status.AssignedInstances = []string{"vm-1", "vm-2"} + cr.Status.LastUsageReconcileAt = &now + + k8sClient := newCRTestClient(scheme, cr) + r := newUsageReconciler(k8sClient, cooldown) + _, err := r.Reconcile(context.Background(), reconcileReq(cr.Name)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + var updated v1alpha1.CommittedResource + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: cr.Name}, &updated); err != nil { + t.Fatalf("get CR: %v", err) + } + if len(updated.Status.AssignedInstances) != 0 { + t.Errorf("AssignedInstances = %v, want nil", updated.Status.AssignedInstances) + } + if updated.Status.LastUsageReconcileAt != nil { + t.Errorf("LastUsageReconcileAt = %v, want nil", updated.Status.LastUsageReconcileAt) + } + }) + + t.Run("expired EndTime with stale data: clears AssignedInstances and timestamps", func(t *testing.T) { + now := metav1.Now() + pastTime := metav1.NewTime(time.Now().Add(-1 * time.Minute)) + cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusConfirmed) + cr.Spec.EndTime = &pastTime + cr.Status.AssignedInstances = []string{"vm-1"} + cr.Status.LastUsageReconcileAt = &now + + k8sClient := newCRTestClient(scheme, cr) + r := newUsageReconciler(k8sClient, cooldown) + _, err := r.Reconcile(context.Background(), reconcileReq(cr.Name)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + var updated v1alpha1.CommittedResource + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: cr.Name}, &updated); err != nil { + t.Fatalf("get CR: %v", err) + } + if len(updated.Status.AssignedInstances) != 0 { + t.Errorf("AssignedInstances = %v, want nil", updated.Status.AssignedInstances) + } + if updated.Status.LastUsageReconcileAt != nil { + t.Errorf("LastUsageReconcileAt = %v, want nil", updated.Status.LastUsageReconcileAt) + } + }) + + t.Run("future StartTime: skips without clearing or requeuing", func(t *testing.T) { + futureTime := metav1.NewTime(time.Now().Add(10 * time.Minute)) + cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusGuaranteed) + cr.Spec.StartTime = &futureTime + r := newUsageReconciler(newCRTestClient(scheme, cr), cooldown) + result, err := r.Reconcile(context.Background(), reconcileReq(cr.Name)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.RequeueAfter != 0 { + t.Errorf("RequeueAfter = %v, want 0", result.RequeueAfter) + } + }) + + t.Run("Ready condition absent: skips without requeue", func(t *testing.T) { + cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusConfirmed) + r := newUsageReconciler(newCRTestClient(scheme, cr), cooldown) + result, err := r.Reconcile(context.Background(), reconcileReq(cr.Name)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.RequeueAfter != 0 { + t.Errorf("RequeueAfter = %v, want 0 (no requeue until predicate fires)", result.RequeueAfter) + } + }) + + t.Run("Ready=True but ObsGen behind metadata.generation: skips without requeue", func(t *testing.T) { + cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusConfirmed) + cr.Generation = 2 + cr.Status.Conditions = []metav1.Condition{{ + Type: v1alpha1.CommittedResourceConditionReady, + Status: metav1.ConditionTrue, + ObservedGeneration: 1, // lags behind generation=2 + Reason: v1alpha1.CommittedResourceReasonAccepted, + }} + r := newUsageReconciler(newCRTestClient(scheme, cr), cooldown) + result, err := r.Reconcile(context.Background(), reconcileReq(cr.Name)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.RequeueAfter != 0 { + t.Errorf("RequeueAfter = %v, want 0", result.RequeueAfter) + } + }) + + t.Run("cooldown active: recent reconcile returns RequeueAfter near cooldown boundary", func(t *testing.T) { + cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusConfirmed) + cr.Generation = 1 + now := metav1.Now() + cr.Status.Conditions = []metav1.Condition{{ + Type: v1alpha1.CommittedResourceConditionReady, + Status: metav1.ConditionTrue, + ObservedGeneration: 1, + Reason: v1alpha1.CommittedResourceReasonAccepted, + }} + cr.Status.LastUsageReconcileAt = &now + cr.Status.UsageObservedGeneration = i64ptr(1) // up to date → generationAdvanced=false + + r := newUsageReconciler(newCRTestClient(scheme, cr), cooldown) + result, err := r.Reconcile(context.Background(), reconcileReq(cr.Name)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.RequeueAfter <= 0 || result.RequeueAfter > cooldown { + t.Errorf("RequeueAfter = %v, want (0, %v]", result.RequeueAfter, cooldown) + } + }) + + t.Run("generation advanced bypasses cooldown: runs past cooldown to buildCommitmentCapacityMap", func(t *testing.T) { + // UsageObsGen=nil means generationAdvanced=true, bypassing cooldown. + // The CR has no AcceptedSpec so buildCommitmentCapacityMap returns + // empty, triggering the "no active commitments" early-exit with RequeueAfter=cooldown. + cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusConfirmed) + cr.Generation = 1 + cr.Status.Conditions = []metav1.Condition{{ + Type: v1alpha1.CommittedResourceConditionReady, + Status: metav1.ConditionTrue, + ObservedGeneration: 1, + Reason: v1alpha1.CommittedResourceReasonAccepted, + }} + // UsageObservedGeneration intentionally left nil + + r := newUsageReconciler(newCRTestClient(scheme, cr, newTestFlavorKnowledge()), cooldown) + result, err := r.Reconcile(context.Background(), reconcileReq(cr.Name)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.RequeueAfter != cooldown { + t.Errorf("RequeueAfter = %v, want %v", result.RequeueAfter, cooldown) + } + }) +} + +// ============================================================================ +// hypervisorToCommittedResources mapper +// ============================================================================ + +func TestUsageReconciler_HypervisorToCommittedResources(t *testing.T) { + scheme := newCRTestScheme(t) + const cooldown = 5 * time.Minute + + t.Run("no reservations: empty result", func(t *testing.T) { + r := newUsageReconciler(newCRTestClient(scheme), cooldown) + hv := &hv1.Hypervisor{ObjectMeta: metav1.ObjectMeta{Name: "host-1"}} + if reqs := r.hypervisorToCommittedResources(context.Background(), hv); len(reqs) != 0 { + t.Errorf("got %d requests, want 0", len(reqs)) + } + }) + + t.Run("reservation on different host: no matching project, empty result", func(t *testing.T) { + res := &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{ + Name: "res-1", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-a", + }, + }, + Status: v1alpha1.ReservationStatus{Host: "host-2"}, // different host + } + r := newUsageReconciler(newCRTestClient(scheme, res), cooldown) + hv := &hv1.Hypervisor{ObjectMeta: metav1.ObjectMeta{Name: "host-1"}} + if reqs := r.hypervisorToCommittedResources(context.Background(), hv); len(reqs) != 0 { + t.Errorf("got %d requests, want 0", len(reqs)) + } + }) + + t.Run("reservation on host with matching CR: returns request for that CR", func(t *testing.T) { + cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusConfirmed) + res := &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{ + Name: "res-1", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: cr.Spec.ProjectID, + CommitmentUUID: cr.Spec.CommitmentUUID, + }, + }, + Status: v1alpha1.ReservationStatus{Host: "host-1"}, + } + + r := newUsageReconciler(newCRTestClient(scheme, cr, res), cooldown) + hv := &hv1.Hypervisor{ObjectMeta: metav1.ObjectMeta{Name: "host-1"}} + reqs := r.hypervisorToCommittedResources(context.Background(), hv) + if len(reqs) != 1 { + t.Fatalf("got %d requests, want 1", len(reqs)) + } + if reqs[0].Name != cr.Name { + t.Errorf("request name = %q, want %q", reqs[0].Name, cr.Name) + } + }) + + t.Run("two CRs share a host: both enqueued", func(t *testing.T) { + cr1 := newTestCommittedResource("cr-1", v1alpha1.CommitmentStatusConfirmed) + cr1.Spec.CommitmentUUID = "uuid-cr1" + + cr2 := newTestCommittedResource("cr-2", v1alpha1.CommitmentStatusConfirmed) + cr2.Spec.CommitmentUUID = "uuid-cr2" + // Same project so a single reservation entry covers both CRs. + + res := &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{ + Name: "res-1", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: cr1.Spec.ProjectID, + CommitmentUUID: cr1.Spec.CommitmentUUID, + }, + }, + Status: v1alpha1.ReservationStatus{Host: "host-1"}, + } + + r := newUsageReconciler(newCRTestClient(scheme, cr1, cr2, res), cooldown) + hv := &hv1.Hypervisor{ObjectMeta: metav1.ObjectMeta{Name: "host-1"}} + reqs := r.hypervisorToCommittedResources(context.Background(), hv) + if len(reqs) != 2 { + t.Errorf("got %d requests, want 2", len(reqs)) + } + }) +} + +// ============================================================================ +// writeUsageStatus +// ============================================================================ + +func TestUsageReconciler_WriteUsageStatus(t *testing.T) { + scheme := newCRTestScheme(t) + + t.Run("UUID not in index: returns nil without error", func(t *testing.T) { + r := newUsageReconciler(newCRTestClient(scheme), time.Minute) + state := &CommitmentStateWithUsage{ + CommitmentState: CommitmentState{ + CommitmentUUID: "no-such-uuid", + TotalMemoryBytes: 4 * 1024 * 1024 * 1024, + }, + RemainingMemoryBytes: 2 * 1024 * 1024 * 1024, + } + if err := r.writeUsageStatus(context.Background(), state, metav1.Now()); err != nil { + t.Errorf("unexpected error: %v", err) + } + }) + + t.Run("UUID found: patches AssignedInstances, UsedResources, and generation", func(t *testing.T) { + cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusConfirmed) + cr.Generation = 1 + k8sClient := newCRTestClient(scheme, cr) + r := newUsageReconciler(k8sClient, time.Minute) + + now := metav1.Now() + state := &CommitmentStateWithUsage{ + CommitmentState: CommitmentState{ + CommitmentUUID: cr.Spec.CommitmentUUID, + TotalMemoryBytes: 4 * 1024 * 1024 * 1024, + }, + RemainingMemoryBytes: 2 * 1024 * 1024 * 1024, // 2 GiB used + UsedVCPUs: 4, + AssignedInstances: []string{"vm-1", "vm-2"}, + } + + if err := r.writeUsageStatus(context.Background(), state, now); err != nil { + t.Fatalf("unexpected error: %v", err) + } + + var updated v1alpha1.CommittedResource + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: cr.Name}, &updated); err != nil { + t.Fatalf("get CR: %v", err) + } + + if len(updated.Status.AssignedInstances) != 2 { + t.Errorf("AssignedInstances len = %d, want 2", len(updated.Status.AssignedInstances)) + } + if updated.Status.LastUsageReconcileAt == nil { + t.Errorf("LastUsageReconcileAt not set") + } + if updated.Status.UsageObservedGeneration == nil || *updated.Status.UsageObservedGeneration != 1 { + t.Errorf("UsageObservedGeneration = %v, want 1", updated.Status.UsageObservedGeneration) + } + if _, ok := updated.Status.UsedResources["memory"]; !ok { + t.Errorf("UsedResources[memory] not set") + } + if _, ok := updated.Status.UsedResources["cpu"]; !ok { + t.Errorf("UsedResources[cpu] not set") + } + }) +} diff --git a/internal/scheduling/reservations/failover/integration_test.go b/internal/scheduling/reservations/failover/integration_test.go index 66d5733bb..df1354be6 100644 --- a/internal/scheduling/reservations/failover/integration_test.go +++ b/internal/scheduling/reservations/failover/integration_test.go @@ -1068,6 +1068,21 @@ func (s *MockVMSource) GetVM(_ context.Context, vmUUID string) (*VM, error) { return nil, nil } +// IsServerActive returns true if the server is found in the mock VMs. +func (s *MockVMSource) IsServerActive(_ context.Context, vmUUID string) (bool, error) { + for i := range s.VMs { + if s.VMs[i].UUID == vmUUID { + return true, nil + } + } + return false, nil +} + +// GetDeletedVMInfo returns nil, nil (no deleted VMs in mock). +func (s *MockVMSource) GetDeletedVMInfo(_ context.Context, _ string) (*DeletedVMInfo, error) { + return nil, nil +} + // newIntegrationTestEnv creates a complete test environment with HTTP server and VMSource. func newIntegrationTestEnv(t *testing.T, vms []VM, hypervisors []*hv1.Hypervisor, reservations []*v1alpha1.Reservation) *IntegrationTestEnv { t.Helper() diff --git a/internal/scheduling/reservations/failover/vm_source.go b/internal/scheduling/reservations/failover/vm_source.go index 4d5c3f210..5a9603b79 100644 --- a/internal/scheduling/reservations/failover/vm_source.go +++ b/internal/scheduling/reservations/failover/vm_source.go @@ -26,6 +26,9 @@ type VM struct { // AvailabilityZone is the availability zone where the VM is located. // This is used to ensure failover reservations are created in the same AZ. AvailabilityZone string + // CreatedAt is the ISO 8601 timestamp when the VM was created in Nova. + // Used by the quota controller to distinguish new VMs from migrations. + CreatedAt string // Resources contains the VM's resource allocations (e.g., "memory", "vcpus"). Resources map[string]resource.Quantity // FlavorExtraSpecs contains the flavor's extra specifications (e.g., traits, capabilities). @@ -46,6 +49,22 @@ type VMSource interface { // GetVM returns a specific VM by UUID. // Returns nil, nil if the VM is not found (not an error, just doesn't exist). GetVM(ctx context.Context, vmUUID string) (*VM, error) + // IsServerActive returns true if the server exists in the servers table (still running somewhere). + // Returns false if not found. Used by quota controller to determine if a removed HV instance was deleted vs migrated. + IsServerActive(ctx context.Context, vmUUID string) (bool, error) + // GetDeletedVMInfo returns metadata about a deleted VM (from deleted_servers table), + // including resolved flavor resources. Returns nil, nil if not found. + // Used by quota controller for incremental usage decrements. + GetDeletedVMInfo(ctx context.Context, vmUUID string) (*DeletedVMInfo, error) +} + +// DeletedVMInfo contains the metadata needed to compute resource decrements for a deleted VM. +type DeletedVMInfo struct { + ProjectID string + AvailabilityZone string + FlavorName string + RAMMiB uint64 + VCPUs uint64 } // DBVMSource implements VMSource by reading directly from the database. @@ -122,6 +141,7 @@ func (s *DBVMSource) ListVMs(ctx context.Context) ([]VM, error) { ProjectID: server.TenantID, CurrentHypervisor: server.OSEXTSRVATTRHost, AvailabilityZone: server.OSEXTAvailabilityZone, + CreatedAt: server.Created, Resources: resources, FlavorExtraSpecs: extraSpecs, }) @@ -208,6 +228,7 @@ func (s *DBVMSource) GetVM(ctx context.Context, vmUUID string) (*VM, error) { ProjectID: server.TenantID, CurrentHypervisor: server.OSEXTSRVATTRHost, AvailabilityZone: server.OSEXTAvailabilityZone, + CreatedAt: server.Created, Resources: resources, FlavorExtraSpecs: extraSpecs, }, nil @@ -308,6 +329,7 @@ func buildVMsFromHypervisors(hypervisorList *hv1.HypervisorList, postgresVMs []V ProjectID: pgVM.ProjectID, CurrentHypervisor: hv.Name, // Use hypervisor CRD location, not postgres AvailabilityZone: pgVM.AvailabilityZone, + CreatedAt: pgVM.CreatedAt, Resources: pgVM.Resources, FlavorExtraSpecs: pgVM.FlavorExtraSpecs, } @@ -397,6 +419,50 @@ func filterVMsOnKnownHypervisors(vms []VM, hypervisorList *hv1.HypervisorList) [ return result } +// IsServerActive returns true if the server exists in the servers table and is not DELETED. +// VMs in any other status (ACTIVE, SHUTOFF, MIGRATING, ERROR, etc.) still consume resources +// and should NOT be decremented from quota usage. +// Used by the quota controller to distinguish deleted VMs from migrated/existing ones. +func (s *DBVMSource) IsServerActive(ctx context.Context, vmUUID string) (bool, error) { + server, err := s.NovaReader.GetServerByID(ctx, vmUUID) + if err != nil { + return false, fmt.Errorf("failed to check server existence: %w", err) + } + if server == nil { + return false, nil + } + return server.Status != "DELETED", nil +} + +// GetDeletedVMInfo returns metadata about a deleted VM from the deleted_servers table, +// including resolved flavor resources. Returns nil, nil if the VM is not found in deleted_servers. +func (s *DBVMSource) GetDeletedVMInfo(ctx context.Context, vmUUID string) (*DeletedVMInfo, error) { + deletedServer, err := s.NovaReader.GetDeletedServerByID(ctx, vmUUID) + if err != nil { + return nil, fmt.Errorf("failed to get deleted server: %w", err) + } + if deletedServer == nil { + return nil, nil + } + + // Resolve the flavor to get RAM/VCPUs + flavor, err := s.NovaReader.GetFlavorByName(ctx, deletedServer.FlavorName) + if err != nil { + return nil, fmt.Errorf("failed to get flavor for deleted server: %w", err) + } + if flavor == nil { + return nil, fmt.Errorf("flavor %q not found for deleted server %s", deletedServer.FlavorName, vmUUID) + } + + return &DeletedVMInfo{ + ProjectID: deletedServer.TenantID, + AvailabilityZone: deletedServer.OSEXTAvailabilityZone, + FlavorName: deletedServer.FlavorName, + RAMMiB: flavor.RAM, + VCPUs: flavor.VCPUs, + }, nil +} + // warnUnknownVMsOnHypervisors logs a warning for VMs that are on hypervisors but not in the ListVMs (i.e. nova) result. // This can indicate a data sync issue between the hypervisor operator and the VM datasource. func warnUnknownVMsOnHypervisors(hypervisors *hv1.HypervisorList, vms []VM) { diff --git a/internal/scheduling/reservations/failover/vm_source_test.go b/internal/scheduling/reservations/failover/vm_source_test.go index 0b30af0e5..a710c5658 100644 --- a/internal/scheduling/reservations/failover/vm_source_test.go +++ b/internal/scheduling/reservations/failover/vm_source_test.go @@ -399,3 +399,7 @@ func (m *mockNovaReader) GetFlavorByName(ctx context.Context, flavorName string) } return nil, nil } + +func (m *mockNovaReader) GetDeletedServerByID(_ context.Context, _ string) (*nova.DeletedServer, error) { + return nil, nil +} diff --git a/internal/scheduling/reservations/quota/config.go b/internal/scheduling/reservations/quota/config.go new file mode 100644 index 000000000..b7314f595 --- /dev/null +++ b/internal/scheduling/reservations/quota/config.go @@ -0,0 +1,44 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package quota + +import ( + "time" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// QuotaControllerConfig defines the configuration for the quota controller. +type QuotaControllerConfig struct { + // FullReconcileInterval is the periodic full reconcile interval. + // Full reconcile re-reads all VMs from Postgres and recomputes all usage. Default: 5m. + FullReconcileInterval metav1.Duration `json:"fullReconcileInterval"` + + // CRStateFilter defines which CommittedResource states to include + // when summing cr_actual_usage. Default: ["confirmed", "guaranteed"] + CRStateFilter []v1alpha1.CommitmentStatus `json:"crStateFilter"` +} + +// ApplyDefaults fills in any unset values with defaults. +func (c *QuotaControllerConfig) ApplyDefaults() { + defaults := DefaultQuotaControllerConfig() + if c.FullReconcileInterval.Duration == 0 { + c.FullReconcileInterval = defaults.FullReconcileInterval + } + if len(c.CRStateFilter) == 0 { + c.CRStateFilter = defaults.CRStateFilter + } +} + +// DefaultQuotaControllerConfig returns a default configuration. +func DefaultQuotaControllerConfig() QuotaControllerConfig { + return QuotaControllerConfig{ + FullReconcileInterval: metav1.Duration{Duration: 5 * time.Minute}, + CRStateFilter: []v1alpha1.CommitmentStatus{ + v1alpha1.CommitmentStatusConfirmed, + v1alpha1.CommitmentStatusGuaranteed, + }, + } +} diff --git a/internal/scheduling/reservations/quota/context.go b/internal/scheduling/reservations/quota/context.go new file mode 100644 index 000000000..8352a1934 --- /dev/null +++ b/internal/scheduling/reservations/quota/context.go @@ -0,0 +1,27 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package quota + +import ( + "context" + + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" + "github.com/go-logr/logr" + "github.com/google/uuid" +) + +// WithNewGlobalRequestID creates a new context with a quota-prefixed global request ID. +func WithNewGlobalRequestID(ctx context.Context) context.Context { + return reservations.WithGlobalRequestID(ctx, "quota-"+uuid.New().String()) +} + +// LoggerFromContext returns a logger with greq and req values from the context. +// This creates a child logger with the request tracking values pre-attached, +// so you don't need to repeat them in every log call. +func LoggerFromContext(ctx context.Context) logr.Logger { + return log.WithValues( + "greq", reservations.GlobalRequestIDFromContext(ctx), + "req", reservations.RequestIDFromContext(ctx), + ) +} diff --git a/internal/scheduling/reservations/quota/controller.go b/internal/scheduling/reservations/quota/controller.go new file mode 100644 index 000000000..f00d040dc --- /dev/null +++ b/internal/scheduling/reservations/quota/controller.go @@ -0,0 +1,1047 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package quota + +import ( + "context" + "fmt" + "time" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" + commitments "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/commitments" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/failover" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/util/retry" + "k8s.io/client-go/util/workqueue" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + "sigs.k8s.io/controller-runtime/pkg/source" +) + +var log = ctrl.Log.WithName("quota-controller").WithValues("module", "quota") + +// QuotaController manages quota usage tracking for projects. +// It provides three reconciliation modes: +// - Periodic full reconcile: recomputes all TotalUsage from Postgres +// - Incremental HV diff: delta-updates TotalUsage on HV instance changes +// - PaygUsage-only recompute: triggered by CR or ProjectQuota spec changes +type QuotaController struct { + client.Client + VMSource failover.VMSource + Config QuotaControllerConfig + Metrics *QuotaMetrics +} + +// NewQuotaController creates a new QuotaController. +func NewQuotaController( + c client.Client, + vmSource failover.VMSource, + config QuotaControllerConfig, + metrics *QuotaMetrics, +) *QuotaController { + + return &QuotaController{ + Client: c, + VMSource: vmSource, + Config: config, + Metrics: metrics, + } +} + +// ============================================================================ +// Periodic Full Reconciliation +// ============================================================================ + +// ReconcilePeriodic performs a full reconcile of all project quota usage. +// It reads all VMs from Postgres, computes TotalUsage per project/AZ/resource, +// then derives PaygUsage = TotalUsage - CRUsage for each existing ProjectQuota CRD. +func (c *QuotaController) ReconcilePeriodic(ctx context.Context) error { + ctx = WithNewGlobalRequestID(ctx) + startTime := time.Now() + logger := LoggerFromContext(ctx).WithValues("mode", "full-reconcile") + logger.Info("starting full quota reconcile") + + // Fetch flavor groups from Knowledge CRD + flavorGroupClient := &reservations.FlavorGroupKnowledgeClient{Client: c.Client} + flavorGroups, err := flavorGroupClient.GetAllFlavorGroups(ctx, nil) + if err != nil { + logger.Error(err, "failed to get flavor groups") + c.Metrics.RecordReconcileResult(false) + return fmt.Errorf("failed to get flavor groups: %w", err) + } + + // Build flavorName → flavorGroup lookup + flavorToGroup := buildFlavorToGroupMap(flavorGroups) + + // Fetch all VMs using VMSource (reads from Postgres via DBVMSource) + vms, err := c.VMSource.ListVMs(ctx) + if err != nil { + logger.Error(err, "failed to list VMs") + c.Metrics.RecordReconcileResult(false) + return fmt.Errorf("failed to list VMs: %w", err) + } + + // Compute totalUsage per project/AZ/resource + totalUsageByProject := c.computeTotalUsage(vms, flavorToGroup, flavorGroups) + + // List all existing ProjectQuota CRDs + var pqList v1alpha1.ProjectQuotaList + if err := c.List(ctx, &pqList); err != nil { + logger.Error(err, "failed to list ProjectQuota CRDs") + c.Metrics.RecordReconcileResult(false) + return fmt.Errorf("failed to list ProjectQuota CRDs: %w", err) + } + + // List all CommittedResource CRDs and pre-group by project ID + var crList v1alpha1.CommittedResourceList + if err := c.List(ctx, &crList); err != nil { + logger.Error(err, "failed to list CommittedResource CRDs") + c.Metrics.RecordReconcileResult(false) + return fmt.Errorf("failed to list CommittedResource CRDs: %w", err) + } + crsByProject := groupCRsByProject(crList.Items) + + // For each ProjectQuota CRD, write TotalUsage + PaygUsage + var updated, skipped int + for i := range pqList.Items { + pq := &pqList.Items[i] + projectID := pq.Spec.ProjectID + + // Get totalUsage for this project (may be empty if project has no VMs) + projectTotalUsage := totalUsageByProject[projectID] + + // Compute CRUsage for this project (using pre-grouped CRs) + crUsage := c.computeCRUsage(crsByProject[projectID], flavorGroups) + + // Derive PaygUsage + paygUsage := derivePaygUsage(projectTotalUsage, crUsage) + + // Write status with conflict retry (full reconcile sets LastFullReconcileAt) + if err := c.updateProjectQuotaStatusWithRetry(ctx, pq.Name, projectTotalUsage, paygUsage, true); err != nil { + logger.Error(err, "failed to update ProjectQuota status", "project", projectID) + skipped++ + continue + } + + // Record metrics + c.recordUsageMetrics(projectID, projectTotalUsage, paygUsage, crUsage) + updated++ + } + + duration := time.Since(startTime) + c.Metrics.RecordReconcileDuration(duration.Seconds()) + c.Metrics.RecordReconcileResult(true) + logger.Info("full quota reconcile completed", + "duration", duration.Round(time.Millisecond), + "totalVMs", len(vms), + "projectQuotas", len(pqList.Items), + "updated", updated, + "skipped", skipped) + + return nil +} + +// ============================================================================ +// Watch-based Reconciliation (PaygUsage-only recompute) +// ============================================================================ + +// Reconcile handles watch-based reconciliation for a single ProjectQuota. +// Triggered by: CR Status.UsedAmount changes or ProjectQuota spec changes. +// +// Behavior depends on what changed: +// - Spec change (Generation > ObservedGeneration): recomputes TotalUsage from Postgres + PaygUsage +// - CR UsedAmount change (Generation == ObservedGeneration): reads persisted TotalUsage, recomputes PaygUsage only +func (c *QuotaController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + ctx = WithNewGlobalRequestID(ctx) + logger := LoggerFromContext(ctx).WithValues("projectQuota", req.Name, "mode", "payg-recompute") + logger.V(1).Info("reconciling ProjectQuota") + + // Fetch the ProjectQuota + var pq v1alpha1.ProjectQuota + if err := c.Get(ctx, req.NamespacedName, &pq); err != nil { + if client.IgnoreNotFound(err) == nil { + logger.V(1).Info("ProjectQuota not found, likely deleted") + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + projectID := pq.Spec.ProjectID + ctx = reservations.WithRequestID(ctx, projectID) + + // Determine if this is a spec change (new CRD or quota update) vs. a CR UsedAmount change + specChanged := pq.Generation > pq.Status.ObservedGeneration + + var totalUsage map[string]v1alpha1.ResourceQuotaUsage + if specChanged { + // Spec changed (new CRD or quota update) — recompute TotalUsage from Postgres + logger.Info("spec changed, recomputing TotalUsage from Postgres", + "generation", pq.Generation, "observedGeneration", pq.Status.ObservedGeneration) + var err error + totalUsage, err = c.computeTotalUsageForProject(ctx, projectID) + if err != nil { + logger.Error(err, "failed to compute TotalUsage for project") + return ctrl.Result{}, err + } + } else { + // CR UsedAmount changed — read persisted TotalUsage, only recompute PaygUsage + totalUsage = pq.Status.TotalUsage + if totalUsage == nil { + // Safety fallback: TotalUsage should always be set after first spec reconcile + logger.Info("no TotalUsage persisted, computing as fallback") + var err error + totalUsage, err = c.computeTotalUsageForProject(ctx, projectID) + if err != nil { + logger.Error(err, "failed to compute TotalUsage for project") + return ctrl.Result{}, err + } + } + } + + // Fetch flavor groups for CRUsage computation + flavorGroupClient := &reservations.FlavorGroupKnowledgeClient{Client: c.Client} + flavorGroups, err := flavorGroupClient.GetAllFlavorGroups(ctx, nil) + if err != nil { + logger.Error(err, "failed to get flavor groups") + return ctrl.Result{}, err + } + + // List CRs for this project (from local cache) + var crList v1alpha1.CommittedResourceList + if err := c.List(ctx, &crList); err != nil { + logger.Error(err, "failed to list CommittedResource CRDs") + return ctrl.Result{}, err + } + crsByProject := groupCRsByProject(crList.Items) + + // Compute CRUsage + crUsage := c.computeCRUsage(crsByProject[projectID], flavorGroups) + + // Derive PaygUsage + paygUsage := derivePaygUsage(totalUsage, crUsage) + + // Write updated status with conflict retry + if err := c.updateProjectQuotaStatusWithRetry(ctx, pq.Name, totalUsage, paygUsage, specChanged); err != nil { + logger.Error(err, "failed to update ProjectQuota status") + return ctrl.Result{}, err + } + + // Record metrics + c.recordUsageMetrics(projectID, totalUsage, paygUsage, crUsage) + + logger.V(1).Info("reconcile completed", "project", projectID, "specChanged", specChanged) + return ctrl.Result{}, nil +} + +// computeTotalUsageForProject computes TotalUsage for a single project by reading +// all VMs from Postgres and filtering to the target project. Used as bootstrap when +// a ProjectQuota is first created and has no persisted TotalUsage yet. +func (c *QuotaController) computeTotalUsageForProject(ctx context.Context, projectID string) (map[string]v1alpha1.ResourceQuotaUsage, error) { + // Fetch flavor groups from Knowledge CRD + flavorGroupClient := &reservations.FlavorGroupKnowledgeClient{Client: c.Client} + flavorGroups, err := flavorGroupClient.GetAllFlavorGroups(ctx, nil) + if err != nil { + return nil, fmt.Errorf("failed to get flavor groups: %w", err) + } + + // Build flavorName → flavorGroup lookup + flavorToGroup := buildFlavorToGroupMap(flavorGroups) + + // Fetch all VMs and compute usage (only the target project's data will be used) + vms, err := c.VMSource.ListVMs(ctx) + if err != nil { + return nil, fmt.Errorf("failed to list VMs: %w", err) + } + + // Compute totalUsage for all projects and return just this one + totalUsageByProject := c.computeTotalUsage(vms, flavorToGroup, flavorGroups) + return totalUsageByProject[projectID], nil +} + +// ============================================================================ +// Incremental Update (HV Instance Diff) +// ============================================================================ + +// usageDelta tracks resource deltas for a single project during incremental reconciliation. +type usageDelta struct { + // increments[resourceName][az] = amount to add + increments map[string]map[string]int64 + // decrements[resourceName][az] = amount to subtract + decrements map[string]map[string]int64 +} + +func newUsageDelta() *usageDelta { + return &usageDelta{ + increments: make(map[string]map[string]int64), + decrements: make(map[string]map[string]int64), + } +} + +func (d *usageDelta) addIncrement(resourceName, az string, amount int64) { + if d.increments[resourceName] == nil { + d.increments[resourceName] = make(map[string]int64) + } + d.increments[resourceName][az] += amount +} + +func (d *usageDelta) addDecrement(resourceName, az string, amount int64) { + if d.decrements[resourceName] == nil { + d.decrements[resourceName] = make(map[string]int64) + } + d.decrements[resourceName][az] += amount +} + +// ReconcileHVDiff handles incremental updates when HV instance lists change. +// It diffs old vs new instances to delta-update TotalUsage for affected projects. +// Deltas are batched per project and applied in a single status update per project +// to avoid race conditions from multiple updates. +func (c *QuotaController) ReconcileHVDiff(ctx context.Context, oldHV, newHV *hv1.Hypervisor) error { + ctx = WithNewGlobalRequestID(ctx) + logger := LoggerFromContext(ctx).WithValues("hypervisor", newHV.Name, "mode", "incremental") + + // Diff old vs new instances + oldInstances := make(map[string]bool) + for _, inst := range oldHV.Status.Instances { + if inst.Active { + oldInstances[inst.ID] = true + } + } + newInstances := make(map[string]bool) + for _, inst := range newHV.Status.Instances { + if inst.Active { + newInstances[inst.ID] = true + } + } + + // Find added and removed UUIDs + var added, removed []string + for id := range newInstances { + if !oldInstances[id] { + added = append(added, id) + } + } + for id := range oldInstances { + if !newInstances[id] { + removed = append(removed, id) + } + } + + if len(added) == 0 && len(removed) == 0 { + return nil + } + + logger.V(1).Info("HV instance diff detected", "added", len(added), "removed", len(removed)) + + // Get flavor groups for mapping + flavorGroupClient := &reservations.FlavorGroupKnowledgeClient{Client: c.Client} + flavorGroups, err := flavorGroupClient.GetAllFlavorGroups(ctx, nil) + if err != nil { + logger.Error(err, "failed to get flavor groups for incremental update") + return err + } + flavorToGroup := buildFlavorToGroupMap(flavorGroups) + + // Accumulate deltas per project (batched to avoid per-VM persist race) + projectDeltas := make(map[string]*usageDelta) + + // Process added instances + for _, vmUUID := range added { + c.accumulateAddedVM(ctx, vmUUID, flavorToGroup, flavorGroups, projectDeltas) + } + + // Process removed instances + for _, vmUUID := range removed { + c.accumulateRemovedVM(ctx, vmUUID, flavorToGroup, flavorGroups, projectDeltas) + } + + // Apply batched deltas and recompute PaygUsage for affected projects + var crList v1alpha1.CommittedResourceList + if err := c.List(ctx, &crList); err != nil { + logger.Error(err, "failed to list CRs for PaygUsage recompute") + return err + } + crsByProject := groupCRsByProject(crList.Items) + + for projectID, delta := range projectDeltas { + if err := c.applyDeltaAndUpdateStatus(ctx, projectID, delta, crsByProject[projectID], flavorGroups); err != nil { + logger.Error(err, "failed to apply delta for project", "project", projectID) + // Continue with other projects + } + } + + return nil +} + +// accumulateAddedVM looks up a VM and accumulates its resource contribution as a delta. +// It checks whether the VM is truly new (created after last full reconcile) vs a migration +// (already counted in TotalUsage). Only new VMs get incremented. +func (c *QuotaController) accumulateAddedVM( + ctx context.Context, + vmUUID string, + flavorToGroup map[string]string, + flavorGroups map[string]compute.FlavorGroupFeature, + projectDeltas map[string]*usageDelta, +) { + + logger := LoggerFromContext(ctx).WithValues("vmUUID", vmUUID) + + vm, err := c.VMSource.GetVM(ctx, vmUUID) + if err != nil { + logger.Error(err, "failed to get VM for increment") + return + } + if vm == nil { + return // VM not found in DB, skip + } + + // Check if this VM was already counted in the last full reconcile. + // If the VM was created BEFORE the last full reconcile, it's a migration + // (already in TotalUsage) and we should NOT increment again. + if !c.isVMNewSinceLastReconcile(ctx, vm) { + logger.V(1).Info("VM already counted (created before last reconcile), skipping increment", + "vmCreatedAt", vm.CreatedAt, "project", vm.ProjectID) + return + } + + groupName, ok := flavorToGroup[vm.FlavorName] + if !ok { + return // Flavor not in any group + } + fg, ok := flavorGroups[groupName] + if !ok { + return + } + + unitSizeMiB := int64(fg.SmallestFlavor.MemoryMB) //nolint:gosec // MemoryMB is always within int64 range + if unitSizeMiB == 0 { + return + } + + ramUnits, coresAmount := vmResourceUnits(vm.Resources, unitSizeMiB) + + delta := projectDeltas[vm.ProjectID] + if delta == nil { + delta = newUsageDelta() + projectDeltas[vm.ProjectID] = delta + } + + delta.addIncrement(commitments.ResourceNameRAM(groupName), vm.AvailabilityZone, ramUnits) + delta.addIncrement(commitments.ResourceNameCores(groupName), vm.AvailabilityZone, coresAmount) +} + +// isVMNewSinceLastReconcile checks if a VM was created after the last full reconcile. +// Returns true if the VM is new and should be incrementally added to TotalUsage. +// Returns false if the VM already existed at the last full reconcile (migration, not new). +// +// NOTE: There is a known timing gap -- the postgres servers table is only refreshed every +// N minutes by the datasource poller. A VM that was created shortly BEFORE the last reconcile +// might not have been visible in postgres yet (sync delay), so the full reconcile may have +// missed it. In that case we would also skip the increment here (CreatedAt <= LastReconcileAt) +// and the VM would only be counted on the NEXT full reconcile cycle. This is acceptable for +// now and will be resolved when we move to a CRD-based VM source with real-time events. +func (c *QuotaController) isVMNewSinceLastReconcile(ctx context.Context, vm *failover.VM) bool { + if vm.CreatedAt == "" { + // No creation time available -- be conservative, skip increment. + // The next full reconcile will pick it up. + return false + } + + // Look up the ProjectQuota for this VM's project + crdName := "quota-" + vm.ProjectID + var pq v1alpha1.ProjectQuota + if err := c.Get(ctx, client.ObjectKey{Name: crdName}, &pq); err != nil { + // If we can't find the ProjectQuota, skip (full reconcile will handle it) + return false + } + + if pq.Status.LastFullReconcileAt == nil { + // No full reconcile has run yet -- skip incremental updates + return false + } + + // Parse the VM's creation time and compare with last FULL reconcile + vmCreatedAt, err := time.Parse("2006-01-02T15:04:05Z", vm.CreatedAt) + if err != nil { + // Try alternative format with timezone offset + vmCreatedAt, err = time.Parse(time.RFC3339, vm.CreatedAt) + if err != nil { + // Cannot parse -- be conservative, skip + return false + } + } + + return vmCreatedAt.After(pq.Status.LastFullReconcileAt.Time) +} + +// accumulateRemovedVM looks up a deleted VM and accumulates its resource contribution as a decrement. +func (c *QuotaController) accumulateRemovedVM( + ctx context.Context, + vmUUID string, + flavorToGroup map[string]string, + flavorGroups map[string]compute.FlavorGroupFeature, + projectDeltas map[string]*usageDelta, +) { + + logger := LoggerFromContext(ctx).WithValues("vmUUID", vmUUID) + + // Check if the VM still exists in the servers table (migrated away = still running) + active, err := c.VMSource.IsServerActive(ctx, vmUUID) + if err != nil { + logger.Error(err, "failed to check server for decrement") + return + } + if active { + // VM still exists (either ACTIVE on another HV, or in non-ACTIVE state). + // Don't decrement — the full reconcile handles these correctly. + return + } + + // Not found in servers table — check deleted_servers + info, err := c.VMSource.GetDeletedVMInfo(ctx, vmUUID) + if err != nil { + logger.Error(err, "failed to get deleted VM info for decrement") + return + } + if info == nil { + // Not found anywhere — cannot determine what to decrement + logger.V(1).Info("removed VM not found in servers or deleted_servers") + return + } + + groupName, ok := flavorToGroup[info.FlavorName] + if !ok { + return // Flavor not in any group + } + fg, ok := flavorGroups[groupName] + if !ok { + return + } + + // Compute commitment units from the resolved flavor resources + unitSizeMiB := int64(fg.SmallestFlavor.MemoryMB) //nolint:gosec // MemoryMB is always within int64 range + if unitSizeMiB == 0 { + return + } + + ramUnits := int64(info.RAMMiB) / unitSizeMiB //nolint:gosec // safe + coresAmount := int64(info.VCPUs) //nolint:gosec // safe + + delta := projectDeltas[info.ProjectID] + if delta == nil { + delta = newUsageDelta() + projectDeltas[info.ProjectID] = delta + } + + delta.addDecrement(commitments.ResourceNameRAM(groupName), info.AvailabilityZone, ramUnits) + delta.addDecrement(commitments.ResourceNameCores(groupName), info.AvailabilityZone, coresAmount) +} + +// applyDeltaAndUpdateStatus fetches the ProjectQuota, applies the batched delta to TotalUsage, +// recomputes PaygUsage, and persists with conflict retry. +func (c *QuotaController) applyDeltaAndUpdateStatus( + ctx context.Context, + projectID string, + delta *usageDelta, + projectCRs []v1alpha1.CommittedResource, + flavorGroups map[string]compute.FlavorGroupFeature, +) error { + + crdName := "quota-" + projectID + + return retry.RetryOnConflict(retry.DefaultRetry, func() error { + // Re-fetch fresh copy on each retry + var pq v1alpha1.ProjectQuota + if err := c.Get(ctx, client.ObjectKey{Name: crdName}, &pq); err != nil { + if client.IgnoreNotFound(err) == nil { + return nil // PQ deleted, nothing to do + } + return err + } + + if pq.Status.TotalUsage == nil { + pq.Status.TotalUsage = make(map[string]v1alpha1.ResourceQuotaUsage) + } + + // Apply increments + for resourceName, azAmounts := range delta.increments { + for az, amount := range azAmounts { + incrementUsage(pq.Status.TotalUsage, resourceName, az, amount) + } + } + + // Apply decrements + for resourceName, azAmounts := range delta.decrements { + for az, amount := range azAmounts { + decrementUsage(pq.Status.TotalUsage, resourceName, az, amount) + } + } + + // Recompute PaygUsage + crUsage := c.computeCRUsage(projectCRs, flavorGroups) + paygUsage := derivePaygUsage(pq.Status.TotalUsage, crUsage) + + pq.Status.PaygUsage = paygUsage + now := metav1.Now() + pq.Status.LastReconcileAt = &now + + if err := c.Status().Update(ctx, &pq); err != nil { + return err + } + + c.recordUsageMetrics(projectID, pq.Status.TotalUsage, paygUsage, crUsage) + return nil + }) +} + +// ============================================================================ +// Manager Setup +// ============================================================================ + +// SetupWithManager sets up the watch-based reconciler for PaygUsage recomputes. +func (c *QuotaController) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + Named("quota-controller"). + // Watch ProjectQuota for spec/generation changes only (ignore status-only updates + // to avoid infinite reconcile loops since Reconcile() updates status). + For(&v1alpha1.ProjectQuota{}, builder.WithPredicates(projectQuotaGenerationChangePredicate())). + // Watch CommittedResource for status changes (UsedAmount updates) + Watches( + &v1alpha1.CommittedResource{}, + handler.EnqueueRequestsFromMapFunc(c.mapCRToProjectQuota), + builder.WithPredicates(crUsedAmountChangePredicate()), + ). + WithOptions(controller.Options{ + MaxConcurrentReconciles: 1, + }). + Complete(c) +} + +// SetupHVWatcher sets up a separate controller to watch HV CRD changes +// for incremental TotalUsage updates. +func (c *QuotaController) SetupHVWatcher(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + Named("quota-hv-watcher"). + WatchesRawSource(source.Kind( + mgr.GetCache(), + &hv1.Hypervisor{}, + &hvInstanceDiffHandler{controller: c}, + hvInstanceChangePredicate(), + )). + WithOptions(controller.Options{ + MaxConcurrentReconciles: 1, + }). + Complete(reconcile.Func(func(_ context.Context, _ ctrl.Request) (ctrl.Result, error) { + // The actual work is done in the event handler + return ctrl.Result{}, nil + })) +} + +// Start implements manager.Runnable for the periodic reconciliation loop. +// It does not block manager startup — the first reconcile fires after a short +// initial delay to allow cache sync. +func (c *QuotaController) Start(ctx context.Context) error { + log.Info("starting quota controller (periodic)", + "fullReconcileInterval", c.Config.FullReconcileInterval.Duration, + "crStateFilter", c.Config.CRStateFilter) + + // Use a short initial delay to allow cache sync before first reconcile + initialDelay := 5 * time.Second + timer := time.NewTimer(initialDelay) + defer timer.Stop() + + for { + select { + case <-ctx.Done(): + log.Info("stopping quota controller") + return nil + case <-timer.C: + if err := c.ReconcilePeriodic(ctx); err != nil { + log.Error(err, "periodic full reconcile failed") + } + timer.Reset(c.Config.FullReconcileInterval.Duration) + } + } +} + +// ============================================================================ +// Internal Helpers +// ============================================================================ + +// computeTotalUsage aggregates VM resources by project/AZ/resource. +// +// The RAM calculation converts server RAM into LIQUID commitment units: +// - Each flavor group has a "smallest flavor" defining the unit size (e.g., 32768 MiB) +// - A VM's RAM usage in units = VM_RAM_MiB / unit_size_MiB +// - Example: a 64 GiB VM in a group with 32 GiB smallest flavor = 2 units +// +// This matches the unit system used by LIQUID for commitment tracking. +// The per-AZ breakdown allows Limes to enforce AZ-level quota limits. +func (c *QuotaController) computeTotalUsage( + vms []failover.VM, + flavorToGroup map[string]string, + flavorGroups map[string]compute.FlavorGroupFeature, +) map[string]map[string]v1alpha1.ResourceQuotaUsage { + // result[projectID][resourceName] = ResourceQuotaUsage{PerAZ: {az: amount}} + result := make(map[string]map[string]v1alpha1.ResourceQuotaUsage) + + for _, vm := range vms { + groupName, ok := flavorToGroup[vm.FlavorName] + if !ok { + continue // Flavor not in any tracked group + } + fg, ok := flavorGroups[groupName] + if !ok { + continue + } + if fg.SmallestFlavor.MemoryMB == 0 { + continue // Invalid group config + } + + ramResourceName := commitments.ResourceNameRAM(groupName) + coresResourceName := commitments.ResourceNameCores(groupName) + + unitSizeMiB := int64(fg.SmallestFlavor.MemoryMB) //nolint:gosec // safe + ramUnits, coresAmount := vmResourceUnits(vm.Resources, unitSizeMiB) + + if _, ok := result[vm.ProjectID]; !ok { + result[vm.ProjectID] = make(map[string]v1alpha1.ResourceQuotaUsage) + } + + // Accumulate RAM usage for this project + AZ + ramUsage := result[vm.ProjectID][ramResourceName] + if ramUsage.PerAZ == nil { + ramUsage.PerAZ = make(map[string]int64) + } + ramUsage.PerAZ[vm.AvailabilityZone] += ramUnits + result[vm.ProjectID][ramResourceName] = ramUsage + + // Accumulate cores usage for this project + AZ + coresUsage := result[vm.ProjectID][coresResourceName] + if coresUsage.PerAZ == nil { + coresUsage.PerAZ = make(map[string]int64) + } + coresUsage.PerAZ[vm.AvailabilityZone] += coresAmount + result[vm.ProjectID][coresResourceName] = coresUsage + } + + return result +} + +// groupCRsByProject groups CommittedResources by project ID for efficient lookup. +func groupCRsByProject(crs []v1alpha1.CommittedResource) map[string][]v1alpha1.CommittedResource { + result := make(map[string][]v1alpha1.CommittedResource) + for i := range crs { + projectID := crs[i].Spec.ProjectID + result[projectID] = append(result[projectID], crs[i]) + } + return result +} + +// computeCRUsage computes the committed resource usage from a pre-filtered slice of CRs for one project. +// It reads UsedResources from each CR's status and converts to commitment units (multiples for RAM, raw for cores). +func (c *QuotaController) computeCRUsage(crs []v1alpha1.CommittedResource, flavorGroups map[string]compute.FlavorGroupFeature) map[string]v1alpha1.ResourceQuotaUsage { + result := make(map[string]v1alpha1.ResourceQuotaUsage) + + for i := range crs { + cr := &crs[i] + + // Prefer AcceptedSpec (last successful reconcile snapshot) over Spec + // to avoid mis-bucketing during spec transitions. + spec := &cr.Spec + if cr.Status.AcceptedSpec != nil { + spec = cr.Status.AcceptedSpec + } + + // Filter: only matching states + if !c.isCRStateIncluded(spec.State) { + continue + } + + // Get used amount from UsedResources map + if len(cr.Status.UsedResources) == 0 { + continue + } + + // Map ResourceType to resource name and extract used amount + var resourceName string + var usedAmount int64 + switch spec.ResourceType { + case v1alpha1.CommittedResourceTypeMemory: + resourceName = commitments.ResourceNameRAM(spec.FlavorGroupName) + memQty, ok := cr.Status.UsedResources["memory"] + if !ok { + continue + } + // Convert bytes to commitment units (multiples of smallest flavor) + usedBytes := memQty.Value() + fg, ok := flavorGroups[spec.FlavorGroupName] + if !ok || fg.SmallestFlavor.MemoryMB == 0 { + continue + } + unitSizeBytes := int64(fg.SmallestFlavor.MemoryMB) * 1024 * 1024 //nolint:gosec // safe + usedAmount = usedBytes / unitSizeBytes + case v1alpha1.CommittedResourceTypeCores: + resourceName = commitments.ResourceNameCores(spec.FlavorGroupName) + cpuQty, ok := cr.Status.UsedResources["cpu"] + if !ok { + continue + } + usedAmount = cpuQty.Value() + default: + continue + } + + if usedAmount <= 0 { + continue + } + + // Accumulate per AZ + usage := result[resourceName] + if usage.PerAZ == nil { + usage.PerAZ = make(map[string]int64) + } + usage.PerAZ[spec.AvailabilityZone] += usedAmount + result[resourceName] = usage + } + + return result +} + +// isCRStateIncluded checks if a commitment state is in the configured filter. +func (c *QuotaController) isCRStateIncluded(state v1alpha1.CommitmentStatus) bool { + for _, s := range c.Config.CRStateFilter { + if s == state { + return true + } + } + return false +} + +// derivePaygUsage computes PaygUsage = TotalUsage - CRUsage (clamped >= 0). +func derivePaygUsage( + totalUsage map[string]v1alpha1.ResourceQuotaUsage, + crUsage map[string]v1alpha1.ResourceQuotaUsage, +) map[string]v1alpha1.ResourceQuotaUsage { + + result := make(map[string]v1alpha1.ResourceQuotaUsage) + + for resourceName, total := range totalUsage { + payg := v1alpha1.ResourceQuotaUsage{ + PerAZ: make(map[string]int64), + } + for az, totalAmount := range total.PerAZ { + crAmount := int64(0) + if cr, ok := crUsage[resourceName]; ok { + if azAmount, ok := cr.PerAZ[az]; ok { + crAmount = azAmount + } + } + paygAmount := totalAmount - crAmount + if paygAmount < 0 { + paygAmount = 0 // Clamp >= 0 + } + payg.PerAZ[az] = paygAmount + } + result[resourceName] = payg + } + + return result +} + +// updateProjectQuotaStatusWithRetry writes TotalUsage + PaygUsage + LastReconcileAt +// with retry-on-conflict to handle concurrent updates. +// If fullReconcile is true, also updates LastFullReconcileAt and ObservedGeneration. +func (c *QuotaController) updateProjectQuotaStatusWithRetry( + ctx context.Context, + pqName string, + totalUsage map[string]v1alpha1.ResourceQuotaUsage, + paygUsage map[string]v1alpha1.ResourceQuotaUsage, + fullReconcile bool, +) error { + + return retry.RetryOnConflict(retry.DefaultRetry, func() error { + // Re-fetch fresh copy on each retry + var pq v1alpha1.ProjectQuota + if err := c.Get(ctx, client.ObjectKey{Name: pqName}, &pq); err != nil { + return err + } + + pq.Status.TotalUsage = totalUsage + pq.Status.PaygUsage = paygUsage + pq.Status.ObservedGeneration = pq.Generation + now := metav1.Now() + pq.Status.LastReconcileAt = &now + if fullReconcile { + pq.Status.LastFullReconcileAt = &now + } + return c.Status().Update(ctx, &pq) + }) +} + +// vmResourceUnits computes RAM commitment units and cores from a VM's resources. +// RAM is converted from bytes (resource.Quantity) to MiB, then divided by unitSizeMiB +// (the smallest flavor's memory in MiB for the flavor group) to get commitment units. +func vmResourceUnits(resources map[string]resource.Quantity, unitSizeMiB int64) (ramUnits, cores int64) { + memQty := resources["memory"] + serverRAMMiB := memQty.Value() / (1024 * 1024) // bytes to MiB + ramUnits = serverRAMMiB / unitSizeMiB // commitment units + vcpuQty := resources["vcpus"] + cores = vcpuQty.Value() + return ramUnits, cores +} + +// buildFlavorToGroupMap builds a flavorName → flavorGroupName lookup from flavor groups. +func buildFlavorToGroupMap(flavorGroups map[string]compute.FlavorGroupFeature) map[string]string { + result := make(map[string]string) + for groupName, group := range flavorGroups { + for _, flavor := range group.Flavors { + result[flavor.Name] = groupName + } + } + return result +} + +// incrementUsage increments a usage value in the map. +func incrementUsage(usage map[string]v1alpha1.ResourceQuotaUsage, resourceName, az string, amount int64) { + u := usage[resourceName] + if u.PerAZ == nil { + u.PerAZ = make(map[string]int64) + } + u.PerAZ[az] += amount + usage[resourceName] = u +} + +// decrementUsage decrements a usage value in the map (clamp >= 0). +func decrementUsage(usage map[string]v1alpha1.ResourceQuotaUsage, resourceName, az string, amount int64) { + u := usage[resourceName] + if u.PerAZ == nil { + return + } + u.PerAZ[az] -= amount + if u.PerAZ[az] < 0 { + u.PerAZ[az] = 0 + } + usage[resourceName] = u +} + +// recordUsageMetrics emits Prometheus metrics for all resources in a project. +func (c *QuotaController) recordUsageMetrics( + projectID string, + totalUsage map[string]v1alpha1.ResourceQuotaUsage, + paygUsage map[string]v1alpha1.ResourceQuotaUsage, + crUsage map[string]v1alpha1.ResourceQuotaUsage, +) { + + for resourceName, total := range totalUsage { + for az, totalAmount := range total.PerAZ { + paygAmount := int64(0) + if payg, ok := paygUsage[resourceName]; ok { + paygAmount = payg.PerAZ[az] + } + crAmount := int64(0) + if cr, ok := crUsage[resourceName]; ok { + crAmount = cr.PerAZ[az] + } + c.Metrics.RecordUsage(projectID, az, resourceName, totalAmount, paygAmount, crAmount) + } + } +} + +// ============================================================================ +// Predicates & Event Handlers +// ============================================================================ + +// mapCRToProjectQuota maps a CommittedResource change to the affected ProjectQuota reconcile request. +func (c *QuotaController) mapCRToProjectQuota(_ context.Context, obj client.Object) []reconcile.Request { + cr, ok := obj.(*v1alpha1.CommittedResource) + if !ok { + return nil + } + // Map to the ProjectQuota for this project + crdName := "quota-" + cr.Spec.ProjectID + return []reconcile.Request{ + {NamespacedName: client.ObjectKey{Name: crdName}}, + } +} + +// crUsedResourcesChangePredicate triggers only when Status.UsedResources changes on a CommittedResource. +func crUsedAmountChangePredicate() predicate.Predicate { + return predicate.Funcs{ + CreateFunc: func(_ event.CreateEvent) bool { return false }, + UpdateFunc: func(e event.UpdateEvent) bool { + oldCR, ok1 := e.ObjectOld.(*v1alpha1.CommittedResource) + newCR, ok2 := e.ObjectNew.(*v1alpha1.CommittedResource) + if !ok1 || !ok2 { + return false + } + // Trigger if UsedResources changed + if len(oldCR.Status.UsedResources) != len(newCR.Status.UsedResources) { + return true + } + for key, oldQty := range oldCR.Status.UsedResources { + newQty, ok := newCR.Status.UsedResources[key] + if !ok || oldQty.Cmp(newQty) != 0 { + return true + } + } + return false + }, + DeleteFunc: func(_ event.DeleteEvent) bool { return true }, + GenericFunc: func(_ event.GenericEvent) bool { return false }, + } +} + +// projectQuotaGenerationChangePredicate triggers only when the ProjectQuota's generation changes +// (i.e., spec was modified). This prevents infinite reconcile loops from status-only updates. +func projectQuotaGenerationChangePredicate() predicate.Predicate { + return predicate.GenerationChangedPredicate{} +} + +// hvInstanceChangePredicate always returns true for updates. +// ReconcileHVDiff performs its own set-diff and exits early if there are no +// actual additions/removals. This ensures instance swaps (same count, different IDs) +// are not missed. +func hvInstanceChangePredicate() predicate.TypedPredicate[*hv1.Hypervisor] { + return predicate.TypedFuncs[*hv1.Hypervisor]{ + CreateFunc: func(_ event.TypedCreateEvent[*hv1.Hypervisor]) bool { return true }, + UpdateFunc: func(_ event.TypedUpdateEvent[*hv1.Hypervisor]) bool { + return true + }, + DeleteFunc: func(_ event.TypedDeleteEvent[*hv1.Hypervisor]) bool { return true }, + GenericFunc: func(_ event.TypedGenericEvent[*hv1.Hypervisor]) bool { return false }, + } +} + +// hvInstanceDiffHandler handles HV instance diff events by calling ReconcileHVDiff. +type hvInstanceDiffHandler struct { + controller *QuotaController +} + +func (h *hvInstanceDiffHandler) Create(_ context.Context, _ event.TypedCreateEvent[*hv1.Hypervisor], _ workqueue.TypedRateLimitingInterface[reconcile.Request]) { + // On create, no diff needed (full reconcile will catch up) +} + +func (h *hvInstanceDiffHandler) Update(ctx context.Context, e event.TypedUpdateEvent[*hv1.Hypervisor], _ workqueue.TypedRateLimitingInterface[reconcile.Request]) { + if err := h.controller.ReconcileHVDiff(ctx, e.ObjectOld, e.ObjectNew); err != nil { + log.Error(err, "failed to process HV instance diff", "hypervisor", e.ObjectNew.Name) + } +} + +func (h *hvInstanceDiffHandler) Delete(_ context.Context, _ event.TypedDeleteEvent[*hv1.Hypervisor], _ workqueue.TypedRateLimitingInterface[reconcile.Request]) { + // On delete, full reconcile will correct +} + +func (h *hvInstanceDiffHandler) Generic(_ context.Context, _ event.TypedGenericEvent[*hv1.Hypervisor], _ workqueue.TypedRateLimitingInterface[reconcile.Request]) { + // No-op +} diff --git a/internal/scheduling/reservations/quota/controller_test.go b/internal/scheduling/reservations/quota/controller_test.go new file mode 100644 index 000000000..d503b363f --- /dev/null +++ b/internal/scheduling/reservations/quota/controller_test.go @@ -0,0 +1,604 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package quota + +import ( + "context" + "testing" + "time" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/failover" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestComputeTotalUsage(t *testing.T) { + ctrl := &QuotaController{Config: DefaultQuotaControllerConfig()} + + flavorGroups := map[string]compute.FlavorGroupFeature{ + "hana_v2": { + SmallestFlavor: compute.FlavorInGroup{MemoryMB: 32768}, + Flavors: []compute.FlavorInGroup{ + {Name: "m1.hana_v2.small", MemoryMB: 32768}, + {Name: "m1.hana_v2.large", MemoryMB: 65536}, + }, + }, + "general": { + SmallestFlavor: compute.FlavorInGroup{MemoryMB: 4096}, + Flavors: []compute.FlavorInGroup{ + {Name: "m1.general.small", MemoryMB: 4096}, + }, + }, + } + flavorToGroup := buildFlavorToGroupMap(flavorGroups) + + vms := []failover.VM{ + { + UUID: "vm-1", + FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", + AvailabilityZone: "az-1", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), // 32768 MiB in bytes + "vcpus": resource.MustParse("8"), + }, + }, + { + UUID: "vm-2", + FlavorName: "m1.hana_v2.large", + ProjectID: "project-a", + AvailabilityZone: "az-1", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("68719476736"), // 65536 MiB in bytes + "vcpus": resource.MustParse("16"), + }, + }, + { + UUID: "vm-3", + FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", + AvailabilityZone: "az-2", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), + "vcpus": resource.MustParse("8"), + }, + }, + { + UUID: "vm-4", + FlavorName: "m1.general.small", + ProjectID: "project-b", + AvailabilityZone: "az-1", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("4294967296"), // 4096 MiB in bytes + "vcpus": resource.MustParse("2"), + }, + }, + { + UUID: "vm-5", + FlavorName: "unknown-flavor", + ProjectID: "project-c", + AvailabilityZone: "az-1", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("4294967296"), + "vcpus": resource.MustParse("2"), + }, + }, + } + + result := ctrl.computeTotalUsage(vms, flavorToGroup, flavorGroups) + + // project-a: hana_v2 in az-1: 32768+65536 = 98304 MiB / 32768 = 3 units RAM, 8+16=24 cores + // project-a: hana_v2 in az-2: 32768 MiB / 32768 = 1 unit RAM, 8 cores + projectA := result["project-a"] + if projectA == nil { + t.Fatal("expected project-a in results") + } + + ramUsage := projectA["hw_version_hana_v2_ram"] + if ramUsage.PerAZ["az-1"] != 3 { + t.Errorf("expected project-a az-1 hana_v2_ram = 3, got %d", ramUsage.PerAZ["az-1"]) + } + if ramUsage.PerAZ["az-2"] != 1 { + t.Errorf("expected project-a az-2 hana_v2_ram = 1, got %d", ramUsage.PerAZ["az-2"]) + } + + coresUsage := projectA["hw_version_hana_v2_cores"] + if coresUsage.PerAZ["az-1"] != 24 { + t.Errorf("expected project-a az-1 hana_v2_cores = 24, got %d", coresUsage.PerAZ["az-1"]) + } + if coresUsage.PerAZ["az-2"] != 8 { + t.Errorf("expected project-a az-2 hana_v2_cores = 8, got %d", coresUsage.PerAZ["az-2"]) + } + + // project-b: general in az-1: 4096/4096=1 unit RAM, 2 cores + projectB := result["project-b"] + if projectB == nil { + t.Fatal("expected project-b in results") + } + if projectB["hw_version_general_ram"].PerAZ["az-1"] != 1 { + t.Errorf("expected project-b az-1 general_ram = 1, got %d", projectB["hw_version_general_ram"].PerAZ["az-1"]) + } + if projectB["hw_version_general_cores"].PerAZ["az-1"] != 2 { + t.Errorf("expected project-b az-1 general_cores = 2, got %d", projectB["hw_version_general_cores"].PerAZ["az-1"]) + } + + // project-c: unknown flavor → not in results + if _, exists := result["project-c"]; exists { + t.Error("expected project-c to NOT be in results (unknown flavor)") + } +} + +func TestComputeCRUsage(t *testing.T) { + ctrl := &QuotaController{Config: DefaultQuotaControllerConfig()} + + // Flavor groups with SmallestFlavor.MemoryMB = 1 for simple unit conversion in tests + // (1 multiple = 1 MiB = 1048576 bytes) + testFlavorGroups := map[string]compute.FlavorGroupFeature{ + "hana_v2": { + SmallestFlavor: compute.FlavorInGroup{Name: "m1.hana_v2.small", MemoryMB: 1}, + Flavors: []compute.FlavorInGroup{{Name: "m1.hana_v2.small", MemoryMB: 1}}, + }, + } + + allCRs := []v1alpha1.CommittedResource{ + { + Spec: v1alpha1.CommittedResourceSpec{ + ProjectID: "project-a", + FlavorGroupName: "hana_v2", + AvailabilityZone: "az-1", + ResourceType: v1alpha1.CommittedResourceTypeMemory, + State: v1alpha1.CommitmentStatusConfirmed, + }, + Status: v1alpha1.CommittedResourceStatus{ + UsedResources: map[string]resource.Quantity{"memory": resource.MustParse("5Mi")}, + }, + }, + { + Spec: v1alpha1.CommittedResourceSpec{ + ProjectID: "project-a", + FlavorGroupName: "hana_v2", + AvailabilityZone: "az-1", + ResourceType: v1alpha1.CommittedResourceTypeMemory, + State: v1alpha1.CommitmentStatusGuaranteed, + }, + Status: v1alpha1.CommittedResourceStatus{ + UsedResources: map[string]resource.Quantity{"memory": resource.MustParse("3Mi")}, + }, + }, + { + Spec: v1alpha1.CommittedResourceSpec{ + ProjectID: "project-a", + FlavorGroupName: "hana_v2", + AvailabilityZone: "az-1", + ResourceType: v1alpha1.CommittedResourceTypeCores, + State: v1alpha1.CommitmentStatusConfirmed, + }, + Status: v1alpha1.CommittedResourceStatus{ + UsedResources: map[string]resource.Quantity{"cpu": resource.MustParse("2")}, + }, + }, + // Different project — should be excluded by groupCRsByProject + { + Spec: v1alpha1.CommittedResourceSpec{ + ProjectID: "project-b", + FlavorGroupName: "hana_v2", + AvailabilityZone: "az-1", + ResourceType: v1alpha1.CommittedResourceTypeMemory, + State: v1alpha1.CommitmentStatusConfirmed, + }, + Status: v1alpha1.CommittedResourceStatus{ + UsedResources: map[string]resource.Quantity{"memory": resource.MustParse("5Mi")}, + }, + }, + // Pending state — should be excluded by state filter + { + Spec: v1alpha1.CommittedResourceSpec{ + ProjectID: "project-a", + FlavorGroupName: "hana_v2", + AvailabilityZone: "az-2", + ResourceType: v1alpha1.CommittedResourceTypeMemory, + State: v1alpha1.CommitmentStatusPending, + }, + Status: v1alpha1.CommittedResourceStatus{ + UsedResources: map[string]resource.Quantity{"memory": resource.MustParse("2Mi")}, + }, + }, + } + + // Pre-group and pass only project-a's CRs + crsByProject := groupCRsByProject(allCRs) + result := ctrl.computeCRUsage(crsByProject["project-a"], testFlavorGroups) + + // Should include confirmed + guaranteed for project-a only + ramUsage := result["hw_version_hana_v2_ram"] + if ramUsage.PerAZ["az-1"] != 8 { // 5 + 3 + t.Errorf("expected cr ram usage az-1 = 8, got %d", ramUsage.PerAZ["az-1"]) + } + + coresUsage := result["hw_version_hana_v2_cores"] + if coresUsage.PerAZ["az-1"] != 2 { + t.Errorf("expected cr cores usage az-1 = 2, got %d", coresUsage.PerAZ["az-1"]) + } + + // az-2 should NOT be included (pending state) + if ramUsage.PerAZ["az-2"] != 0 { + t.Errorf("expected cr ram usage az-2 = 0 (pending excluded), got %d", ramUsage.PerAZ["az-2"]) + } +} + +func TestDerivePaygUsage(t *testing.T) { + tests := []struct { + name string + totalUsage map[string]v1alpha1.ResourceQuotaUsage + crUsage map[string]v1alpha1.ResourceQuotaUsage + expected map[string]map[string]int64 // resourceName -> az -> amount + }{ + { + name: "basic subtraction", + totalUsage: map[string]v1alpha1.ResourceQuotaUsage{ + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 10, "az-2": 5}}, + }, + crUsage: map[string]v1alpha1.ResourceQuotaUsage{ + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3}}, + }, + expected: map[string]map[string]int64{ + "hw_version_hana_v2_ram": {"az-1": 7, "az-2": 5}, + }, + }, + { + name: "clamp to zero", + totalUsage: map[string]v1alpha1.ResourceQuotaUsage{ + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 2}}, + }, + crUsage: map[string]v1alpha1.ResourceQuotaUsage{ + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 10}}, + }, + expected: map[string]map[string]int64{ + "hw_version_hana_v2_ram": {"az-1": 0}, + }, + }, + { + name: "no CR usage", + totalUsage: map[string]v1alpha1.ResourceQuotaUsage{ + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 5}}, + }, + crUsage: map[string]v1alpha1.ResourceQuotaUsage{}, + expected: map[string]map[string]int64{ + "hw_version_hana_v2_ram": {"az-1": 5}, + }, + }, + { + name: "empty total usage", + totalUsage: map[string]v1alpha1.ResourceQuotaUsage{}, + crUsage: map[string]v1alpha1.ResourceQuotaUsage{ + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 5}}, + }, + expected: map[string]map[string]int64{}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := derivePaygUsage(tt.totalUsage, tt.crUsage) + + for resourceName, expectedAZ := range tt.expected { + resUsage, ok := result[resourceName] + if !ok { + t.Errorf("expected resource %s in result", resourceName) + continue + } + for az, expectedAmount := range expectedAZ { + if resUsage.PerAZ[az] != expectedAmount { + t.Errorf("resource=%s az=%s: expected %d, got %d", + resourceName, az, expectedAmount, resUsage.PerAZ[az]) + } + } + } + + // Check no extra resources in result + for resourceName := range result { + if _, ok := tt.expected[resourceName]; !ok { + t.Errorf("unexpected resource %s in result", resourceName) + } + } + }) + } +} + +func TestBuildFlavorToGroupMap(t *testing.T) { + flavorGroups := map[string]compute.FlavorGroupFeature{ + "hana_v2": { + Flavors: []compute.FlavorInGroup{ + {Name: "m1.hana_v2.small"}, + {Name: "m1.hana_v2.large"}, + }, + }, + "general": { + Flavors: []compute.FlavorInGroup{ + {Name: "m1.general.small"}, + }, + }, + } + + result := buildFlavorToGroupMap(flavorGroups) + + if result["m1.hana_v2.small"] != "hana_v2" { + t.Errorf("expected hana_v2 for m1.hana_v2.small, got %s", result["m1.hana_v2.small"]) + } + if result["m1.hana_v2.large"] != "hana_v2" { + t.Errorf("expected hana_v2 for m1.hana_v2.large, got %s", result["m1.hana_v2.large"]) + } + if result["m1.general.small"] != "general" { + t.Errorf("expected general for m1.general.small, got %s", result["m1.general.small"]) + } + if _, exists := result["unknown"]; exists { + t.Error("expected unknown flavor not to be in map") + } +} + +func TestIncrementDecrementUsage(t *testing.T) { + usage := make(map[string]v1alpha1.ResourceQuotaUsage) + + // Increment from empty + incrementUsage(usage, "res1", "az-1", 5) + if usage["res1"].PerAZ["az-1"] != 5 { + t.Errorf("expected 5 after increment, got %d", usage["res1"].PerAZ["az-1"]) + } + + // Increment again + incrementUsage(usage, "res1", "az-1", 3) + if usage["res1"].PerAZ["az-1"] != 8 { + t.Errorf("expected 8 after second increment, got %d", usage["res1"].PerAZ["az-1"]) + } + + // Decrement + decrementUsage(usage, "res1", "az-1", 2) + if usage["res1"].PerAZ["az-1"] != 6 { + t.Errorf("expected 6 after decrement, got %d", usage["res1"].PerAZ["az-1"]) + } + + // Decrement below zero → clamp to 0 + decrementUsage(usage, "res1", "az-1", 100) + if usage["res1"].PerAZ["az-1"] != 0 { + t.Errorf("expected 0 after over-decrement, got %d", usage["res1"].PerAZ["az-1"]) + } + + // Decrement non-existent resource (no-op) + decrementUsage(usage, "res2", "az-1", 5) + // Should not panic, and res2 should not exist + if _, exists := usage["res2"]; exists { + if usage["res2"].PerAZ != nil { + t.Error("expected res2 to not have PerAZ after decrement on non-existent") + } + } +} + +func TestIsCRStateIncluded(t *testing.T) { + ctrl := &QuotaController{Config: DefaultQuotaControllerConfig()} + + if !ctrl.isCRStateIncluded(v1alpha1.CommitmentStatusConfirmed) { + t.Error("expected confirmed to be included") + } + if !ctrl.isCRStateIncluded(v1alpha1.CommitmentStatusGuaranteed) { + t.Error("expected guaranteed to be included") + } + if ctrl.isCRStateIncluded(v1alpha1.CommitmentStatusPending) { + t.Error("expected pending to NOT be included") + } +} + +func TestGroupCRsByProject(t *testing.T) { + crs := []v1alpha1.CommittedResource{ + {Spec: v1alpha1.CommittedResourceSpec{ProjectID: "p1"}}, + {Spec: v1alpha1.CommittedResourceSpec{ProjectID: "p2"}}, + {Spec: v1alpha1.CommittedResourceSpec{ProjectID: "p1"}}, + {Spec: v1alpha1.CommittedResourceSpec{ProjectID: "p3"}}, + } + + grouped := groupCRsByProject(crs) + + if len(grouped["p1"]) != 2 { + t.Errorf("expected 2 CRs for p1, got %d", len(grouped["p1"])) + } + if len(grouped["p2"]) != 1 { + t.Errorf("expected 1 CR for p2, got %d", len(grouped["p2"])) + } + if len(grouped["p3"]) != 1 { + t.Errorf("expected 1 CR for p3, got %d", len(grouped["p3"])) + } + if len(grouped["nonexistent"]) != 0 { + t.Error("expected 0 CRs for nonexistent project") + } +} + +func TestUsageDelta(t *testing.T) { + delta := newUsageDelta() + + delta.addIncrement("res1", "az-1", 5) + delta.addIncrement("res1", "az-1", 3) + delta.addIncrement("res1", "az-2", 2) + delta.addDecrement("res1", "az-1", 1) + + if delta.increments["res1"]["az-1"] != 8 { + t.Errorf("expected increment res1/az-1 = 8, got %d", delta.increments["res1"]["az-1"]) + } + if delta.increments["res1"]["az-2"] != 2 { + t.Errorf("expected increment res1/az-2 = 2, got %d", delta.increments["res1"]["az-2"]) + } + if delta.decrements["res1"]["az-1"] != 1 { + t.Errorf("expected decrement res1/az-1 = 1, got %d", delta.decrements["res1"]["az-1"]) + } +} + +func TestReconcile_NilTotalUsage(t *testing.T) { + // When TotalUsage is nil, Reconcile should skip and return no error. + // This validates the early-return branch logic used in Reconcile(). + ctrl := &QuotaController{Config: DefaultQuotaControllerConfig()} + + // computeCRUsage on nil slice should return empty map (no panic) + result := ctrl.computeCRUsage(nil, nil) + if len(result) != 0 { + t.Errorf("expected empty result for nil CRs, got %d entries", len(result)) + } + + // derivePaygUsage on nil totalUsage should return empty map + payg := derivePaygUsage(nil, result) + if len(payg) != 0 { + t.Errorf("expected empty payg for nil totalUsage, got %d entries", len(payg)) + } +} + +func TestAccumulateAddedVM_UnknownFlavor(t *testing.T) { + // Verifies that accumulateAddedVM gracefully handles a VM with an unknown flavor + ctrl := &QuotaController{Config: DefaultQuotaControllerConfig()} + + flavorGroups := map[string]compute.FlavorGroupFeature{ + "hana_v2": { + SmallestFlavor: compute.FlavorInGroup{MemoryMB: 32768}, + Flavors: []compute.FlavorInGroup{{Name: "m1.hana_v2.small", MemoryMB: 32768}}, + }, + } + flavorToGroup := buildFlavorToGroupMap(flavorGroups) + projectDeltas := make(map[string]*usageDelta) + + // Use a mock VMSource that returns a VM with unknown flavor + ctrl.VMSource = &mockVMSource{ + getVM: func(_ context.Context, vmUUID string) (*failover.VM, error) { + return &failover.VM{ + UUID: vmUUID, + FlavorName: "unknown-flavor", + ProjectID: "project-a", + AvailabilityZone: "az-1", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("4294967296"), + "vcpus": resource.MustParse("2"), + }, + }, nil + }, + } + + ctrl.accumulateAddedVM(context.Background(), "vm-1", flavorToGroup, flavorGroups, projectDeltas) + + // Should not have added any delta (unknown flavor) + if len(projectDeltas) != 0 { + t.Errorf("expected no deltas for unknown flavor, got %d projects", len(projectDeltas)) + } +} + +func TestAccumulateAddedVM_KnownFlavor(t *testing.T) { + // Set up a fake client with a ProjectQuota that has LastReconcileAt in the past. + // The VM's CreatedAt must be AFTER LastReconcileAt for it to be considered new. + lastReconcile := metav1.NewTime(time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC)) + vmCreatedAt := "2026-01-02T00:00:00Z" // After lastReconcile + + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add scheme: %v", err) + } + + pq := &v1alpha1.ProjectQuota{ + ObjectMeta: metav1.ObjectMeta{Name: "quota-project-a"}, + Spec: v1alpha1.ProjectQuotaSpec{ProjectID: "project-a"}, + Status: v1alpha1.ProjectQuotaStatus{ + LastReconcileAt: &lastReconcile, + LastFullReconcileAt: &lastReconcile, + }, + } + + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(pq). + WithStatusSubresource(&v1alpha1.ProjectQuota{}). + Build() + + qc := &QuotaController{ + Client: k8sClient, + Config: DefaultQuotaControllerConfig(), + } + + flavorGroups := map[string]compute.FlavorGroupFeature{ + "hana_v2": { + SmallestFlavor: compute.FlavorInGroup{MemoryMB: 32768}, + Flavors: []compute.FlavorInGroup{{Name: "m1.hana_v2.small", MemoryMB: 32768}}, + }, + } + flavorToGroup := buildFlavorToGroupMap(flavorGroups) + projectDeltas := make(map[string]*usageDelta) + + qc.VMSource = &mockVMSource{ + getVM: func(_ context.Context, vmUUID string) (*failover.VM, error) { + return &failover.VM{ + UUID: vmUUID, + FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", + AvailabilityZone: "az-1", + CreatedAt: vmCreatedAt, + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), // 32768 MiB + "vcpus": resource.MustParse("8"), + }, + }, nil + }, + } + + qc.accumulateAddedVM(context.Background(), "vm-1", flavorToGroup, flavorGroups, projectDeltas) + + delta, ok := projectDeltas["project-a"] + if !ok { + t.Fatal("expected delta for project-a") + } + + // 32768 MiB / 32768 = 1 unit RAM + if delta.increments["hw_version_hana_v2_ram"]["az-1"] != 1 { + t.Errorf("expected ram increment = 1, got %d", delta.increments["hw_version_hana_v2_ram"]["az-1"]) + } + if delta.increments["hw_version_hana_v2_cores"]["az-1"] != 8 { + t.Errorf("expected cores increment = 8, got %d", delta.increments["hw_version_hana_v2_cores"]["az-1"]) + } +} + +// mockVMSource is a test helper for VMSource. +type mockVMSource struct { + listVMs func(ctx context.Context) ([]failover.VM, error) + getVM func(ctx context.Context, vmUUID string) (*failover.VM, error) + isServerActive func(ctx context.Context, vmUUID string) (bool, error) + getDeletedVM func(ctx context.Context, vmUUID string) (*failover.DeletedVMInfo, error) +} + +func (m *mockVMSource) ListVMs(ctx context.Context) ([]failover.VM, error) { + if m.listVMs != nil { + return m.listVMs(ctx) + } + return nil, nil +} + +func (m *mockVMSource) GetVM(ctx context.Context, vmUUID string) (*failover.VM, error) { + if m.getVM != nil { + return m.getVM(ctx, vmUUID) + } + return nil, nil +} + +func (m *mockVMSource) ListVMsOnHypervisors(_ context.Context, _ *hv1.HypervisorList, _ bool) ([]failover.VM, error) { + return nil, nil +} + +func (m *mockVMSource) IsServerActive(ctx context.Context, vmUUID string) (bool, error) { + if m.isServerActive != nil { + return m.isServerActive(ctx, vmUUID) + } + return false, nil +} + +func (m *mockVMSource) GetDeletedVMInfo(ctx context.Context, vmUUID string) (*failover.DeletedVMInfo, error) { + if m.getDeletedVM != nil { + return m.getDeletedVM(ctx, vmUUID) + } + return nil, nil +} diff --git a/internal/scheduling/reservations/quota/integration_test.go b/internal/scheduling/reservations/quota/integration_test.go new file mode 100644 index 000000000..740341d9a --- /dev/null +++ b/internal/scheduling/reservations/quota/integration_test.go @@ -0,0 +1,1253 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package quota + +import ( + "context" + "encoding/json" + "testing" + "time" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/failover" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +// ============================================================================ +// Integration Tests +// ============================================================================ + +func TestIntegration(t *testing.T) { + lastReconcileTime := metav1.NewTime(time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC)) + + tests := []IntegrationTestCase{ + { + Name: "full reconcile - basic usage", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + makePQ("project-b", nil), + }, + Actions: []TestAction{ + { + Type: "full_reconcile", + // project-a: hana_v2 az-1: (32768+65536)/32768 = 3 RAM units, 8+16=24 cores + // project-a: hana_v2 az-2: 32768/32768 = 1 RAM unit, 8 cores + // project-a: general az-1: 4096/4096 = 1 RAM unit, 2 cores + // project-b: general az-1: 4096/4096 = 1 RAM unit, 2 cores + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + "project-b": { + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + // No CRs -> PaygUsage == TotalUsage + ExpectedPaygUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + "project-b": { + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "full reconcile - with CRs reduces PaygUsage", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + }, + CommittedResources: []*v1alpha1.CommittedResource{ + // 2 units of hana_v2 RAM committed in az-1 for project-a + makeCR("cr-1", "project-a", "hana_v2", "az-1", + v1alpha1.CommittedResourceTypeMemory, v1alpha1.CommitmentStatusConfirmed, int64Ptr(2)), + // 10 cores committed in az-1 for project-a + makeCR("cr-2", "project-a", "hana_v2", "az-1", + v1alpha1.CommittedResourceTypeCores, v1alpha1.CommitmentStatusConfirmed, int64Ptr(10)), + }, + Actions: []TestAction{ + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + // PaygUsage = TotalUsage - CRUsage + // hana_v2 RAM: 3-2=1 in az-1, 1-0=1 in az-2 + // hana_v2 Cores: 24-10=14 in az-1, 8-0=8 in az-2 + // general: no CRs so PaygUsage == TotalUsage + ExpectedPaygUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 1, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 14, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "incremental add - new VM after last reconcile", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + }, + Actions: []TestAction{ + // Step 1: full reconcile to establish baseline + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 2: HV diff adds a NEW VM (created after last reconcile) + { + Type: "hv_diff", + OldHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + }), + NewHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + activeInstance("vm-new"), // new instance + }), + OverrideVMs: withExtraVMs( + failover.VM{ + UUID: "vm-new", FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2099-01-01T00:00:00Z", // far future, always AFTER last reconcile + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), // 32768 MiB = 1 RAM unit + "vcpus": resource.MustParse("8"), + }, + }, + ), + // vm-new is created AFTER last reconcile, so it gets incremented + // +1 RAM unit (32768/32768), +8 cores + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "incremental add - migration skipped (VM created before last reconcile)", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + }, + Actions: []TestAction{ + // Step 1: full reconcile + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 2: HV diff adds vm-1 (which was created BEFORE last reconcile = migration) + { + Type: "hv_diff", + OldHV: makeHV("hv-2", []hv1.Instance{}), + NewHV: makeHV("hv-2", []hv1.Instance{ + activeInstance("vm-1"), // migrated here, created before reconcile + }), + // Should NOT increment -- vm-1 CreatedAt is 2025-12-01 which is before reconcile time + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "incremental remove - deleted VM decrements usage", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + // vm-del is not in VMs (deleted), but has info in DeletedVMs + DeletedVMs: map[string]*failover.DeletedVMInfo{ + "vm-del": { + ProjectID: "project-a", + FlavorName: "m1.hana_v2.small", + AvailabilityZone: "az-1", + RAMMiB: 32768, + VCPUs: 8, + }, + }, + ActiveVMs: map[string]bool{ + "vm-del": false, // not active (truly deleted) + }, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + }, + Actions: []TestAction{ + // Step 1: full reconcile (vm-del not in VMs so not counted) + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 2: HV diff removes vm-del (was on HV before, now gone) + { + Type: "hv_diff", + OldHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + activeInstance("vm-del"), // was here + }), + NewHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + // vm-del gone + }), + // vm-del: IsServerActive=false, deleted info found + // Decrement: -1 RAM unit, -8 cores in az-1 + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 2, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 16, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "incremental remove - migrated VM not decremented", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ActiveVMs: map[string]bool{ + "vm-1": true, // still active (migrated to another HV) + }, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + }, + Actions: []TestAction{ + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // HV reports vm-1 removed (migrated away) + { + Type: "hv_diff", + OldHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + }), + NewHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-2"), + // vm-1 gone from this HV + }), + // vm-1: IsServerActive=true, so NOT decremented + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "CR update triggers PaygUsage recompute", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + }, + CommittedResources: []*v1alpha1.CommittedResource{ + makeCR("cr-ram-1", "project-a", "hana_v2", "az-1", + v1alpha1.CommittedResourceTypeMemory, v1alpha1.CommitmentStatusConfirmed, int64Ptr(1)), + }, + Actions: []TestAction{ + // Step 1: full reconcile with initial CR (UsedAmount=1) + { + Type: "full_reconcile", + ExpectedPaygUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 2, "az-2": 1}}, // 3-1=2 + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 2: CR UsedAmount increases to 3 -> PaygUsage should drop + { + Type: "cr_update", + CRName: "cr-ram-1", + UsedAmount: 3, + ExpectedPaygUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 0, "az-2": 1}}, // 3-3=0 + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "unknown flavor VMs are skipped", + FlavorGroups: testFlavorGroups, + VMs: []failover.VM{ + { + UUID: "vm-unknown", FlavorName: "nonexistent-flavor", + ProjectID: "project-x", AvailabilityZone: "az-1", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("4294967296"), + "vcpus": resource.MustParse("2"), + }, + }, + }, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-x", nil), + }, + Actions: []TestAction{ + { + Type: "full_reconcile", + // No usage for project-x (unknown flavor skipped) + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-x": {}, + }, + ExpectedPaygUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-x": {}, + }, + }, + }, + }, + { + Name: "multiple full reconciles are idempotent", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + makePQ("project-b", nil), + }, + Actions: []TestAction{ + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + "project-b": { + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Second full reconcile - same result + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + "project-b": { + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "pending CRs are excluded from PaygUsage deduction", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + }, + CommittedResources: []*v1alpha1.CommittedResource{ + // Pending CR should NOT reduce PaygUsage + makeCR("cr-pending", "project-a", "hana_v2", "az-1", + v1alpha1.CommittedResourceTypeMemory, v1alpha1.CommitmentStatusPending, int64Ptr(5)), + }, + Actions: []TestAction{ + { + Type: "full_reconcile", + // PaygUsage == TotalUsage because pending CRs are excluded + ExpectedPaygUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "full reconcile corrects incremental drift", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + }, + Actions: []TestAction{ + // Step 1: full reconcile establishes correct baseline + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 2: HV diff adds a short-lived "phantom" VM (created after reconcile, + // but deleted before the next full reconcile runs). The incremental path + // bumps TotalUsage by +1 RAM / +8 cores. + { + Type: "hv_diff", + OldHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + }), + NewHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + activeInstance("vm-phantom"), + }), + OverrideVMs: withExtraVMs( + failover.VM{ + UUID: "vm-phantom", FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2099-01-01T00:00:00Z", // after last reconcile + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), // 32768 MiB = 1 RAM unit + "vcpus": resource.MustParse("8"), + }, + }, + ), + // TotalUsage now has phantom's contribution (drift) + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, // 3+1 drift + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, // 24+8 drift + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 3: full reconcile re-scans all VMs. Reset VM list to baseline + // (vm-phantom is gone). This corrects the drift back to the ground truth. + { + Type: "full_reconcile", + OverrideVMs: baseVMsPtr(), + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, // corrected + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, // corrected + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "complex multi-project scenario with adds, removes, and reconcile corrections", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + DeletedVMs: map[string]*failover.DeletedVMInfo{ + "vm-del": { + ProjectID: "project-a", + FlavorName: "m1.hana_v2.small", + AvailabilityZone: "az-1", + RAMMiB: 32768, + VCPUs: 8, + }, + }, + ActiveVMs: map[string]bool{ + "vm-del": false, // truly deleted + "vm-1": true, // still active (for migration scenario) + }, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + makePQ("project-b", nil), + }, + Actions: []TestAction{ + // Step 1: full reconcile establishes baseline for both projects + // project-a hana_v2: az-1=3 RAM / 24 cores, az-2=1 RAM / 8 cores; general: az-1=1 RAM / 2 cores + // project-b general: az-1=1 RAM / 2 cores + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + "project-b": { + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 2: HV diff adds a genuine new VM to project-a (hana_v2 small, az-1) + // +1 RAM unit, +8 cores + { + Type: "hv_diff", + OldHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + }), + NewHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + activeInstance("vm-new-a"), + }), + OverrideVMs: withExtraVMs( + failover.VM{ + UUID: "vm-new-a", FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2099-01-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), // 32768 MiB = 1 RAM unit + "vcpus": resource.MustParse("8"), + }, + }, + ), + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 3: HV diff adds a phantom VM to project-b (general, az-1) + // This is a short-lived VM that will disappear -- DRIFT for project-b + { + Type: "hv_diff", + OldHV: makeHV("hv-2", []hv1.Instance{ + activeInstance("vm-5"), + }), + NewHV: makeHV("hv-2", []hv1.Instance{ + activeInstance("vm-5"), + activeInstance("vm-phantom-b"), + }), + OverrideVMs: withExtraVMs( + failover.VM{ + UUID: "vm-phantom-b", FlavorName: "m1.general.small", + ProjectID: "project-b", AvailabilityZone: "az-1", + CreatedAt: "2099-01-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("4294967296"), // 4096 MiB = 1 RAM unit + "vcpus": resource.MustParse("2"), + }, + }, + ), + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-b": { + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 2}}, // 1+1 drift + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 4}}, // 2+2 drift + }, + }, + }, + // Step 4: HV diff removes vm-del from project-a (truly deleted) + // -1 RAM unit, -8 cores in az-1 + { + Type: "hv_diff", + OldHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + activeInstance("vm-new-a"), + activeInstance("vm-del"), + }), + NewHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + activeInstance("vm-new-a"), + }), + + OverrideVMs: withExtraVMs( + failover.VM{ + UUID: "vm-new-a", FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2099-01-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), + "vcpus": resource.MustParse("8"), + }, + }, + ), + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, // 4-1=3 + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, // 32-8=24 + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 5: full reconcile with OverrideVMs that includes vm-new-a + // (vm-new-a is now "real" and appears in the VM list). + // This reconcile: + // - project-a: FIXES drift -- truth is 4 (vm-new-a in list), delta said 3 + // - project-b: FIXES drift -- truth is 1, delta said 2 (phantom gone) + { + Type: "full_reconcile", + OverrideVMs: &[]failover.VM{ + // testVMs + vm-new-a + testVMs[0], testVMs[1], testVMs[2], testVMs[3], testVMs[4], + { + UUID: "vm-new-a", FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2099-01-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), + "vcpus": resource.MustParse("8"), + }, + }, + }, + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, // corrected up + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, // corrected up + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + "project-b": { + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, // corrected down + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, // corrected down + }, + }, + }, + // Step 6: another HV diff removes vm-1 from a HV (migration, not deletion). + // vm-1 is still active (ActiveVMs["vm-1"]=true), so NOT decremented. + { + Type: "hv_diff", + OldHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + activeInstance("vm-new-a"), + }), + NewHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-2"), + activeInstance("vm-new-a"), + }), + OverrideVMs: withExtraVMs( + failover.VM{ + UUID: "vm-new-a", FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2099-01-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), + "vcpus": resource.MustParse("8"), + }, + }, + ), + // vm-1 migrated, NOT decremented -- totals unchanged + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 7: final full reconcile confirms everything matches (no drift). + // This is the "reconcile that matches the deltas" -- nothing to fix. + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + "project-b": { + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + } + + _ = lastReconcileTime // referenced by test data (VM CreatedAt values) + + for _, tc := range tests { + t.Run(tc.Name, func(t *testing.T) { + env := newIntegrationTestEnv(t, tc) + + for i, action := range tc.Actions { + t.Logf(" action %d: %s", i+1, action.Type) + env.executeAction(action) + } + }) + } +} + +// ============================================================================ +// Test Data +// ============================================================================ + +var testFlavorGroups = map[string]compute.FlavorGroupFeature{ + "hana_v2": { + Name: "hana_v2", + SmallestFlavor: compute.FlavorInGroup{Name: "m1.hana_v2.small", MemoryMB: 32768, VCPUs: 8}, + LargestFlavor: compute.FlavorInGroup{Name: "m1.hana_v2.large", MemoryMB: 65536, VCPUs: 16}, + Flavors: []compute.FlavorInGroup{ + {Name: "m1.hana_v2.small", MemoryMB: 32768, VCPUs: 8}, + {Name: "m1.hana_v2.large", MemoryMB: 65536, VCPUs: 16}, + }, + }, + "general": { + Name: "general", + SmallestFlavor: compute.FlavorInGroup{Name: "m1.general.small", MemoryMB: 4096, VCPUs: 2}, + LargestFlavor: compute.FlavorInGroup{Name: "m1.general.small", MemoryMB: 4096, VCPUs: 2}, + Flavors: []compute.FlavorInGroup{ + {Name: "m1.general.small", MemoryMB: 4096, VCPUs: 2}, + }, + }, +} + +// Standard VM set for most tests. +// project-a has VMs in BOTH flavor groups (hana_v2 and general). +// project-b has only general VMs. +var testVMs = []failover.VM{ + // vm-1: hana_v2, 1 RAM unit (32768/32768), 8 cores + { + UUID: "vm-1", FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2025-12-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), // 32768 MiB + "vcpus": resource.MustParse("8"), + }, + }, + // vm-2: hana_v2, 2 RAM units (65536/32768), 16 cores + { + UUID: "vm-2", FlavorName: "m1.hana_v2.large", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2025-12-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("68719476736"), // 65536 MiB + "vcpus": resource.MustParse("16"), + }, + }, + // vm-3: hana_v2, 1 RAM unit (32768/32768), 8 cores + { + UUID: "vm-3", FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", AvailabilityZone: "az-2", + CreatedAt: "2025-12-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), // 32768 MiB + "vcpus": resource.MustParse("8"), + }, + }, + // vm-4: general, 1 RAM unit (4096/4096), 2 cores + { + UUID: "vm-4", FlavorName: "m1.general.small", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2025-12-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("4294967296"), // 4096 MiB + "vcpus": resource.MustParse("2"), + }, + }, + // vm-5: general, 1 RAM unit (4096/4096), 2 cores + { + UUID: "vm-5", FlavorName: "m1.general.small", + ProjectID: "project-b", AvailabilityZone: "az-1", + CreatedAt: "2025-12-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("4294967296"), // 4096 MiB + "vcpus": resource.MustParse("2"), + }, + }, +} + +// ============================================================================ +// Integration Test Framework +// ============================================================================ + +// TestAction defines a single step in an integration test scenario. +type TestAction struct { + // Type of action to perform. + // "full_reconcile" - run ReconcilePeriodic + // "hv_diff" - run ReconcileHVDiff with OldHV/NewHV + // "cr_update" - update a CR's UsedAmount, then run Reconcile (watch-triggered) + Type string + + // For hv_diff actions: + OldHV *hv1.Hypervisor + NewHV *hv1.Hypervisor + + // OverrideVMs, when non-nil, replaces the VMSource (ListVMs + GetVM) for + // THIS action and all subsequent actions. Use to simulate VMs appearing or + // disappearing between steps. To "undo" a temporary VM, set OverrideVMs + // again in a later action without that VM. + OverrideVMs *[]failover.VM + + // For cr_update actions: + CRName string + UsedAmount int64 + + // Optional: verify state AFTER this action completes. + // Keys are project IDs. If nil, no verification for this step. + ExpectedTotalUsage map[string]map[string]v1alpha1.ResourceQuotaUsage + ExpectedPaygUsage map[string]map[string]v1alpha1.ResourceQuotaUsage +} + +// IntegrationTestCase defines a complete integration test scenario. +type IntegrationTestCase struct { + Name string + + // Initial state seeded into the fake client and mock VMSource + VMs []failover.VM + DeletedVMs map[string]*failover.DeletedVMInfo // UUID -> deleted VM info + ActiveVMs map[string]bool // UUID -> IsServerActive response + + FlavorGroups map[string]compute.FlavorGroupFeature + ProjectQuotas []*v1alpha1.ProjectQuota + CommittedResources []*v1alpha1.CommittedResource + + // Ordered actions with per-step verification + Actions []TestAction +} + +// integrationTestEnv holds the test environment for a single test case. +type integrationTestEnv struct { + t *testing.T + client client.Client + controller *QuotaController + vmSource *mockVMSource +} + +func newIntegrationTestEnv(t *testing.T, tc IntegrationTestCase) *integrationTestEnv { + t.Helper() + + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add v1alpha1 to scheme: %v", err) + } + if err := hv1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add hv1 to scheme: %v", err) + } + + // Build initial objects list + var objects []client.Object + + // Create Knowledge CRD with flavor groups + knowledgeCRD := buildKnowledgeCRD(t, tc.FlavorGroups) + objects = append(objects, knowledgeCRD) + + // Add ProjectQuotas + for _, pq := range tc.ProjectQuotas { + objects = append(objects, pq) + } + + // Add CommittedResources + for _, cr := range tc.CommittedResources { + objects = append(objects, cr) + } + + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(objects...). + WithStatusSubresource( + &v1alpha1.ProjectQuota{}, + &v1alpha1.CommittedResource{}, + &v1alpha1.Knowledge{}, + ). + Build() + + // Build mock VMSource + vmSrc := &mockVMSource{ + listVMs: func(_ context.Context) ([]failover.VM, error) { + return tc.VMs, nil + }, + getVM: func(_ context.Context, vmUUID string) (*failover.VM, error) { + for i := range tc.VMs { + if tc.VMs[i].UUID == vmUUID { + return &tc.VMs[i], nil + } + } + return nil, nil + }, + isServerActive: func(_ context.Context, vmUUID string) (bool, error) { + if tc.ActiveVMs != nil { + if active, ok := tc.ActiveVMs[vmUUID]; ok { + return active, nil + } + } + return false, nil + }, + getDeletedVM: func(_ context.Context, vmUUID string) (*failover.DeletedVMInfo, error) { + if tc.DeletedVMs != nil { + if info, ok := tc.DeletedVMs[vmUUID]; ok { + return info, nil + } + } + return nil, nil + }, + } + + controller := &QuotaController{ + Client: k8sClient, + VMSource: vmSrc, + Config: DefaultQuotaControllerConfig(), + Metrics: NewQuotaMetrics(nil), // no-op metrics + } + + return &integrationTestEnv{ + t: t, + client: k8sClient, + controller: controller, + vmSource: vmSrc, + } +} + +func (env *integrationTestEnv) verifyTotalUsage(projectID string, expected map[string]v1alpha1.ResourceQuotaUsage) { + env.t.Helper() + crdName := "quota-" + projectID + var pq v1alpha1.ProjectQuota + if err := env.client.Get(context.Background(), client.ObjectKey{Name: crdName}, &pq); err != nil { + env.t.Fatalf("failed to get ProjectQuota %s: %v", crdName, err) + } + + if expected == nil && pq.Status.TotalUsage == nil { + return // both nil, ok + } + + for resourceName, expectedUsage := range expected { + actual, ok := pq.Status.TotalUsage[resourceName] + if !ok { + env.t.Errorf("project %s: expected TotalUsage resource %q not found", projectID, resourceName) + continue + } + for az, expectedAmount := range expectedUsage.PerAZ { + if actual.PerAZ[az] != expectedAmount { + env.t.Errorf("project %s: TotalUsage[%s][%s] = %d, want %d", + projectID, resourceName, az, actual.PerAZ[az], expectedAmount) + } + } + } + + // Check no unexpected resources + for resourceName := range pq.Status.TotalUsage { + if _, ok := expected[resourceName]; !ok { + env.t.Errorf("project %s: unexpected TotalUsage resource %q", projectID, resourceName) + } + } +} + +func (env *integrationTestEnv) verifyPaygUsage(projectID string, expected map[string]v1alpha1.ResourceQuotaUsage) { + env.t.Helper() + crdName := "quota-" + projectID + var pq v1alpha1.ProjectQuota + if err := env.client.Get(context.Background(), client.ObjectKey{Name: crdName}, &pq); err != nil { + env.t.Fatalf("failed to get ProjectQuota %s: %v", crdName, err) + } + + if expected == nil && pq.Status.PaygUsage == nil { + return + } + + for resourceName, expectedUsage := range expected { + actual, ok := pq.Status.PaygUsage[resourceName] + if !ok { + env.t.Errorf("project %s: expected PaygUsage resource %q not found", projectID, resourceName) + continue + } + for az, expectedAmount := range expectedUsage.PerAZ { + if actual.PerAZ[az] != expectedAmount { + env.t.Errorf("project %s: PaygUsage[%s][%s] = %d, want %d", + projectID, resourceName, az, actual.PerAZ[az], expectedAmount) + } + } + } + + for resourceName := range pq.Status.PaygUsage { + if _, ok := expected[resourceName]; !ok { + env.t.Errorf("project %s: unexpected PaygUsage resource %q", projectID, resourceName) + } + } +} + +func (env *integrationTestEnv) executeAction(action TestAction) { + env.t.Helper() + ctx := context.Background() + + // Apply OverrideVMs if set (persists for all subsequent actions) + if action.OverrideVMs != nil { + vms := *action.OverrideVMs + env.vmSource.listVMs = func(_ context.Context) ([]failover.VM, error) { + return vms, nil + } + env.vmSource.getVM = func(_ context.Context, vmUUID string) (*failover.VM, error) { + for i := range vms { + if vms[i].UUID == vmUUID { + return &vms[i], nil + } + } + return nil, nil + } + } + + switch action.Type { + case "full_reconcile": + if err := env.controller.ReconcilePeriodic(ctx); err != nil { + env.t.Fatalf("ReconcilePeriodic failed: %v", err) + } + + case "hv_diff": + if err := env.controller.ReconcileHVDiff(ctx, action.OldHV, action.NewHV); err != nil { + env.t.Fatalf("ReconcileHVDiff failed: %v", err) + } + + case "cr_update": + // Fetch the CR, update UsedResources, then call Reconcile + var cr v1alpha1.CommittedResource + if err := env.client.Get(ctx, client.ObjectKey{Name: action.CRName}, &cr); err != nil { + env.t.Fatalf("failed to get CR %s: %v", action.CRName, err) + } + cr.Status.UsedResources = usedResourcesFromMultiples(cr.Spec.ResourceType, cr.Spec.FlavorGroupName, action.UsedAmount) + if err := env.client.Status().Update(ctx, &cr); err != nil { + env.t.Fatalf("failed to update CR %s status: %v", action.CRName, err) + } + + // Simulate watch trigger: call Reconcile for the affected project + pqName := "quota-" + cr.Spec.ProjectID + _, err := env.controller.Reconcile(ctx, reconcileRequest(pqName)) + if err != nil { + env.t.Fatalf("Reconcile failed after CR update: %v", err) + } + + default: + env.t.Fatalf("unknown action type: %s", action.Type) + } + + // Verify expected state after this action + if action.ExpectedTotalUsage != nil { + for projectID, expected := range action.ExpectedTotalUsage { + env.verifyTotalUsage(projectID, expected) + } + } + if action.ExpectedPaygUsage != nil { + for projectID, expected := range action.ExpectedPaygUsage { + env.verifyPaygUsage(projectID, expected) + } + } +} + +// ============================================================================ +// Helpers +// ============================================================================ + +func buildKnowledgeCRD(t *testing.T, flavorGroups map[string]compute.FlavorGroupFeature) *v1alpha1.Knowledge { + t.Helper() + + // Convert map to slice for BoxFeatureList + var features []compute.FlavorGroupFeature + for _, fg := range flavorGroups { + features = append(features, fg) + } + + raw, err := boxFlavorGroupFeatures(features) + if err != nil { + t.Fatalf("failed to box flavor group features: %v", err) + } + + return &v1alpha1.Knowledge{ + ObjectMeta: metav1.ObjectMeta{ + Name: "flavor-groups", + }, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: "nova", + }, + Status: v1alpha1.KnowledgeStatus{ + Raw: raw, + Conditions: []metav1.Condition{ + { + Type: v1alpha1.KnowledgeConditionReady, + Status: metav1.ConditionTrue, + LastTransitionTime: metav1.Now(), + Reason: "Ready", + }, + }, + }, + } +} + +func boxFlavorGroupFeatures(features []compute.FlavorGroupFeature) (runtime.RawExtension, error) { + rawSerialized := struct { + Features []compute.FlavorGroupFeature `json:"features"` + }{ + Features: features, + } + data, err := json.Marshal(rawSerialized) + if err != nil { + return runtime.RawExtension{}, err + } + return runtime.RawExtension{Raw: data}, nil +} + +func reconcileRequest(name string) ctrl.Request { + return ctrl.Request{NamespacedName: client.ObjectKey{Name: name}} +} + +func makePQ(projectID string, lastReconcileAt *metav1.Time) *v1alpha1.ProjectQuota { //nolint:unparam + return &v1alpha1.ProjectQuota{ + ObjectMeta: metav1.ObjectMeta{Name: "quota-" + projectID}, + Spec: v1alpha1.ProjectQuotaSpec{ProjectID: projectID, DomainID: "domain-1"}, + Status: v1alpha1.ProjectQuotaStatus{ + LastReconcileAt: lastReconcileAt, + }, + } +} + +func makeCR(name, projectID, flavorGroup, az string, resourceType v1alpha1.CommittedResourceType, state v1alpha1.CommitmentStatus, usedAmount *int64) *v1alpha1.CommittedResource { //nolint:unparam + cr := &v1alpha1.CommittedResource{ + ObjectMeta: metav1.ObjectMeta{Name: name}, + Spec: v1alpha1.CommittedResourceSpec{ + CommitmentUUID: name + "-uuid", + FlavorGroupName: flavorGroup, + ResourceType: resourceType, + AvailabilityZone: az, + ProjectID: projectID, + DomainID: "domain-1", + Amount: resource.MustParse("10"), + State: state, + }, + } + if usedAmount != nil { + cr.Status.UsedResources = usedResourcesFromMultiples(resourceType, flavorGroup, *usedAmount) + } + return cr +} + +// usedResourcesFromMultiples converts a "multiples" value (the old UsedAmount unit) to UsedResources. +// For memory: multiples * smallestFlavorMB * 1024 * 1024 = bytes. +// For cores: the value is used directly. +func usedResourcesFromMultiples(resourceType v1alpha1.CommittedResourceType, flavorGroup string, multiples int64) map[string]resource.Quantity { + switch resourceType { + case v1alpha1.CommittedResourceTypeMemory: + fg, ok := testFlavorGroups[flavorGroup] + if !ok || fg.SmallestFlavor.MemoryMB == 0 { + return nil + } + bytesVal := multiples * int64(fg.SmallestFlavor.MemoryMB) * 1024 * 1024 //nolint:gosec // test only + return map[string]resource.Quantity{ + "memory": *resource.NewQuantity(bytesVal, resource.BinarySI), + } + case v1alpha1.CommittedResourceTypeCores: + return map[string]resource.Quantity{ + "cpu": *resource.NewQuantity(multiples, resource.DecimalSI), + } + default: + return nil + } +} + +func int64Ptr(v int64) *int64 { return &v } + +// withExtraVMs returns a pointer to testVMs + additional VMs. +// Used with OverrideVMs to add VMs to the "world" for an action. +func withExtraVMs(extra ...failover.VM) *[]failover.VM { + vms := append(append([]failover.VM{}, testVMs...), extra...) + return &vms +} + +// baseVMsPtr returns a pointer to a copy of testVMs (resets to baseline). +func baseVMsPtr() *[]failover.VM { + vms := append([]failover.VM{}, testVMs...) + return &vms +} + +func makeHV(name string, instances []hv1.Instance) *hv1.Hypervisor { + return &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{Name: name}, + Status: hv1.HypervisorStatus{ + Instances: instances, + }, + } +} + +func activeInstance(id string) hv1.Instance { + return hv1.Instance{ID: id, Active: true} +} diff --git a/internal/scheduling/reservations/quota/metrics.go b/internal/scheduling/reservations/quota/metrics.go new file mode 100644 index 000000000..7263ab1fd --- /dev/null +++ b/internal/scheduling/reservations/quota/metrics.go @@ -0,0 +1,98 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package quota + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +// QuotaMetrics holds Prometheus metrics for the quota controller. +type QuotaMetrics struct { + totalUsageGauge *prometheus.GaugeVec + paygUsageGauge *prometheus.GaugeVec + crUsageGauge *prometheus.GaugeVec + reconcileDuration prometheus.Histogram + reconcileResultVec *prometheus.CounterVec +} + +// NewQuotaMetrics creates a new QuotaMetrics instance and registers with the given registerer. +func NewQuotaMetrics(reg prometheus.Registerer) *QuotaMetrics { + m := &QuotaMetrics{ + totalUsageGauge: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "cortex_quota_total_usage", + Help: "Total resource usage per project/AZ/resource", + }, + []string{"project_id", "availability_zone", "resource"}, + ), + paygUsageGauge: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "cortex_quota_payg_usage", + Help: "Pay-as-you-go usage per project/AZ/resource", + }, + []string{"project_id", "availability_zone", "resource"}, + ), + crUsageGauge: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "cortex_quota_cr_usage", + Help: "Committed resource usage per project/AZ/resource", + }, + []string{"project_id", "availability_zone", "resource"}, + ), + reconcileDuration: prometheus.NewHistogram( + prometheus.HistogramOpts{ + Name: "cortex_quota_reconcile_duration_seconds", + Help: "Duration of quota controller full reconcile", + Buckets: prometheus.ExponentialBuckets(0.1, 2, 10), + }, + ), + reconcileResultVec: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "cortex_quota_reconcile_total", + Help: "Total number of periodic reconcile attempts by result (success/failure)", + }, + []string{"result"}, + ), + } + + if reg != nil { + reg.MustRegister(m.totalUsageGauge) + reg.MustRegister(m.paygUsageGauge) + reg.MustRegister(m.crUsageGauge) + reg.MustRegister(m.reconcileDuration) + reg.MustRegister(m.reconcileResultVec) + } + + return m +} + +// RecordUsage records usage metrics for a project/AZ/resource. +func (m *QuotaMetrics) RecordUsage(projectID, az, resource string, totalUsage, paygUsage, crUsage int64) { + if m == nil { + return + } + m.totalUsageGauge.WithLabelValues(projectID, az, resource).Set(float64(totalUsage)) + m.paygUsageGauge.WithLabelValues(projectID, az, resource).Set(float64(paygUsage)) + m.crUsageGauge.WithLabelValues(projectID, az, resource).Set(float64(crUsage)) +} + +// RecordReconcileDuration records the duration of a full reconcile. +func (m *QuotaMetrics) RecordReconcileDuration(seconds float64) { + if m == nil { + return + } + m.reconcileDuration.Observe(seconds) +} + +// RecordReconcileResult increments the success or failure counter for periodic reconciles. +func (m *QuotaMetrics) RecordReconcileResult(success bool) { + if m == nil { + return + } + result := "failure" + if success { + result = "success" + } + m.reconcileResultVec.WithLabelValues(result).Inc() +} diff --git a/pkg/conf/params_test.go b/pkg/conf/params_test.go index c040b0102..55319ef37 100644 --- a/pkg/conf/params_test.go +++ b/pkg/conf/params_test.go @@ -8,7 +8,6 @@ import ( "testing" "github.com/cobaltcore-dev/cortex/api/v1alpha1" - testlib "github.com/cobaltcore-dev/cortex/pkg/testing" ) func TestUnmarshalParams_NilParameters(t *testing.T) { @@ -47,7 +46,7 @@ func TestUnmarshalParams_StringValue(t *testing.T) { } params := v1alpha1.Parameters{ - {Key: "name", StringValue: testlib.Ptr("test-name")}, + {Key: "name", StringValue: new("test-name")}, } var result TestStruct @@ -67,7 +66,7 @@ func TestUnmarshalParams_BoolValue(t *testing.T) { } params := v1alpha1.Parameters{ - {Key: "enabled", BoolValue: testlib.Ptr(true)}, + {Key: "enabled", BoolValue: new(true)}, } var result TestStruct @@ -87,7 +86,7 @@ func TestUnmarshalParams_IntValue(t *testing.T) { } params := v1alpha1.Parameters{ - {Key: "count", IntValue: testlib.Ptr(int64(42))}, + {Key: "count", IntValue: new(int64(42))}, } var result TestStruct @@ -107,7 +106,7 @@ func TestUnmarshalParams_FloatValue(t *testing.T) { } params := v1alpha1.Parameters{ - {Key: "threshold", FloatValue: testlib.Ptr(3.14)}, + {Key: "threshold", FloatValue: new(3.14)}, } var result TestStruct @@ -127,7 +126,7 @@ func TestUnmarshalParams_StringListValue(t *testing.T) { } params := v1alpha1.Parameters{ - {Key: "tags", StringListValue: testlib.Ptr([]string{"tag1", "tag2", "tag3"})}, + {Key: "tags", StringListValue: new([]string{"tag1", "tag2", "tag3"})}, } var result TestStruct @@ -158,12 +157,12 @@ func TestUnmarshalParams_MultipleValues(t *testing.T) { } params := v1alpha1.Parameters{ - {Key: "name", StringValue: testlib.Ptr("test")}, - {Key: "count", IntValue: testlib.Ptr(int64(10))}, - {Key: "enabled", BoolValue: testlib.Ptr(true)}, - {Key: "threshold", FloatValue: testlib.Ptr(0.5)}, - {Key: "tags", StringListValue: testlib.Ptr([]string{"a", "b"})}, - {Key: "weights", FloatMapValue: testlib.Ptr(map[string]float64{"cpu": 1.0, "memory": 0.5})}, + {Key: "name", StringValue: new("test")}, + {Key: "count", IntValue: new(int64(10))}, + {Key: "enabled", BoolValue: new(true)}, + {Key: "threshold", FloatValue: new(0.5)}, + {Key: "tags", StringListValue: new([]string{"a", "b"})}, + {Key: "weights", FloatMapValue: new(map[string]float64{"cpu": 1.0, "memory": 0.5})}, } var result TestStruct @@ -204,8 +203,8 @@ func TestUnmarshalParams_DuplicateKeys(t *testing.T) { } params := v1alpha1.Parameters{ - {Key: "name", StringValue: testlib.Ptr("first")}, - {Key: "name", StringValue: testlib.Ptr("second")}, + {Key: "name", StringValue: new("first")}, + {Key: "name", StringValue: new("second")}, } var result TestStruct @@ -225,7 +224,7 @@ func TestUnmarshalParams_DuplicateValues(t *testing.T) { } params := v1alpha1.Parameters{ - {Key: "name", StringValue: testlib.Ptr("first"), IntValue: testlib.Ptr(int64(1))}, + {Key: "name", StringValue: new("first"), IntValue: new(int64(1))}, } var result TestStruct @@ -265,8 +264,8 @@ func TestUnmarshalParams_UnknownField(t *testing.T) { } params := v1alpha1.Parameters{ - {Key: "name", StringValue: testlib.Ptr("test")}, - {Key: "unknown", StringValue: testlib.Ptr("value")}, + {Key: "name", StringValue: new("test")}, + {Key: "unknown", StringValue: new("value")}, } var result TestStruct @@ -286,7 +285,7 @@ func TestUnmarshalParams_TypeMismatch(t *testing.T) { } params := v1alpha1.Parameters{ - {Key: "count", StringValue: testlib.Ptr("not-a-number")}, + {Key: "count", StringValue: new("not-a-number")}, } var result TestStruct @@ -307,7 +306,7 @@ func TestUnmarshalParams_OptionalFieldsMissing(t *testing.T) { } params := v1alpha1.Parameters{ - {Key: "required", StringValue: testlib.Ptr("value")}, + {Key: "required", StringValue: new("value")}, } var result TestStruct @@ -333,8 +332,8 @@ func TestUnmarshalParams_NestedStruct(t *testing.T) { } params := v1alpha1.Parameters{ - {Key: "name", StringValue: testlib.Ptr("nested-test")}, - {Key: "weight", FloatValue: testlib.Ptr(1.5)}, + {Key: "name", StringValue: new("nested-test")}, + {Key: "weight", FloatValue: new(1.5)}, } var result TestStruct @@ -357,7 +356,7 @@ func TestUnmarshalParams_EmptyStringList(t *testing.T) { } params := v1alpha1.Parameters{ - {Key: "tags", StringListValue: testlib.Ptr([]string{})}, + {Key: "tags", StringListValue: new([]string{})}, } var result TestStruct @@ -380,10 +379,10 @@ func TestUnmarshalParams_ZeroValues(t *testing.T) { } params := v1alpha1.Parameters{ - {Key: "count", IntValue: testlib.Ptr(int64(0))}, - {Key: "threshold", FloatValue: testlib.Ptr(0.0)}, - {Key: "enabled", BoolValue: testlib.Ptr(false)}, - {Key: "name", StringValue: testlib.Ptr("")}, + {Key: "count", IntValue: new(int64(0))}, + {Key: "threshold", FloatValue: new(0.0)}, + {Key: "enabled", BoolValue: new(false)}, + {Key: "name", StringValue: new("")}, } var result TestStruct @@ -413,8 +412,8 @@ func TestUnmarshalParams_NegativeValues(t *testing.T) { } params := v1alpha1.Parameters{ - {Key: "count", IntValue: testlib.Ptr(int64(-42))}, - {Key: "threshold", FloatValue: testlib.Ptr(-3.14)}, + {Key: "count", IntValue: new(int64(-42))}, + {Key: "threshold", FloatValue: new(-3.14)}, } var result TestStruct @@ -438,8 +437,8 @@ func TestUnmarshalParams_LargeValues(t *testing.T) { } params := v1alpha1.Parameters{ - {Key: "largeInt", IntValue: testlib.Ptr(int64(9223372036854775807))}, // max int64 - {Key: "largeFloat", FloatValue: testlib.Ptr(1.7976931348623157e+308)}, // approx max float64 + {Key: "largeInt", IntValue: new(int64(9223372036854775807))}, // max int64 + {Key: "largeFloat", FloatValue: new(1.7976931348623157e+308)}, // approx max float64 } var result TestStruct @@ -459,7 +458,7 @@ func TestUnmarshalParams_SpecialCharactersInString(t *testing.T) { } params := v1alpha1.Parameters{ - {Key: "name", StringValue: testlib.Ptr(`test"with'special\chars`)}, + {Key: "name", StringValue: new(`test"with'special\chars`)}, } var result TestStruct @@ -479,7 +478,7 @@ func TestUnmarshalParams_UnicodeInString(t *testing.T) { } params := v1alpha1.Parameters{ - {Key: "name", StringValue: testlib.Ptr("日本語テスト🚀")}, + {Key: "name", StringValue: new("日本語テスト🚀")}, } var result TestStruct diff --git a/pkg/multicluster/routers_test.go b/pkg/multicluster/routers_test.go index 0110bc01f..c3e1a0c16 100644 --- a/pkg/multicluster/routers_test.go +++ b/pkg/multicluster/routers_test.go @@ -7,7 +7,6 @@ import ( "testing" "github.com/cobaltcore-dev/cortex/api/v1alpha1" - testlib "github.com/cobaltcore-dev/cortex/pkg/testing" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -34,7 +33,7 @@ func TestHypervisorResourceRouter_Match(t *testing.T) { }, { name: "matching AZ pointer", - obj: testlib.Ptr(hv1.Hypervisor{ + obj: new(hv1.Hypervisor{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{"topology.kubernetes.io/zone": "qa-de-1a"}, }, @@ -135,7 +134,7 @@ func TestHistoryResourceRouter_Match(t *testing.T) { }, { name: "matching AZ pointer", - obj: testlib.Ptr(v1alpha1.History{ + obj: new(v1alpha1.History{ Spec: v1alpha1.HistorySpec{ AvailabilityZone: &az, }, @@ -240,7 +239,7 @@ func TestReservationsResourceRouter_Match(t *testing.T) { }, { name: "matching AZ pointer", - obj: testlib.Ptr(v1alpha1.Reservation{ + obj: new(v1alpha1.Reservation{ Spec: v1alpha1.ReservationSpec{ AvailabilityZone: "qa-de-1a", }, diff --git a/pkg/testing/ptr.go b/pkg/testing/ptr.go deleted file mode 100644 index e022638d1..000000000 --- a/pkg/testing/ptr.go +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package testlib - -// Init something as a pointer. -func Ptr[T any](v T) *T { return new(v) } diff --git a/tools/visualize-committed-resources/main.go b/tools/visualize-committed-resources/main.go index afa16a372..f722f8b8d 100644 --- a/tools/visualize-committed-resources/main.go +++ b/tools/visualize-committed-resources/main.go @@ -15,7 +15,7 @@ // --filter-group=name Show only CRs for this flavor group (substring match) // --filter-state=state Show only CRs in this state (e.g. confirmed, reserving) // --active Shorthand: show only confirmed/guaranteed CRs -// --views=v1,v2,... Views to show (default: all). Available: summary, commitments, reservations, allocations +// --views=v1,v2,... Views to show (default: all). Available: summary, commitments, reservations, allocations, usage // --hide=v1,v2,... Views to hide (applied after --views) // --watch=interval Refresh interval (e.g. 2s, 5s). Clears screen between refreshes. package main @@ -72,9 +72,10 @@ const ( viewCommitments = "commitments" viewReservations = "reservations" viewAllocations = "allocations" + viewUsage = "usage" ) -var allViews = []string{viewSummary, viewCommitments, viewReservations, viewAllocations} +var allViews = []string{viewSummary, viewCommitments, viewReservations, viewAllocations, viewUsage} type viewSet map[string]bool @@ -292,7 +293,7 @@ func printSummary(crs []v1alpha1.CommittedResource, reservations []v1alpha1.Rese ) } -func printCommitments(crs []v1alpha1.CommittedResource) { +func printCommitments(crs []v1alpha1.CommittedResource, showUsage bool) { printHeader(fmt.Sprintf("CommittedResources (%d)", len(crs))) if len(crs) == 0 { @@ -311,15 +312,21 @@ func printCommitments(crs []v1alpha1.CommittedResource) { stateColour(cr.Spec.State), cr.Spec.Amount.String(), func() string { - if cr.Status.AcceptedAmount == nil { + if cr.Status.AcceptedSpec == nil { return gray("—") } - return cr.Status.AcceptedAmount.String() + return cr.Status.AcceptedSpec.Amount.String() }(), ) - if cr.Status.UsedAmount != nil { - fmt.Printf(" used=%-12s\n", cr.Status.UsedAmount.String()) + if mem, ok := cr.Status.UsedResources["memory"]; ok { + cpu := cr.Status.UsedResources["cpu"] + usageAgeStr := gray("—") + if cr.Status.LastUsageReconcileAt != nil { + usageAgeStr = age(cr.Status.LastUsageReconcileAt) + } + fmt.Printf(" used=%-12s usedCPU=%-10s instances=%-4d usage-age=%s\n", + mem.String(), cpu.String(), len(cr.Status.AssignedInstances), usageAgeStr) } endStr := gray("no expiry") @@ -332,6 +339,13 @@ func printCommitments(crs []v1alpha1.CommittedResource) { } } fmt.Printf(" age=%-8s %s\n", age(&cr.CreationTimestamp), endStr) + + if showUsage && len(cr.Status.AssignedInstances) > 0 { + fmt.Printf(" assigned instances (%d):\n", len(cr.Status.AssignedInstances)) + for _, inst := range cr.Status.AssignedInstances { + fmt.Printf(" %s\n", gray(inst)) + } + } } } @@ -586,7 +600,7 @@ func printSnapshot(crs []v1alpha1.CommittedResource, reservations []v1alpha1.Res printSummary(crs, reservations) } if views.has(viewCommitments) { - printCommitments(crs) + printCommitments(crs, views.has(viewUsage)) } if views.has(viewReservations) { printReservations(crs, reservations, views.has(viewAllocations))