From 11695132419b5ffcae046db8468f0faaedd36851 Mon Sep 17 00:00:00 2001 From: Markus Wieland Date: Thu, 23 Apr 2026 13:02:57 +0200 Subject: [PATCH 1/2] Fallback to physical capacity in cortex capacity KPI --- .../plugins/compute/resource_capacity_kvm.go | 26 +++- .../compute/resource_capacity_kvm_test.go | 124 ++++++++++++++++++ 2 files changed, 144 insertions(+), 6 deletions(-) diff --git a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go index 1d10d5c30..e250892a2 100644 --- a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go +++ b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go @@ -183,13 +183,18 @@ func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) { failoverByHost, committedNotInUseByHost := aggregateReservationsByHost(reservations.Items) for _, hypervisor := range hvs.Items { - if hypervisor.Status.EffectiveCapacity == nil { - slog.Warn("hypervisor with nil effective capacity, skipping", "host", hypervisor.Name) + capacityMap := hypervisor.Status.EffectiveCapacity + if capacityMap == nil { + slog.Warn("hypervisor with nil effective capacity, falling back to physical capacity (overcommit not considered)", "host", hypervisor.Name) + capacityMap = hypervisor.Status.Capacity + } + if capacityMap == nil { + slog.Warn("hypervisor with nil capacity, skipping", "host", hypervisor.Name) continue } - cpuTotal, hasCPUTotal := hypervisor.Status.EffectiveCapacity[hv1.ResourceCPU] - ramTotal, hasRAMTotal := hypervisor.Status.EffectiveCapacity[hv1.ResourceMemory] + cpuTotal, hasCPUTotal := capacityMap[hv1.ResourceCPU] + ramTotal, hasRAMTotal := capacityMap[hv1.ResourceMemory] if !hasCPUTotal || !hasRAMTotal { slog.Error("hypervisor missing cpu or ram total capacity", "hypervisor", hypervisor.Name) @@ -197,8 +202,17 @@ func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) { } if cpuTotal.IsZero() || ramTotal.IsZero() { - slog.Warn("hypervisor with zero cpu or ram total capacity, skipping", "host", hypervisor.Name) - continue + slog.Warn("hypervisor with zero effective capacity, falling back to physical capacity (overcommit not considered)", "host", hypervisor.Name) + if hypervisor.Status.Capacity == nil { + slog.Warn("hypervisor with nil physical capacity, skipping", "host", hypervisor.Name) + continue + } + cpuTotal = hypervisor.Status.Capacity[hv1.ResourceCPU] + ramTotal = hypervisor.Status.Capacity[hv1.ResourceMemory] + if cpuTotal.IsZero() || ramTotal.IsZero() { + slog.Warn("hypervisor with zero physical capacity, skipping", "host", hypervisor.Name) + continue + } } cpuUsed, hasCPUUtilized := hypervisor.Status.Allocation[hv1.ResourceCPU] diff --git a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go index e692df04f..724436724 100644 --- a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go +++ b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go @@ -125,6 +125,130 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, expectedMetrics: []kvmExpectedMetric{}, }, + { + name: "nil effective capacity falls back to physical capacity", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: nil, + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("128"), + hv1.ResourceMemory: resource.MustParse("512Gi"), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("64"), + hv1.ResourceMemory: resource.MustParse("256Gi"), + }, + Traits: []string{}, + }, + }, + }, + expectedMetrics: []kvmExpectedMetric{ + totalMetric("node001-bb088", "cpu", "qa-1a", "bb088", 128), + totalMetric("node001-bb088", "ram", "qa-1a", "bb088", 549755813888), // 512Gi + usageMetric("node001-bb088", "cpu", "utilized", "qa-1a", "bb088", 64), + usageMetric("node001-bb088", "ram", "utilized", "qa-1a", "bb088", 274877906944), // 256Gi + usageMetric("node001-bb088", "cpu", "reserved", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "ram", "reserved", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "cpu", "failover", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "ram", "failover", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "cpu", "payg", "qa-1a", "bb088", 64), // 128-64-0-0 + usageMetric("node001-bb088", "ram", "payg", "qa-1a", "bb088", 274877906944), // 512Gi-256Gi + }, + }, + { + name: "zero effective capacity falls back to physical capacity", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), + hv1.ResourceMemory: resource.MustParse("0"), + }, + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("128"), + hv1.ResourceMemory: resource.MustParse("512Gi"), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("64"), + hv1.ResourceMemory: resource.MustParse("256Gi"), + }, + Traits: []string{}, + }, + }, + }, + expectedMetrics: []kvmExpectedMetric{ + totalMetric("node001-bb088", "cpu", "qa-1a", "bb088", 128), + totalMetric("node001-bb088", "ram", "qa-1a", "bb088", 549755813888), // 512Gi + usageMetric("node001-bb088", "cpu", "utilized", "qa-1a", "bb088", 64), + usageMetric("node001-bb088", "ram", "utilized", "qa-1a", "bb088", 274877906944), // 256Gi + usageMetric("node001-bb088", "cpu", "reserved", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "ram", "reserved", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "cpu", "failover", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "ram", "failover", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "cpu", "payg", "qa-1a", "bb088", 64), // 128-64-0-0 + usageMetric("node001-bb088", "ram", "payg", "qa-1a", "bb088", 274877906944), // 512Gi-256Gi + }, + }, + { + name: "zero effective capacity with nil physical capacity skips", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), + hv1.ResourceMemory: resource.MustParse("0"), + }, + Capacity: nil, + Traits: []string{}, + }, + }, + }, + expectedMetrics: []kvmExpectedMetric{}, + }, + { + name: "zero effective capacity with zero physical capacity skips", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), + hv1.ResourceMemory: resource.MustParse("0"), + }, + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), + hv1.ResourceMemory: resource.MustParse("0"), + }, + Traits: []string{}, + }, + }, + }, + expectedMetrics: []kvmExpectedMetric{}, + }, { name: "single hypervisor with default traits, no reservations", hypervisors: []hv1.Hypervisor{ From 3d843a8d18c39ec446dedfd3d4d92dbc2010af3f Mon Sep 17 00:00:00 2001 From: Markus Wieland Date: Thu, 23 Apr 2026 14:21:19 +0200 Subject: [PATCH 2/2] Implement KVM resource capacity handling and add tests for payg capacity clamping --- .../plugins/compute/resource_capacity_kvm.go | 166 ++++++++++-------- .../compute/resource_capacity_kvm_test.go | 75 ++++++++ 2 files changed, 169 insertions(+), 72 deletions(-) diff --git a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go index e250892a2..7d3bb33ef 100644 --- a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go +++ b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go @@ -23,6 +23,72 @@ import ( hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" ) +type kvmHost struct { + hv1.Hypervisor +} + +// getResourceCapacity attempts to retrieve the effective capacity for the specified resource from the hypervisor status, falling back to the physical capacity if effective capacity is not available. It returns the capacity quantity and a boolean indicating whether any capacity information was found. +func (k kvmHost) getResourceCapacity(resourceName hv1.ResourceName) (capacity resource.Quantity, ok bool) { + if k.Status.EffectiveCapacity != nil { + qty, exists := k.Status.EffectiveCapacity[resourceName] + if exists && !qty.IsZero() { + return qty, true + } + } + if k.Status.Capacity == nil { + return resource.Quantity{}, false + } + qty, exists := k.Status.Capacity[resourceName] + if !exists || qty.IsZero() { + return resource.Quantity{}, false + } + return qty, true +} + +func (k kvmHost) getResourceAllocation(resourceName hv1.ResourceName) (allocation resource.Quantity) { + if k.Status.Allocation == nil { + return resource.MustParse("0") + } + + qty, exists := k.Status.Allocation[resourceName] + if !exists { + return resource.MustParse("0") + } + return qty +} + +func (k kvmHost) getLabels() kvmHostLabels { + decommissioned := false + externalCustomer := false + workloadType := "general-purpose" + cpuArchitecture := "cascade-lake" + + for _, trait := range k.Status.Traits { + switch trait { + case "CUSTOM_HW_SAPPHIRE_RAPIDS": + cpuArchitecture = "sapphire-rapids" + case "CUSTOM_HANA_EXCLUSIVE_HOST": + workloadType = "hana" + case "CUSTOM_DECOMMISSIONING": + decommissioned = true + case "CUSTOM_EXTERNAL_CUSTOMER_EXCLUSIVE": + externalCustomer = true + } + } + + return kvmHostLabels{ + computeHost: k.Name, + availabilityZone: k.Labels["topology.kubernetes.io/zone"], + buildingBlock: getBuildingBlock(k.Name), + cpuArchitecture: cpuArchitecture, + workloadType: workloadType, + enabled: strconv.FormatBool(true), + decommissioned: strconv.FormatBool(decommissioned), + externalCustomer: strconv.FormatBool(externalCustomer), + maintenance: strconv.FormatBool(false), + } +} + // Assuming hypervisor names are in the format nodeXXX-bbYY func getBuildingBlock(hostName string) string { parts := strings.Split(hostName, "-") @@ -167,10 +233,23 @@ func aggregateReservationsByHost(reservations []v1alpha1.Reservation) ( return failoverByHost, committedNotInUseByHost } -func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) { +func (k *KVMResourceCapacityKPI) getHypervisors() ([]kvmHost, error) { hvs := &hv1.HypervisorList{} if err := k.Client.List(context.Background(), hvs); err != nil { - slog.Error("failed to list hypervisors", "error", err) + return nil, err + } + + hosts := make([]kvmHost, len(hvs.Items)) + for i, hv := range hvs.Items { + hosts[i] = kvmHost{Hypervisor: hv} + } + return hosts, nil +} + +func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) { + hypervisors, err := k.getHypervisors() + if err != nil { + slog.Error("failed to get hypervisors", "error", err) return } @@ -182,48 +261,17 @@ func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) { failoverByHost, committedNotInUseByHost := aggregateReservationsByHost(reservations.Items) - for _, hypervisor := range hvs.Items { - capacityMap := hypervisor.Status.EffectiveCapacity - if capacityMap == nil { - slog.Warn("hypervisor with nil effective capacity, falling back to physical capacity (overcommit not considered)", "host", hypervisor.Name) - capacityMap = hypervisor.Status.Capacity - } - if capacityMap == nil { - slog.Warn("hypervisor with nil capacity, skipping", "host", hypervisor.Name) - continue - } - - cpuTotal, hasCPUTotal := capacityMap[hv1.ResourceCPU] - ramTotal, hasRAMTotal := capacityMap[hv1.ResourceMemory] + for _, hypervisor := range hypervisors { + cpuTotal, hasCPUTotal := hypervisor.getResourceCapacity(hv1.ResourceCPU) + ramTotal, hasRAMTotal := hypervisor.getResourceCapacity(hv1.ResourceMemory) if !hasCPUTotal || !hasRAMTotal { - slog.Error("hypervisor missing cpu or ram total capacity", "hypervisor", hypervisor.Name) + slog.Warn("hypervisor missing cpu or ram capacity, skipping", "host", hypervisor.Name) continue } - if cpuTotal.IsZero() || ramTotal.IsZero() { - slog.Warn("hypervisor with zero effective capacity, falling back to physical capacity (overcommit not considered)", "host", hypervisor.Name) - if hypervisor.Status.Capacity == nil { - slog.Warn("hypervisor with nil physical capacity, skipping", "host", hypervisor.Name) - continue - } - cpuTotal = hypervisor.Status.Capacity[hv1.ResourceCPU] - ramTotal = hypervisor.Status.Capacity[hv1.ResourceMemory] - if cpuTotal.IsZero() || ramTotal.IsZero() { - slog.Warn("hypervisor with zero physical capacity, skipping", "host", hypervisor.Name) - continue - } - } - - cpuUsed, hasCPUUtilized := hypervisor.Status.Allocation[hv1.ResourceCPU] - if !hasCPUUtilized { - cpuUsed = resource.MustParse("0") - } - - ramUsed, hasRAMUtilized := hypervisor.Status.Allocation[hv1.ResourceMemory] - if !hasRAMUtilized { - ramUsed = resource.MustParse("0") - } + cpuUsed := hypervisor.getResourceAllocation(hv1.ResourceCPU) + ramUsed := hypervisor.getResourceAllocation(hv1.ResourceMemory) // Get reservation data for this hypervisor (zero-value if absent). failoverRes := failoverByHost[hypervisor.Name] @@ -234,7 +282,7 @@ func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) { cpuFailover := failoverRes.cpu ramFailover := failoverRes.memory - labels := hostLabelsFromHypervisor(hypervisor) + labels := hypervisor.getLabels() k.emitTotal(ch, "cpu", cpuTotal.AsApproximateFloat64(), labels) k.emitTotal(ch, "ram", ramTotal.AsApproximateFloat64(), labels) @@ -253,11 +301,17 @@ func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) { paygCPU.Sub(cpuUsed) paygCPU.Sub(cpuReserved) paygCPU.Sub(cpuFailover) + if paygCPU.Cmp(resource.MustParse("0")) < 0 { + paygCPU = resource.MustParse("0") + } paygRAM := ramTotal.DeepCopy() paygRAM.Sub(ramUsed) paygRAM.Sub(ramReserved) paygRAM.Sub(ramFailover) + if paygRAM.Cmp(resource.MustParse("0")) < 0 { + paygRAM = resource.MustParse("0") + } k.emitUsage(ch, "cpu", paygCPU.AsApproximateFloat64(), "payg", labels) k.emitUsage(ch, "ram", paygRAM.AsApproximateFloat64(), "payg", labels) @@ -277,38 +331,6 @@ type kvmHostLabels struct { maintenance string } -func hostLabelsFromHypervisor(hypervisor hv1.Hypervisor) kvmHostLabels { - decommissioned := false - externalCustomer := false - workloadType := "general-purpose" - cpuArchitecture := "cascade-lake" - - for _, trait := range hypervisor.Status.Traits { - switch trait { - case "CUSTOM_HW_SAPPHIRE_RAPIDS": - cpuArchitecture = "sapphire-rapids" - case "CUSTOM_HANA_EXCLUSIVE_HOST": - workloadType = "hana" - case "CUSTOM_DECOMMISSIONING": - decommissioned = true - case "CUSTOM_EXTERNAL_CUSTOMER_EXCLUSIVE": - externalCustomer = true - } - } - - return kvmHostLabels{ - computeHost: hypervisor.Name, - availabilityZone: hypervisor.Labels["topology.kubernetes.io/zone"], - buildingBlock: getBuildingBlock(hypervisor.Name), - cpuArchitecture: cpuArchitecture, - workloadType: workloadType, - enabled: strconv.FormatBool(true), - decommissioned: strconv.FormatBool(decommissioned), - externalCustomer: strconv.FormatBool(externalCustomer), - maintenance: strconv.FormatBool(false), - } -} - func (k *KVMResourceCapacityKPI) emitTotal(ch chan<- prometheus.Metric, resourceName string, value float64, l kvmHostLabels) { ch <- prometheus.MustNewConstMetric( k.totalCapacityPerHost, diff --git a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go index 724436724..6e9d38c7b 100644 --- a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go +++ b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go @@ -1199,6 +1199,81 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { usageMetric("node001-bb088", "ram", "payg", "qa-1a", "bb088", 188978561024), // 512Gi-256Gi-0-80Gi = 176Gi }, }, + { + name: "payg capacity clamped to zero when overcommitted", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{"topology.kubernetes.io/zone": "qa-1a"}, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("100"), + hv1.ResourceMemory: resource.MustParse("200Gi"), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("80"), + hv1.ResourceMemory: resource.MustParse("150Gi"), + }, + Traits: []string{}, + }, + }, + }, + // failover=20 CPU/40Gi RAM, committed reserved=20 CPU/40Gi RAM (no allocations) + // CPU: 100 - 80 - 20 - 20 = -20 → clamped to 0 + // RAM: 200Gi - 150Gi - 40Gi - 40Gi = -30Gi → clamped to 0 + reservations: []v1alpha1.Reservation{ + { + ObjectMeta: v1.ObjectMeta{Name: "failover-1"}, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeFailover, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("20"), + hv1.ResourceMemory: resource.MustParse("40Gi"), + }, + FailoverReservation: &v1alpha1.FailoverReservationSpec{}, + }, + Status: v1alpha1.ReservationStatus{ + Host: "node001-bb088", + Conditions: []v1.Condition{ + {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, + }, + }, + }, + { + ObjectMeta: v1.ObjectMeta{Name: "committed-1"}, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("20"), + hv1.ResourceMemory: resource.MustParse("40Gi"), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{}, + }, + Status: v1alpha1.ReservationStatus{ + Host: "node001-bb088", + Conditions: []v1.Condition{ + {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, + }, + }, + }, + }, + expectedMetrics: []kvmExpectedMetric{ + totalMetric("node001-bb088", "cpu", "qa-1a", "bb088", 100), + totalMetric("node001-bb088", "ram", "qa-1a", "bb088", 214748364800), // 200Gi + usageMetric("node001-bb088", "cpu", "utilized", "qa-1a", "bb088", 80), + usageMetric("node001-bb088", "ram", "utilized", "qa-1a", "bb088", 161061273600), // 150Gi + usageMetric("node001-bb088", "cpu", "reserved", "qa-1a", "bb088", 20), + usageMetric("node001-bb088", "ram", "reserved", "qa-1a", "bb088", 42949672960), // 40Gi + usageMetric("node001-bb088", "cpu", "failover", "qa-1a", "bb088", 20), + usageMetric("node001-bb088", "ram", "failover", "qa-1a", "bb088", 42949672960), // 40Gi + usageMetric("node001-bb088", "cpu", "payg", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "ram", "payg", "qa-1a", "bb088", 0), + }, + }, } for _, tt := range tests {