diff --git a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go index 1d10d5c30..7d3bb33ef 100644 --- a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go +++ b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go @@ -23,6 +23,72 @@ import ( hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" ) +type kvmHost struct { + hv1.Hypervisor +} + +// getResourceCapacity attempts to retrieve the effective capacity for the specified resource from the hypervisor status, falling back to the physical capacity if effective capacity is not available. It returns the capacity quantity and a boolean indicating whether any capacity information was found. +func (k kvmHost) getResourceCapacity(resourceName hv1.ResourceName) (capacity resource.Quantity, ok bool) { + if k.Status.EffectiveCapacity != nil { + qty, exists := k.Status.EffectiveCapacity[resourceName] + if exists && !qty.IsZero() { + return qty, true + } + } + if k.Status.Capacity == nil { + return resource.Quantity{}, false + } + qty, exists := k.Status.Capacity[resourceName] + if !exists || qty.IsZero() { + return resource.Quantity{}, false + } + return qty, true +} + +func (k kvmHost) getResourceAllocation(resourceName hv1.ResourceName) (allocation resource.Quantity) { + if k.Status.Allocation == nil { + return resource.MustParse("0") + } + + qty, exists := k.Status.Allocation[resourceName] + if !exists { + return resource.MustParse("0") + } + return qty +} + +func (k kvmHost) getLabels() kvmHostLabels { + decommissioned := false + externalCustomer := false + workloadType := "general-purpose" + cpuArchitecture := "cascade-lake" + + for _, trait := range k.Status.Traits { + switch trait { + case "CUSTOM_HW_SAPPHIRE_RAPIDS": + cpuArchitecture = "sapphire-rapids" + case "CUSTOM_HANA_EXCLUSIVE_HOST": + workloadType = "hana" + case "CUSTOM_DECOMMISSIONING": + decommissioned = true + case "CUSTOM_EXTERNAL_CUSTOMER_EXCLUSIVE": + externalCustomer = true + } + } + + return kvmHostLabels{ + computeHost: k.Name, + availabilityZone: k.Labels["topology.kubernetes.io/zone"], + buildingBlock: getBuildingBlock(k.Name), + cpuArchitecture: cpuArchitecture, + workloadType: workloadType, + enabled: strconv.FormatBool(true), + decommissioned: strconv.FormatBool(decommissioned), + externalCustomer: strconv.FormatBool(externalCustomer), + maintenance: strconv.FormatBool(false), + } +} + // Assuming hypervisor names are in the format nodeXXX-bbYY func getBuildingBlock(hostName string) string { parts := strings.Split(hostName, "-") @@ -167,10 +233,23 @@ func aggregateReservationsByHost(reservations []v1alpha1.Reservation) ( return failoverByHost, committedNotInUseByHost } -func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) { +func (k *KVMResourceCapacityKPI) getHypervisors() ([]kvmHost, error) { hvs := &hv1.HypervisorList{} if err := k.Client.List(context.Background(), hvs); err != nil { - slog.Error("failed to list hypervisors", "error", err) + return nil, err + } + + hosts := make([]kvmHost, len(hvs.Items)) + for i, hv := range hvs.Items { + hosts[i] = kvmHost{Hypervisor: hv} + } + return hosts, nil +} + +func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) { + hypervisors, err := k.getHypervisors() + if err != nil { + slog.Error("failed to get hypervisors", "error", err) return } @@ -182,34 +261,17 @@ func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) { failoverByHost, committedNotInUseByHost := aggregateReservationsByHost(reservations.Items) - for _, hypervisor := range hvs.Items { - if hypervisor.Status.EffectiveCapacity == nil { - slog.Warn("hypervisor with nil effective capacity, skipping", "host", hypervisor.Name) - continue - } - - cpuTotal, hasCPUTotal := hypervisor.Status.EffectiveCapacity[hv1.ResourceCPU] - ramTotal, hasRAMTotal := hypervisor.Status.EffectiveCapacity[hv1.ResourceMemory] + for _, hypervisor := range hypervisors { + cpuTotal, hasCPUTotal := hypervisor.getResourceCapacity(hv1.ResourceCPU) + ramTotal, hasRAMTotal := hypervisor.getResourceCapacity(hv1.ResourceMemory) if !hasCPUTotal || !hasRAMTotal { - slog.Error("hypervisor missing cpu or ram total capacity", "hypervisor", hypervisor.Name) + slog.Warn("hypervisor missing cpu or ram capacity, skipping", "host", hypervisor.Name) continue } - if cpuTotal.IsZero() || ramTotal.IsZero() { - slog.Warn("hypervisor with zero cpu or ram total capacity, skipping", "host", hypervisor.Name) - continue - } - - cpuUsed, hasCPUUtilized := hypervisor.Status.Allocation[hv1.ResourceCPU] - if !hasCPUUtilized { - cpuUsed = resource.MustParse("0") - } - - ramUsed, hasRAMUtilized := hypervisor.Status.Allocation[hv1.ResourceMemory] - if !hasRAMUtilized { - ramUsed = resource.MustParse("0") - } + cpuUsed := hypervisor.getResourceAllocation(hv1.ResourceCPU) + ramUsed := hypervisor.getResourceAllocation(hv1.ResourceMemory) // Get reservation data for this hypervisor (zero-value if absent). failoverRes := failoverByHost[hypervisor.Name] @@ -220,7 +282,7 @@ func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) { cpuFailover := failoverRes.cpu ramFailover := failoverRes.memory - labels := hostLabelsFromHypervisor(hypervisor) + labels := hypervisor.getLabels() k.emitTotal(ch, "cpu", cpuTotal.AsApproximateFloat64(), labels) k.emitTotal(ch, "ram", ramTotal.AsApproximateFloat64(), labels) @@ -239,11 +301,17 @@ func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) { paygCPU.Sub(cpuUsed) paygCPU.Sub(cpuReserved) paygCPU.Sub(cpuFailover) + if paygCPU.Cmp(resource.MustParse("0")) < 0 { + paygCPU = resource.MustParse("0") + } paygRAM := ramTotal.DeepCopy() paygRAM.Sub(ramUsed) paygRAM.Sub(ramReserved) paygRAM.Sub(ramFailover) + if paygRAM.Cmp(resource.MustParse("0")) < 0 { + paygRAM = resource.MustParse("0") + } k.emitUsage(ch, "cpu", paygCPU.AsApproximateFloat64(), "payg", labels) k.emitUsage(ch, "ram", paygRAM.AsApproximateFloat64(), "payg", labels) @@ -263,38 +331,6 @@ type kvmHostLabels struct { maintenance string } -func hostLabelsFromHypervisor(hypervisor hv1.Hypervisor) kvmHostLabels { - decommissioned := false - externalCustomer := false - workloadType := "general-purpose" - cpuArchitecture := "cascade-lake" - - for _, trait := range hypervisor.Status.Traits { - switch trait { - case "CUSTOM_HW_SAPPHIRE_RAPIDS": - cpuArchitecture = "sapphire-rapids" - case "CUSTOM_HANA_EXCLUSIVE_HOST": - workloadType = "hana" - case "CUSTOM_DECOMMISSIONING": - decommissioned = true - case "CUSTOM_EXTERNAL_CUSTOMER_EXCLUSIVE": - externalCustomer = true - } - } - - return kvmHostLabels{ - computeHost: hypervisor.Name, - availabilityZone: hypervisor.Labels["topology.kubernetes.io/zone"], - buildingBlock: getBuildingBlock(hypervisor.Name), - cpuArchitecture: cpuArchitecture, - workloadType: workloadType, - enabled: strconv.FormatBool(true), - decommissioned: strconv.FormatBool(decommissioned), - externalCustomer: strconv.FormatBool(externalCustomer), - maintenance: strconv.FormatBool(false), - } -} - func (k *KVMResourceCapacityKPI) emitTotal(ch chan<- prometheus.Metric, resourceName string, value float64, l kvmHostLabels) { ch <- prometheus.MustNewConstMetric( k.totalCapacityPerHost, diff --git a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go index e692df04f..6e9d38c7b 100644 --- a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go +++ b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go @@ -125,6 +125,130 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, expectedMetrics: []kvmExpectedMetric{}, }, + { + name: "nil effective capacity falls back to physical capacity", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: nil, + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("128"), + hv1.ResourceMemory: resource.MustParse("512Gi"), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("64"), + hv1.ResourceMemory: resource.MustParse("256Gi"), + }, + Traits: []string{}, + }, + }, + }, + expectedMetrics: []kvmExpectedMetric{ + totalMetric("node001-bb088", "cpu", "qa-1a", "bb088", 128), + totalMetric("node001-bb088", "ram", "qa-1a", "bb088", 549755813888), // 512Gi + usageMetric("node001-bb088", "cpu", "utilized", "qa-1a", "bb088", 64), + usageMetric("node001-bb088", "ram", "utilized", "qa-1a", "bb088", 274877906944), // 256Gi + usageMetric("node001-bb088", "cpu", "reserved", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "ram", "reserved", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "cpu", "failover", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "ram", "failover", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "cpu", "payg", "qa-1a", "bb088", 64), // 128-64-0-0 + usageMetric("node001-bb088", "ram", "payg", "qa-1a", "bb088", 274877906944), // 512Gi-256Gi + }, + }, + { + name: "zero effective capacity falls back to physical capacity", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), + hv1.ResourceMemory: resource.MustParse("0"), + }, + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("128"), + hv1.ResourceMemory: resource.MustParse("512Gi"), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("64"), + hv1.ResourceMemory: resource.MustParse("256Gi"), + }, + Traits: []string{}, + }, + }, + }, + expectedMetrics: []kvmExpectedMetric{ + totalMetric("node001-bb088", "cpu", "qa-1a", "bb088", 128), + totalMetric("node001-bb088", "ram", "qa-1a", "bb088", 549755813888), // 512Gi + usageMetric("node001-bb088", "cpu", "utilized", "qa-1a", "bb088", 64), + usageMetric("node001-bb088", "ram", "utilized", "qa-1a", "bb088", 274877906944), // 256Gi + usageMetric("node001-bb088", "cpu", "reserved", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "ram", "reserved", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "cpu", "failover", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "ram", "failover", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "cpu", "payg", "qa-1a", "bb088", 64), // 128-64-0-0 + usageMetric("node001-bb088", "ram", "payg", "qa-1a", "bb088", 274877906944), // 512Gi-256Gi + }, + }, + { + name: "zero effective capacity with nil physical capacity skips", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), + hv1.ResourceMemory: resource.MustParse("0"), + }, + Capacity: nil, + Traits: []string{}, + }, + }, + }, + expectedMetrics: []kvmExpectedMetric{}, + }, + { + name: "zero effective capacity with zero physical capacity skips", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), + hv1.ResourceMemory: resource.MustParse("0"), + }, + Capacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("0"), + hv1.ResourceMemory: resource.MustParse("0"), + }, + Traits: []string{}, + }, + }, + }, + expectedMetrics: []kvmExpectedMetric{}, + }, { name: "single hypervisor with default traits, no reservations", hypervisors: []hv1.Hypervisor{ @@ -1075,6 +1199,81 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { usageMetric("node001-bb088", "ram", "payg", "qa-1a", "bb088", 188978561024), // 512Gi-256Gi-0-80Gi = 176Gi }, }, + { + name: "payg capacity clamped to zero when overcommitted", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{"topology.kubernetes.io/zone": "qa-1a"}, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("100"), + hv1.ResourceMemory: resource.MustParse("200Gi"), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("80"), + hv1.ResourceMemory: resource.MustParse("150Gi"), + }, + Traits: []string{}, + }, + }, + }, + // failover=20 CPU/40Gi RAM, committed reserved=20 CPU/40Gi RAM (no allocations) + // CPU: 100 - 80 - 20 - 20 = -20 → clamped to 0 + // RAM: 200Gi - 150Gi - 40Gi - 40Gi = -30Gi → clamped to 0 + reservations: []v1alpha1.Reservation{ + { + ObjectMeta: v1.ObjectMeta{Name: "failover-1"}, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeFailover, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("20"), + hv1.ResourceMemory: resource.MustParse("40Gi"), + }, + FailoverReservation: &v1alpha1.FailoverReservationSpec{}, + }, + Status: v1alpha1.ReservationStatus{ + Host: "node001-bb088", + Conditions: []v1.Condition{ + {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, + }, + }, + }, + { + ObjectMeta: v1.ObjectMeta{Name: "committed-1"}, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("20"), + hv1.ResourceMemory: resource.MustParse("40Gi"), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{}, + }, + Status: v1alpha1.ReservationStatus{ + Host: "node001-bb088", + Conditions: []v1.Condition{ + {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, + }, + }, + }, + }, + expectedMetrics: []kvmExpectedMetric{ + totalMetric("node001-bb088", "cpu", "qa-1a", "bb088", 100), + totalMetric("node001-bb088", "ram", "qa-1a", "bb088", 214748364800), // 200Gi + usageMetric("node001-bb088", "cpu", "utilized", "qa-1a", "bb088", 80), + usageMetric("node001-bb088", "ram", "utilized", "qa-1a", "bb088", 161061273600), // 150Gi + usageMetric("node001-bb088", "cpu", "reserved", "qa-1a", "bb088", 20), + usageMetric("node001-bb088", "ram", "reserved", "qa-1a", "bb088", 42949672960), // 40Gi + usageMetric("node001-bb088", "cpu", "failover", "qa-1a", "bb088", 20), + usageMetric("node001-bb088", "ram", "failover", "qa-1a", "bb088", 42949672960), // 40Gi + usageMetric("node001-bb088", "cpu", "payg", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "ram", "payg", "qa-1a", "bb088", 0), + }, + }, } for _, tt := range tests {