From 6e8388b29283087b736d90897cdc5b259fd0e0c3 Mon Sep 17 00:00:00 2001 From: Markus Wieland Date: Wed, 29 Apr 2026 13:29:47 +0200 Subject: [PATCH 1/5] refactor: vmware host capacity kpi Co-authored-by: Copilot --- helm/bundles/cortex-nova/templates/kpis.yaml | 30 +- .../kpis/plugins/compute/refactor/shared.go | 150 ++++++ .../compute/resource_capacity_kvm_test.go | 11 + .../compute/resource_capacity_vmware.go | 201 ------- .../compute/resource_capacity_vmware_test.go | 503 ------------------ .../kpis/plugins/infrastructure/shared.go | 1 + .../infrastructure/vmware_host_capacity.go | 119 +++++ .../vmware_host_capacity_test.go | 335 ++++++++++++ internal/knowledge/kpis/supported_kpis.go | 2 +- 9 files changed, 632 insertions(+), 720 deletions(-) create mode 100644 internal/knowledge/kpis/plugins/compute/refactor/shared.go delete mode 100644 internal/knowledge/kpis/plugins/compute/resource_capacity_vmware.go delete mode 100644 internal/knowledge/kpis/plugins/compute/resource_capacity_vmware_test.go create mode 100644 internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity.go create mode 100644 internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity_test.go diff --git a/helm/bundles/cortex-nova/templates/kpis.yaml b/helm/bundles/cortex-nova/templates/kpis.yaml index 22774c62a..6979b0e29 100644 --- a/helm/bundles/cortex-nova/templates/kpis.yaml +++ b/helm/bundles/cortex-nova/templates/kpis.yaml @@ -29,20 +29,6 @@ spec: --- apiVersion: cortex.cloud/v1alpha1 kind: KPI -metadata: - name: vmware-host-capacity -spec: - schedulingDomain: nova - impl: vmware_host_capacity_kpi - dependencies: - knowledges: - - name: host-details - - name: host-utilization - description: | - This KPI tracks the total, utilized, reserved and failover capacity of VMware hosts. ---- -apiVersion: cortex.cloud/v1alpha1 -kind: KPI metadata: name: host-running-vms spec: @@ -215,4 +201,18 @@ spec: - name: nova-flavors - name: limes-project-commitments description: | - This KPI tracks the resource commitments of projects running VMs on VMware hosts. \ No newline at end of file + This KPI tracks the resource commitments of projects running VMs on VMware hosts. +--- +apiVersion: cortex.cloud/v1alpha1 +kind: KPI +metadata: + name: vmware-host-capacity +spec: + schedulingDomain: nova + impl: vmware_host_capacity_kpi + dependencies: + knowledges: + - name: host-details + - name: host-utilization + description: | + This KPI tracks the capacity and utilization of VMware hosts in terms of CPU, RAM, and disk resources. \ No newline at end of file diff --git a/internal/knowledge/kpis/plugins/compute/refactor/shared.go b/internal/knowledge/kpis/plugins/compute/refactor/shared.go new file mode 100644 index 000000000..d50698b1f --- /dev/null +++ b/internal/knowledge/kpis/plugins/compute/refactor/shared.go @@ -0,0 +1,150 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package refactor + +import ( + "fmt" + "regexp" + "strconv" + + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/prometheus/client_golang/prometheus" +) + +const ( + hypervisorTypeIronic = "ironic" + hypervisorFamilyVMware = "vmware" + hypervisorFamilyKVM = "kvm" + + cpuArchCascadeLake = "cascade-lake" + cpuArchSapphireRapids = "sapphire-rapids" + + workloadTypeHANA = "hana" + workloadTypeGeneralPurpose = "general-purpose" + + hostDetailsKnowledge = "host-details" + hostUtilizationKnowledge = "host-utilization" + + commitmentStatusConfirmed = "confirmed" + commitmentStatusGuaranteed = "guaranteed" + + limesServiceCompute = "compute" + limesResourceCores = "cores" + limesResourceRAM = "ram" + + unitVCPU = "vCPU" + unitBytes = "B" + + // Flavor suffix indicating sapphire-rapids CPU architecture. + sapphireRapidsFlavorSuffix = "_v2" +) + +// kvmFlavorRegex matches KVM flavors where the second underscore-delimited segment is "k", e.g. "m1_k_small". +var kvmFlavorRegex = regexp.MustCompile(`^[^_]+_k_`) + +var vmwareHostLabels = []string{ + "availability_zone", + "compute_host", + "cpu_architecture", + "workload_type", + "hypervisor_family", + "enabled", + "decommissioned", + "external_customer", + "disabled_reason", + "pinned_projects", + "pinned_project_ids", +} + +var vmwareHostCapacityLabels = append(append([]string{}, vmwareHostLabels...), "resource", "unit") + +var vmwareProjectLabels = append(append([]string{}, vmwareHostLabels...), "project_id", "project_name", "flavor_name") + +var vmwareProjectCapacityLabels = append(append([]string{}, vmwareProjectLabels...), "resource", "unit") + +var kvmHostLabels = []string{ + "compute_host", + "availability_zone", + "building_block", + "cpu_architecture", + "workload_type", + "enabled", + "decommissioned", + "external_customer", + "maintenance", +} + +var kvmHostCapacityLabels = append(append([]string{}, kvmHostLabels...), "resource", "unit") + +var kvmProjectLabels = append(append([]string{}, kvmHostLabels...), "project_id", "project_name", "flavor_name") + +var kvmProjectCapacityLabels = append(append([]string{}, kvmProjectLabels...), "resource", "unit") + +// vmwareHost wraps HostDetails with Prometheus metric helpers. +type vmwareHost struct { + compute.HostDetails +} + +func (h vmwareHost) getHostLabels() []string { + pinnedProjectIds := "" + pinnedProjects := false + if h.PinnedProjects != nil { + pinnedProjectIds = *h.PinnedProjects + pinnedProjects = true + } + disabledReason := "-" + if h.DisabledReason != nil { + disabledReason = *h.DisabledReason + } + return []string{ + h.AvailabilityZone, + h.ComputeHost, + h.CPUArchitecture, + h.WorkloadType, + h.HypervisorFamily, + strconv.FormatBool(h.Enabled), + strconv.FormatBool(h.Decommissioned), + strconv.FormatBool(h.ExternalCustomer), + disabledReason, + strconv.FormatBool(pinnedProjects), + pinnedProjectIds, + } +} + +func (h vmwareHost) toCapacityMetric(desc *prometheus.Desc, resource, unit string, value float64) prometheus.Metric { + labels := append(h.getHostLabels(), resource, unit) + return prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, value, labels...) +} + +func (h vmwareHost) toInstanceCountMetric(desc *prometheus.Desc, value float64) prometheus.Metric { + return prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, value, h.getHostLabels()...) +} + +// convertLimesMemoryToBytes converts a limes memory amount and its unit string to bytes. +func convertLimesMemoryToBytes(amount uint64, unit string) (float64, error) { + switch unit { + case "B", "": + return float64(amount), nil + case "KiB": + return float64(amount) * 1024, nil + case "MiB": + return float64(amount) * 1024 * 1024, nil + case "GiB": + return float64(amount) * 1024 * 1024 * 1024, nil + case "TiB": + return float64(amount) * 1024 * 1024 * 1024 * 1024, nil + default: + return 0, fmt.Errorf("unknown limes memory unit: %s", unit) + } +} + +// cpuArchForFlavor derives the CPU architecture from a flavor name. +// Flavors with a "_v2" suffix run on sapphire-rapids; all others on cascade-lake. +func cpuArchForFlavor(flavorName string) string { + if len(flavorName) >= len(sapphireRapidsFlavorSuffix) && + flavorName[len(flavorName)-len(sapphireRapidsFlavorSuffix):] == sapphireRapidsFlavorSuffix { + return cpuArchSapphireRapids + } + return cpuArchCascadeLake +} diff --git a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go index 6e9d38c7b..c233cfd4c 100644 --- a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go +++ b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go @@ -4,6 +4,7 @@ package compute import ( + "regexp" "testing" "github.com/cobaltcore-dev/cortex/api/v1alpha1" @@ -38,6 +39,16 @@ type kvmMetricLabels struct { Maintenance string } +var fqNameRe = regexp.MustCompile(`fqName: "([^"]+)"`) + +func getMetricName(desc string) string { + match := fqNameRe.FindStringSubmatch(desc) + if len(match) > 1 { + return match[1] + } + return "" +} + type kvmExpectedMetric struct { Name string // metric family name (e.g. "cortex_kvm_host_capacity_total") Labels kvmMetricLabels diff --git a/internal/knowledge/kpis/plugins/compute/resource_capacity_vmware.go b/internal/knowledge/kpis/plugins/compute/resource_capacity_vmware.go deleted file mode 100644 index 8bd2d4177..000000000 --- a/internal/knowledge/kpis/plugins/compute/resource_capacity_vmware.go +++ /dev/null @@ -1,201 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package compute - -import ( - "context" - "log/slog" - "strconv" - - "github.com/cobaltcore-dev/cortex/api/v1alpha1" - "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" - "sigs.k8s.io/controller-runtime/pkg/client" - - "github.com/cobaltcore-dev/cortex/internal/knowledge/db" - "github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins" - "github.com/cobaltcore-dev/cortex/pkg/conf" - "github.com/prometheus/client_golang/prometheus" -) - -type VMwareResourceCapacityKPI struct { - // Common base for all KPIs that provides standard functionality. - plugins.BaseKPI[struct{}] // No options passed through yaml config - - availableCapacityPerHost *prometheus.Desc - totalCapacityPerHost *prometheus.Desc -} - -func (VMwareResourceCapacityKPI) GetName() string { - return "vmware_host_capacity_kpi" -} - -func (k *VMwareResourceCapacityKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) error { - if err := k.BaseKPI.Init(db, client, opts); err != nil { - return err - } - k.availableCapacityPerHost = prometheus.NewDesc( - "cortex_vmware_host_capacity_available", - "Available capacity per resource on the hosts currently (individually by host).", - []string{ - "compute_host", - "resource", - "availability_zone", - "cpu_architecture", - "workload_type", - "enabled", - "decommissioned", - "external_customer", - "pinned_projects", - "disabled_reason", - "pinned_project_ids", - }, - nil, - ) - k.totalCapacityPerHost = prometheus.NewDesc( - "cortex_vmware_host_capacity_total", - "Total resources available on the hosts currently (individually by host).", - []string{ - "compute_host", - "resource", - "availability_zone", - "cpu_architecture", - "workload_type", - "enabled", - "decommissioned", - "external_customer", - "pinned_projects", - "pinned_project_ids", - }, - nil, - ) - return nil -} - -func (k *VMwareResourceCapacityKPI) Describe(ch chan<- *prometheus.Desc) { - ch <- k.availableCapacityPerHost - ch <- k.totalCapacityPerHost -} - -func (k *VMwareResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) { - hostDetailsKnowledge := &v1alpha1.Knowledge{} - if err := k.Client.Get( - context.Background(), - client.ObjectKey{Name: "host-details"}, - hostDetailsKnowledge, - ); err != nil { - slog.Error("failed to get knowledge host-details", "err", err) - return - } - hostDetails, err := v1alpha1. - UnboxFeatureList[compute.HostDetails](hostDetailsKnowledge.Status.Raw) - if err != nil { - slog.Error("failed to unbox storage pool cpu usage", "err", err) - return - } - detailsByComputeHost := make(map[string]compute.HostDetails) - for _, detail := range hostDetails { - detailsByComputeHost[detail.ComputeHost] = detail - } - - hostUtilizationKnowledge := &v1alpha1.Knowledge{} - if err := k.Client.Get( - context.Background(), - client.ObjectKey{Name: "host-utilization"}, - hostUtilizationKnowledge, - ); err != nil { - slog.Error("failed to get knowledge host-utilization", "err", err) - return - } - hostUtilizations, err := v1alpha1. - UnboxFeatureList[compute.HostUtilization](hostUtilizationKnowledge.Status.Raw) - if err != nil { - slog.Error("failed to unbox host utilization", "err", err) - return - } - - for _, utilization := range hostUtilizations { - detail, exists := detailsByComputeHost[utilization.ComputeHost] - if !exists { - slog.Warn("host_available_capacity: missing host details for compute host", "compute_host", utilization.ComputeHost) - continue - } - if detail.HypervisorType == "ironic" { - continue // Ironic hosts do not run VMs/instances - } - - if detail.HypervisorFamily != "vmware" { - continue - } - - if utilization.TotalRAMAllocatableMB == 0 || utilization.TotalVCPUsAllocatable == 0 || utilization.TotalDiskAllocatableGB == 0 { - slog.Info( - "Skipping host since placement is reporting zero allocatable resources", - "metric", "cortex_available_capacity_per_host", - "host", utilization.ComputeHost, - "cpu", utilization.TotalVCPUsAllocatable, - "ram", utilization.TotalRAMAllocatableMB, - "disk", utilization.TotalDiskAllocatableGB, - ) - continue - } - - availableCPUs := float64(utilization.TotalVCPUsAllocatable - utilization.VCPUsUsed) - availableRAMMB := float64(utilization.TotalRAMAllocatableMB - utilization.RAMUsedMB) - availableDiskGB := float64(utilization.TotalDiskAllocatableGB - utilization.DiskUsedGB) - - k.exportCapacityMetricVMware(ch, "cpu", availableCPUs, utilization.TotalVCPUsAllocatable, detail) - k.exportCapacityMetricVMware(ch, "ram", availableRAMMB, utilization.TotalRAMAllocatableMB, detail) - k.exportCapacityMetricVMware(ch, "disk", availableDiskGB, utilization.TotalDiskAllocatableGB, detail) - } -} - -func (k *VMwareResourceCapacityKPI) exportCapacityMetricVMware(ch chan<- prometheus.Metric, resource string, available, total float64, host compute.HostDetails) { - enabled := strconv.FormatBool(host.Enabled) - decommissioned := strconv.FormatBool(host.Decommissioned) - externalCustomer := strconv.FormatBool(host.ExternalCustomer) - pinnedProjectIds := "" - pinnedProjects := "false" - if host.PinnedProjects != nil { - pinnedProjectIds = *host.PinnedProjects - pinnedProjects = "true" - } - - disabledReason := "-" - if host.DisabledReason != nil { - disabledReason = *host.DisabledReason - } - - ch <- prometheus.MustNewConstMetric( - k.availableCapacityPerHost, - prometheus.GaugeValue, - available, - host.ComputeHost, - resource, - host.AvailabilityZone, - host.CPUArchitecture, - host.WorkloadType, - enabled, - decommissioned, - externalCustomer, - pinnedProjects, - disabledReason, - pinnedProjectIds, - ) - - ch <- prometheus.MustNewConstMetric( - k.totalCapacityPerHost, - prometheus.GaugeValue, - total, - host.ComputeHost, - resource, - host.AvailabilityZone, - host.CPUArchitecture, - host.WorkloadType, - enabled, - decommissioned, - externalCustomer, - pinnedProjects, - pinnedProjectIds, - ) -} diff --git a/internal/knowledge/kpis/plugins/compute/resource_capacity_vmware_test.go b/internal/knowledge/kpis/plugins/compute/resource_capacity_vmware_test.go deleted file mode 100644 index 875be6357..000000000 --- a/internal/knowledge/kpis/plugins/compute/resource_capacity_vmware_test.go +++ /dev/null @@ -1,503 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package compute - -import ( - "reflect" - "regexp" - "testing" - - "github.com/cobaltcore-dev/cortex/api/v1alpha1" - "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" - "github.com/cobaltcore-dev/cortex/pkg/conf" - testlib "github.com/cobaltcore-dev/cortex/pkg/testing" - "github.com/prometheus/client_golang/prometheus" - prometheusgo "github.com/prometheus/client_model/go" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "sigs.k8s.io/controller-runtime/pkg/client/fake" -) - -func TestVMwareResourceCapacityKPI_Init(t *testing.T) { - kpi := &VMwareResourceCapacityKPI{} - if err := kpi.Init(nil, nil, conf.NewRawOpts("{}")); err != nil { - t.Fatalf("expected no error, got %v", err) - } -} - -var fqNameRe = regexp.MustCompile(`fqName: "([^"]+)"`) - -func getMetricName(desc string) string { - match := fqNameRe.FindStringSubmatch(desc) - if len(match) > 1 { - return match[1] - } - return "" -} - -func TestVMwareResourceCapacityKPI_Collect_AbsoluteMetric(t *testing.T) { - scheme, err := v1alpha1.SchemeBuilder.Build() - if err != nil { - t.Fatalf("expected no error, got %v", err) - } - - hostDetails, err := v1alpha1.BoxFeatureList([]any{ - &compute.HostDetails{ - ComputeHost: "vmware-host", - AvailabilityZone: "az1", - CPUArchitecture: "cascade-lake", - HypervisorType: "vcenter", - HypervisorFamily: "vmware", - WorkloadType: "general-purpose", - Enabled: true, - Decommissioned: true, - ExternalCustomer: true, - DisabledReason: nil, - PinnedProjects: nil, - }, - // Skip this because it's not a VMware host - &compute.HostDetails{ - ComputeHost: "kvm-host", - AvailabilityZone: "az2", - CPUArchitecture: "cascade-lake", - HypervisorType: "qemu", - HypervisorFamily: "kvm", - WorkloadType: "hana", - Enabled: false, - Decommissioned: false, - ExternalCustomer: false, - DisabledReason: testlib.Ptr("test"), - PinnedProjects: testlib.Ptr("project1,project2"), - }, - // Skip this because placement doesn't report any capacity for this host - &compute.HostDetails{ - ComputeHost: "vmware-host-2", - AvailabilityZone: "az2", - CPUArchitecture: "cascade-lake", - HypervisorType: "qemu", - HypervisorFamily: "vmware", - WorkloadType: "hana", - Enabled: false, - Decommissioned: false, - ExternalCustomer: false, - DisabledReason: testlib.Ptr("test"), - PinnedProjects: testlib.Ptr("project1,project2"), - }, - // Skip this because it's a ironic host - &compute.HostDetails{ - ComputeHost: "ironic-host", - AvailabilityZone: "az2", - CPUArchitecture: "cascade-lake", - HypervisorType: "ironic", - HypervisorFamily: "vmware", - WorkloadType: "hana", - Enabled: false, - Decommissioned: false, - ExternalCustomer: false, - DisabledReason: testlib.Ptr("test"), - PinnedProjects: testlib.Ptr("project1"), - }, - }) - if err != nil { - t.Fatalf("expected no error, got %v", err) - } - - hostUtilizations, err := v1alpha1.BoxFeatureList([]any{ - &compute.HostUtilization{ - ComputeHost: "vmware-host", - TotalVCPUsAllocatable: 100, - TotalRAMAllocatableMB: 200, - TotalDiskAllocatableGB: 300, - VCPUsUsed: 40, - RAMUsedMB: 40, - DiskUsedGB: 40, - }, - &compute.HostUtilization{ - ComputeHost: "kvm-host", - TotalVCPUsAllocatable: 100, - TotalRAMAllocatableMB: 100, - TotalDiskAllocatableGB: 100, - VCPUsUsed: 75, - RAMUsedMB: 80, - DiskUsedGB: 85, - }, - &compute.HostUtilization{ - ComputeHost: "ironic-host", - TotalVCPUsAllocatable: 0, - TotalRAMAllocatableMB: 0, - TotalDiskAllocatableGB: 0, - VCPUsUsed: 0, - RAMUsedMB: 0, - DiskUsedGB: 0, - }, - // No Capacity reported for host kvm-host-2 - }) - if err != nil { - t.Fatalf("expected no error, got %v", err) - } - - kpi := &VMwareResourceCapacityKPI{} - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithRuntimeObjects(&v1alpha1.Knowledge{ - ObjectMeta: v1.ObjectMeta{Name: "host-details"}, - Status: v1alpha1.KnowledgeStatus{Raw: hostDetails}, - }, &v1alpha1.Knowledge{ - ObjectMeta: v1.ObjectMeta{Name: "host-utilization"}, - Status: v1alpha1.KnowledgeStatus{Raw: hostUtilizations}, - }). - Build() - if err := kpi.Init(nil, client, conf.NewRawOpts("{}")); err != nil { - t.Fatalf("expected no error, got %v", err) - } - - ch := make(chan prometheus.Metric, 100) - kpi.Collect(ch) - close(ch) - - type HostResourceMetric struct { - ComputeHost string - Resource string - AvailabilityZone string - Enabled string - Decommissioned string - ExternalCustomer string - CPUArchitecture string - WorkloadType string - DisabledReason string - PinnedProjects string - PinnedProjectIds string - Value float64 - } - - actualMetrics := make(map[string]HostResourceMetric, 0) - - for metric := range ch { - desc := metric.Desc().String() - metricName := getMetricName(desc) - - // Only consider cortex_vmware_host_capacity_available metric in this test - if metricName != "cortex_vmware_host_capacity_available" { - continue - } - - var m prometheusgo.Metric - if err := metric.Write(&m); err != nil { - t.Fatalf("failed to write metric: %v", err) - } - - labels := make(map[string]string) - for _, label := range m.Label { - labels[label.GetName()] = label.GetValue() - } - - key := labels["compute_host"] + "-" + labels["resource"] - - actualMetrics[key] = HostResourceMetric{ - ComputeHost: labels["compute_host"], - Resource: labels["resource"], - AvailabilityZone: labels["availability_zone"], - Enabled: labels["enabled"], - Decommissioned: labels["decommissioned"], - ExternalCustomer: labels["external_customer"], - CPUArchitecture: labels["cpu_architecture"], - WorkloadType: labels["workload_type"], - DisabledReason: labels["disabled_reason"], - PinnedProjects: labels["pinned_projects"], - PinnedProjectIds: labels["pinned_project_ids"], - Value: m.GetGauge().GetValue(), - } - } - - expectedMetrics := map[string]HostResourceMetric{ - "vmware-host-cpu": { - ComputeHost: "vmware-host", - Resource: "cpu", - AvailabilityZone: "az1", - Enabled: "true", - Decommissioned: "true", - ExternalCustomer: "true", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - DisabledReason: "-", - PinnedProjects: "false", - PinnedProjectIds: "", - Value: 60, // 100 - 40 - }, - "vmware-host-ram": { - ComputeHost: "vmware-host", - Resource: "ram", - AvailabilityZone: "az1", - Enabled: "true", - Decommissioned: "true", - ExternalCustomer: "true", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - DisabledReason: "-", - PinnedProjects: "false", - PinnedProjectIds: "", - Value: 160, // 200 - 40 - }, - "vmware-host-disk": { - ComputeHost: "vmware-host", - Resource: "disk", - AvailabilityZone: "az1", - Enabled: "true", - Decommissioned: "true", - ExternalCustomer: "true", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - DisabledReason: "-", - PinnedProjects: "false", - PinnedProjectIds: "", - Value: 260, // 300 - 40 - }, - } - - if len(expectedMetrics) != len(actualMetrics) { - t.Errorf("expected %d metrics, got %d", len(expectedMetrics), len(actualMetrics)) - } - - for key, expected := range expectedMetrics { - actual, ok := actualMetrics[key] - if !ok { - t.Errorf("expected metric %q not found", key) - continue - } - - if !reflect.DeepEqual(expected, actual) { - t.Errorf("metric %q: expected %+v, got %+v", key, expected, actual) - } - } -} - -func TestVMwareResourceCapacityKPI_Collect_TotalMetric(t *testing.T) { - scheme, err := v1alpha1.SchemeBuilder.Build() - if err != nil { - t.Fatalf("expected no error, got %v", err) - } - - hostDetails, err := v1alpha1.BoxFeatureList([]any{ - &compute.HostDetails{ - ComputeHost: "vmware-host", - AvailabilityZone: "az1", - CPUArchitecture: "cascade-lake", - HypervisorType: "vcenter", - HypervisorFamily: "vmware", - WorkloadType: "general-purpose", - Enabled: true, - Decommissioned: true, - ExternalCustomer: true, - DisabledReason: nil, - PinnedProjects: testlib.Ptr("project1,project2"), - }, - // Skip this because it's not a VMware host - &compute.HostDetails{ - ComputeHost: "kvm-host", - AvailabilityZone: "az2", - CPUArchitecture: "cascade-lake", - HypervisorType: "qemu", - HypervisorFamily: "kvm", - WorkloadType: "hana", - Enabled: false, - Decommissioned: false, - ExternalCustomer: false, - DisabledReason: testlib.Ptr("test"), - PinnedProjects: testlib.Ptr("project1,project2"), - }, - // Skip this because placement doesn't report any capacity for this host - &compute.HostDetails{ - ComputeHost: "vmware-host-2", - AvailabilityZone: "az2", - CPUArchitecture: "cascade-lake", - HypervisorType: "qemu", - HypervisorFamily: "vmware", - WorkloadType: "hana", - Enabled: false, - Decommissioned: false, - ExternalCustomer: false, - DisabledReason: testlib.Ptr("test"), - PinnedProjects: testlib.Ptr("project1,project2"), - }, - // Skip this because it's a ironic host - &compute.HostDetails{ - ComputeHost: "ironic-host", - AvailabilityZone: "az2", - CPUArchitecture: "cascade-lake", - HypervisorType: "ironic", - HypervisorFamily: "vmware", - WorkloadType: "hana", - Enabled: false, - Decommissioned: false, - ExternalCustomer: false, - DisabledReason: testlib.Ptr("test"), - PinnedProjects: testlib.Ptr("project1"), - }, - }) - if err != nil { - t.Fatalf("expected no error, got %v", err) - } - - hostUtilizations, err := v1alpha1.BoxFeatureList([]any{ - &compute.HostUtilization{ - ComputeHost: "vmware-host", - TotalVCPUsAllocatable: 100, - TotalRAMAllocatableMB: 200, - TotalDiskAllocatableGB: 300, - VCPUsUsed: 40, - RAMUsedMB: 40, - DiskUsedGB: 40, - }, - &compute.HostUtilization{ - ComputeHost: "kvm-host", - TotalVCPUsAllocatable: 100, - TotalRAMAllocatableMB: 100, - TotalDiskAllocatableGB: 100, - VCPUsUsed: 75, - RAMUsedMB: 80, - DiskUsedGB: 85, - }, - &compute.HostUtilization{ - ComputeHost: "ironic-host", - TotalVCPUsAllocatable: 0, - TotalRAMAllocatableMB: 0, - TotalDiskAllocatableGB: 0, - VCPUsUsed: 0, - RAMUsedMB: 0, - DiskUsedGB: 0, - }, - // No Capacity reported for host kvm-host-2 - }) - if err != nil { - t.Fatalf("expected no error, got %v", err) - } - - kpi := &VMwareResourceCapacityKPI{} - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithRuntimeObjects(&v1alpha1.Knowledge{ - ObjectMeta: v1.ObjectMeta{Name: "host-details"}, - Status: v1alpha1.KnowledgeStatus{Raw: hostDetails}, - }, &v1alpha1.Knowledge{ - ObjectMeta: v1.ObjectMeta{Name: "host-utilization"}, - Status: v1alpha1.KnowledgeStatus{Raw: hostUtilizations}, - }). - Build() - if err := kpi.Init(nil, client, conf.NewRawOpts("{}")); err != nil { - t.Fatalf("expected no error, got %v", err) - } - - ch := make(chan prometheus.Metric, 100) - kpi.Collect(ch) - close(ch) - - type HostResourceMetric struct { - ComputeHost string - Resource string - AvailabilityZone string - Enabled string - Decommissioned string - ExternalCustomer string - CPUArchitecture string - WorkloadType string - PinnedProjects string - PinnedProjectIds string - Value float64 - } - - actualMetrics := make(map[string]HostResourceMetric, 0) - - for metric := range ch { - desc := metric.Desc().String() - metricName := getMetricName(desc) - - // Only consider cortex_vmware_host_capacity_total metric in this test - if metricName != "cortex_vmware_host_capacity_total" { - continue - } - - var m prometheusgo.Metric - if err := metric.Write(&m); err != nil { - t.Fatalf("failed to write metric: %v", err) - } - - labels := make(map[string]string) - for _, label := range m.Label { - labels[label.GetName()] = label.GetValue() - } - - key := labels["compute_host"] + "-" + labels["resource"] - - actualMetrics[key] = HostResourceMetric{ - ComputeHost: labels["compute_host"], - Resource: labels["resource"], - AvailabilityZone: labels["availability_zone"], - Enabled: labels["enabled"], - Decommissioned: labels["decommissioned"], - ExternalCustomer: labels["external_customer"], - CPUArchitecture: labels["cpu_architecture"], - WorkloadType: labels["workload_type"], - PinnedProjects: labels["pinned_projects"], - PinnedProjectIds: labels["pinned_project_ids"], - Value: m.GetGauge().GetValue(), - } - } - - expectedMetrics := map[string]HostResourceMetric{ - "vmware-host-cpu": { - ComputeHost: "vmware-host", - Resource: "cpu", - AvailabilityZone: "az1", - Enabled: "true", - Decommissioned: "true", - ExternalCustomer: "true", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - PinnedProjects: "true", - PinnedProjectIds: "project1,project2", - Value: 100, - }, - "vmware-host-ram": { - ComputeHost: "vmware-host", - Resource: "ram", - AvailabilityZone: "az1", - Enabled: "true", - Decommissioned: "true", - ExternalCustomer: "true", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - PinnedProjects: "true", - PinnedProjectIds: "project1,project2", - Value: 200, - }, - "vmware-host-disk": { - ComputeHost: "vmware-host", - Resource: "disk", - AvailabilityZone: "az1", - Enabled: "true", - Decommissioned: "true", - ExternalCustomer: "true", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - PinnedProjects: "true", - PinnedProjectIds: "project1,project2", - Value: 300, - }, - } - - if len(expectedMetrics) != len(actualMetrics) { - t.Errorf("expected %d metrics, got %d", len(expectedMetrics), len(actualMetrics)) - } - - for key, expected := range expectedMetrics { - actual, ok := actualMetrics[key] - if !ok { - t.Errorf("expected metric %q not found", key) - continue - } - - if !reflect.DeepEqual(expected, actual) { - t.Errorf("metric %q: expected %+v, got %+v", key, expected, actual) - } - } -} diff --git a/internal/knowledge/kpis/plugins/infrastructure/shared.go b/internal/knowledge/kpis/plugins/infrastructure/shared.go index 4c011492c..77bb2546b 100644 --- a/internal/knowledge/kpis/plugins/infrastructure/shared.go +++ b/internal/knowledge/kpis/plugins/infrastructure/shared.go @@ -13,6 +13,7 @@ import ( const ( hostDetailsKnowledgeName = "host-details" + hostUtilizationKnowledgeName = "host-utilization" vmwareIronicHypervisorType = "ironic" hypervisorFamilyVMware = "vmware" vmwareComputeHostPattern = "nova-compute-%" diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity.go new file mode 100644 index 000000000..c7976db3a --- /dev/null +++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity.go @@ -0,0 +1,119 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package infrastructure + +import ( + "context" + "log/slog" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/db" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins" + "github.com/cobaltcore-dev/cortex/pkg/conf" + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +type VMwareHostCapacityKPI struct { + plugins.BaseKPI[struct{}] + + capacityUsagePerHost *prometheus.Desc + capacityTotalPerHost *prometheus.Desc +} + +func (k *VMwareHostCapacityKPI) GetName() string { + return "vmware_host_capacity_kpi" +} + +func (k *VMwareHostCapacityKPI) Init(dbConn *db.DB, c client.Client, opts conf.RawOpts) error { + if err := k.BaseKPI.Init(dbConn, c, opts); err != nil { + return err + } + k.capacityUsagePerHost = prometheus.NewDesc( + "cortex_vmware_host_capacity_usage", + "Capacity usage per VMware host. CPU in vCPUs, memory and disk in bytes.", + append(vmwareHostLabels, "resource"), nil, + ) + k.capacityTotalPerHost = prometheus.NewDesc( + "cortex_vmware_host_capacity_total", + "Total allocatable capacity per VMware host. CPU in vCPUs, memory and disk in bytes.", + append(vmwareHostLabels, "resource"), nil, + ) + return nil +} + +func (k *VMwareHostCapacityKPI) Describe(ch chan<- *prometheus.Desc) { + ch <- k.capacityUsagePerHost + ch <- k.capacityTotalPerHost +} + +func (k *VMwareHostCapacityKPI) Collect(ch chan<- prometheus.Metric) { + hosts, err := k.getVMwareHosts() + if err != nil { + slog.Error("vmware_host_capacity: failed to get vmware hosts", "error", err) + return + } + utilizations, err := k.getHostUtilizations() + if err != nil { + slog.Error("vmware_host_capacity: failed to get host utilizations", "error", err) + return + } + for _, host := range hosts { + util, ok := utilizations[host.ComputeHost] + if !ok { + slog.Warn("vmware_host_capacity: missing utilization for host", "compute_host", host.ComputeHost) + continue + } + + labels := host.getHostLabels() + + ch <- prometheus.MustNewConstMetric(k.capacityUsagePerHost, prometheus.GaugeValue, util.VCPUsUsed, append(labels, "cpu")...) + ch <- prometheus.MustNewConstMetric(k.capacityUsagePerHost, prometheus.GaugeValue, util.RAMUsedMB*1024*1024, append(labels, "ram")...) + ch <- prometheus.MustNewConstMetric(k.capacityUsagePerHost, prometheus.GaugeValue, util.DiskUsedGB*1024*1024*1024, append(labels, "disk")...) + + ch <- prometheus.MustNewConstMetric(k.capacityTotalPerHost, prometheus.GaugeValue, util.TotalVCPUsAllocatable, append(labels, "cpu")...) + ch <- prometheus.MustNewConstMetric(k.capacityTotalPerHost, prometheus.GaugeValue, util.TotalRAMAllocatableMB*1024*1024, append(labels, "ram")...) + ch <- prometheus.MustNewConstMetric(k.capacityTotalPerHost, prometheus.GaugeValue, util.TotalDiskAllocatableGB*1024*1024*1024, append(labels, "disk")...) + } +} + +func (k *VMwareHostCapacityKPI) getVMwareHosts() ([]vmwareHost, error) { + knowledge := &v1alpha1.Knowledge{} + if err := k.Client.Get(context.Background(), client.ObjectKey{Name: hostDetailsKnowledgeName}, knowledge); err != nil { + return nil, err + } + details, err := v1alpha1.UnboxFeatureList[compute.HostDetails](knowledge.Status.Raw) + if err != nil { + return nil, err + } + hosts := make([]vmwareHost, 0, len(details)) + for _, d := range details { + if d.HypervisorType == vmwareIronicHypervisorType || d.HypervisorFamily != hypervisorFamilyVMware { + continue + } + hosts = append(hosts, vmwareHost{HostDetails: d}) + } + return hosts, nil +} + +func (k *VMwareHostCapacityKPI) getHostUtilizations() (map[string]compute.HostUtilization, error) { + knowledge := &v1alpha1.Knowledge{} + if err := k.Client.Get(context.Background(), client.ObjectKey{Name: hostUtilizationKnowledgeName}, knowledge); err != nil { + return nil, err + } + utils, err := v1alpha1.UnboxFeatureList[compute.HostUtilization](knowledge.Status.Raw) + if err != nil { + return nil, err + } + m := make(map[string]compute.HostUtilization, len(utils)) + for _, u := range utils { + if u.TotalVCPUsAllocatable == 0 || u.TotalRAMAllocatableMB == 0 || u.TotalDiskAllocatableGB == 0 { + slog.Warn("vmware_host_capacity: skipping host with zero allocatable resources", "compute_host", u.ComputeHost) + continue + } + m[u.ComputeHost] = u + } + return m, nil +} diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity_test.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity_test.go new file mode 100644 index 000000000..4a55efdb5 --- /dev/null +++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity_test.go @@ -0,0 +1,335 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package infrastructure + +import ( + "reflect" + "testing" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/db" + testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/pkg/conf" + "github.com/prometheus/client_golang/prometheus" + prometheusgo "github.com/prometheus/client_model/go" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func buildHostCapacityClient(t *testing.T, hostDetails []compute.HostDetails, utilizations []compute.HostUtilization) *fake.ClientBuilder { + t.Helper() + scheme, err := v1alpha1.SchemeBuilder.Build() + if err != nil { + t.Fatalf("failed to build scheme: %v", err) + } + rawDetails, err := v1alpha1.BoxFeatureList(hostDetails) + if err != nil { + t.Fatalf("failed to box host details: %v", err) + } + rawUtils, err := v1alpha1.BoxFeatureList(utilizations) + if err != nil { + t.Fatalf("failed to box host utilizations: %v", err) + } + return fake.NewClientBuilder().WithScheme(scheme).WithRuntimeObjects( + &v1alpha1.Knowledge{ + ObjectMeta: v1.ObjectMeta{Name: hostDetailsKnowledgeName}, + Status: v1alpha1.KnowledgeStatus{Raw: rawDetails}, + }, + &v1alpha1.Knowledge{ + ObjectMeta: v1.ObjectMeta{Name: hostUtilizationKnowledgeName}, + Status: v1alpha1.KnowledgeStatus{Raw: rawUtils}, + }, + ) +} + +func TestVMwareHostCapacityKPI_Init(t *testing.T) { + dbEnv := testlibDB.SetupDBEnv(t) + testDB := db.DB{DbMap: dbEnv.DbMap} + defer dbEnv.Close() + kpi := &VMwareHostCapacityKPI{} + if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error, got %v", err) + } +} + +func TestVMwareHostCapacityKPI_getVMwareHosts(t *testing.T) { + hostDetails := []compute.HostDetails{ + {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware}, + {ComputeHost: "nova-compute-2", HypervisorFamily: hypervisorFamilyVMware}, + {ComputeHost: "nova-compute-ironic-1", HypervisorType: vmwareIronicHypervisorType, HypervisorFamily: hypervisorFamilyVMware}, + {ComputeHost: "nova-compute-3", HypervisorFamily: "other"}, + } + + client := buildHostCapacityClient(t, hostDetails, nil) + kpi := &VMwareHostCapacityKPI{} + kpi.Client = client.Build() + + hosts, err := kpi.getVMwareHosts() + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + + if len(hosts) != 2 { + t.Fatalf("expected 2 hosts, got %d", len(hosts)) + } + seen := make(map[string]bool) + for _, h := range hosts { + seen[h.ComputeHost] = true + } + for _, name := range []string{"nova-compute-1", "nova-compute-2"} { + if !seen[name] { + t.Errorf("expected host %q in result", name) + } + } +} + +func TestVMwareHostCapacityKPI_getHostUtilizations(t *testing.T) { + tests := []struct { + name string + utilizations []compute.HostUtilization + expectedHosts []string + }{ + { + name: "normal utilizations are returned", + utilizations: []compute.HostUtilization{ + {ComputeHost: "h1", TotalVCPUsAllocatable: 10, TotalRAMAllocatableMB: 1024, TotalDiskAllocatableGB: 100}, + {ComputeHost: "h2", TotalVCPUsAllocatable: 20, TotalRAMAllocatableMB: 2048, TotalDiskAllocatableGB: 200}, + }, + expectedHosts: []string{"h1", "h2"}, + }, + { + name: "zero TotalVCPUsAllocatable is skipped", + utilizations: []compute.HostUtilization{ + {ComputeHost: "h1", TotalVCPUsAllocatable: 0, TotalRAMAllocatableMB: 1024, TotalDiskAllocatableGB: 100}, + }, + expectedHosts: []string{}, + }, + { + name: "zero TotalRAMAllocatableMB is skipped", + utilizations: []compute.HostUtilization{ + {ComputeHost: "h1", TotalVCPUsAllocatable: 10, TotalRAMAllocatableMB: 0, TotalDiskAllocatableGB: 100}, + }, + expectedHosts: []string{}, + }, + { + name: "zero TotalDiskAllocatableGB is skipped", + utilizations: []compute.HostUtilization{ + {ComputeHost: "h1", TotalVCPUsAllocatable: 10, TotalRAMAllocatableMB: 1024, TotalDiskAllocatableGB: 0}, + }, + expectedHosts: []string{}, + }, + { + name: "mix of valid and zero-allocatable entries", + utilizations: []compute.HostUtilization{ + {ComputeHost: "h1", TotalVCPUsAllocatable: 10, TotalRAMAllocatableMB: 1024, TotalDiskAllocatableGB: 100}, + {ComputeHost: "h2", TotalVCPUsAllocatable: 0, TotalRAMAllocatableMB: 1024, TotalDiskAllocatableGB: 100}, + }, + expectedHosts: []string{"h1"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + client := buildHostCapacityClient(t, nil, tt.utilizations) + kpi := &VMwareHostCapacityKPI{} + kpi.Client = client.Build() + + m, err := kpi.getHostUtilizations() + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if len(m) != len(tt.expectedHosts) { + t.Fatalf("expected %d entries, got %d: %v", len(tt.expectedHosts), len(m), m) + } + for _, host := range tt.expectedHosts { + if _, ok := m[host]; !ok { + t.Errorf("expected host %q in result", host) + } + } + }) + } +} + +func TestVMwareHostCapacityKPI_Collect(t *testing.T) { + tests := []struct { + name string + hostDetails []compute.HostDetails + utilizations []compute.HostUtilization + expectedMetrics []collectedVMwareMetric + }{ + { + name: "single host emits usage and total metrics", + hostDetails: []compute.HostDetails{ + {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"}, + }, + utilizations: []compute.HostUtilization{ + { + ComputeHost: "nova-compute-1", + VCPUsUsed: 4, + TotalVCPUsAllocatable: 16, + RAMUsedMB: 2048, + TotalRAMAllocatableMB: 8192, + DiskUsedGB: 50, + TotalDiskAllocatableGB: 500, + }, + }, + expectedMetrics: []collectedVMwareMetric{ + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "cpu"), Value: 4}, + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "ram"), Value: 2048 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "disk"), Value: 50 * 1024 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "cpu"), Value: 16}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "ram"), Value: 8192 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "disk"), Value: 500 * 1024 * 1024 * 1024}, + }, + }, + { + name: "multiple hosts each emit their own metrics", + hostDetails: []compute.HostDetails{ + {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"}, + {ComputeHost: "nova-compute-2", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az2"}, + }, + utilizations: []compute.HostUtilization{ + {ComputeHost: "nova-compute-1", VCPUsUsed: 2, TotalVCPUsAllocatable: 8, RAMUsedMB: 512, TotalRAMAllocatableMB: 2048, DiskUsedGB: 10, TotalDiskAllocatableGB: 100}, + {ComputeHost: "nova-compute-2", VCPUsUsed: 6, TotalVCPUsAllocatable: 12, RAMUsedMB: 1024, TotalRAMAllocatableMB: 4096, DiskUsedGB: 20, TotalDiskAllocatableGB: 200}, + }, + expectedMetrics: []collectedVMwareMetric{ + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "cpu"), Value: 2}, + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "ram"), Value: 512 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "disk"), Value: 10 * 1024 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "cpu"), Value: 8}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "ram"), Value: 2048 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "disk"), Value: 100 * 1024 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-2", "az2", "cpu"), Value: 6}, + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-2", "az2", "ram"), Value: 1024 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-2", "az2", "disk"), Value: 20 * 1024 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-2", "az2", "cpu"), Value: 12}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-2", "az2", "ram"), Value: 4096 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-2", "az2", "disk"), Value: 200 * 1024 * 1024 * 1024}, + }, + }, + { + name: "ironic hosts are excluded", + hostDetails: []compute.HostDetails{ + {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"}, + {ComputeHost: "nova-compute-ironic-1", HypervisorType: vmwareIronicHypervisorType, HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"}, + }, + utilizations: []compute.HostUtilization{ + {ComputeHost: "nova-compute-1", VCPUsUsed: 2, TotalVCPUsAllocatable: 8, RAMUsedMB: 512, TotalRAMAllocatableMB: 2048, DiskUsedGB: 10, TotalDiskAllocatableGB: 100}, + {ComputeHost: "nova-compute-ironic-1", VCPUsUsed: 4, TotalVCPUsAllocatable: 16, RAMUsedMB: 1024, TotalRAMAllocatableMB: 4096, DiskUsedGB: 20, TotalDiskAllocatableGB: 200}, + }, + expectedMetrics: []collectedVMwareMetric{ + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "cpu"), Value: 2}, + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "ram"), Value: 512 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "disk"), Value: 10 * 1024 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "cpu"), Value: 8}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "ram"), Value: 2048 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "disk"), Value: 100 * 1024 * 1024 * 1024}, + }, + }, + { + name: "non-vmware hosts are excluded", + hostDetails: []compute.HostDetails{ + {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"}, + {ComputeHost: "nova-compute-2", HypervisorFamily: "kvm", AvailabilityZone: "az1"}, + }, + utilizations: []compute.HostUtilization{ + {ComputeHost: "nova-compute-1", VCPUsUsed: 2, TotalVCPUsAllocatable: 8, RAMUsedMB: 512, TotalRAMAllocatableMB: 2048, DiskUsedGB: 10, TotalDiskAllocatableGB: 100}, + {ComputeHost: "nova-compute-2", VCPUsUsed: 4, TotalVCPUsAllocatable: 16, RAMUsedMB: 1024, TotalRAMAllocatableMB: 4096, DiskUsedGB: 20, TotalDiskAllocatableGB: 200}, + }, + expectedMetrics: []collectedVMwareMetric{ + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "cpu"), Value: 2}, + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "ram"), Value: 512 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "disk"), Value: 10 * 1024 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "cpu"), Value: 8}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "ram"), Value: 2048 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "disk"), Value: 100 * 1024 * 1024 * 1024}, + }, + }, + { + name: "host without matching utilization produces no metrics", + hostDetails: []compute.HostDetails{ + {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"}, + }, + utilizations: []compute.HostUtilization{}, + expectedMetrics: []collectedVMwareMetric{}, + }, + { + name: "utilization with zero allocatable resources is skipped", + hostDetails: []compute.HostDetails{ + {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"}, + }, + utilizations: []compute.HostUtilization{ + {ComputeHost: "nova-compute-1", VCPUsUsed: 2, TotalVCPUsAllocatable: 0, RAMUsedMB: 512, TotalRAMAllocatableMB: 2048, DiskUsedGB: 10, TotalDiskAllocatableGB: 100}, + }, + expectedMetrics: []collectedVMwareMetric{}, + }, + { + name: "no hosts produces no metrics", + hostDetails: []compute.HostDetails{}, + utilizations: []compute.HostUtilization{}, + expectedMetrics: []collectedVMwareMetric{}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + dbEnv := testlibDB.SetupDBEnv(t) + testDB := db.DB{DbMap: dbEnv.DbMap} + defer dbEnv.Close() + + client := buildHostCapacityClient(t, tt.hostDetails, tt.utilizations) + kpi := &VMwareHostCapacityKPI{} + if err := kpi.Init(&testDB, client.Build(), conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error on Init, got %v", err) + } + + ch := make(chan prometheus.Metric, 200) + kpi.Collect(ch) + close(ch) + + actual := make(map[string]collectedVMwareMetric) + for m := range ch { + var pm prometheusgo.Metric + if err := m.Write(&pm); err != nil { + t.Fatalf("failed to write metric: %v", err) + } + labels := make(map[string]string) + for _, lbl := range pm.Label { + labels[lbl.GetName()] = lbl.GetValue() + } + name := getMetricName(m.Desc().String()) + key := name + "|" + labels["compute_host"] + "|" + labels["resource"] + if _, exists := actual[key]; exists { + t.Fatalf("duplicate metric key %q", key) + } + actual[key] = collectedVMwareMetric{Name: name, Labels: labels, Value: pm.GetGauge().GetValue()} + } + + if len(actual) != len(tt.expectedMetrics) { + t.Errorf("expected %d metrics, got %d: actual=%v", len(tt.expectedMetrics), len(actual), actual) + } + for _, exp := range tt.expectedMetrics { + key := exp.Name + "|" + exp.Labels["compute_host"] + "|" + exp.Labels["resource"] + got, ok := actual[key] + if !ok { + t.Errorf("missing metric %q", key) + continue + } + if got.Value != exp.Value { + t.Errorf("metric %q value: expected %v, got %v", key, exp.Value, got.Value) + } + if !reflect.DeepEqual(exp.Labels, got.Labels) { + t.Errorf("metric %q labels: expected %v, got %v", key, exp.Labels, got.Labels) + } + } + }) + } +} + +func hostCapacityLabels(computeHost, az, resource string) map[string]string { + labels := hostLabels(computeHost, az) + labels["resource"] = resource + return labels +} diff --git a/internal/knowledge/kpis/supported_kpis.go b/internal/knowledge/kpis/supported_kpis.go index 19726a488..63a35866b 100644 --- a/internal/knowledge/kpis/supported_kpis.go +++ b/internal/knowledge/kpis/supported_kpis.go @@ -16,7 +16,6 @@ var supportedKPIs = map[string]plugins.KPI{ "kvm_host_capacity_kpi": &compute.KVMResourceCapacityKPI{}, "vmware_host_contention_kpi": &compute.VMwareHostContentionKPI{}, "vmware_project_noisiness_kpi": &compute.VMwareProjectNoisinessKPI{}, - "vmware_host_capacity_kpi": &compute.VMwareResourceCapacityKPI{}, "host_running_vms_kpi": &compute.HostRunningVMsKPI{}, "flavor_running_vms_kpi": &compute.FlavorRunningVMsKPI{}, "vm_migration_statistics_kpi": &compute.VMMigrationStatisticsKPI{}, @@ -26,6 +25,7 @@ var supportedKPIs = map[string]plugins.KPI{ "vmware_project_utilization_kpi": &infrastructure.VMwareProjectUtilizationKPI{}, "vmware_resource_commitments_kpi": &infrastructure.VMwareResourceCommitmentsKPI{}, + "vmware_host_capacity_kpi": &infrastructure.VMwareHostCapacityKPI{}, "netapp_storage_pool_cpu_usage_kpi": &storage.NetAppStoragePoolCPUUsageKPI{}, From f587fe2a11bcd24af09335b1337540db9c33f59b Mon Sep 17 00:00:00 2001 From: Markus Wieland Date: Wed, 29 Apr 2026 13:33:32 +0200 Subject: [PATCH 2/5] refactor: remove unused shared.go file for vmware host capacity --- .../kpis/plugins/compute/refactor/shared.go | 150 ------------------ 1 file changed, 150 deletions(-) delete mode 100644 internal/knowledge/kpis/plugins/compute/refactor/shared.go diff --git a/internal/knowledge/kpis/plugins/compute/refactor/shared.go b/internal/knowledge/kpis/plugins/compute/refactor/shared.go deleted file mode 100644 index d50698b1f..000000000 --- a/internal/knowledge/kpis/plugins/compute/refactor/shared.go +++ /dev/null @@ -1,150 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package refactor - -import ( - "fmt" - "regexp" - "strconv" - - "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" - "github.com/prometheus/client_golang/prometheus" -) - -const ( - hypervisorTypeIronic = "ironic" - hypervisorFamilyVMware = "vmware" - hypervisorFamilyKVM = "kvm" - - cpuArchCascadeLake = "cascade-lake" - cpuArchSapphireRapids = "sapphire-rapids" - - workloadTypeHANA = "hana" - workloadTypeGeneralPurpose = "general-purpose" - - hostDetailsKnowledge = "host-details" - hostUtilizationKnowledge = "host-utilization" - - commitmentStatusConfirmed = "confirmed" - commitmentStatusGuaranteed = "guaranteed" - - limesServiceCompute = "compute" - limesResourceCores = "cores" - limesResourceRAM = "ram" - - unitVCPU = "vCPU" - unitBytes = "B" - - // Flavor suffix indicating sapphire-rapids CPU architecture. - sapphireRapidsFlavorSuffix = "_v2" -) - -// kvmFlavorRegex matches KVM flavors where the second underscore-delimited segment is "k", e.g. "m1_k_small". -var kvmFlavorRegex = regexp.MustCompile(`^[^_]+_k_`) - -var vmwareHostLabels = []string{ - "availability_zone", - "compute_host", - "cpu_architecture", - "workload_type", - "hypervisor_family", - "enabled", - "decommissioned", - "external_customer", - "disabled_reason", - "pinned_projects", - "pinned_project_ids", -} - -var vmwareHostCapacityLabels = append(append([]string{}, vmwareHostLabels...), "resource", "unit") - -var vmwareProjectLabels = append(append([]string{}, vmwareHostLabels...), "project_id", "project_name", "flavor_name") - -var vmwareProjectCapacityLabels = append(append([]string{}, vmwareProjectLabels...), "resource", "unit") - -var kvmHostLabels = []string{ - "compute_host", - "availability_zone", - "building_block", - "cpu_architecture", - "workload_type", - "enabled", - "decommissioned", - "external_customer", - "maintenance", -} - -var kvmHostCapacityLabels = append(append([]string{}, kvmHostLabels...), "resource", "unit") - -var kvmProjectLabels = append(append([]string{}, kvmHostLabels...), "project_id", "project_name", "flavor_name") - -var kvmProjectCapacityLabels = append(append([]string{}, kvmProjectLabels...), "resource", "unit") - -// vmwareHost wraps HostDetails with Prometheus metric helpers. -type vmwareHost struct { - compute.HostDetails -} - -func (h vmwareHost) getHostLabels() []string { - pinnedProjectIds := "" - pinnedProjects := false - if h.PinnedProjects != nil { - pinnedProjectIds = *h.PinnedProjects - pinnedProjects = true - } - disabledReason := "-" - if h.DisabledReason != nil { - disabledReason = *h.DisabledReason - } - return []string{ - h.AvailabilityZone, - h.ComputeHost, - h.CPUArchitecture, - h.WorkloadType, - h.HypervisorFamily, - strconv.FormatBool(h.Enabled), - strconv.FormatBool(h.Decommissioned), - strconv.FormatBool(h.ExternalCustomer), - disabledReason, - strconv.FormatBool(pinnedProjects), - pinnedProjectIds, - } -} - -func (h vmwareHost) toCapacityMetric(desc *prometheus.Desc, resource, unit string, value float64) prometheus.Metric { - labels := append(h.getHostLabels(), resource, unit) - return prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, value, labels...) -} - -func (h vmwareHost) toInstanceCountMetric(desc *prometheus.Desc, value float64) prometheus.Metric { - return prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, value, h.getHostLabels()...) -} - -// convertLimesMemoryToBytes converts a limes memory amount and its unit string to bytes. -func convertLimesMemoryToBytes(amount uint64, unit string) (float64, error) { - switch unit { - case "B", "": - return float64(amount), nil - case "KiB": - return float64(amount) * 1024, nil - case "MiB": - return float64(amount) * 1024 * 1024, nil - case "GiB": - return float64(amount) * 1024 * 1024 * 1024, nil - case "TiB": - return float64(amount) * 1024 * 1024 * 1024 * 1024, nil - default: - return 0, fmt.Errorf("unknown limes memory unit: %s", unit) - } -} - -// cpuArchForFlavor derives the CPU architecture from a flavor name. -// Flavors with a "_v2" suffix run on sapphire-rapids; all others on cascade-lake. -func cpuArchForFlavor(flavorName string) string { - if len(flavorName) >= len(sapphireRapidsFlavorSuffix) && - flavorName[len(flavorName)-len(sapphireRapidsFlavorSuffix):] == sapphireRapidsFlavorSuffix { - return cpuArchSapphireRapids - } - return cpuArchCascadeLake -} From 7aeaf55c8a1dfda1fa6b05339ef456638656afcd Mon Sep 17 00:00:00 2001 From: Markus Wieland Date: Wed, 29 Apr 2026 13:39:46 +0200 Subject: [PATCH 3/5] refactor: remove hypervisor family from vmware host labels --- internal/knowledge/kpis/plugins/infrastructure/shared.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/internal/knowledge/kpis/plugins/infrastructure/shared.go b/internal/knowledge/kpis/plugins/infrastructure/shared.go index 77bb2546b..62eb44e9c 100644 --- a/internal/knowledge/kpis/plugins/infrastructure/shared.go +++ b/internal/knowledge/kpis/plugins/infrastructure/shared.go @@ -41,7 +41,6 @@ func (h vmwareHost) getHostLabels() []string { h.ComputeHost, h.CPUArchitecture, h.WorkloadType, - h.HypervisorFamily, strconv.FormatBool(h.Enabled), strconv.FormatBool(h.Decommissioned), strconv.FormatBool(h.ExternalCustomer), @@ -56,7 +55,6 @@ var vmwareHostLabels = []string{ "compute_host", "cpu_architecture", "workload_type", - "hypervisor_family", "enabled", "decommissioned", "external_customer", From 2fc583e04107bb16e27710c8b4ef6e2b812ad92f Mon Sep 17 00:00:00 2001 From: Markus Wieland Date: Wed, 29 Apr 2026 14:06:10 +0200 Subject: [PATCH 4/5] fix: failing tests Co-authored-by: Copilot --- .../plugins/infrastructure/shared_test.go | 15 ++++++++++++++ .../vmware_host_capacity_test.go | 2 +- .../vmware_project_utilization_test.go | 20 ++----------------- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/internal/knowledge/kpis/plugins/infrastructure/shared_test.go b/internal/knowledge/kpis/plugins/infrastructure/shared_test.go index dc720d159..e8d995ab0 100644 --- a/internal/knowledge/kpis/plugins/infrastructure/shared_test.go +++ b/internal/knowledge/kpis/plugins/infrastructure/shared_test.go @@ -5,6 +5,21 @@ package infrastructure import "testing" +func mockVMwareHostLabels(computeHost, az string) map[string]string { + return map[string]string{ + "availability_zone": az, + "compute_host": computeHost, + "cpu_architecture": "", + "workload_type": "", + "enabled": "false", + "decommissioned": "false", + "external_customer": "false", + "disabled_reason": "-", + "pinned_projects": "false", + "pinned_project_ids": "", + } +} + func TestIsKVMFlavor(t *testing.T) { tests := []struct { flavor string diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity_test.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity_test.go index 4a55efdb5..f0a025db4 100644 --- a/internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity_test.go +++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity_test.go @@ -329,7 +329,7 @@ func TestVMwareHostCapacityKPI_Collect(t *testing.T) { } func hostCapacityLabels(computeHost, az, resource string) map[string]string { - labels := hostLabels(computeHost, az) + labels := mockVMwareHostLabels(computeHost, az) labels["resource"] = resource return labels } diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization_test.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization_test.go index 9f6d84786..4c43c893b 100644 --- a/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization_test.go +++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization_test.go @@ -33,24 +33,8 @@ func buildMetricKey(name string, labels map[string]string) string { } } -func hostLabels(computeHost, az string) map[string]string { - return map[string]string{ - "availability_zone": az, - "compute_host": computeHost, - "cpu_architecture": "", - "workload_type": "", - "hypervisor_family": "vmware", - "enabled": "false", - "decommissioned": "false", - "external_customer": "false", - "disabled_reason": "-", - "pinned_projects": "false", - "pinned_project_ids": "", - } -} - func instanceMetric(computeHost, az, projectID, projectName, flavorName string, value float64) collectedVMwareMetric { - labels := hostLabels(computeHost, az) + labels := mockVMwareHostLabels(computeHost, az) labels["project_id"] = projectID labels["project_name"] = projectName labels["flavor_name"] = flavorName @@ -58,7 +42,7 @@ func instanceMetric(computeHost, az, projectID, projectName, flavorName string, } func capacityMetric(computeHost, az, projectID, projectName, resource string, value float64) collectedVMwareMetric { - labels := hostLabels(computeHost, az) + labels := mockVMwareHostLabels(computeHost, az) labels["project_id"] = projectID labels["project_name"] = projectName labels["resource"] = resource From e62606f02eaf6017cce98bcf8a15c4484b33fe1d Mon Sep 17 00:00:00 2001 From: Markus Wieland Date: Wed, 29 Apr 2026 14:31:44 +0200 Subject: [PATCH 5/5] test: add unit tests for vmware host label generation --- .../plugins/infrastructure/shared_test.go | 74 ++++++++++++++++++- 1 file changed, 73 insertions(+), 1 deletion(-) diff --git a/internal/knowledge/kpis/plugins/infrastructure/shared_test.go b/internal/knowledge/kpis/plugins/infrastructure/shared_test.go index e8d995ab0..351fedc50 100644 --- a/internal/knowledge/kpis/plugins/infrastructure/shared_test.go +++ b/internal/knowledge/kpis/plugins/infrastructure/shared_test.go @@ -3,7 +3,11 @@ package infrastructure -import "testing" +import ( + "testing" + + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" +) func mockVMwareHostLabels(computeHost, az string) map[string]string { return map[string]string{ @@ -20,6 +24,74 @@ func mockVMwareHostLabels(computeHost, az string) map[string]string { } } +func TestVMwareHostGetHostLabels(t *testing.T) { + str := func(s string) *string { return &s } + + tests := []struct { + name string + host vmwareHost + want []string + }{ + { + name: "all optional fields nil", + host: vmwareHost{compute.HostDetails{ + AvailabilityZone: "az1", + ComputeHost: "nova-compute-1", + CPUArchitecture: "cascade-lake", + WorkloadType: "general-purpose", + Enabled: true, + Decommissioned: false, + ExternalCustomer: false, + DisabledReason: nil, + PinnedProjects: nil, + }}, + want: []string{"az1", "nova-compute-1", "cascade-lake", "general-purpose", "true", "false", "false", "-", "false", ""}, + }, + { + name: "disabled reason set", + host: vmwareHost{compute.HostDetails{ + AvailabilityZone: "az2", + ComputeHost: "nova-compute-2", + DisabledReason: str("scheduled-maintenance"), + }}, + want: []string{"az2", "nova-compute-2", "", "", "false", "false", "false", "scheduled-maintenance", "false", ""}, + }, + { + name: "pinned projects set", + host: vmwareHost{compute.HostDetails{ + AvailabilityZone: "az1", + ComputeHost: "nova-compute-3", + PinnedProjects: str("proj-a,proj-b"), + }}, + want: []string{"az1", "nova-compute-3", "", "", "false", "false", "false", "-", "true", "proj-a,proj-b"}, + }, + { + name: "decommissioned and external customer", + host: vmwareHost{compute.HostDetails{ + AvailabilityZone: "az3", + ComputeHost: "nova-compute-4", + Decommissioned: true, + ExternalCustomer: true, + }}, + want: []string{"az3", "nova-compute-4", "", "", "false", "true", "true", "-", "false", ""}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := tt.host.getHostLabels() + if len(got) != len(vmwareHostLabels) { + t.Fatalf("getHostLabels() returned %d values, want %d (matching vmwareHostLabels)", len(got), len(vmwareHostLabels)) + } + for i, want := range tt.want { + if got[i] != want { + t.Errorf("label[%d] (%s) = %q, want %q", i, vmwareHostLabels[i], got[i], want) + } + } + }) + } +} + func TestIsKVMFlavor(t *testing.T) { tests := []struct { flavor string