diff --git a/helm/bundles/cortex-nova/templates/kpis.yaml b/helm/bundles/cortex-nova/templates/kpis.yaml index 22774c62a..6979b0e29 100644 --- a/helm/bundles/cortex-nova/templates/kpis.yaml +++ b/helm/bundles/cortex-nova/templates/kpis.yaml @@ -29,20 +29,6 @@ spec: --- apiVersion: cortex.cloud/v1alpha1 kind: KPI -metadata: - name: vmware-host-capacity -spec: - schedulingDomain: nova - impl: vmware_host_capacity_kpi - dependencies: - knowledges: - - name: host-details - - name: host-utilization - description: | - This KPI tracks the total, utilized, reserved and failover capacity of VMware hosts. ---- -apiVersion: cortex.cloud/v1alpha1 -kind: KPI metadata: name: host-running-vms spec: @@ -215,4 +201,18 @@ spec: - name: nova-flavors - name: limes-project-commitments description: | - This KPI tracks the resource commitments of projects running VMs on VMware hosts. \ No newline at end of file + This KPI tracks the resource commitments of projects running VMs on VMware hosts. +--- +apiVersion: cortex.cloud/v1alpha1 +kind: KPI +metadata: + name: vmware-host-capacity +spec: + schedulingDomain: nova + impl: vmware_host_capacity_kpi + dependencies: + knowledges: + - name: host-details + - name: host-utilization + description: | + This KPI tracks the capacity and utilization of VMware hosts in terms of CPU, RAM, and disk resources. \ No newline at end of file diff --git a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go index 6e9d38c7b..c233cfd4c 100644 --- a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go +++ b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go @@ -4,6 +4,7 @@ package compute import ( + "regexp" "testing" "github.com/cobaltcore-dev/cortex/api/v1alpha1" @@ -38,6 +39,16 @@ type kvmMetricLabels struct { Maintenance string } +var fqNameRe = regexp.MustCompile(`fqName: "([^"]+)"`) + +func getMetricName(desc string) string { + match := fqNameRe.FindStringSubmatch(desc) + if len(match) > 1 { + return match[1] + } + return "" +} + type kvmExpectedMetric struct { Name string // metric family name (e.g. "cortex_kvm_host_capacity_total") Labels kvmMetricLabels diff --git a/internal/knowledge/kpis/plugins/compute/resource_capacity_vmware.go b/internal/knowledge/kpis/plugins/compute/resource_capacity_vmware.go deleted file mode 100644 index 8bd2d4177..000000000 --- a/internal/knowledge/kpis/plugins/compute/resource_capacity_vmware.go +++ /dev/null @@ -1,201 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package compute - -import ( - "context" - "log/slog" - "strconv" - - "github.com/cobaltcore-dev/cortex/api/v1alpha1" - "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" - "sigs.k8s.io/controller-runtime/pkg/client" - - "github.com/cobaltcore-dev/cortex/internal/knowledge/db" - "github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins" - "github.com/cobaltcore-dev/cortex/pkg/conf" - "github.com/prometheus/client_golang/prometheus" -) - -type VMwareResourceCapacityKPI struct { - // Common base for all KPIs that provides standard functionality. - plugins.BaseKPI[struct{}] // No options passed through yaml config - - availableCapacityPerHost *prometheus.Desc - totalCapacityPerHost *prometheus.Desc -} - -func (VMwareResourceCapacityKPI) GetName() string { - return "vmware_host_capacity_kpi" -} - -func (k *VMwareResourceCapacityKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) error { - if err := k.BaseKPI.Init(db, client, opts); err != nil { - return err - } - k.availableCapacityPerHost = prometheus.NewDesc( - "cortex_vmware_host_capacity_available", - "Available capacity per resource on the hosts currently (individually by host).", - []string{ - "compute_host", - "resource", - "availability_zone", - "cpu_architecture", - "workload_type", - "enabled", - "decommissioned", - "external_customer", - "pinned_projects", - "disabled_reason", - "pinned_project_ids", - }, - nil, - ) - k.totalCapacityPerHost = prometheus.NewDesc( - "cortex_vmware_host_capacity_total", - "Total resources available on the hosts currently (individually by host).", - []string{ - "compute_host", - "resource", - "availability_zone", - "cpu_architecture", - "workload_type", - "enabled", - "decommissioned", - "external_customer", - "pinned_projects", - "pinned_project_ids", - }, - nil, - ) - return nil -} - -func (k *VMwareResourceCapacityKPI) Describe(ch chan<- *prometheus.Desc) { - ch <- k.availableCapacityPerHost - ch <- k.totalCapacityPerHost -} - -func (k *VMwareResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) { - hostDetailsKnowledge := &v1alpha1.Knowledge{} - if err := k.Client.Get( - context.Background(), - client.ObjectKey{Name: "host-details"}, - hostDetailsKnowledge, - ); err != nil { - slog.Error("failed to get knowledge host-details", "err", err) - return - } - hostDetails, err := v1alpha1. - UnboxFeatureList[compute.HostDetails](hostDetailsKnowledge.Status.Raw) - if err != nil { - slog.Error("failed to unbox storage pool cpu usage", "err", err) - return - } - detailsByComputeHost := make(map[string]compute.HostDetails) - for _, detail := range hostDetails { - detailsByComputeHost[detail.ComputeHost] = detail - } - - hostUtilizationKnowledge := &v1alpha1.Knowledge{} - if err := k.Client.Get( - context.Background(), - client.ObjectKey{Name: "host-utilization"}, - hostUtilizationKnowledge, - ); err != nil { - slog.Error("failed to get knowledge host-utilization", "err", err) - return - } - hostUtilizations, err := v1alpha1. - UnboxFeatureList[compute.HostUtilization](hostUtilizationKnowledge.Status.Raw) - if err != nil { - slog.Error("failed to unbox host utilization", "err", err) - return - } - - for _, utilization := range hostUtilizations { - detail, exists := detailsByComputeHost[utilization.ComputeHost] - if !exists { - slog.Warn("host_available_capacity: missing host details for compute host", "compute_host", utilization.ComputeHost) - continue - } - if detail.HypervisorType == "ironic" { - continue // Ironic hosts do not run VMs/instances - } - - if detail.HypervisorFamily != "vmware" { - continue - } - - if utilization.TotalRAMAllocatableMB == 0 || utilization.TotalVCPUsAllocatable == 0 || utilization.TotalDiskAllocatableGB == 0 { - slog.Info( - "Skipping host since placement is reporting zero allocatable resources", - "metric", "cortex_available_capacity_per_host", - "host", utilization.ComputeHost, - "cpu", utilization.TotalVCPUsAllocatable, - "ram", utilization.TotalRAMAllocatableMB, - "disk", utilization.TotalDiskAllocatableGB, - ) - continue - } - - availableCPUs := float64(utilization.TotalVCPUsAllocatable - utilization.VCPUsUsed) - availableRAMMB := float64(utilization.TotalRAMAllocatableMB - utilization.RAMUsedMB) - availableDiskGB := float64(utilization.TotalDiskAllocatableGB - utilization.DiskUsedGB) - - k.exportCapacityMetricVMware(ch, "cpu", availableCPUs, utilization.TotalVCPUsAllocatable, detail) - k.exportCapacityMetricVMware(ch, "ram", availableRAMMB, utilization.TotalRAMAllocatableMB, detail) - k.exportCapacityMetricVMware(ch, "disk", availableDiskGB, utilization.TotalDiskAllocatableGB, detail) - } -} - -func (k *VMwareResourceCapacityKPI) exportCapacityMetricVMware(ch chan<- prometheus.Metric, resource string, available, total float64, host compute.HostDetails) { - enabled := strconv.FormatBool(host.Enabled) - decommissioned := strconv.FormatBool(host.Decommissioned) - externalCustomer := strconv.FormatBool(host.ExternalCustomer) - pinnedProjectIds := "" - pinnedProjects := "false" - if host.PinnedProjects != nil { - pinnedProjectIds = *host.PinnedProjects - pinnedProjects = "true" - } - - disabledReason := "-" - if host.DisabledReason != nil { - disabledReason = *host.DisabledReason - } - - ch <- prometheus.MustNewConstMetric( - k.availableCapacityPerHost, - prometheus.GaugeValue, - available, - host.ComputeHost, - resource, - host.AvailabilityZone, - host.CPUArchitecture, - host.WorkloadType, - enabled, - decommissioned, - externalCustomer, - pinnedProjects, - disabledReason, - pinnedProjectIds, - ) - - ch <- prometheus.MustNewConstMetric( - k.totalCapacityPerHost, - prometheus.GaugeValue, - total, - host.ComputeHost, - resource, - host.AvailabilityZone, - host.CPUArchitecture, - host.WorkloadType, - enabled, - decommissioned, - externalCustomer, - pinnedProjects, - pinnedProjectIds, - ) -} diff --git a/internal/knowledge/kpis/plugins/compute/resource_capacity_vmware_test.go b/internal/knowledge/kpis/plugins/compute/resource_capacity_vmware_test.go deleted file mode 100644 index 875be6357..000000000 --- a/internal/knowledge/kpis/plugins/compute/resource_capacity_vmware_test.go +++ /dev/null @@ -1,503 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package compute - -import ( - "reflect" - "regexp" - "testing" - - "github.com/cobaltcore-dev/cortex/api/v1alpha1" - "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" - "github.com/cobaltcore-dev/cortex/pkg/conf" - testlib "github.com/cobaltcore-dev/cortex/pkg/testing" - "github.com/prometheus/client_golang/prometheus" - prometheusgo "github.com/prometheus/client_model/go" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "sigs.k8s.io/controller-runtime/pkg/client/fake" -) - -func TestVMwareResourceCapacityKPI_Init(t *testing.T) { - kpi := &VMwareResourceCapacityKPI{} - if err := kpi.Init(nil, nil, conf.NewRawOpts("{}")); err != nil { - t.Fatalf("expected no error, got %v", err) - } -} - -var fqNameRe = regexp.MustCompile(`fqName: "([^"]+)"`) - -func getMetricName(desc string) string { - match := fqNameRe.FindStringSubmatch(desc) - if len(match) > 1 { - return match[1] - } - return "" -} - -func TestVMwareResourceCapacityKPI_Collect_AbsoluteMetric(t *testing.T) { - scheme, err := v1alpha1.SchemeBuilder.Build() - if err != nil { - t.Fatalf("expected no error, got %v", err) - } - - hostDetails, err := v1alpha1.BoxFeatureList([]any{ - &compute.HostDetails{ - ComputeHost: "vmware-host", - AvailabilityZone: "az1", - CPUArchitecture: "cascade-lake", - HypervisorType: "vcenter", - HypervisorFamily: "vmware", - WorkloadType: "general-purpose", - Enabled: true, - Decommissioned: true, - ExternalCustomer: true, - DisabledReason: nil, - PinnedProjects: nil, - }, - // Skip this because it's not a VMware host - &compute.HostDetails{ - ComputeHost: "kvm-host", - AvailabilityZone: "az2", - CPUArchitecture: "cascade-lake", - HypervisorType: "qemu", - HypervisorFamily: "kvm", - WorkloadType: "hana", - Enabled: false, - Decommissioned: false, - ExternalCustomer: false, - DisabledReason: testlib.Ptr("test"), - PinnedProjects: testlib.Ptr("project1,project2"), - }, - // Skip this because placement doesn't report any capacity for this host - &compute.HostDetails{ - ComputeHost: "vmware-host-2", - AvailabilityZone: "az2", - CPUArchitecture: "cascade-lake", - HypervisorType: "qemu", - HypervisorFamily: "vmware", - WorkloadType: "hana", - Enabled: false, - Decommissioned: false, - ExternalCustomer: false, - DisabledReason: testlib.Ptr("test"), - PinnedProjects: testlib.Ptr("project1,project2"), - }, - // Skip this because it's a ironic host - &compute.HostDetails{ - ComputeHost: "ironic-host", - AvailabilityZone: "az2", - CPUArchitecture: "cascade-lake", - HypervisorType: "ironic", - HypervisorFamily: "vmware", - WorkloadType: "hana", - Enabled: false, - Decommissioned: false, - ExternalCustomer: false, - DisabledReason: testlib.Ptr("test"), - PinnedProjects: testlib.Ptr("project1"), - }, - }) - if err != nil { - t.Fatalf("expected no error, got %v", err) - } - - hostUtilizations, err := v1alpha1.BoxFeatureList([]any{ - &compute.HostUtilization{ - ComputeHost: "vmware-host", - TotalVCPUsAllocatable: 100, - TotalRAMAllocatableMB: 200, - TotalDiskAllocatableGB: 300, - VCPUsUsed: 40, - RAMUsedMB: 40, - DiskUsedGB: 40, - }, - &compute.HostUtilization{ - ComputeHost: "kvm-host", - TotalVCPUsAllocatable: 100, - TotalRAMAllocatableMB: 100, - TotalDiskAllocatableGB: 100, - VCPUsUsed: 75, - RAMUsedMB: 80, - DiskUsedGB: 85, - }, - &compute.HostUtilization{ - ComputeHost: "ironic-host", - TotalVCPUsAllocatable: 0, - TotalRAMAllocatableMB: 0, - TotalDiskAllocatableGB: 0, - VCPUsUsed: 0, - RAMUsedMB: 0, - DiskUsedGB: 0, - }, - // No Capacity reported for host kvm-host-2 - }) - if err != nil { - t.Fatalf("expected no error, got %v", err) - } - - kpi := &VMwareResourceCapacityKPI{} - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithRuntimeObjects(&v1alpha1.Knowledge{ - ObjectMeta: v1.ObjectMeta{Name: "host-details"}, - Status: v1alpha1.KnowledgeStatus{Raw: hostDetails}, - }, &v1alpha1.Knowledge{ - ObjectMeta: v1.ObjectMeta{Name: "host-utilization"}, - Status: v1alpha1.KnowledgeStatus{Raw: hostUtilizations}, - }). - Build() - if err := kpi.Init(nil, client, conf.NewRawOpts("{}")); err != nil { - t.Fatalf("expected no error, got %v", err) - } - - ch := make(chan prometheus.Metric, 100) - kpi.Collect(ch) - close(ch) - - type HostResourceMetric struct { - ComputeHost string - Resource string - AvailabilityZone string - Enabled string - Decommissioned string - ExternalCustomer string - CPUArchitecture string - WorkloadType string - DisabledReason string - PinnedProjects string - PinnedProjectIds string - Value float64 - } - - actualMetrics := make(map[string]HostResourceMetric, 0) - - for metric := range ch { - desc := metric.Desc().String() - metricName := getMetricName(desc) - - // Only consider cortex_vmware_host_capacity_available metric in this test - if metricName != "cortex_vmware_host_capacity_available" { - continue - } - - var m prometheusgo.Metric - if err := metric.Write(&m); err != nil { - t.Fatalf("failed to write metric: %v", err) - } - - labels := make(map[string]string) - for _, label := range m.Label { - labels[label.GetName()] = label.GetValue() - } - - key := labels["compute_host"] + "-" + labels["resource"] - - actualMetrics[key] = HostResourceMetric{ - ComputeHost: labels["compute_host"], - Resource: labels["resource"], - AvailabilityZone: labels["availability_zone"], - Enabled: labels["enabled"], - Decommissioned: labels["decommissioned"], - ExternalCustomer: labels["external_customer"], - CPUArchitecture: labels["cpu_architecture"], - WorkloadType: labels["workload_type"], - DisabledReason: labels["disabled_reason"], - PinnedProjects: labels["pinned_projects"], - PinnedProjectIds: labels["pinned_project_ids"], - Value: m.GetGauge().GetValue(), - } - } - - expectedMetrics := map[string]HostResourceMetric{ - "vmware-host-cpu": { - ComputeHost: "vmware-host", - Resource: "cpu", - AvailabilityZone: "az1", - Enabled: "true", - Decommissioned: "true", - ExternalCustomer: "true", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - DisabledReason: "-", - PinnedProjects: "false", - PinnedProjectIds: "", - Value: 60, // 100 - 40 - }, - "vmware-host-ram": { - ComputeHost: "vmware-host", - Resource: "ram", - AvailabilityZone: "az1", - Enabled: "true", - Decommissioned: "true", - ExternalCustomer: "true", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - DisabledReason: "-", - PinnedProjects: "false", - PinnedProjectIds: "", - Value: 160, // 200 - 40 - }, - "vmware-host-disk": { - ComputeHost: "vmware-host", - Resource: "disk", - AvailabilityZone: "az1", - Enabled: "true", - Decommissioned: "true", - ExternalCustomer: "true", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - DisabledReason: "-", - PinnedProjects: "false", - PinnedProjectIds: "", - Value: 260, // 300 - 40 - }, - } - - if len(expectedMetrics) != len(actualMetrics) { - t.Errorf("expected %d metrics, got %d", len(expectedMetrics), len(actualMetrics)) - } - - for key, expected := range expectedMetrics { - actual, ok := actualMetrics[key] - if !ok { - t.Errorf("expected metric %q not found", key) - continue - } - - if !reflect.DeepEqual(expected, actual) { - t.Errorf("metric %q: expected %+v, got %+v", key, expected, actual) - } - } -} - -func TestVMwareResourceCapacityKPI_Collect_TotalMetric(t *testing.T) { - scheme, err := v1alpha1.SchemeBuilder.Build() - if err != nil { - t.Fatalf("expected no error, got %v", err) - } - - hostDetails, err := v1alpha1.BoxFeatureList([]any{ - &compute.HostDetails{ - ComputeHost: "vmware-host", - AvailabilityZone: "az1", - CPUArchitecture: "cascade-lake", - HypervisorType: "vcenter", - HypervisorFamily: "vmware", - WorkloadType: "general-purpose", - Enabled: true, - Decommissioned: true, - ExternalCustomer: true, - DisabledReason: nil, - PinnedProjects: testlib.Ptr("project1,project2"), - }, - // Skip this because it's not a VMware host - &compute.HostDetails{ - ComputeHost: "kvm-host", - AvailabilityZone: "az2", - CPUArchitecture: "cascade-lake", - HypervisorType: "qemu", - HypervisorFamily: "kvm", - WorkloadType: "hana", - Enabled: false, - Decommissioned: false, - ExternalCustomer: false, - DisabledReason: testlib.Ptr("test"), - PinnedProjects: testlib.Ptr("project1,project2"), - }, - // Skip this because placement doesn't report any capacity for this host - &compute.HostDetails{ - ComputeHost: "vmware-host-2", - AvailabilityZone: "az2", - CPUArchitecture: "cascade-lake", - HypervisorType: "qemu", - HypervisorFamily: "vmware", - WorkloadType: "hana", - Enabled: false, - Decommissioned: false, - ExternalCustomer: false, - DisabledReason: testlib.Ptr("test"), - PinnedProjects: testlib.Ptr("project1,project2"), - }, - // Skip this because it's a ironic host - &compute.HostDetails{ - ComputeHost: "ironic-host", - AvailabilityZone: "az2", - CPUArchitecture: "cascade-lake", - HypervisorType: "ironic", - HypervisorFamily: "vmware", - WorkloadType: "hana", - Enabled: false, - Decommissioned: false, - ExternalCustomer: false, - DisabledReason: testlib.Ptr("test"), - PinnedProjects: testlib.Ptr("project1"), - }, - }) - if err != nil { - t.Fatalf("expected no error, got %v", err) - } - - hostUtilizations, err := v1alpha1.BoxFeatureList([]any{ - &compute.HostUtilization{ - ComputeHost: "vmware-host", - TotalVCPUsAllocatable: 100, - TotalRAMAllocatableMB: 200, - TotalDiskAllocatableGB: 300, - VCPUsUsed: 40, - RAMUsedMB: 40, - DiskUsedGB: 40, - }, - &compute.HostUtilization{ - ComputeHost: "kvm-host", - TotalVCPUsAllocatable: 100, - TotalRAMAllocatableMB: 100, - TotalDiskAllocatableGB: 100, - VCPUsUsed: 75, - RAMUsedMB: 80, - DiskUsedGB: 85, - }, - &compute.HostUtilization{ - ComputeHost: "ironic-host", - TotalVCPUsAllocatable: 0, - TotalRAMAllocatableMB: 0, - TotalDiskAllocatableGB: 0, - VCPUsUsed: 0, - RAMUsedMB: 0, - DiskUsedGB: 0, - }, - // No Capacity reported for host kvm-host-2 - }) - if err != nil { - t.Fatalf("expected no error, got %v", err) - } - - kpi := &VMwareResourceCapacityKPI{} - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithRuntimeObjects(&v1alpha1.Knowledge{ - ObjectMeta: v1.ObjectMeta{Name: "host-details"}, - Status: v1alpha1.KnowledgeStatus{Raw: hostDetails}, - }, &v1alpha1.Knowledge{ - ObjectMeta: v1.ObjectMeta{Name: "host-utilization"}, - Status: v1alpha1.KnowledgeStatus{Raw: hostUtilizations}, - }). - Build() - if err := kpi.Init(nil, client, conf.NewRawOpts("{}")); err != nil { - t.Fatalf("expected no error, got %v", err) - } - - ch := make(chan prometheus.Metric, 100) - kpi.Collect(ch) - close(ch) - - type HostResourceMetric struct { - ComputeHost string - Resource string - AvailabilityZone string - Enabled string - Decommissioned string - ExternalCustomer string - CPUArchitecture string - WorkloadType string - PinnedProjects string - PinnedProjectIds string - Value float64 - } - - actualMetrics := make(map[string]HostResourceMetric, 0) - - for metric := range ch { - desc := metric.Desc().String() - metricName := getMetricName(desc) - - // Only consider cortex_vmware_host_capacity_total metric in this test - if metricName != "cortex_vmware_host_capacity_total" { - continue - } - - var m prometheusgo.Metric - if err := metric.Write(&m); err != nil { - t.Fatalf("failed to write metric: %v", err) - } - - labels := make(map[string]string) - for _, label := range m.Label { - labels[label.GetName()] = label.GetValue() - } - - key := labels["compute_host"] + "-" + labels["resource"] - - actualMetrics[key] = HostResourceMetric{ - ComputeHost: labels["compute_host"], - Resource: labels["resource"], - AvailabilityZone: labels["availability_zone"], - Enabled: labels["enabled"], - Decommissioned: labels["decommissioned"], - ExternalCustomer: labels["external_customer"], - CPUArchitecture: labels["cpu_architecture"], - WorkloadType: labels["workload_type"], - PinnedProjects: labels["pinned_projects"], - PinnedProjectIds: labels["pinned_project_ids"], - Value: m.GetGauge().GetValue(), - } - } - - expectedMetrics := map[string]HostResourceMetric{ - "vmware-host-cpu": { - ComputeHost: "vmware-host", - Resource: "cpu", - AvailabilityZone: "az1", - Enabled: "true", - Decommissioned: "true", - ExternalCustomer: "true", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - PinnedProjects: "true", - PinnedProjectIds: "project1,project2", - Value: 100, - }, - "vmware-host-ram": { - ComputeHost: "vmware-host", - Resource: "ram", - AvailabilityZone: "az1", - Enabled: "true", - Decommissioned: "true", - ExternalCustomer: "true", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - PinnedProjects: "true", - PinnedProjectIds: "project1,project2", - Value: 200, - }, - "vmware-host-disk": { - ComputeHost: "vmware-host", - Resource: "disk", - AvailabilityZone: "az1", - Enabled: "true", - Decommissioned: "true", - ExternalCustomer: "true", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - PinnedProjects: "true", - PinnedProjectIds: "project1,project2", - Value: 300, - }, - } - - if len(expectedMetrics) != len(actualMetrics) { - t.Errorf("expected %d metrics, got %d", len(expectedMetrics), len(actualMetrics)) - } - - for key, expected := range expectedMetrics { - actual, ok := actualMetrics[key] - if !ok { - t.Errorf("expected metric %q not found", key) - continue - } - - if !reflect.DeepEqual(expected, actual) { - t.Errorf("metric %q: expected %+v, got %+v", key, expected, actual) - } - } -} diff --git a/internal/knowledge/kpis/plugins/infrastructure/shared.go b/internal/knowledge/kpis/plugins/infrastructure/shared.go index 4c011492c..62eb44e9c 100644 --- a/internal/knowledge/kpis/plugins/infrastructure/shared.go +++ b/internal/knowledge/kpis/plugins/infrastructure/shared.go @@ -13,6 +13,7 @@ import ( const ( hostDetailsKnowledgeName = "host-details" + hostUtilizationKnowledgeName = "host-utilization" vmwareIronicHypervisorType = "ironic" hypervisorFamilyVMware = "vmware" vmwareComputeHostPattern = "nova-compute-%" @@ -40,7 +41,6 @@ func (h vmwareHost) getHostLabels() []string { h.ComputeHost, h.CPUArchitecture, h.WorkloadType, - h.HypervisorFamily, strconv.FormatBool(h.Enabled), strconv.FormatBool(h.Decommissioned), strconv.FormatBool(h.ExternalCustomer), @@ -55,7 +55,6 @@ var vmwareHostLabels = []string{ "compute_host", "cpu_architecture", "workload_type", - "hypervisor_family", "enabled", "decommissioned", "external_customer", diff --git a/internal/knowledge/kpis/plugins/infrastructure/shared_test.go b/internal/knowledge/kpis/plugins/infrastructure/shared_test.go index dc720d159..351fedc50 100644 --- a/internal/knowledge/kpis/plugins/infrastructure/shared_test.go +++ b/internal/knowledge/kpis/plugins/infrastructure/shared_test.go @@ -3,7 +3,94 @@ package infrastructure -import "testing" +import ( + "testing" + + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" +) + +func mockVMwareHostLabels(computeHost, az string) map[string]string { + return map[string]string{ + "availability_zone": az, + "compute_host": computeHost, + "cpu_architecture": "", + "workload_type": "", + "enabled": "false", + "decommissioned": "false", + "external_customer": "false", + "disabled_reason": "-", + "pinned_projects": "false", + "pinned_project_ids": "", + } +} + +func TestVMwareHostGetHostLabels(t *testing.T) { + str := func(s string) *string { return &s } + + tests := []struct { + name string + host vmwareHost + want []string + }{ + { + name: "all optional fields nil", + host: vmwareHost{compute.HostDetails{ + AvailabilityZone: "az1", + ComputeHost: "nova-compute-1", + CPUArchitecture: "cascade-lake", + WorkloadType: "general-purpose", + Enabled: true, + Decommissioned: false, + ExternalCustomer: false, + DisabledReason: nil, + PinnedProjects: nil, + }}, + want: []string{"az1", "nova-compute-1", "cascade-lake", "general-purpose", "true", "false", "false", "-", "false", ""}, + }, + { + name: "disabled reason set", + host: vmwareHost{compute.HostDetails{ + AvailabilityZone: "az2", + ComputeHost: "nova-compute-2", + DisabledReason: str("scheduled-maintenance"), + }}, + want: []string{"az2", "nova-compute-2", "", "", "false", "false", "false", "scheduled-maintenance", "false", ""}, + }, + { + name: "pinned projects set", + host: vmwareHost{compute.HostDetails{ + AvailabilityZone: "az1", + ComputeHost: "nova-compute-3", + PinnedProjects: str("proj-a,proj-b"), + }}, + want: []string{"az1", "nova-compute-3", "", "", "false", "false", "false", "-", "true", "proj-a,proj-b"}, + }, + { + name: "decommissioned and external customer", + host: vmwareHost{compute.HostDetails{ + AvailabilityZone: "az3", + ComputeHost: "nova-compute-4", + Decommissioned: true, + ExternalCustomer: true, + }}, + want: []string{"az3", "nova-compute-4", "", "", "false", "true", "true", "-", "false", ""}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := tt.host.getHostLabels() + if len(got) != len(vmwareHostLabels) { + t.Fatalf("getHostLabels() returned %d values, want %d (matching vmwareHostLabels)", len(got), len(vmwareHostLabels)) + } + for i, want := range tt.want { + if got[i] != want { + t.Errorf("label[%d] (%s) = %q, want %q", i, vmwareHostLabels[i], got[i], want) + } + } + }) + } +} func TestIsKVMFlavor(t *testing.T) { tests := []struct { diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity.go new file mode 100644 index 000000000..c7976db3a --- /dev/null +++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity.go @@ -0,0 +1,119 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package infrastructure + +import ( + "context" + "log/slog" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/db" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins" + "github.com/cobaltcore-dev/cortex/pkg/conf" + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +type VMwareHostCapacityKPI struct { + plugins.BaseKPI[struct{}] + + capacityUsagePerHost *prometheus.Desc + capacityTotalPerHost *prometheus.Desc +} + +func (k *VMwareHostCapacityKPI) GetName() string { + return "vmware_host_capacity_kpi" +} + +func (k *VMwareHostCapacityKPI) Init(dbConn *db.DB, c client.Client, opts conf.RawOpts) error { + if err := k.BaseKPI.Init(dbConn, c, opts); err != nil { + return err + } + k.capacityUsagePerHost = prometheus.NewDesc( + "cortex_vmware_host_capacity_usage", + "Capacity usage per VMware host. CPU in vCPUs, memory and disk in bytes.", + append(vmwareHostLabels, "resource"), nil, + ) + k.capacityTotalPerHost = prometheus.NewDesc( + "cortex_vmware_host_capacity_total", + "Total allocatable capacity per VMware host. CPU in vCPUs, memory and disk in bytes.", + append(vmwareHostLabels, "resource"), nil, + ) + return nil +} + +func (k *VMwareHostCapacityKPI) Describe(ch chan<- *prometheus.Desc) { + ch <- k.capacityUsagePerHost + ch <- k.capacityTotalPerHost +} + +func (k *VMwareHostCapacityKPI) Collect(ch chan<- prometheus.Metric) { + hosts, err := k.getVMwareHosts() + if err != nil { + slog.Error("vmware_host_capacity: failed to get vmware hosts", "error", err) + return + } + utilizations, err := k.getHostUtilizations() + if err != nil { + slog.Error("vmware_host_capacity: failed to get host utilizations", "error", err) + return + } + for _, host := range hosts { + util, ok := utilizations[host.ComputeHost] + if !ok { + slog.Warn("vmware_host_capacity: missing utilization for host", "compute_host", host.ComputeHost) + continue + } + + labels := host.getHostLabels() + + ch <- prometheus.MustNewConstMetric(k.capacityUsagePerHost, prometheus.GaugeValue, util.VCPUsUsed, append(labels, "cpu")...) + ch <- prometheus.MustNewConstMetric(k.capacityUsagePerHost, prometheus.GaugeValue, util.RAMUsedMB*1024*1024, append(labels, "ram")...) + ch <- prometheus.MustNewConstMetric(k.capacityUsagePerHost, prometheus.GaugeValue, util.DiskUsedGB*1024*1024*1024, append(labels, "disk")...) + + ch <- prometheus.MustNewConstMetric(k.capacityTotalPerHost, prometheus.GaugeValue, util.TotalVCPUsAllocatable, append(labels, "cpu")...) + ch <- prometheus.MustNewConstMetric(k.capacityTotalPerHost, prometheus.GaugeValue, util.TotalRAMAllocatableMB*1024*1024, append(labels, "ram")...) + ch <- prometheus.MustNewConstMetric(k.capacityTotalPerHost, prometheus.GaugeValue, util.TotalDiskAllocatableGB*1024*1024*1024, append(labels, "disk")...) + } +} + +func (k *VMwareHostCapacityKPI) getVMwareHosts() ([]vmwareHost, error) { + knowledge := &v1alpha1.Knowledge{} + if err := k.Client.Get(context.Background(), client.ObjectKey{Name: hostDetailsKnowledgeName}, knowledge); err != nil { + return nil, err + } + details, err := v1alpha1.UnboxFeatureList[compute.HostDetails](knowledge.Status.Raw) + if err != nil { + return nil, err + } + hosts := make([]vmwareHost, 0, len(details)) + for _, d := range details { + if d.HypervisorType == vmwareIronicHypervisorType || d.HypervisorFamily != hypervisorFamilyVMware { + continue + } + hosts = append(hosts, vmwareHost{HostDetails: d}) + } + return hosts, nil +} + +func (k *VMwareHostCapacityKPI) getHostUtilizations() (map[string]compute.HostUtilization, error) { + knowledge := &v1alpha1.Knowledge{} + if err := k.Client.Get(context.Background(), client.ObjectKey{Name: hostUtilizationKnowledgeName}, knowledge); err != nil { + return nil, err + } + utils, err := v1alpha1.UnboxFeatureList[compute.HostUtilization](knowledge.Status.Raw) + if err != nil { + return nil, err + } + m := make(map[string]compute.HostUtilization, len(utils)) + for _, u := range utils { + if u.TotalVCPUsAllocatable == 0 || u.TotalRAMAllocatableMB == 0 || u.TotalDiskAllocatableGB == 0 { + slog.Warn("vmware_host_capacity: skipping host with zero allocatable resources", "compute_host", u.ComputeHost) + continue + } + m[u.ComputeHost] = u + } + return m, nil +} diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity_test.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity_test.go new file mode 100644 index 000000000..f0a025db4 --- /dev/null +++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_host_capacity_test.go @@ -0,0 +1,335 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package infrastructure + +import ( + "reflect" + "testing" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/db" + testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/pkg/conf" + "github.com/prometheus/client_golang/prometheus" + prometheusgo "github.com/prometheus/client_model/go" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func buildHostCapacityClient(t *testing.T, hostDetails []compute.HostDetails, utilizations []compute.HostUtilization) *fake.ClientBuilder { + t.Helper() + scheme, err := v1alpha1.SchemeBuilder.Build() + if err != nil { + t.Fatalf("failed to build scheme: %v", err) + } + rawDetails, err := v1alpha1.BoxFeatureList(hostDetails) + if err != nil { + t.Fatalf("failed to box host details: %v", err) + } + rawUtils, err := v1alpha1.BoxFeatureList(utilizations) + if err != nil { + t.Fatalf("failed to box host utilizations: %v", err) + } + return fake.NewClientBuilder().WithScheme(scheme).WithRuntimeObjects( + &v1alpha1.Knowledge{ + ObjectMeta: v1.ObjectMeta{Name: hostDetailsKnowledgeName}, + Status: v1alpha1.KnowledgeStatus{Raw: rawDetails}, + }, + &v1alpha1.Knowledge{ + ObjectMeta: v1.ObjectMeta{Name: hostUtilizationKnowledgeName}, + Status: v1alpha1.KnowledgeStatus{Raw: rawUtils}, + }, + ) +} + +func TestVMwareHostCapacityKPI_Init(t *testing.T) { + dbEnv := testlibDB.SetupDBEnv(t) + testDB := db.DB{DbMap: dbEnv.DbMap} + defer dbEnv.Close() + kpi := &VMwareHostCapacityKPI{} + if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error, got %v", err) + } +} + +func TestVMwareHostCapacityKPI_getVMwareHosts(t *testing.T) { + hostDetails := []compute.HostDetails{ + {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware}, + {ComputeHost: "nova-compute-2", HypervisorFamily: hypervisorFamilyVMware}, + {ComputeHost: "nova-compute-ironic-1", HypervisorType: vmwareIronicHypervisorType, HypervisorFamily: hypervisorFamilyVMware}, + {ComputeHost: "nova-compute-3", HypervisorFamily: "other"}, + } + + client := buildHostCapacityClient(t, hostDetails, nil) + kpi := &VMwareHostCapacityKPI{} + kpi.Client = client.Build() + + hosts, err := kpi.getVMwareHosts() + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + + if len(hosts) != 2 { + t.Fatalf("expected 2 hosts, got %d", len(hosts)) + } + seen := make(map[string]bool) + for _, h := range hosts { + seen[h.ComputeHost] = true + } + for _, name := range []string{"nova-compute-1", "nova-compute-2"} { + if !seen[name] { + t.Errorf("expected host %q in result", name) + } + } +} + +func TestVMwareHostCapacityKPI_getHostUtilizations(t *testing.T) { + tests := []struct { + name string + utilizations []compute.HostUtilization + expectedHosts []string + }{ + { + name: "normal utilizations are returned", + utilizations: []compute.HostUtilization{ + {ComputeHost: "h1", TotalVCPUsAllocatable: 10, TotalRAMAllocatableMB: 1024, TotalDiskAllocatableGB: 100}, + {ComputeHost: "h2", TotalVCPUsAllocatable: 20, TotalRAMAllocatableMB: 2048, TotalDiskAllocatableGB: 200}, + }, + expectedHosts: []string{"h1", "h2"}, + }, + { + name: "zero TotalVCPUsAllocatable is skipped", + utilizations: []compute.HostUtilization{ + {ComputeHost: "h1", TotalVCPUsAllocatable: 0, TotalRAMAllocatableMB: 1024, TotalDiskAllocatableGB: 100}, + }, + expectedHosts: []string{}, + }, + { + name: "zero TotalRAMAllocatableMB is skipped", + utilizations: []compute.HostUtilization{ + {ComputeHost: "h1", TotalVCPUsAllocatable: 10, TotalRAMAllocatableMB: 0, TotalDiskAllocatableGB: 100}, + }, + expectedHosts: []string{}, + }, + { + name: "zero TotalDiskAllocatableGB is skipped", + utilizations: []compute.HostUtilization{ + {ComputeHost: "h1", TotalVCPUsAllocatable: 10, TotalRAMAllocatableMB: 1024, TotalDiskAllocatableGB: 0}, + }, + expectedHosts: []string{}, + }, + { + name: "mix of valid and zero-allocatable entries", + utilizations: []compute.HostUtilization{ + {ComputeHost: "h1", TotalVCPUsAllocatable: 10, TotalRAMAllocatableMB: 1024, TotalDiskAllocatableGB: 100}, + {ComputeHost: "h2", TotalVCPUsAllocatable: 0, TotalRAMAllocatableMB: 1024, TotalDiskAllocatableGB: 100}, + }, + expectedHosts: []string{"h1"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + client := buildHostCapacityClient(t, nil, tt.utilizations) + kpi := &VMwareHostCapacityKPI{} + kpi.Client = client.Build() + + m, err := kpi.getHostUtilizations() + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if len(m) != len(tt.expectedHosts) { + t.Fatalf("expected %d entries, got %d: %v", len(tt.expectedHosts), len(m), m) + } + for _, host := range tt.expectedHosts { + if _, ok := m[host]; !ok { + t.Errorf("expected host %q in result", host) + } + } + }) + } +} + +func TestVMwareHostCapacityKPI_Collect(t *testing.T) { + tests := []struct { + name string + hostDetails []compute.HostDetails + utilizations []compute.HostUtilization + expectedMetrics []collectedVMwareMetric + }{ + { + name: "single host emits usage and total metrics", + hostDetails: []compute.HostDetails{ + {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"}, + }, + utilizations: []compute.HostUtilization{ + { + ComputeHost: "nova-compute-1", + VCPUsUsed: 4, + TotalVCPUsAllocatable: 16, + RAMUsedMB: 2048, + TotalRAMAllocatableMB: 8192, + DiskUsedGB: 50, + TotalDiskAllocatableGB: 500, + }, + }, + expectedMetrics: []collectedVMwareMetric{ + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "cpu"), Value: 4}, + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "ram"), Value: 2048 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "disk"), Value: 50 * 1024 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "cpu"), Value: 16}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "ram"), Value: 8192 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "disk"), Value: 500 * 1024 * 1024 * 1024}, + }, + }, + { + name: "multiple hosts each emit their own metrics", + hostDetails: []compute.HostDetails{ + {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"}, + {ComputeHost: "nova-compute-2", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az2"}, + }, + utilizations: []compute.HostUtilization{ + {ComputeHost: "nova-compute-1", VCPUsUsed: 2, TotalVCPUsAllocatable: 8, RAMUsedMB: 512, TotalRAMAllocatableMB: 2048, DiskUsedGB: 10, TotalDiskAllocatableGB: 100}, + {ComputeHost: "nova-compute-2", VCPUsUsed: 6, TotalVCPUsAllocatable: 12, RAMUsedMB: 1024, TotalRAMAllocatableMB: 4096, DiskUsedGB: 20, TotalDiskAllocatableGB: 200}, + }, + expectedMetrics: []collectedVMwareMetric{ + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "cpu"), Value: 2}, + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "ram"), Value: 512 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "disk"), Value: 10 * 1024 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "cpu"), Value: 8}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "ram"), Value: 2048 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "disk"), Value: 100 * 1024 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-2", "az2", "cpu"), Value: 6}, + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-2", "az2", "ram"), Value: 1024 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-2", "az2", "disk"), Value: 20 * 1024 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-2", "az2", "cpu"), Value: 12}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-2", "az2", "ram"), Value: 4096 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-2", "az2", "disk"), Value: 200 * 1024 * 1024 * 1024}, + }, + }, + { + name: "ironic hosts are excluded", + hostDetails: []compute.HostDetails{ + {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"}, + {ComputeHost: "nova-compute-ironic-1", HypervisorType: vmwareIronicHypervisorType, HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"}, + }, + utilizations: []compute.HostUtilization{ + {ComputeHost: "nova-compute-1", VCPUsUsed: 2, TotalVCPUsAllocatable: 8, RAMUsedMB: 512, TotalRAMAllocatableMB: 2048, DiskUsedGB: 10, TotalDiskAllocatableGB: 100}, + {ComputeHost: "nova-compute-ironic-1", VCPUsUsed: 4, TotalVCPUsAllocatable: 16, RAMUsedMB: 1024, TotalRAMAllocatableMB: 4096, DiskUsedGB: 20, TotalDiskAllocatableGB: 200}, + }, + expectedMetrics: []collectedVMwareMetric{ + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "cpu"), Value: 2}, + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "ram"), Value: 512 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "disk"), Value: 10 * 1024 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "cpu"), Value: 8}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "ram"), Value: 2048 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "disk"), Value: 100 * 1024 * 1024 * 1024}, + }, + }, + { + name: "non-vmware hosts are excluded", + hostDetails: []compute.HostDetails{ + {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"}, + {ComputeHost: "nova-compute-2", HypervisorFamily: "kvm", AvailabilityZone: "az1"}, + }, + utilizations: []compute.HostUtilization{ + {ComputeHost: "nova-compute-1", VCPUsUsed: 2, TotalVCPUsAllocatable: 8, RAMUsedMB: 512, TotalRAMAllocatableMB: 2048, DiskUsedGB: 10, TotalDiskAllocatableGB: 100}, + {ComputeHost: "nova-compute-2", VCPUsUsed: 4, TotalVCPUsAllocatable: 16, RAMUsedMB: 1024, TotalRAMAllocatableMB: 4096, DiskUsedGB: 20, TotalDiskAllocatableGB: 200}, + }, + expectedMetrics: []collectedVMwareMetric{ + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "cpu"), Value: 2}, + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "ram"), Value: 512 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_usage", Labels: hostCapacityLabels("nova-compute-1", "az1", "disk"), Value: 10 * 1024 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "cpu"), Value: 8}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "ram"), Value: 2048 * 1024 * 1024}, + {Name: "cortex_vmware_host_capacity_total", Labels: hostCapacityLabels("nova-compute-1", "az1", "disk"), Value: 100 * 1024 * 1024 * 1024}, + }, + }, + { + name: "host without matching utilization produces no metrics", + hostDetails: []compute.HostDetails{ + {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"}, + }, + utilizations: []compute.HostUtilization{}, + expectedMetrics: []collectedVMwareMetric{}, + }, + { + name: "utilization with zero allocatable resources is skipped", + hostDetails: []compute.HostDetails{ + {ComputeHost: "nova-compute-1", HypervisorFamily: hypervisorFamilyVMware, AvailabilityZone: "az1"}, + }, + utilizations: []compute.HostUtilization{ + {ComputeHost: "nova-compute-1", VCPUsUsed: 2, TotalVCPUsAllocatable: 0, RAMUsedMB: 512, TotalRAMAllocatableMB: 2048, DiskUsedGB: 10, TotalDiskAllocatableGB: 100}, + }, + expectedMetrics: []collectedVMwareMetric{}, + }, + { + name: "no hosts produces no metrics", + hostDetails: []compute.HostDetails{}, + utilizations: []compute.HostUtilization{}, + expectedMetrics: []collectedVMwareMetric{}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + dbEnv := testlibDB.SetupDBEnv(t) + testDB := db.DB{DbMap: dbEnv.DbMap} + defer dbEnv.Close() + + client := buildHostCapacityClient(t, tt.hostDetails, tt.utilizations) + kpi := &VMwareHostCapacityKPI{} + if err := kpi.Init(&testDB, client.Build(), conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error on Init, got %v", err) + } + + ch := make(chan prometheus.Metric, 200) + kpi.Collect(ch) + close(ch) + + actual := make(map[string]collectedVMwareMetric) + for m := range ch { + var pm prometheusgo.Metric + if err := m.Write(&pm); err != nil { + t.Fatalf("failed to write metric: %v", err) + } + labels := make(map[string]string) + for _, lbl := range pm.Label { + labels[lbl.GetName()] = lbl.GetValue() + } + name := getMetricName(m.Desc().String()) + key := name + "|" + labels["compute_host"] + "|" + labels["resource"] + if _, exists := actual[key]; exists { + t.Fatalf("duplicate metric key %q", key) + } + actual[key] = collectedVMwareMetric{Name: name, Labels: labels, Value: pm.GetGauge().GetValue()} + } + + if len(actual) != len(tt.expectedMetrics) { + t.Errorf("expected %d metrics, got %d: actual=%v", len(tt.expectedMetrics), len(actual), actual) + } + for _, exp := range tt.expectedMetrics { + key := exp.Name + "|" + exp.Labels["compute_host"] + "|" + exp.Labels["resource"] + got, ok := actual[key] + if !ok { + t.Errorf("missing metric %q", key) + continue + } + if got.Value != exp.Value { + t.Errorf("metric %q value: expected %v, got %v", key, exp.Value, got.Value) + } + if !reflect.DeepEqual(exp.Labels, got.Labels) { + t.Errorf("metric %q labels: expected %v, got %v", key, exp.Labels, got.Labels) + } + } + }) + } +} + +func hostCapacityLabels(computeHost, az, resource string) map[string]string { + labels := mockVMwareHostLabels(computeHost, az) + labels["resource"] = resource + return labels +} diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization_test.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization_test.go index 9f6d84786..4c43c893b 100644 --- a/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization_test.go +++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization_test.go @@ -33,24 +33,8 @@ func buildMetricKey(name string, labels map[string]string) string { } } -func hostLabels(computeHost, az string) map[string]string { - return map[string]string{ - "availability_zone": az, - "compute_host": computeHost, - "cpu_architecture": "", - "workload_type": "", - "hypervisor_family": "vmware", - "enabled": "false", - "decommissioned": "false", - "external_customer": "false", - "disabled_reason": "-", - "pinned_projects": "false", - "pinned_project_ids": "", - } -} - func instanceMetric(computeHost, az, projectID, projectName, flavorName string, value float64) collectedVMwareMetric { - labels := hostLabels(computeHost, az) + labels := mockVMwareHostLabels(computeHost, az) labels["project_id"] = projectID labels["project_name"] = projectName labels["flavor_name"] = flavorName @@ -58,7 +42,7 @@ func instanceMetric(computeHost, az, projectID, projectName, flavorName string, } func capacityMetric(computeHost, az, projectID, projectName, resource string, value float64) collectedVMwareMetric { - labels := hostLabels(computeHost, az) + labels := mockVMwareHostLabels(computeHost, az) labels["project_id"] = projectID labels["project_name"] = projectName labels["resource"] = resource diff --git a/internal/knowledge/kpis/supported_kpis.go b/internal/knowledge/kpis/supported_kpis.go index 19726a488..63a35866b 100644 --- a/internal/knowledge/kpis/supported_kpis.go +++ b/internal/knowledge/kpis/supported_kpis.go @@ -16,7 +16,6 @@ var supportedKPIs = map[string]plugins.KPI{ "kvm_host_capacity_kpi": &compute.KVMResourceCapacityKPI{}, "vmware_host_contention_kpi": &compute.VMwareHostContentionKPI{}, "vmware_project_noisiness_kpi": &compute.VMwareProjectNoisinessKPI{}, - "vmware_host_capacity_kpi": &compute.VMwareResourceCapacityKPI{}, "host_running_vms_kpi": &compute.HostRunningVMsKPI{}, "flavor_running_vms_kpi": &compute.FlavorRunningVMsKPI{}, "vm_migration_statistics_kpi": &compute.VMMigrationStatisticsKPI{}, @@ -26,6 +25,7 @@ var supportedKPIs = map[string]plugins.KPI{ "vmware_project_utilization_kpi": &infrastructure.VMwareProjectUtilizationKPI{}, "vmware_resource_commitments_kpi": &infrastructure.VMwareResourceCommitmentsKPI{}, + "vmware_host_capacity_kpi": &infrastructure.VMwareHostCapacityKPI{}, "netapp_storage_pool_cpu_usage_kpi": &storage.NetAppStoragePoolCPUUsageKPI{},