diff --git a/helm/bundles/cortex-nova/templates/kpis.yaml b/helm/bundles/cortex-nova/templates/kpis.yaml index 3234fcc4a..22774c62a 100644 --- a/helm/bundles/cortex-nova/templates/kpis.yaml +++ b/helm/bundles/cortex-nova/templates/kpis.yaml @@ -188,31 +188,31 @@ spec: apiVersion: cortex.cloud/v1alpha1 kind: KPI metadata: - name: vmware-commitments + name: vmware-project-utilization spec: schedulingDomain: nova - impl: vmware_commitments_kpi + impl: vmware_project_utilization_kpi dependencies: datasources: - - name: limes-project-commitments - - name: nova-flavors - name: nova-servers + - name: nova-flavors + - name: identity-projects + knowledges: + - name: host-details description: | - This KPI tracks unused VMware commitments based on project commitments and usage. + This KPI tracks the resource utilization of projects running VMs on VMware hosts. --- apiVersion: cortex.cloud/v1alpha1 kind: KPI metadata: - name: vmware-project-utilization + name: vmware-resource-commitments spec: schedulingDomain: nova - impl: vmware_project_utilization_kpi + impl: vmware_resource_commitments_kpi dependencies: datasources: - name: nova-servers - name: nova-flavors - - name: identity-projects - knowledges: - - name: host-details + - name: limes-project-commitments description: | - This KPI tracks the resource utilization of projects running VMs on VMware hosts. \ No newline at end of file + This KPI tracks the resource commitments of projects running VMs on VMware hosts. \ No newline at end of file diff --git a/internal/knowledge/kpis/plugins/compute/resource_commitments_vmware.go b/internal/knowledge/kpis/plugins/compute/resource_commitments_vmware.go deleted file mode 100644 index 74cde06a0..000000000 --- a/internal/knowledge/kpis/plugins/compute/resource_commitments_vmware.go +++ /dev/null @@ -1,197 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package compute - -import ( - "log/slog" - "strings" - - "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/limes" - "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" - "github.com/cobaltcore-dev/cortex/internal/knowledge/db" - "github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins" - "github.com/cobaltcore-dev/cortex/pkg/conf" - "github.com/prometheus/client_golang/prometheus" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -type VMwareResourceCommitmentsKPI struct { - // Common base for all KPIs that provides standard functionality. - plugins.BaseKPI[struct{}] // No options passed through yaml config - - unusedInstanceCommitments *prometheus.Desc -} - -func (VMwareResourceCommitmentsKPI) GetName() string { - return "vmware_commitments_kpi" -} - -func (k *VMwareResourceCommitmentsKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) error { - if err := k.BaseKPI.Init(db, client, opts); err != nil { - return err - } - k.unusedInstanceCommitments = prometheus.NewDesc( - "cortex_vmware_hana_unused_instance_commitments", - "Unused instance commitment capacity summed across all projects (vcpus / ram_mb / disk_gb).", - []string{ - "resource", // "cpu", "ram", "disk" - "availability_zone", - "cpu_architecture", // "sapphire-rapids" (_v2 suffix) or "cascade-lake" - }, - nil, - ) - return nil -} - -func (k *VMwareResourceCommitmentsKPI) Describe(ch chan<- *prometheus.Desc) { - ch <- k.unusedInstanceCommitments -} - -func (k *VMwareResourceCommitmentsKPI) Collect(ch chan<- prometheus.Metric) { - k.collectUnusedCommitments(ch) -} - -// getRunningHANAServers loads all running HANA servers from the database. We consider a server "running" if its status is not DELETED or ERROR. -func (k *VMwareResourceCommitmentsKPI) getRunningHANAServers() ([]nova.Server, error) { - // Load running HANA servers (non-deleted, non-error). - var servers []nova.Server - if _, err := k.DB.Select(&servers, ` - SELECT * FROM `+nova.Server{}.TableName()+` - WHERE flavor_name LIKE 'hana_%' - AND status NOT IN ('DELETED', 'ERROR') - `); err != nil { - return nil, err - } - return servers, nil -} - -// getFlavorsByName loads all flavors from the database and returns a map of flavor name to flavor struct for easy lookup. -func (k *VMwareResourceCommitmentsKPI) getFlavorsByName() (map[string]nova.Flavor, error) { - var flavors []nova.Flavor - if _, err := k.DB.Select(&flavors, "SELECT * FROM "+nova.Flavor{}.TableName()); err != nil { - return nil, err - } - flavorsByName := make(map[string]nova.Flavor, len(flavors)) - for _, flavor := range flavors { - flavorsByName[flavor.Name] = flavor - } - return flavorsByName, nil -} - -// getInstanceCommitments loads all confirmed or guaranteed instance commitments from the database. -func (k *VMwareResourceCommitmentsKPI) getInstanceCommitments() ([]limes.Commitment, error) { - var commitments []limes.Commitment - if _, err := k.DB.Select(&commitments, ` - SELECT * FROM `+limes.Commitment{}.TableName()+` - WHERE service_type = 'compute' - AND resource_name LIKE 'instances_%' - AND status IN ('confirmed', 'guaranteed') - `); err != nil { - return nil, err - } - return commitments, nil -} - -// cpuArchitectureForFlavor returns the CPU architecture label for a HANA flavor name. -// Flavors with a "_v2" suffix run on sapphire-rapids; all others are cascade-lake. -func cpuArchitectureForFlavor(flavorName string) string { - if strings.HasSuffix(flavorName, "_v2") { - return "sapphire-rapids" - } - return "cascade-lake" -} - -// resourceKey identifies an aggregated capacity bucket by (resource, az, architecture). -type resourceKey struct{ resource, az, architecture string } - -// calculateUnusedInstanceCapacity computes per-(resource, az, architecture) capacity sums for unused -// HANA VMware commitments. It filters out non-HANA and KVM (hana_k_) commitments, then for each -// (project, flavor, az, architecture) bucket subtracts running servers from committed amount; over-used -// buckets are clamped to zero and omitted from the result. -func calculateUnusedInstanceCapacity( - commitments []limes.Commitment, - servers []nova.Server, - flavorsByName map[string]nova.Flavor, -) map[resourceKey]float64 { - // running: (project_id, flavor_name, az) -> count of non-deleted/non-error servers. - type serverCountKey struct{ projectID, flavorName, az string } - running := make(map[serverCountKey]uint64, len(servers)) - for _, s := range servers { - running[serverCountKey{s.TenantID, s.FlavorName, s.OSEXTAvailabilityZone}]++ - } - - // committed: (project_id, flavor_name, az, cpuArchitecture) -> total committed amount. - type commitmentKey struct{ projectID, flavorName, az, cpuArchitecture string } - committed := make(map[commitmentKey]uint64) - for _, c := range commitments { - flavorName := strings.TrimPrefix(c.ResourceName, "instances_") - if !strings.HasPrefix(flavorName, "hana_") { - continue - } - if strings.HasPrefix(flavorName, "hana_k_") { - slog.Debug("unused_commitments: skipping hana kvm commitment", "flavor", flavorName, "project_id", c.ProjectID) - continue - } - key := commitmentKey{c.ProjectID, flavorName, c.AvailabilityZone, cpuArchitectureForFlavor(flavorName)} - committed[key] += c.Amount - } - - sum := make(map[resourceKey]float64) - for ck, total := range committed { - run := running[serverCountKey{ck.projectID, ck.flavorName, ck.az}] - if run >= total { - continue - } - unused := total - run - flavor, ok := flavorsByName[ck.flavorName] - if !ok { - slog.Warn("unused_commitments: flavor not found in flavor table", "flavor", ck.flavorName) - continue - } - sum[resourceKey{"cpu", ck.az, ck.cpuArchitecture}] += float64(unused) * float64(flavor.VCPUs) - sum[resourceKey{"ram", ck.az, ck.cpuArchitecture}] += float64(unused) * float64(flavor.RAM) - sum[resourceKey{"disk", ck.az, ck.cpuArchitecture}] += float64(unused) * float64(flavor.Disk) - } - return sum -} - -func (k *VMwareResourceCommitmentsKPI) collectUnusedCommitments(ch chan<- prometheus.Metric) { - if k.DB == nil { - return - } - - // Load confirmed/guaranteed instance commitments. - commitments, err := k.getInstanceCommitments() - if err != nil { - slog.Error("unused_commitments: failed to load commitments", "err", err) - return - } - - // Load flavors for capacity lookup. - flavorsByName, err := k.getFlavorsByName() - if err != nil { - slog.Error("unused_commitments: failed to load flavors", "err", err) - return - } - - // Load running HANA servers. - servers, err := k.getRunningHANAServers() - if err != nil { - slog.Error("unused_commitments: failed to get running HANA servers", "err", err) - return - } - - sumByResource := calculateUnusedInstanceCapacity(commitments, servers, flavorsByName) - - for rk, value := range sumByResource { - ch <- prometheus.MustNewConstMetric( - k.unusedInstanceCommitments, - prometheus.GaugeValue, - value, - rk.resource, - rk.az, - rk.architecture, - ) - } -} diff --git a/internal/knowledge/kpis/plugins/compute/resource_commitments_vmware_test.go b/internal/knowledge/kpis/plugins/compute/resource_commitments_vmware_test.go deleted file mode 100644 index 90a1abd3b..000000000 --- a/internal/knowledge/kpis/plugins/compute/resource_commitments_vmware_test.go +++ /dev/null @@ -1,301 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package compute - -import ( - "reflect" - "testing" - - "github.com/cobaltcore-dev/cortex/api/v1alpha1" - "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/limes" - "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" - "github.com/cobaltcore-dev/cortex/internal/knowledge/db" - testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing" - "github.com/cobaltcore-dev/cortex/pkg/conf" - "github.com/prometheus/client_golang/prometheus" - prometheusgo "github.com/prometheus/client_model/go" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "sigs.k8s.io/controller-runtime/pkg/client/fake" -) - -func TestCPUArchitectureForFlavor(t *testing.T) { - tests := []struct { - flavorName string - want string - }{ - {"hana_small", "cascade-lake"}, - {"hana_large", "cascade-lake"}, - {"hana_small_v2", "sapphire-rapids"}, - {"hana_large_v2", "sapphire-rapids"}, - {"hana_v2_extra", "cascade-lake"}, // _v2 must be a suffix - {"hana_x_v2", "sapphire-rapids"}, - } - for _, tt := range tests { - t.Run(tt.flavorName, func(t *testing.T) { - got := cpuArchitectureForFlavor(tt.flavorName) - if got != tt.want { - t.Errorf("cpuArchitectureForFlavor(%q) = %q, want %q", tt.flavorName, got, tt.want) - } - }) - } -} - -func TestCalculateUnusedInstanceCapacity(t *testing.T) { - flavors := map[string]nova.Flavor{ - "hana_small": {VCPUs: 4, RAM: 16384, Disk: 100}, - "hana_large_v2": {VCPUs: 16, RAM: 65536, Disk: 400}, - } - - t.Run("basic unused", func(t *testing.T) { - commitments := []limes.Commitment{ - {ProjectID: "p1", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 3}, - } - servers := []nova.Server{ - {TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1"}, // 1 running -> 2 unused - } - got := calculateUnusedInstanceCapacity(commitments, servers, flavors) - - if got[resourceKey{"cpu", "az1", "cascade-lake"}] != 8 { // 2 * 4 - t.Errorf("expected cpu=8, got %v", got[resourceKey{"cpu", "az1", "cascade-lake"}]) - } - if got[resourceKey{"ram", "az1", "cascade-lake"}] != 32768 { // 2 * 16384 - t.Errorf("expected ram=32768, got %v", got[resourceKey{"ram", "az1", "cascade-lake"}]) - } - if got[resourceKey{"disk", "az1", "cascade-lake"}] != 200 { // 2 * 100 - t.Errorf("expected disk=200, got %v", got[resourceKey{"disk", "az1", "cascade-lake"}]) - } - }) - - t.Run("non-hana and kvm commitments are skipped", func(t *testing.T) { - commitments := []limes.Commitment{ - {ProjectID: "p1", ResourceName: "instances_hana_k_foo", AvailabilityZone: "az1", Amount: 5}, - {ProjectID: "p2", ResourceName: "instances_general_medium", AvailabilityZone: "az1", Amount: 3}, - } - got := calculateUnusedInstanceCapacity(commitments, nil, flavors) - if len(got) != 0 { - t.Errorf("expected no metrics for kvm/non-hana commitments, got %v", got) - } - }) - - t.Run("amounts for the same key are summed", func(t *testing.T) { - commitments := []limes.Commitment{ - {ProjectID: "p1", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 3}, - {ProjectID: "p1", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 2}, - } - got := calculateUnusedInstanceCapacity(commitments, nil, flavors) // nil servers -> all unused - if got[resourceKey{"cpu", "az1", "cascade-lake"}] != 20 { // 5 * 4 - t.Errorf("expected cpu=20 for summed commitments, got %v", got[resourceKey{"cpu", "az1", "cascade-lake"}]) - } - }) - - t.Run("over-used bucket emits no metric", func(t *testing.T) { - commitments := []limes.Commitment{ - {ProjectID: "p1", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 2}, - } - servers := []nova.Server{ // 5 running > 2 committed - {TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1"}, - {TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1"}, - {TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1"}, - {TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1"}, - {TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1"}, - } - got := calculateUnusedInstanceCapacity(commitments, servers, flavors) - if len(got) != 0 { - t.Errorf("expected no metrics for over-used bucket, got %v", got) - } - }) - - t.Run("exactly-used bucket emits no metric", func(t *testing.T) { - commitments := []limes.Commitment{ - {ProjectID: "p1", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 3}, - } - servers := []nova.Server{ // 3 running == 3 committed - {TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1"}, - {TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1"}, - {TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1"}, - } - got := calculateUnusedInstanceCapacity(commitments, servers, flavors) - if len(got) != 0 { - t.Errorf("expected no metrics for fully-used bucket, got %v", got) - } - }) - - t.Run("unknown flavor is skipped", func(t *testing.T) { - commitments := []limes.Commitment{ - {ProjectID: "p1", ResourceName: "instances_hana_unknown", AvailabilityZone: "az1", Amount: 3}, - } - got := calculateUnusedInstanceCapacity(commitments, nil, flavors) - if len(got) != 0 { - t.Errorf("expected no metrics for unknown flavor, got %v", got) - } - }) - - t.Run("multiple keys aggregated correctly", func(t *testing.T) { - commitments := []limes.Commitment{ - {ProjectID: "p1", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 2}, - {ProjectID: "p2", ResourceName: "instances_hana_large_v2", AvailabilityZone: "az1", Amount: 1}, - } - got := calculateUnusedInstanceCapacity(commitments, nil, flavors) // nil running -> all unused - - if got[resourceKey{"cpu", "az1", "cascade-lake"}] != 8 { // 2 * 4 - t.Errorf("expected cpu cascade-lake=8, got %v", got[resourceKey{"cpu", "az1", "cascade-lake"}]) - } - if got[resourceKey{"cpu", "az1", "sapphire-rapids"}] != 16 { // 1 * 16 - t.Errorf("expected cpu sapphire-rapids=16, got %v", got[resourceKey{"cpu", "az1", "sapphire-rapids"}]) - } - }) -} - -func TestVMwareResourceCommitmentsKPI_CollectHanaUnusedCommitments(t *testing.T) { - scheme, err := v1alpha1.SchemeBuilder.Build() - if err != nil { - t.Fatalf("expected no error building scheme, got %v", err) - } - - dbEnv := testlibDB.SetupDBEnv(t) - testDB := db.DB{DbMap: dbEnv.DbMap} - defer dbEnv.Close() - - if err := testDB.CreateTable( - testDB.AddTable(limes.Commitment{}), - testDB.AddTable(nova.Flavor{}), - testDB.AddTable(nova.Server{}), - ); err != nil { - t.Fatalf("expected no error creating tables, got %v", err) - } - - // Flavors: hana_small (4 vcpu, 16384 MB ram, 100 GB disk) - // hana_large_v2 (16 vcpu, 65536 MB ram, 400 GB disk) - if err := testDB.Insert( - &nova.Flavor{ID: "f1", Name: "hana_small", VCPUs: 4, RAM: 16384, Disk: 100}, - &nova.Flavor{ID: "f2", Name: "hana_large_v2", VCPUs: 16, RAM: 65536, Disk: 400}, - &nova.Flavor{ID: "f3", Name: "general_medium", VCPUs: 8, RAM: 32768, Disk: 200}, - ); err != nil { - t.Fatalf("expected no error inserting flavors, got %v", err) - } - - // Commitments across two AZs to verify per-AZ aggregation: - // project-A: 3 x hana_small in az1 (cascade-lake) - // project-B: 2 x hana_large_v2 in az1 (sapphire-rapids) - // project-A: 4 x hana_small in az2 (cascade-lake) — separate AZ bucket - // project-C: 1 x hana_k_foo in az1 — hana_k_ prefix, should be skipped - // project-D: 1 x general_medium — not hana_, should be skipped - // project-A: 10 x hana_small pending — should be excluded (wrong status) - // project-E: 2 x hana_small in az1 — running will exceed this (over-used, no metric) - // project-F: 3 x hana_large_v2 in az2 — running exactly equals this (fully used, no metric) - if err := testDB.Insert( - &limes.Commitment{ID: 1, ServiceType: "compute", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 3, Status: "confirmed", ProjectID: "project-A"}, - &limes.Commitment{ID: 2, ServiceType: "compute", ResourceName: "instances_hana_large_v2", AvailabilityZone: "az1", Amount: 2, Status: "confirmed", ProjectID: "project-B"}, - &limes.Commitment{ID: 3, ServiceType: "compute", ResourceName: "instances_hana_small", AvailabilityZone: "az2", Amount: 4, Status: "guaranteed", ProjectID: "project-A"}, - &limes.Commitment{ID: 4, ServiceType: "compute", ResourceName: "instances_hana_k_foo", AvailabilityZone: "az1", Amount: 5, Status: "confirmed", ProjectID: "project-C"}, - &limes.Commitment{ID: 5, ServiceType: "compute", ResourceName: "instances_general_medium", AvailabilityZone: "az1", Amount: 1, Status: "confirmed", ProjectID: "project-D"}, - &limes.Commitment{ID: 6, ServiceType: "compute", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 10, Status: "pending", ProjectID: "project-A"}, - &limes.Commitment{ID: 7, ServiceType: "compute", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 2, Status: "confirmed", ProjectID: "project-E"}, - &limes.Commitment{ID: 8, ServiceType: "compute", ResourceName: "instances_hana_large_v2", AvailabilityZone: "az2", Amount: 3, Status: "confirmed", ProjectID: "project-F"}, - ); err != nil { - t.Fatalf("expected no error inserting commitments, got %v", err) - } - - // Running servers: - // project-A/az1: 1 hana_small ACTIVE, 1 DELETED (ignored) -> 2 unused in az1 - // project-B/az1: 0 hana_large_v2 -> 2 unused in az1 - // project-A/az2: 1 hana_small ACTIVE -> 3 unused in az2 - // project-E/az1: 5 hana_small ACTIVE -> 5 > 2 committed -> 0 unused (over-used, clamped) - // project-F/az2: 3 hana_large_v2 ACTIVE -> 3 == 3 committed -> 0 unused (fully used, clamped) - if err := testDB.Insert( - &nova.Server{ID: "s1", TenantID: "project-A", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"}, - &nova.Server{ID: "s2", TenantID: "project-A", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "DELETED"}, - &nova.Server{ID: "s3", TenantID: "project-A", FlavorName: "hana_small", OSEXTAvailabilityZone: "az2", Status: "ACTIVE"}, - &nova.Server{ID: "s4", TenantID: "project-E", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"}, - &nova.Server{ID: "s5", TenantID: "project-E", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"}, - &nova.Server{ID: "s6", TenantID: "project-E", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"}, - &nova.Server{ID: "s7", TenantID: "project-E", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"}, - &nova.Server{ID: "s8", TenantID: "project-E", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"}, - &nova.Server{ID: "s9", TenantID: "project-F", FlavorName: "hana_large_v2", OSEXTAvailabilityZone: "az2", Status: "ACTIVE"}, - &nova.Server{ID: "s10", TenantID: "project-F", FlavorName: "hana_large_v2", OSEXTAvailabilityZone: "az2", Status: "ACTIVE"}, - &nova.Server{ID: "s11", TenantID: "project-F", FlavorName: "hana_large_v2", OSEXTAvailabilityZone: "az2", Status: "ACTIVE"}, - ); err != nil { - t.Fatalf("expected no error inserting servers, got %v", err) - } - - k8sClient := fake.NewClientBuilder(). - WithScheme(scheme). - WithRuntimeObjects( - &v1alpha1.Knowledge{ObjectMeta: v1.ObjectMeta{Name: "host-details"}}, - ). - Build() - - kpi := &VMwareResourceCommitmentsKPI{} - if err := kpi.Init(&testDB, k8sClient, conf.NewRawOpts("{}")); err != nil { - t.Fatalf("expected no error, got %v", err) - } - - ch := make(chan prometheus.Metric, 100) - kpi.Collect(ch) - close(ch) - - type UnusedMetric struct { - Resource string - AZ string - Arch string - Value float64 - } - - actual := make(map[string]UnusedMetric) - for metric := range ch { - if getMetricName(metric.Desc().String()) != "cortex_vmware_hana_unused_instance_commitments" { - continue - } - var m prometheusgo.Metric - if err := metric.Write(&m); err != nil { - t.Fatalf("failed to write metric: %v", err) - } - labels := make(map[string]string) - for _, lbl := range m.Label { - labels[lbl.GetName()] = lbl.GetValue() - } - key := labels["resource"] + "/" + labels["availability_zone"] + "/" + labels["cpu_architecture"] - if _, exists := actual[key]; exists { - t.Fatalf("duplicate metric key %q (resource=%q, availability_zone=%q, cpu_architecture=%q)", - key, labels["resource"], labels["availability_zone"], labels["cpu_architecture"]) - } - actual[key] = UnusedMetric{ - Resource: labels["resource"], - AZ: labels["availability_zone"], - Arch: labels["cpu_architecture"], - Value: m.GetGauge().GetValue(), - } - } - - // project-A/az1: 2 unused hana_small (cascade-lake) -> cpu=2*4=8, ram=2*16384=32768, disk=2*100=200 - // project-B/az1: 2 unused hana_large_v2 (sapphire-rapids) -> cpu=2*16=32, ram=2*65536=131072, disk=2*400=800 - // project-A/az2: 3 unused hana_small (cascade-lake) -> cpu=3*4=12, ram=3*16384=49152, disk=3*100=300 - // project-E/az1: 5 running > 2 committed hana_small -> clamped to 0, no metric emitted - // project-F/az2: 3 running == 3 committed hana_large_v2 -> clamped to 0, no metric emitted - expected := map[string]UnusedMetric{ - "cpu/az1/cascade-lake": {Resource: "cpu", AZ: "az1", Arch: "cascade-lake", Value: 8}, - "ram/az1/cascade-lake": {Resource: "ram", AZ: "az1", Arch: "cascade-lake", Value: 32768}, - "disk/az1/cascade-lake": {Resource: "disk", AZ: "az1", Arch: "cascade-lake", Value: 200}, - "cpu/az1/sapphire-rapids": {Resource: "cpu", AZ: "az1", Arch: "sapphire-rapids", Value: 32}, - "ram/az1/sapphire-rapids": {Resource: "ram", AZ: "az1", Arch: "sapphire-rapids", Value: 131072}, - "disk/az1/sapphire-rapids": {Resource: "disk", AZ: "az1", Arch: "sapphire-rapids", Value: 800}, - "cpu/az2/cascade-lake": {Resource: "cpu", AZ: "az2", Arch: "cascade-lake", Value: 12}, - "ram/az2/cascade-lake": {Resource: "ram", AZ: "az2", Arch: "cascade-lake", Value: 49152}, - "disk/az2/cascade-lake": {Resource: "disk", AZ: "az2", Arch: "cascade-lake", Value: 300}, - } - - if len(actual) != len(expected) { - t.Errorf("expected %d metrics, got %d: %v", len(expected), len(actual), actual) - } - for key, exp := range expected { - got, ok := actual[key] - if !ok { - t.Errorf("missing metric %q", key) - continue - } - if !reflect.DeepEqual(exp, got) { - t.Errorf("metric %q: expected %+v, got %+v", key, exp, got) - } - } -} diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_metrics.go b/internal/knowledge/kpis/plugins/infrastructure/shared.go similarity index 50% rename from internal/knowledge/kpis/plugins/infrastructure/vmware_metrics.go rename to internal/knowledge/kpis/plugins/infrastructure/shared.go index d92e8c3c2..4c011492c 100644 --- a/internal/knowledge/kpis/plugins/infrastructure/vmware_metrics.go +++ b/internal/knowledge/kpis/plugins/infrastructure/shared.go @@ -4,6 +4,7 @@ package infrastructure import ( + "fmt" "regexp" "strconv" @@ -78,3 +79,57 @@ type collectedVMwareMetric struct { Labels map[string]string Value float64 } + +// kvmFlavorPattern matches KVM flavors where the second underscore-delimited +// segment is "k" (e.g. "m1_k_small", "hana_k_large"). +var kvmFlavorPattern = regexp.MustCompile(`^[^_]+_k_`) + +// isKVMFlavor reports whether flavorName belongs to a KVM hypervisor. +func isKVMFlavor(name string) bool { + return kvmFlavorPattern.MatchString(name) +} + +// cpuArchitectureRule maps a flavor name regex to a CPU architecture label. +type cpuArchitectureRule struct { + pattern *regexp.Regexp + arch string +} + +// flavorCPUArchitectureRules maps flavor name patterns to CPU architecture labels in priority order. +// The first matching rule wins; defaultCPUArch is used when none match. +var flavorCPUArchitectureRules = []cpuArchitectureRule{ + {regexp.MustCompile(`_v2$`), "sapphire-rapids"}, +} + +const defaultCPUArchitecture = "cascade-lake" + +// flavorCPUArchitecture derives the CPU architecture label from a flavor name. +func flavorCPUArchitecture(flavorName string) string { + for _, rule := range flavorCPUArchitectureRules { + if rule.pattern.MatchString(flavorName) { + return rule.arch + } + } + return defaultCPUArchitecture +} + +// bytesPerUnit maps memory unit strings to their byte multipliers. +var bytesPerUnit = map[string]float64{ + "": 1, + "B": 1, + "KiB": 1024, + "MB": 1024 * 1024, + "MiB": 1024 * 1024, + "GB": 1024 * 1024 * 1024, + "GiB": 1024 * 1024 * 1024, + "TiB": 1024 * 1024 * 1024 * 1024, +} + +// bytesFromUnit converts an amount in the given unit to bytes. +func bytesFromUnit(amount float64, unit string) (float64, error) { + multiplier, ok := bytesPerUnit[unit] + if !ok { + return 0, fmt.Errorf("unknown memory unit: %s", unit) + } + return amount * multiplier, nil +} diff --git a/internal/knowledge/kpis/plugins/infrastructure/shared_test.go b/internal/knowledge/kpis/plugins/infrastructure/shared_test.go new file mode 100644 index 000000000..dc720d159 --- /dev/null +++ b/internal/knowledge/kpis/plugins/infrastructure/shared_test.go @@ -0,0 +1,78 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package infrastructure + +import "testing" + +func TestIsKVMFlavor(t *testing.T) { + tests := []struct { + flavor string + want bool + }{ + {"m1_k_small", true}, + {"hana_k_large", true}, + {"hana_small", false}, + {"hana_c128_m1600", false}, + {"hana_c128_m1600_v2", false}, + {"small", false}, + {"m1_large", false}, + } + for _, tt := range tests { + if got := isKVMFlavor(tt.flavor); got != tt.want { + t.Errorf("isKVMFlavor(%q) = %v, want %v", tt.flavor, got, tt.want) + } + } +} + +func TestFlavorCPUArchitecture(t *testing.T) { + tests := []struct { + flavor string + want string + }{ + {"hana_c128_m1600_v2", "sapphire-rapids"}, + {"hana_c256_m3200_v2", "sapphire-rapids"}, + {"hana_c128_m1600", "cascade-lake"}, + {"hana_small", "cascade-lake"}, + } + for _, tt := range tests { + if got := flavorCPUArchitecture(tt.flavor); got != tt.want { + t.Errorf("flavorCPUArchitecture(%q) = %q, want %q", tt.flavor, got, tt.want) + } + } +} + +func TestVmwareBytesFromUnit(t *testing.T) { + tests := []struct { + amount float64 + unit string + want float64 + errMsg string + }{ + {1024, "MiB", 1024 * 1024 * 1024, ""}, + {1024, "MB", 1024 * 1024 * 1024, ""}, + {2, "GiB", 2 * 1024 * 1024 * 1024, ""}, + {2, "GB", 2 * 1024 * 1024 * 1024, ""}, + {1, "TiB", 1024 * 1024 * 1024 * 1024, ""}, + {512, "KiB", 512 * 1024, ""}, + {100, "B", 100, ""}, + {100, "", 100, ""}, + {1, "TB", 0, "unknown memory unit: TB"}, + } + for _, tt := range tests { + got, err := bytesFromUnit(tt.amount, tt.unit) + if tt.errMsg != "" { + if err == nil || err.Error() != tt.errMsg { + t.Errorf("vmwareBytesFromUnit(%v, %q): expected error %q, got %v", tt.amount, tt.unit, tt.errMsg, err) + } + continue + } + if err != nil { + t.Errorf("vmwareBytesFromUnit(%v, %q): unexpected error: %v", tt.amount, tt.unit, err) + continue + } + if got != tt.want { + t.Errorf("vmwareBytesFromUnit(%v, %q) = %f, want %f", tt.amount, tt.unit, got, tt.want) + } + } +} diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization.go index 2d48b9737..16fcac857 100644 --- a/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization.go +++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization.go @@ -115,8 +115,16 @@ func (k *VMwareProjectUtilizationKPI) Collect(ch chan<- prometheus.Metric) { hostLabels := host.getHostLabels() hostLabels = append(hostLabels, projectCapacityUsage.ProjectID, projectCapacityUsage.ProjectName) - memoryUsageBytes := projectCapacityUsage.TotalRAMMB * 1024 * 1024 - diskUsageBytes := projectCapacityUsage.TotalDiskGB * 1024 * 1024 * 1024 + memoryUsageBytes, err := bytesFromUnit(projectCapacityUsage.TotalRAMMB, "MB") + if err != nil { + slog.Error("vmware_project_utilization: failed to convert memory to bytes", "err", err) + continue + } + diskUsageBytes, err := bytesFromUnit(projectCapacityUsage.TotalDiskGB, "GB") + if err != nil { + slog.Error("vmware_project_utilization: failed to convert disk to bytes", "err", err) + continue + } ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, projectCapacityUsage.TotalVCPUs, append(hostLabels, "vcpu")...) ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, memoryUsageBytes, append(hostLabels, "memory")...) diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_resource_commitments.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_resource_commitments.go new file mode 100644 index 000000000..0d3d5d3ed --- /dev/null +++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_resource_commitments.go @@ -0,0 +1,271 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package infrastructure + +import ( + "log/slog" + "strings" + + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/limes" + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" + "github.com/cobaltcore-dev/cortex/internal/knowledge/db" + "github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins" + "github.com/cobaltcore-dev/cortex/pkg/conf" + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// This KPI tracks committed resources in the VMware environment, based on commitments provided by Limes. +// For KVM we can map a commitment to a reservation on a specific host. In VMware this is not possible. +// For general purpose workload customer can specific amounts of resources. +// For HANA workloads customers commit a certain number of HANA instances (based on flavor). +// Like this it is possible to determine the workload type of a commitment. +// For general purpose workloads its not possible to differentiate the cpu architecture. To avoid weird behavior in a dashboard we don't export this label for the metric. +// For HANA flavors the cpu architecture is part of the flavor name (_v2 suffix for sapphire rapids, without suffix for cascade lake). +// For both types of workload however we can not determine on which host the commitment is fulfilled. +type VMwareResourceCommitmentsKPI struct { + // BaseKPI provides common fields and methods for all KPIs, such as database connection and Kubernetes client. + plugins.BaseKPI[struct{}] + + unusedGeneralPurposeCommitmentsPerProject *prometheus.Desc + unusedHanaCommittedResourcesPerProject *prometheus.Desc +} + +func (k *VMwareResourceCommitmentsKPI) GetName() string { + return "vmware_resource_commitments_kpi" +} + +func (k *VMwareResourceCommitmentsKPI) Init(dbConn *db.DB, c client.Client, opts conf.RawOpts) error { + if err := k.BaseKPI.Init(dbConn, c, opts); err != nil { + return err + } + + k.unusedGeneralPurposeCommitmentsPerProject = prometheus.NewDesc( + "cortex_vmware_commitments_general_purpose", + "Committed general purpose resources that are currently unused. CPU (resource=cpu) in vCPUs, memory (resource=ram) in bytes.", + []string{"availability_zone", "resource", "project_id"}, nil, + ) + k.unusedHanaCommittedResourcesPerProject = prometheus.NewDesc( + "cortex_vmware_commitments_hana_resources", + "Total committed HANA instances capacity that is currently unused, translated to resources. CPU in vCPUs, memory and disk in bytes.", + []string{"availability_zone", "cpu_architecture", "resource", "project_id"}, nil, + ) + return nil +} + +func (k *VMwareResourceCommitmentsKPI) Describe(ch chan<- *prometheus.Desc) { + ch <- k.unusedGeneralPurposeCommitmentsPerProject + ch <- k.unusedHanaCommittedResourcesPerProject +} + +func (k *VMwareResourceCommitmentsKPI) Collect(ch chan<- prometheus.Metric) { + if k.DB == nil { + return + } + + flavorsByName, err := k.getFlavorsByName() + if err != nil { + slog.Error("vmware_resource_commitments: failed to load flavors", "err", err) + return + } + + k.collectGeneralPurpose(ch, flavorsByName) + k.collectHana(ch, flavorsByName) +} + +// getFlavorsByName loads all flavors and returns them keyed by name. +func (k *VMwareResourceCommitmentsKPI) getFlavorsByName() (map[string]nova.Flavor, error) { + var flavors []nova.Flavor + if _, err := k.DB.Select(&flavors, "SELECT * FROM "+nova.Flavor{}.TableName()); err != nil { + return nil, err + } + byName := make(map[string]nova.Flavor, len(flavors)) + for _, f := range flavors { + byName[f.Name] = f + } + return byName, nil +} + +// getGeneralPurposeCommitments loads confirmed/guaranteed cores and ram commitments. +func (k *VMwareResourceCommitmentsKPI) getGeneralPurposeCommitments() ([]limes.Commitment, error) { + var commitments []limes.Commitment + if _, err := k.DB.Select(&commitments, ` + SELECT * FROM `+limes.Commitment{}.TableName()+` + WHERE service_type = 'compute' + AND resource_name IN ('cores', 'ram') + AND status IN ('confirmed', 'guaranteed') + `); err != nil { + return nil, err + } + return commitments, nil +} + +// getGeneralPurposeServers loads running non-HANA servers for general purpose usage accounting. +// KVM-specific flavors are filtered out in Go since SQL LIKE cannot express the segment-exact pattern. +func (k *VMwareResourceCommitmentsKPI) getGeneralPurposeServers() ([]nova.Server, error) { + var servers []nova.Server + if _, err := k.DB.Select(&servers, ` + SELECT * FROM `+nova.Server{}.TableName()+` + WHERE status NOT IN ('DELETED', 'ERROR') + AND flavor_name NOT LIKE 'hana_%' + `); err != nil { + return nil, err + } + result := make([]nova.Server, 0, len(servers)) + for _, s := range servers { + if !isKVMFlavor(s.FlavorName) { + result = append(result, s) + } + } + return result, nil +} + +// getHanaInstanceCommitments loads confirmed/guaranteed HANA instance commitments. +func (k *VMwareResourceCommitmentsKPI) getHanaInstanceCommitments() ([]limes.Commitment, error) { + var commitments []limes.Commitment + if _, err := k.DB.Select(&commitments, ` + SELECT * FROM `+limes.Commitment{}.TableName()+` + WHERE service_type = 'compute' + AND resource_name LIKE 'instances_hana_%' + AND status IN ('confirmed', 'guaranteed') + `); err != nil { + return nil, err + } + return commitments, nil +} + +// getRunningHanaServers loads all running HANA VMware servers (KVM HANA flavors excluded in Go). +func (k *VMwareResourceCommitmentsKPI) getRunningHanaServers() ([]nova.Server, error) { + var servers []nova.Server + if _, err := k.DB.Select(&servers, ` + SELECT * FROM `+nova.Server{}.TableName()+` + WHERE status NOT IN ('DELETED', 'ERROR') + AND flavor_name LIKE 'hana_%' + `); err != nil { + return nil, err + } + result := make([]nova.Server, 0, len(servers)) + for _, s := range servers { + if !isKVMFlavor(s.FlavorName) { + result = append(result, s) + } + } + return result, nil +} + +// collectGeneralPurpose computes and emits unused general purpose committed resources per project. +// Unused = committed - in-use (clamped to zero; zero values are not emitted). +func (k *VMwareResourceCommitmentsKPI) collectGeneralPurpose(ch chan<- prometheus.Metric, flavorsByName map[string]nova.Flavor) { + commitments, err := k.getGeneralPurposeCommitments() + if err != nil { + slog.Error("vmware_resource_commitments: failed to load gp commitments", "err", err) + return + } + servers, err := k.getGeneralPurposeServers() + if err != nil { + slog.Error("vmware_resource_commitments: failed to load gp servers", "err", err) + return + } + + type gpKey struct{ projectID, az, resource string } + + committed := make(map[gpKey]float64) + for _, c := range commitments { + switch c.ResourceName { + case "cores": + committed[gpKey{c.ProjectID, c.AvailabilityZone, "cpu"}] += float64(c.Amount) + case "ram": + bytes, err := bytesFromUnit(float64(c.Amount), c.Unit) + if err != nil { + slog.Warn("vmware_resource_commitments: unknown ram unit", "unit", c.Unit, "err", err) + continue + } + committed[gpKey{c.ProjectID, c.AvailabilityZone, "ram"}] += bytes + } + } + + used := make(map[gpKey]float64) + for _, s := range servers { + flavor, ok := flavorsByName[s.FlavorName] + if !ok { + slog.Warn("vmware_resource_commitments: gp flavor not found", "flavor", s.FlavorName) + continue + } + used[gpKey{s.TenantID, s.OSEXTAvailabilityZone, "cpu"}] += float64(flavor.VCPUs) + used[gpKey{s.TenantID, s.OSEXTAvailabilityZone, "ram"}] += float64(flavor.RAM) * 1024 * 1024 + } + + for key, committedAmt := range committed { + unused := committedAmt - used[key] + if unused <= 0 { + continue + } + ch <- prometheus.MustNewConstMetric( + k.unusedGeneralPurposeCommitmentsPerProject, + prometheus.GaugeValue, + unused, + key.az, key.resource, key.projectID, + ) + } +} + +// collectHana computes and emits unused committed HANA instance resources per project. +// Each HANA instance commitment is compared against running servers; the remainder is +// translated to cpu/ram/disk capacity using the flavor spec. +func (k *VMwareResourceCommitmentsKPI) collectHana(ch chan<- prometheus.Metric, flavorsByName map[string]nova.Flavor) { + commitments, err := k.getHanaInstanceCommitments() + if err != nil { + slog.Error("vmware_resource_commitments: failed to load hana commitments", "err", err) + return + } + servers, err := k.getRunningHanaServers() + if err != nil { + slog.Error("vmware_resource_commitments: failed to load hana servers", "err", err) + return + } + + type serverKey struct{ projectID, flavorName, az string } + running := make(map[serverKey]uint64, len(servers)) + for _, s := range servers { + running[serverKey{s.TenantID, s.FlavorName, s.OSEXTAvailabilityZone}]++ + } + + type commitKey struct{ projectID, flavorName, az, cpuArch string } + committedInstances := make(map[commitKey]uint64) + for _, c := range commitments { + flavorName := strings.TrimPrefix(c.ResourceName, "instances_") + if isKVMFlavor(flavorName) { + continue + } + key := commitKey{c.ProjectID, flavorName, c.AvailabilityZone, flavorCPUArchitecture(flavorName)} + committedInstances[key] += c.Amount + } + + type resourceKey struct{ projectID, az, cpuArch, resource string } + totals := make(map[resourceKey]float64) + for ck, total := range committedInstances { + run := running[serverKey{ck.projectID, ck.flavorName, ck.az}] + if run >= total { + continue + } + unused := total - run + flavor, ok := flavorsByName[ck.flavorName] + if !ok { + slog.Warn("vmware_resource_commitments: hana flavor not found", "flavor", ck.flavorName) + continue + } + totals[resourceKey{ck.projectID, ck.az, ck.cpuArch, "cpu"}] += float64(unused) * float64(flavor.VCPUs) + totals[resourceKey{ck.projectID, ck.az, ck.cpuArch, "ram"}] += float64(unused) * float64(flavor.RAM) * 1024 * 1024 + totals[resourceKey{ck.projectID, ck.az, ck.cpuArch, "disk"}] += float64(unused) * float64(flavor.Disk) * 1024 * 1024 * 1024 + } + + for key, value := range totals { + ch <- prometheus.MustNewConstMetric( + k.unusedHanaCommittedResourcesPerProject, + prometheus.GaugeValue, + value, + key.az, key.cpuArch, key.resource, key.projectID, + ) + } +} diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_resource_commitments_test.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_resource_commitments_test.go new file mode 100644 index 000000000..6616dc558 --- /dev/null +++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_resource_commitments_test.go @@ -0,0 +1,523 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package infrastructure + +import ( + "testing" + + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/limes" + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" + "github.com/cobaltcore-dev/cortex/internal/knowledge/db" + testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing" + "github.com/cobaltcore-dev/cortex/pkg/conf" + "github.com/prometheus/client_golang/prometheus" + prometheusgo "github.com/prometheus/client_model/go" +) + +func setupResourceCommitmentsDB(t *testing.T) (testDB *db.DB, cleanup func()) { + t.Helper() + dbEnv := testlibDB.SetupDBEnv(t) + testDB = &db.DB{DbMap: dbEnv.DbMap} + if err := testDB.CreateTable( + testDB.AddTable(limes.Commitment{}), + testDB.AddTable(nova.Server{}), + testDB.AddTable(nova.Flavor{}), + ); err != nil { + t.Fatalf("failed to create tables: %v", err) + } + return testDB, dbEnv.Close +} + +// collectResourceCommitmentsMetrics runs the KPI and returns all emitted metrics keyed by +// "metricName|az|cpu_architecture|resource|project_id". GP metrics have an empty cpu_architecture +// segment since the descriptor does not include that label. +func collectResourceCommitmentsMetrics(t *testing.T, testDB *db.DB) map[string]float64 { + t.Helper() + kpi := &VMwareResourceCommitmentsKPI{} + if err := kpi.Init(testDB, nil, conf.NewRawOpts("{}")); err != nil { + t.Fatalf("failed to init KPI: %v", err) + } + ch := make(chan prometheus.Metric, 200) + kpi.Collect(ch) + close(ch) + + result := make(map[string]float64) + for m := range ch { + var pm prometheusgo.Metric + if err := m.Write(&pm); err != nil { + t.Fatalf("failed to write metric: %v", err) + } + lbls := make(map[string]string) + for _, lp := range pm.Label { + lbls[lp.GetName()] = lp.GetValue() + } + name := getMetricName(m.Desc().String()) + key := name + "|" + lbls["availability_zone"] + "|" + lbls["cpu_architecture"] + "|" + lbls["resource"] + "|" + lbls["project_id"] + result[key] = pm.GetGauge().GetValue() + } + return result +} + +// gpKey builds the expected map key for a general-purpose metric. +// cpu_architecture is always empty because the GP metric descriptor omits that label. +func gpKey(az, resource, projectID string) string { + return "cortex_vmware_commitments_general_purpose|" + az + "||" + resource + "|" + projectID +} + +// hKey builds the expected map key for a HANA metric. +func hKey(az, cpuArch, resource, projectID string) string { + return "cortex_vmware_commitments_hana_resources|" + az + "|" + cpuArch + "|" + resource + "|" + projectID +} + +func TestVMwareResourceCommitmentsKPI_Init(t *testing.T) { + dbEnv := testlibDB.SetupDBEnv(t) + testDB := db.DB{DbMap: dbEnv.DbMap} + defer dbEnv.Close() + kpi := &VMwareResourceCommitmentsKPI{} + if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error, got %v", err) + } +} +func TestVMwareResourceCommitmentsKPI_Collect_GeneralPurpose(t *testing.T) { + tests := []struct { + name string + commitments []limes.Commitment + servers []nova.Server + flavors []nova.Flavor + want map[string]float64 + }{ + { + name: "no commitments produces no metrics", + want: map[string]float64{}, + }, + { + name: "fully unused cores commitment", + commitments: []limes.Commitment{ + {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 10, Status: "confirmed", ProjectID: "p1"}, + }, + want: map[string]float64{ + gpKey("az1", "cpu", "p1"): 10, + }, + }, + { + name: "fully unused ram commitment with MiB unit", + commitments: []limes.Commitment{ + {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "ram", AvailabilityZone: "az1", Amount: 1024, Unit: "MiB", Status: "confirmed", ProjectID: "p1"}, + }, + want: map[string]float64{ + gpKey("az1", "ram", "p1"): 1024 * 1024 * 1024, + }, + }, + { + name: "fully unused ram commitment with GiB unit", + commitments: []limes.Commitment{ + {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "ram", AvailabilityZone: "az1", Amount: 2, Unit: "GiB", Status: "confirmed", ProjectID: "p1"}, + }, + want: map[string]float64{ + gpKey("az1", "ram", "p1"): 2 * 1024 * 1024 * 1024, + }, + }, + { + name: "partial cpu usage reduces unused", + commitments: []limes.Commitment{ + {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 10, Status: "confirmed", ProjectID: "p1"}, + }, + servers: []nova.Server{ + {ID: "s1", TenantID: "p1", FlavorName: "small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"}, + {ID: "s2", TenantID: "p1", FlavorName: "small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"}, + }, + flavors: []nova.Flavor{ + {ID: "f1", Name: "small", VCPUs: 3, RAM: 0, Disk: 0}, + }, + want: map[string]float64{ + gpKey("az1", "cpu", "p1"): 4, // 10 - 2×3 = 4 + }, + }, + { + name: "fully covered cpu produces no metric", + commitments: []limes.Commitment{ + {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 4, Status: "confirmed", ProjectID: "p1"}, + }, + servers: []nova.Server{ + {ID: "s1", TenantID: "p1", FlavorName: "small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"}, + }, + flavors: []nova.Flavor{ + {ID: "f1", Name: "small", VCPUs: 4, RAM: 0, Disk: 0}, + }, + want: map[string]float64{}, + }, + { + name: "over-used cpu produces no metric", + commitments: []limes.Commitment{ + {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 2, Status: "confirmed", ProjectID: "p1"}, + }, + servers: []nova.Server{ + {ID: "s1", TenantID: "p1", FlavorName: "large", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"}, + }, + flavors: []nova.Flavor{ + {ID: "f1", Name: "large", VCPUs: 8, RAM: 0, Disk: 0}, + }, + want: map[string]float64{}, + }, + { + name: "hana servers not counted against gp commitments", + commitments: []limes.Commitment{ + {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 10, Status: "confirmed", ProjectID: "p1"}, + }, + servers: []nova.Server{ + {ID: "s1", TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"}, + }, + flavors: []nova.Flavor{ + {ID: "f1", Name: "hana_small", VCPUs: 8, RAM: 0, Disk: 0}, + }, + want: map[string]float64{ + gpKey("az1", "cpu", "p1"): 10, + }, + }, + { + name: "kvm servers not counted against gp commitments", + commitments: []limes.Commitment{ + {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 10, Status: "confirmed", ProjectID: "p1"}, + }, + servers: []nova.Server{ + {ID: "s1", TenantID: "p1", FlavorName: "m1_k_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"}, + }, + flavors: []nova.Flavor{ + {ID: "f1", Name: "m1_k_small", VCPUs: 4, RAM: 0, Disk: 0}, + }, + want: map[string]float64{ + gpKey("az1", "cpu", "p1"): 10, + }, + }, + { + name: "DELETED and ERROR servers excluded from usage", + commitments: []limes.Commitment{ + {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 10, Status: "confirmed", ProjectID: "p1"}, + }, + servers: []nova.Server{ + {ID: "s1", TenantID: "p1", FlavorName: "small", OSEXTAvailabilityZone: "az1", Status: "DELETED"}, + {ID: "s2", TenantID: "p1", FlavorName: "small", OSEXTAvailabilityZone: "az1", Status: "ERROR"}, + {ID: "s3", TenantID: "p1", FlavorName: "small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"}, + }, + flavors: []nova.Flavor{ + {ID: "f1", Name: "small", VCPUs: 2, RAM: 0, Disk: 0}, + }, + want: map[string]float64{ + gpKey("az1", "cpu", "p1"): 8, // only 1 ACTIVE × 2 subtracted + }, + }, + { + name: "guaranteed commitments counted", + commitments: []limes.Commitment{ + {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 5, Status: "guaranteed", ProjectID: "p1"}, + }, + want: map[string]float64{ + gpKey("az1", "cpu", "p1"): 5, + }, + }, + { + name: "pending commitments excluded", + commitments: []limes.Commitment{ + {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 100, Status: "pending", ProjectID: "p1"}, + }, + want: map[string]float64{}, + }, + { + name: "non-compute service type excluded", + commitments: []limes.Commitment{ + {ID: 1, UUID: "c1", ServiceType: "network", ResourceName: "cores", AvailabilityZone: "az1", Amount: 100, Status: "confirmed", ProjectID: "p1"}, + }, + want: map[string]float64{}, + }, + { + name: "multiple commitments per project and AZ summed", + commitments: []limes.Commitment{ + {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 10, Status: "confirmed", ProjectID: "p1"}, + {ID: 2, UUID: "c2", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 5, Status: "confirmed", ProjectID: "p1"}, + {ID: 3, UUID: "c3", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az2", Amount: 20, Status: "confirmed", ProjectID: "p1"}, + {ID: 4, UUID: "c4", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 8, Status: "confirmed", ProjectID: "p2"}, + }, + want: map[string]float64{ + gpKey("az1", "cpu", "p1"): 15, + gpKey("az2", "cpu", "p1"): 20, + gpKey("az1", "cpu", "p2"): 8, + }, + }, + { + name: "cpu and ram unused reported separately", + commitments: []limes.Commitment{ + {ID: 1, UUID: "c1", ServiceType: "compute", ResourceName: "cores", AvailabilityZone: "az1", Amount: 8, Status: "confirmed", ProjectID: "p1"}, + {ID: 2, UUID: "c2", ServiceType: "compute", ResourceName: "ram", AvailabilityZone: "az1", Amount: 512, Unit: "MiB", Status: "confirmed", ProjectID: "p1"}, + }, + servers: []nova.Server{ + {ID: "s1", TenantID: "p1", FlavorName: "medium", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"}, + }, + flavors: []nova.Flavor{ + {ID: "f1", Name: "medium", VCPUs: 2, RAM: 256, Disk: 0}, + }, + want: map[string]float64{ + gpKey("az1", "cpu", "p1"): 6, // 8 - 1×2 + gpKey("az1", "ram", "p1"): (512 - 256) * 1024 * 1024, // 512MiB - 256MB (flavor.RAM is in MB) + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + testDB, cleanup := setupResourceCommitmentsDB(t) + defer cleanup() + + var rows []any + for i := range tt.commitments { + rows = append(rows, &tt.commitments[i]) + } + for i := range tt.servers { + rows = append(rows, &tt.servers[i]) + } + for i := range tt.flavors { + rows = append(rows, &tt.flavors[i]) + } + if len(rows) > 0 { + if err := testDB.Insert(rows...); err != nil { + t.Fatalf("failed to insert test data: %v", err) + } + } + + got := collectResourceCommitmentsMetrics(t, testDB) + + if len(got) != len(tt.want) { + t.Errorf("expected %d metrics, got %d: %v", len(tt.want), len(got), got) + } + for k, wantVal := range tt.want { + gotVal, ok := got[k] + if !ok { + t.Errorf("missing metric %q", k) + continue + } + if gotVal != wantVal { + t.Errorf("metric %q: expected %f, got %f", k, wantVal, gotVal) + } + } + }) + } +} + +func TestVMwareResourceCommitmentsKPI_Collect_HANA(t *testing.T) { + tests := []struct { + name string + commitments []limes.Commitment + servers []nova.Server + flavors []nova.Flavor + want map[string]float64 + }{ + { + name: "no commitments produces no metrics", + want: map[string]float64{}, + }, + { + name: "fully unused hana instance commitment", + commitments: []limes.Commitment{ + {ID: 1, UUID: "h1", ServiceType: "compute", ResourceName: "instances_hana_c128_m1600", AvailabilityZone: "az1", Amount: 2, Status: "confirmed", ProjectID: "p1"}, + }, + flavors: []nova.Flavor{ + {ID: "f1", Name: "hana_c128_m1600", VCPUs: 128, RAM: 1638400, Disk: 100}, + }, + want: map[string]float64{ + hKey("az1", "cascade-lake", "cpu", "p1"): 2 * 128, + hKey("az1", "cascade-lake", "ram", "p1"): 2 * 1638400 * 1024 * 1024, + hKey("az1", "cascade-lake", "disk", "p1"): 2 * 100 * 1024 * 1024 * 1024, + }, + }, + { + name: "partial hana usage reduces unused instances", + commitments: []limes.Commitment{ + {ID: 1, UUID: "h1", ServiceType: "compute", ResourceName: "instances_hana_c128_m1600", AvailabilityZone: "az1", Amount: 3, Status: "confirmed", ProjectID: "p1"}, + }, + servers: []nova.Server{ + {ID: "s1", TenantID: "p1", FlavorName: "hana_c128_m1600", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"}, + }, + flavors: []nova.Flavor{ + {ID: "f1", Name: "hana_c128_m1600", VCPUs: 128, RAM: 1638400, Disk: 100}, + }, + want: map[string]float64{ + hKey("az1", "cascade-lake", "cpu", "p1"): 2 * 128, + hKey("az1", "cascade-lake", "ram", "p1"): 2 * 1638400 * 1024 * 1024, + hKey("az1", "cascade-lake", "disk", "p1"): 2 * 100 * 1024 * 1024 * 1024, + }, + }, + { + name: "fully used hana produces no metric", + commitments: []limes.Commitment{ + {ID: 1, UUID: "h1", ServiceType: "compute", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 2, Status: "confirmed", ProjectID: "p1"}, + }, + servers: []nova.Server{ + {ID: "s1", TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"}, + {ID: "s2", TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"}, + }, + flavors: []nova.Flavor{ + {ID: "f1", Name: "hana_small", VCPUs: 64, RAM: 819200, Disk: 50}, + }, + want: map[string]float64{}, + }, + { + name: "over-used hana produces no metric", + commitments: []limes.Commitment{ + {ID: 1, UUID: "h1", ServiceType: "compute", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 1, Status: "confirmed", ProjectID: "p1"}, + }, + servers: []nova.Server{ + {ID: "s1", TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"}, + {ID: "s2", TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"}, + }, + flavors: []nova.Flavor{ + {ID: "f1", Name: "hana_small", VCPUs: 64, RAM: 819200, Disk: 50}, + }, + want: map[string]float64{}, + }, + { + name: "sapphire-rapids arch from _v2 suffix", + commitments: []limes.Commitment{ + {ID: 1, UUID: "h1", ServiceType: "compute", ResourceName: "instances_hana_c256_m3200_v2", AvailabilityZone: "az1", Amount: 1, Status: "confirmed", ProjectID: "p1"}, + }, + flavors: []nova.Flavor{ + {ID: "f1", Name: "hana_c256_m3200_v2", VCPUs: 256, RAM: 3276800, Disk: 200}, + }, + want: map[string]float64{ + hKey("az1", "sapphire-rapids", "cpu", "p1"): 256, + hKey("az1", "sapphire-rapids", "ram", "p1"): 3276800 * 1024 * 1024, + hKey("az1", "sapphire-rapids", "disk", "p1"): 200 * 1024 * 1024 * 1024, + }, + }, + { + name: "cascade-lake and sapphire-rapids aggregated separately", + commitments: []limes.Commitment{ + {ID: 1, UUID: "h1", ServiceType: "compute", ResourceName: "instances_hana_c128_m1600", AvailabilityZone: "az1", Amount: 2, Status: "confirmed", ProjectID: "p1"}, + {ID: 2, UUID: "h2", ServiceType: "compute", ResourceName: "instances_hana_c128_m1600_v2", AvailabilityZone: "az1", Amount: 1, Status: "confirmed", ProjectID: "p1"}, + }, + flavors: []nova.Flavor{ + {ID: "f1", Name: "hana_c128_m1600", VCPUs: 128, RAM: 1638400, Disk: 100}, + {ID: "f2", Name: "hana_c128_m1600_v2", VCPUs: 128, RAM: 1638400, Disk: 100}, + }, + want: map[string]float64{ + hKey("az1", "cascade-lake", "cpu", "p1"): 2 * 128, + hKey("az1", "cascade-lake", "ram", "p1"): 2 * 1638400 * 1024 * 1024, + hKey("az1", "cascade-lake", "disk", "p1"): 2 * 100 * 1024 * 1024 * 1024, + hKey("az1", "sapphire-rapids", "cpu", "p1"): 1 * 128, + hKey("az1", "sapphire-rapids", "ram", "p1"): 1 * 1638400 * 1024 * 1024, + hKey("az1", "sapphire-rapids", "disk", "p1"): 1 * 100 * 1024 * 1024 * 1024, + }, + }, + { + name: "kvm hana commitments excluded", + commitments: []limes.Commitment{ + // hana_k_large is a KVM HANA flavor — must be filtered out + {ID: 1, UUID: "h1", ServiceType: "compute", ResourceName: "instances_hana_k_large", AvailabilityZone: "az1", Amount: 5, Status: "confirmed", ProjectID: "p1"}, + }, + flavors: []nova.Flavor{ + {ID: "f1", Name: "hana_k_large", VCPUs: 64, RAM: 819200, Disk: 50}, + }, + want: map[string]float64{}, + }, + { + name: "DELETED and ERROR hana servers excluded from running count", + commitments: []limes.Commitment{ + {ID: 1, UUID: "h1", ServiceType: "compute", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 3, Status: "confirmed", ProjectID: "p1"}, + }, + servers: []nova.Server{ + {ID: "s1", TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "DELETED"}, + {ID: "s2", TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ERROR"}, + {ID: "s3", TenantID: "p1", FlavorName: "hana_small", OSEXTAvailabilityZone: "az1", Status: "ACTIVE"}, + }, + flavors: []nova.Flavor{ + {ID: "f1", Name: "hana_small", VCPUs: 64, RAM: 819200, Disk: 50}, + }, + want: map[string]float64{ + hKey("az1", "cascade-lake", "cpu", "p1"): 2 * 64, // 3 committed - 1 ACTIVE = 2 unused + hKey("az1", "cascade-lake", "ram", "p1"): 2 * 819200 * 1024 * 1024, + hKey("az1", "cascade-lake", "disk", "p1"): 2 * 50 * 1024 * 1024 * 1024, + }, + }, + { + name: "guaranteed hana commitments counted", + commitments: []limes.Commitment{ + {ID: 1, UUID: "h1", ServiceType: "compute", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 1, Status: "guaranteed", ProjectID: "p1"}, + }, + flavors: []nova.Flavor{ + {ID: "f1", Name: "hana_small", VCPUs: 64, RAM: 819200, Disk: 50}, + }, + want: map[string]float64{ + hKey("az1", "cascade-lake", "cpu", "p1"): 64, + hKey("az1", "cascade-lake", "ram", "p1"): 819200 * 1024 * 1024, + hKey("az1", "cascade-lake", "disk", "p1"): 50 * 1024 * 1024 * 1024, + }, + }, + { + name: "unknown flavor is skipped without panic", + commitments: []limes.Commitment{ + {ID: 1, UUID: "h1", ServiceType: "compute", ResourceName: "instances_hana_nonexistent", AvailabilityZone: "az1", Amount: 2, Status: "confirmed", ProjectID: "p1"}, + }, + want: map[string]float64{}, + }, + { + name: "multiple projects and AZs aggregated per bucket", + commitments: []limes.Commitment{ + {ID: 1, UUID: "h1", ServiceType: "compute", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 2, Status: "confirmed", ProjectID: "p1"}, + {ID: 2, UUID: "h2", ServiceType: "compute", ResourceName: "instances_hana_small", AvailabilityZone: "az2", Amount: 3, Status: "confirmed", ProjectID: "p1"}, + {ID: 3, UUID: "h3", ServiceType: "compute", ResourceName: "instances_hana_small", AvailabilityZone: "az1", Amount: 1, Status: "confirmed", ProjectID: "p2"}, + }, + flavors: []nova.Flavor{ + {ID: "f1", Name: "hana_small", VCPUs: 64, RAM: 819200, Disk: 50}, + }, + want: map[string]float64{ + hKey("az1", "cascade-lake", "cpu", "p1"): 2 * 64, + hKey("az1", "cascade-lake", "ram", "p1"): 2 * 819200 * 1024 * 1024, + hKey("az1", "cascade-lake", "disk", "p1"): 2 * 50 * 1024 * 1024 * 1024, + hKey("az2", "cascade-lake", "cpu", "p1"): 3 * 64, + hKey("az2", "cascade-lake", "ram", "p1"): 3 * 819200 * 1024 * 1024, + hKey("az2", "cascade-lake", "disk", "p1"): 3 * 50 * 1024 * 1024 * 1024, + hKey("az1", "cascade-lake", "cpu", "p2"): 1 * 64, + hKey("az1", "cascade-lake", "ram", "p2"): 1 * 819200 * 1024 * 1024, + hKey("az1", "cascade-lake", "disk", "p2"): 1 * 50 * 1024 * 1024 * 1024, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + testDB, cleanup := setupResourceCommitmentsDB(t) + defer cleanup() + + var rows []any + for i := range tt.commitments { + rows = append(rows, &tt.commitments[i]) + } + for i := range tt.servers { + rows = append(rows, &tt.servers[i]) + } + for i := range tt.flavors { + rows = append(rows, &tt.flavors[i]) + } + if len(rows) > 0 { + if err := testDB.Insert(rows...); err != nil { + t.Fatalf("failed to insert test data: %v", err) + } + } + + got := collectResourceCommitmentsMetrics(t, testDB) + + if len(got) != len(tt.want) { + t.Errorf("expected %d metrics, got %d: %v", len(tt.want), len(got), got) + } + for k, wantVal := range tt.want { + gotVal, ok := got[k] + if !ok { + t.Errorf("missing metric %q", k) + continue + } + if gotVal != wantVal { + t.Errorf("metric %q: expected %f, got %f", k, wantVal, gotVal) + } + } + }) + } +} diff --git a/internal/knowledge/kpis/supported_kpis.go b/internal/knowledge/kpis/supported_kpis.go index c1a2b336c..19726a488 100644 --- a/internal/knowledge/kpis/supported_kpis.go +++ b/internal/knowledge/kpis/supported_kpis.go @@ -23,9 +23,9 @@ var supportedKPIs = map[string]plugins.KPI{ "vm_life_span_kpi": &compute.VMLifeSpanKPI{}, "vm_commitments_kpi": &compute.VMCommitmentsKPI{}, "vm_faults_kpi": &compute.VMFaultsKPI{}, - "vmware_commitments_kpi": &compute.VMwareResourceCommitmentsKPI{}, - "vmware_project_utilization_kpi": &infrastructure.VMwareProjectUtilizationKPI{}, + "vmware_project_utilization_kpi": &infrastructure.VMwareProjectUtilizationKPI{}, + "vmware_resource_commitments_kpi": &infrastructure.VMwareResourceCommitmentsKPI{}, "netapp_storage_pool_cpu_usage_kpi": &storage.NetAppStoragePoolCPUUsageKPI{},