From f4acf63054eb7b45bb6dc904729946fa8952f04b Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Mon, 30 Mar 2026 11:46:55 +0200 Subject: [PATCH 01/11] Add vm state kpi --- .../plugins/openstack/nova/nova_types.go | 46 ++++ .../kpis/plugins/compute/vm_state.go | 119 +++++++++ .../kpis/plugins/compute/vm_state_test.go | 240 ++++++++++++++++++ 3 files changed, 405 insertions(+) create mode 100644 internal/knowledge/kpis/plugins/compute/vm_state.go create mode 100644 internal/knowledge/kpis/plugins/compute/vm_state_test.go diff --git a/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go b/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go index 322b05d69..2633c76fc 100644 --- a/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go +++ b/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go @@ -285,6 +285,52 @@ type Flavor struct { ExtraSpecs string `json:"extra_specs" db:"extra_specs"` } +// FlavorHypervisorType is a type alias for a string to represent the specific +// values the hypervisor type contained in flavor extra specs may have. +type FlavorHypervisorType string + +const ( + // FlavorHypervisorTypeQEMU maps a flavor for QEMU/KVM hypervisors. + FlavorHypervisorTypeQEMU FlavorHypervisorType = "QEMU" + // FlavorHypervisorTypeCH maps flavors to Cloud-Hypervisor/KVM hypervisors. + FlavorHypervisorTypeCH FlavorHypervisorType = "CH" + // FlavorHypervisorTypeVMware maps flavors to VMware hypervisors. + FlavorHypervisorTypeVMware FlavorHypervisorType = "VMware vCenter Server" + // FlavorHypervisorTypeIronic maps flavors to Ironic baremetal instances. + FlavorHypervisorTypeIronic FlavorHypervisorType = "Ironic" + // FlavorHypervisorTypeOther is a flavor for which the hypervisor type + // is set in the extra specs but has an unknown value. + FlavorHypervisorTypeOther FlavorHypervisorType = "Other" + // FlavorHypervisorTypeUnspecified is a flavor for which the hypervisor type + // is not set in the extra specs. + FlavorHypervisorTypeUnspecified FlavorHypervisorType = "Unspecified" +) + +// GetHypervisorType returns the hypervisor type of the flavor based on its +// extra specs. +func (f Flavor) GetHypervisorType() (FlavorHypervisorType, error) { + var extraSpecs map[string]string + if err := json.Unmarshal([]byte(f.ExtraSpecs), &extraSpecs); err != nil { + return "", err // Return an error if the extra specs cannot be parsed. + } + hypervisorType, ok := extraSpecs["capabilities:hypervisor_type"] + if !ok { + return FlavorHypervisorTypeUnspecified, nil + } + switch hypervisorType { + case string(FlavorHypervisorTypeQEMU): + return FlavorHypervisorTypeQEMU, nil + case string(FlavorHypervisorTypeCH): + return FlavorHypervisorTypeCH, nil + case string(FlavorHypervisorTypeVMware): + return FlavorHypervisorTypeVMware, nil + case string(FlavorHypervisorTypeIronic): + return FlavorHypervisorTypeIronic, nil + default: + return FlavorHypervisorTypeOther, nil + } +} + // Custom unmarshaler for OpenStackFlavor to handle nested JSON. func (f *Flavor) UnmarshalJSON(data []byte) error { type Alias Flavor diff --git a/internal/knowledge/kpis/plugins/compute/vm_state.go b/internal/knowledge/kpis/plugins/compute/vm_state.go new file mode 100644 index 000000000..53e3d2fc4 --- /dev/null +++ b/internal/knowledge/kpis/plugins/compute/vm_state.go @@ -0,0 +1,119 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package compute + +import ( + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" + "github.com/cobaltcore-dev/cortex/internal/knowledge/db" + "github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins" + "github.com/cobaltcore-dev/cortex/pkg/conf" + "github.com/prometheus/client_golang/prometheus" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +var vmStateKPIlogger = ctrl.Log.WithName("vm-state-kpi") + +// This kpi monitors the current state of vms, i.e. how many vms are running, +// stopped, paused, etc. It also exposes additional labels such as the vm's +// hypervisor type which can be used to define alerts on non-running vms. +type VMStateKPI struct { + // Common base for all KPIs that provides standard functionality. + plugins.BaseKPI[struct{}] // No options passed through yaml config + + // Current state of the VM, e.g. running, stopped, paused, etc. + vmStateDesc *prometheus.Desc +} + +// GetName returns a unique name for this kpi that is used for registration +// and configuration. +func (VMStateKPI) GetName() string { return "vm_state_kpi" } + +// Init initializes the kpi, e.g. by creating the necessary Prometheus +// descriptors. The base kpi is also initialized with the provided database, +// client and options. +func (k *VMStateKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) error { + if err := k.BaseKPI.Init(db, client, opts); err != nil { + return err + } + k.vmStateDesc = prometheus.NewDesc("cortex_vm_state", + "Current state of the VM, e.g. running, stopped, paused, etc.", + []string{"az", "hvtype", "state"}, nil, + ) + return nil +} + +// Describe sends the descriptor of this kpi to the provided channel. This is +// used by Prometheus to know which metrics this kpi exposes. +func (k *VMStateKPI) Describe(ch chan<- *prometheus.Desc) { ch <- k.vmStateDesc } + +// Collect collects the current state of vms from the database and sends it as +// Prometheus metrics to the provided channel. +func (k *VMStateKPI) Collect(ch chan<- prometheus.Metric) { + vmStateKPIlogger.Info("collecting vm state kpi") + + // This can happen when no datasource is provided that connects to a database. + if k.DB == nil { + vmStateKPIlogger.Error(nil, "no database connection, cannot collect vm state kpi") + return + } + + // Get all vms with their current state from the database. + var servers []nova.Server + nServers, err := k.DB.Select(&servers, "SELECT * FROM "+nova.Server{}.TableName()) + if err != nil { + vmStateKPIlogger.Error(err, "failed to query servers from database") + return + } + vmStateKPIlogger.Info("queried servers from database", "nServers", nServers) + + // Get all flavors from the database to map them to the vms. + var flavors []nova.Flavor + nFlavors, err := k.DB.Select(&flavors, "SELECT * FROM "+nova.Flavor{}.TableName()) + if err != nil { + vmStateKPIlogger.Error(err, "failed to query flavors from database") + return + } + vmStateKPIlogger.Info("queried flavors from database", "nFlavors", nFlavors) + + flavorsByName := make(map[string]nova.Flavor, len(flavors)) + for _, flavor := range flavors { + flavorsByName[flavor.Name] = flavor + } + + type labels struct { + az string + hvtype string + state string + } + counts := make(map[labels]float64) + + // For each vm, get its hypervisor type and count up. + for _, server := range servers { + flavor, ok := flavorsByName[server.FlavorName] + if !ok { + vmStateKPIlogger.Error(nil, "flavor not found for server", "server", + server.ID, "flavor", server.FlavorName) + continue + } + hypervisorType, err := flavor.GetHypervisorType() + if err != nil { + vmStateKPIlogger.Error(err, "failed to get hypervisor type for server", + "server", server.ID, "flavor", flavor.Name) + continue + } + key := labels{ + az: server.OSEXTAvailabilityZone, + hvtype: string(hypervisorType), + state: server.Status, + } + counts[key]++ + } + + // Emit metrics to prometheus. + for key, count := range counts { + ch <- prometheus.MustNewConstMetric(k.vmStateDesc, prometheus.GaugeValue, count, + key.az, key.hvtype, key.state) + } +} diff --git a/internal/knowledge/kpis/plugins/compute/vm_state_test.go b/internal/knowledge/kpis/plugins/compute/vm_state_test.go new file mode 100644 index 000000000..57196e1de --- /dev/null +++ b/internal/knowledge/kpis/plugins/compute/vm_state_test.go @@ -0,0 +1,240 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package compute + +import ( + "testing" + + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" + "github.com/cobaltcore-dev/cortex/internal/knowledge/db" + testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing" + "github.com/cobaltcore-dev/cortex/pkg/conf" + "github.com/prometheus/client_golang/prometheus" + prometheusgo "github.com/prometheus/client_model/go" +) + +func TestVMStateKPI_Init(t *testing.T) { + dbEnv := testlibDB.SetupDBEnv(t) + testDB := db.DB{DbMap: dbEnv.DbMap} + defer dbEnv.Close() + kpi := &VMStateKPI{} + if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error, got %v", err) + } +} + +func TestVMStateKPI_Collect(t *testing.T) { + dbEnv := testlibDB.SetupDBEnv(t) + testDB := db.DB{DbMap: dbEnv.DbMap} + defer dbEnv.Close() + if err := testDB.CreateTable( + testDB.AddTable(nova.Server{}), + testDB.AddTable(nova.Flavor{}), + ); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + mockData := []any{ + // Servers in different AZs, states, and with different flavors + &nova.Server{ + ID: "server-1", + FlavorName: "m1.small", + OSEXTAvailabilityZone: "az1", + Status: "ACTIVE", + }, + &nova.Server{ + ID: "server-2", + FlavorName: "m1.small", + OSEXTAvailabilityZone: "az1", + Status: "ACTIVE", + }, + &nova.Server{ + ID: "server-3", + FlavorName: "m1.small", + OSEXTAvailabilityZone: "az1", + Status: "STOPPED", + }, + &nova.Server{ + ID: "server-4", + FlavorName: "m1.vmware", + OSEXTAvailabilityZone: "az2", + Status: "ACTIVE", + }, + &nova.Server{ + ID: "server-5", + FlavorName: "m1.generic", + OSEXTAvailabilityZone: "az1", + Status: "PAUSED", + }, + // Flavors with different hypervisor types + &nova.Flavor{ + ID: "flavor-1", + Name: "m1.small", + ExtraSpecs: `{"capabilities:hypervisor_type": "QEMU"}`, + }, + &nova.Flavor{ + ID: "flavor-2", + Name: "m1.vmware", + ExtraSpecs: `{"capabilities:hypervisor_type": "VMware vCenter Server"}`, + }, + &nova.Flavor{ + ID: "flavor-3", + Name: "m1.generic", + ExtraSpecs: `{}`, + }, + } + + if err := testDB.Insert(mockData...); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + kpi := &VMStateKPI{} + if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + ch := make(chan prometheus.Metric, 100) + kpi.Collect(ch) + close(ch) + + type vmStateMetric struct { + az string + hvtype string + state string + count float64 + } + + metrics := make(map[string]vmStateMetric) + for metric := range ch { + var m prometheusgo.Metric + if err := metric.Write(&m); err != nil { + t.Fatalf("failed to write metric: %v", err) + } + labels := make(map[string]string) + for _, label := range m.Label { + labels[label.GetName()] = label.GetValue() + } + key := labels["az"] + "|" + labels["hvtype"] + "|" + labels["state"] + metrics[key] = vmStateMetric{ + az: labels["az"], + hvtype: labels["hvtype"], + state: labels["state"], + count: m.GetGauge().GetValue(), + } + } + + expectedMetrics := map[string]vmStateMetric{ + "az1|QEMU|ACTIVE": { + az: "az1", + hvtype: "QEMU", + state: "ACTIVE", + count: 2, + }, + "az1|QEMU|STOPPED": { + az: "az1", + hvtype: "QEMU", + state: "STOPPED", + count: 1, + }, + "az2|VMware vCenter Server|ACTIVE": { + az: "az2", + hvtype: "VMware vCenter Server", + state: "ACTIVE", + count: 1, + }, + "az1|Unspecified|PAUSED": { + az: "az1", + hvtype: "Unspecified", + state: "PAUSED", + count: 1, + }, + } + + if len(expectedMetrics) != len(metrics) { + t.Errorf("expected %d metrics, got %d", len(expectedMetrics), len(metrics)) + } + + for key, expected := range expectedMetrics { + actual, ok := metrics[key] + if !ok { + t.Errorf("expected metric %q not found", key) + continue + } + if expected != actual { + t.Errorf("metric %q: expected %+v, got %+v", key, expected, actual) + } + } +} + +func TestVMStateKPI_Collect_MissingFlavor(t *testing.T) { + dbEnv := testlibDB.SetupDBEnv(t) + testDB := db.DB{DbMap: dbEnv.DbMap} + defer dbEnv.Close() + if err := testDB.CreateTable( + testDB.AddTable(nova.Server{}), + testDB.AddTable(nova.Flavor{}), + ); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + mockData := []any{ + &nova.Server{ + ID: "server-1", + FlavorName: "m1.existing", + OSEXTAvailabilityZone: "az1", + Status: "ACTIVE", + }, + &nova.Server{ + ID: "server-2", + FlavorName: "m1.missing", + OSEXTAvailabilityZone: "az1", + Status: "ACTIVE", + }, + &nova.Flavor{ + ID: "flavor-1", + Name: "m1.existing", + ExtraSpecs: `{}`, + }, + } + + if err := testDB.Insert(mockData...); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + kpi := &VMStateKPI{} + if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + ch := make(chan prometheus.Metric, 100) + kpi.Collect(ch) + close(ch) + + var count int + for range ch { + count++ + } + if count != 1 { + t.Errorf("expected 1 metric (missing flavor should be skipped), got %d", count) + } +} + +func TestVMStateKPI_Collect_NoDB(t *testing.T) { + kpi := &VMStateKPI{} + if err := kpi.Init(nil, nil, conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + ch := make(chan prometheus.Metric, 100) + kpi.Collect(ch) // Should not panic + close(ch) + + var count int + for range ch { + count++ + } + if count != 0 { + t.Errorf("expected 0 metrics when no DB, got %d", count) + } +} From 4ba03bb75444fb1ef67eb56b4b3a525a87268dbb Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Mon, 30 Mar 2026 13:16:49 +0200 Subject: [PATCH 02/11] Add kpi to nova bundle --- helm/bundles/cortex-nova/templates/kpis.yaml | 16 ++++++++++++++++ internal/knowledge/kpis/supported_kpis.go | 1 + 2 files changed, 17 insertions(+) diff --git a/helm/bundles/cortex-nova/templates/kpis.yaml b/helm/bundles/cortex-nova/templates/kpis.yaml index af01c10c5..62ff7f499 100644 --- a/helm/bundles/cortex-nova/templates/kpis.yaml +++ b/helm/bundles/cortex-nova/templates/kpis.yaml @@ -110,6 +110,22 @@ spec: --- apiVersion: cortex.cloud/v1alpha1 kind: KPI +metadata: + name: vm-state +spec: + schedulingDomain: nova + impl: vm_state_kpi + dependencies: + datasources: + - name: nova-servers + - name: nova-flavors + description: | + This kpi monitors the current state of vms, i.e. how many vms are running, + stopped, paused, etc. It also exposes additional labels such as the vm's + hypervisor type which can be used to define alerts on non-running vms. +--- +apiVersion: cortex.cloud/v1alpha1 +kind: KPI metadata: name: cortex-nova-datasource-state spec: diff --git a/internal/knowledge/kpis/supported_kpis.go b/internal/knowledge/kpis/supported_kpis.go index 274c5ace5..b469790cc 100644 --- a/internal/knowledge/kpis/supported_kpis.go +++ b/internal/knowledge/kpis/supported_kpis.go @@ -21,6 +21,7 @@ var supportedKPIs = map[string]plugins.KPI{ "vm_migration_statistics_kpi": &compute.VMMigrationStatisticsKPI{}, "vm_life_span_kpi": &compute.VMLifeSpanKPI{}, "vm_commitments_kpi": &compute.VMCommitmentsKPI{}, + "vm_state_kpi": &compute.VMStateKPI{}, "netapp_storage_pool_cpu_usage_kpi": &storage.NetAppStoragePoolCPUUsageKPI{}, From b613bdc7dbd692442945202c90a36901bad89304 Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Mon, 30 Mar 2026 13:17:00 +0200 Subject: [PATCH 03/11] Support syncing server faults from nova --- .../plugins/openstack/nova/nova_types.go | 74 +++++++++++++++---- .../compute/libvirt_domain_cpu_steal_pct.sql | 2 +- .../libvirt_domain_cpu_steal_pct_test.go | 2 +- .../plugins/compute/vm_host_residency.sql | 2 +- .../plugins/compute/vm_life_span.sql | 2 +- .../compute/vrops_hostsystem_resolver.sql | 2 +- .../compute/vrops_project_noisiness.sql | 2 +- .../reservations/commitments/controller.go | 2 +- tools/visualize-reservations/main.go | 5 +- 9 files changed, 71 insertions(+), 22 deletions(-) diff --git a/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go b/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go index 2633c76fc..70e4fb02e 100644 --- a/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go +++ b/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go @@ -108,9 +108,24 @@ type Server struct { OSEXTSTSVmState string `json:"OS-EXT-STS:vm_state" db:"os_ext_sts_vm_state"` OSEXTSTSPowerState int `json:"OS-EXT-STS:power_state" db:"os_ext_sts_power_state"` - // From nested JSON + // From nested server.flavor JSON FlavorName string `json:"-" db:"flavor_name"` + // From nested server.fault JSON + + // The error response code. + FaultCode *uint `json:"-" db:"fault_code"` + // The date and time when the exception was raised. The date and time stamp + // format is ISO 8601 (CCYY-MM-DDThh:mm:ss±hh:mm). For example, + // 2015-08-27T09:49:58-05:00. The ±hh:mm value if included, is the time zone + // as an offset from UTC. In the previous example, the offset value is -05:00. + FaultCreated *string `json:"-" db:"fault_created"` + // The error message. + FaultMessage *string `json:"-" db:"fault_message"` + // The stack trace. It is available if the response code is not 500 or you + // have the administrator privilege. + FaultDetails *string `json:"-" db:"fault_details"` + // Note: there are some more fields that are omitted. To include them again, add // custom unmarshalers and marshalers for the struct below. } @@ -119,7 +134,8 @@ type Server struct { func (s *Server) UnmarshalJSON(data []byte) error { type Alias Server aux := &struct { - Flavor json.RawMessage `json:"flavor"` + Flavor json.RawMessage `json:"flavor"` + Fault *json.RawMessage `json:"fault,omitempty"` *Alias }{ Alias: (*Alias)(s), @@ -135,31 +151,63 @@ func (s *Server) UnmarshalJSON(data []byte) error { return err } s.FlavorName = flavor.Name + var fault struct { + Code uint `json:"code"` + Created string `json:"created"` + Message string `json:"message"` + Details *string `json:"details,omitempty"` + } + if aux.Fault != nil { + if err := json.Unmarshal(*aux.Fault, &fault); err != nil { + return err + } + s.FaultCode = &fault.Code + s.FaultCreated = &fault.Created + s.FaultMessage = &fault.Message + s.FaultDetails = fault.Details + } return nil } // Custom marshaler for OpenStackServer to handle nested JSON. func (s *Server) MarshalJSON() ([]byte, error) { type Alias Server + type flavor struct { + // Starting in microversion 2.47, "id" was removed... + Name string `json:"original_name"` + } + flavorVal := flavor{ + Name: s.FlavorName, + } + type fault struct { + Code uint `json:"code"` + Created string `json:"created"` + Message string `json:"message"` + Details *string `json:"details,omitempty"` + } + var faultVal *fault + if s.FaultCode != nil && s.FaultCreated != nil && s.FaultMessage != nil { + faultVal = &fault{ + Code: *s.FaultCode, + Created: *s.FaultCreated, + Message: *s.FaultMessage, + Details: s.FaultDetails, + } + } aux := &struct { - Flavor struct { - // Starting in microversion 2.47, "id" was removed... - Name string `json:"original_name"` - } `json:"flavor"` + Flavor flavor `json:"flavor"` + Fault *fault `json:"fault,omitempty"` *Alias }{ - Alias: (*Alias)(s), - Flavor: struct { - Name string `json:"original_name"` - }{ - Name: s.FlavorName, - }, + Alias: (*Alias)(s), + Flavor: flavorVal, + Fault: faultVal, } return json.Marshal(aux) } // Table in which the openstack model is stored. -func (Server) TableName() string { return "openstack_servers" } +func (Server) TableName() string { return "openstack_servers_v2" } // Index for the openstack model. func (Server) Indexes() map[string][]string { return nil } diff --git a/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct.sql b/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct.sql index ea2b9c97a..ab3c7b8a7 100644 --- a/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct.sql +++ b/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct.sql @@ -3,6 +3,6 @@ SELECT os.os_ext_srv_attr_host AS host, MAX(value) AS max_steal_time_pct FROM kvm_libvirt_domain_metrics kvm -JOIN openstack_servers os ON os.os_ext_srv_attr_instance_name = kvm.domain +JOIN openstack_servers_v2 os ON os.os_ext_srv_attr_instance_name = kvm.domain WHERE kvm.name = 'kvm_libvirt_domain_steal_pct' AND os.id IS NOT NULL GROUP BY os.os_ext_srv_attr_host, os.id; \ No newline at end of file diff --git a/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct_test.go b/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct_test.go index b9f84b188..bc28218b5 100644 --- a/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct_test.go +++ b/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct_test.go @@ -56,7 +56,7 @@ func TestLibvirtDomainCPUStealPctExtractor_Extract(t *testing.T) { t.Fatalf("expected no error, got %v", err) } - // Insert mock data into the openstack_servers table + // Insert mock data into the openstack servers table servers := []any{ &nova.Server{ ID: "uuid-1", diff --git a/internal/knowledge/extractor/plugins/compute/vm_host_residency.sql b/internal/knowledge/extractor/plugins/compute/vm_host_residency.sql index fff0086c4..c2b4b8846 100644 --- a/internal/knowledge/extractor/plugins/compute/vm_host_residency.sql +++ b/internal/knowledge/extractor/plugins/compute/vm_host_residency.sql @@ -21,7 +21,7 @@ WITH durations AS ( )) AS BIGINT) ) AS duration FROM openstack_migrations AS migrations - LEFT JOIN openstack_servers AS servers ON servers.id = migrations.instance_uuid + LEFT JOIN openstack_servers_v2 AS servers ON servers.id = migrations.instance_uuid LEFT JOIN openstack_flavors_v2 AS flavors ON flavors.name = servers.flavor_name ) SELECT diff --git a/internal/knowledge/extractor/plugins/compute/vm_life_span.sql b/internal/knowledge/extractor/plugins/compute/vm_life_span.sql index daaa0a470..1fad31536 100644 --- a/internal/knowledge/extractor/plugins/compute/vm_life_span.sql +++ b/internal/knowledge/extractor/plugins/compute/vm_life_span.sql @@ -13,7 +13,7 @@ running_servers AS ( EXTRACT(EPOCH FROM (NOW()::timestamp - servers.created::timestamp))::BIGINT AS duration, COALESCE(flavors.name, 'unknown')::TEXT AS flavor_name, false::BOOLEAN AS deleted - FROM openstack_servers servers + FROM openstack_servers_v2 servers LEFT JOIN openstack_flavors_v2 flavors ON flavors.name = servers.flavor_name WHERE servers.created IS NOT NULL ) diff --git a/internal/knowledge/extractor/plugins/compute/vrops_hostsystem_resolver.sql b/internal/knowledge/extractor/plugins/compute/vrops_hostsystem_resolver.sql index e2c6ad4b2..8ab0a2c70 100644 --- a/internal/knowledge/extractor/plugins/compute/vrops_hostsystem_resolver.sql +++ b/internal/knowledge/extractor/plugins/compute/vrops_hostsystem_resolver.sql @@ -3,5 +3,5 @@ SELECT DISTINCT m.hostsystem AS vrops_hostsystem, s.os_ext_srv_attr_host AS nova_compute_host FROM vrops_vm_metrics m -LEFT JOIN openstack_servers s ON m.instance_uuid = s.id +LEFT JOIN openstack_servers_v2 s ON m.instance_uuid = s.id WHERE s.os_ext_srv_attr_host IS NOT NULL; diff --git a/internal/knowledge/extractor/plugins/compute/vrops_project_noisiness.sql b/internal/knowledge/extractor/plugins/compute/vrops_project_noisiness.sql index 334668b22..0b0067790 100644 --- a/internal/knowledge/extractor/plugins/compute/vrops_project_noisiness.sql +++ b/internal/knowledge/extractor/plugins/compute/vrops_project_noisiness.sql @@ -19,7 +19,7 @@ host_cpu_usage AS ( s.tenant_id, h.service_host, AVG(p.avg_cpu) AS avg_cpu_of_project - FROM openstack_servers s + FROM openstack_servers_v2 s JOIN vrops_vm_metrics m ON s.id = m.instance_uuid JOIN projects_avg_cpu p ON s.tenant_id = p.tenant_id JOIN openstack_hypervisors h ON s.os_ext_srv_attr_hypervisor_hostname = h.hostname diff --git a/internal/scheduling/reservations/commitments/controller.go b/internal/scheduling/reservations/commitments/controller.go index 9c238aeee..d38c6e1d8 100644 --- a/internal/scheduling/reservations/commitments/controller.go +++ b/internal/scheduling/reservations/commitments/controller.go @@ -445,7 +445,7 @@ func (r *CommitmentReservationController) listServersByProjectID(ctx context.Con // Query servers from the database cache. var servers []nova.Server _, err := r.DB.Select(&servers, - "SELECT * FROM openstack_servers WHERE tenant_id = $1", + "SELECT * FROM "+nova.Server{}.TableName()+" WHERE tenant_id = $1", projectID) if err != nil { return nil, fmt.Errorf("failed to query servers from database: %w", err) diff --git a/tools/visualize-reservations/main.go b/tools/visualize-reservations/main.go index 9b5880be5..90824fb6e 100644 --- a/tools/visualize-reservations/main.go +++ b/tools/visualize-reservations/main.go @@ -52,6 +52,7 @@ import ( "time" "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" _ "github.com/lib/pq" corev1 "k8s.io/api/core/v1" @@ -1761,9 +1762,9 @@ func connectToPostgres( // Query servers with host information serverMap = make(map[string]serverInfo) - rows, err := db.QueryContext(ctx, "SELECT id, flavor_name, COALESCE(host_id, ''), COALESCE(os_ext_srv_attr_host, '') FROM openstack_servers") + rows, err := db.QueryContext(ctx, "SELECT id, flavor_name, COALESCE(host_id, ''), COALESCE(os_ext_srv_attr_host, '') FROM "+nova.Server{}.TableName()) if err != nil { - fmt.Fprintf(os.Stderr, "Warning: Could not query openstack_servers: %v\n", err) + fmt.Fprintf(os.Stderr, "Warning: Could not query "+nova.Server{}.TableName()+": %v\n", err) } else { defer rows.Close() for rows.Next() { From aeb3fb30eba86812270c852b1a85161ee138c7bd Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Mon, 30 Mar 2026 13:35:31 +0200 Subject: [PATCH 04/11] Rename vm_state_kpi -> vm_faults_kpi --- helm/bundles/cortex-nova/templates/kpis.yaml | 11 +- .../kpis/plugins/compute/vm_state.go | 83 +++--- .../kpis/plugins/compute/vm_state_test.go | 240 ------------------ internal/knowledge/kpis/supported_kpis.go | 2 +- 4 files changed, 58 insertions(+), 278 deletions(-) delete mode 100644 internal/knowledge/kpis/plugins/compute/vm_state_test.go diff --git a/helm/bundles/cortex-nova/templates/kpis.yaml b/helm/bundles/cortex-nova/templates/kpis.yaml index 62ff7f499..bc5666926 100644 --- a/helm/bundles/cortex-nova/templates/kpis.yaml +++ b/helm/bundles/cortex-nova/templates/kpis.yaml @@ -111,18 +111,19 @@ spec: apiVersion: cortex.cloud/v1alpha1 kind: KPI metadata: - name: vm-state + name: vm-faults spec: schedulingDomain: nova - impl: vm_state_kpi + impl: vm_faults_kpi dependencies: datasources: - name: nova-servers - name: nova-flavors description: | - This kpi monitors the current state of vms, i.e. how many vms are running, - stopped, paused, etc. It also exposes additional labels such as the vm's - hypervisor type which can be used to define alerts on non-running vms. + This kpi tracks vm faults in the datacenter. It exposes helpful information + about the faults, such as the availability zone, hypervisor type, vm state, + and error info if available. This can be used to identify issues in the + datacenter and to monitor the overall health of the vms. --- apiVersion: cortex.cloud/v1alpha1 kind: KPI diff --git a/internal/knowledge/kpis/plugins/compute/vm_state.go b/internal/knowledge/kpis/plugins/compute/vm_state.go index 53e3d2fc4..30a0832b3 100644 --- a/internal/knowledge/kpis/plugins/compute/vm_state.go +++ b/internal/knowledge/kpis/plugins/compute/vm_state.go @@ -4,6 +4,9 @@ package compute import ( + "errors" + "strconv" + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" "github.com/cobaltcore-dev/cortex/internal/knowledge/db" "github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins" @@ -13,49 +16,50 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) -var vmStateKPIlogger = ctrl.Log.WithName("vm-state-kpi") +var vmFaultsKPIlogger = ctrl.Log.WithName("vm-faults-kpi") -// This kpi monitors the current state of vms, i.e. how many vms are running, -// stopped, paused, etc. It also exposes additional labels such as the vm's -// hypervisor type which can be used to define alerts on non-running vms. -type VMStateKPI struct { - // Common base for all KPIs that provides standard functionality. - plugins.BaseKPI[struct{}] // No options passed through yaml config +// This kpi tracks vm faults in the datacenter. It exposes helpful information +// about the faults, such as the availability zone, hypervisor type, vm state, +// and error info if available. This can be used to identify issues in the +// datacenter and to monitor the overall health of the vms. +type VMFaultsKPI struct { + plugins.BaseKPI[struct{} /* No opts */] - // Current state of the VM, e.g. running, stopped, paused, etc. - vmStateDesc *prometheus.Desc + // vmFaultsDesc describes the prometheus metric for vm faults. + vmFaultsDesc *prometheus.Desc } // GetName returns a unique name for this kpi that is used for registration // and configuration. -func (VMStateKPI) GetName() string { return "vm_state_kpi" } +func (VMFaultsKPI) GetName() string { return "vm_faults_kpi" } // Init initializes the kpi, e.g. by creating the necessary Prometheus // descriptors. The base kpi is also initialized with the provided database, // client and options. -func (k *VMStateKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) error { +func (k *VMFaultsKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) error { if err := k.BaseKPI.Init(db, client, opts); err != nil { return err } - k.vmStateDesc = prometheus.NewDesc("cortex_vm_state", - "Current state of the VM, e.g. running, stopped, paused, etc.", - []string{"az", "hvtype", "state"}, nil, + k.vmFaultsDesc = prometheus.NewDesc("cortex_vm_faults", + "Number of vm faults in the datacenter", + []string{"az", "hvtype", "state", "faultcode", "faultmessage"}, nil, ) return nil } // Describe sends the descriptor of this kpi to the provided channel. This is // used by Prometheus to know which metrics this kpi exposes. -func (k *VMStateKPI) Describe(ch chan<- *prometheus.Desc) { ch <- k.vmStateDesc } +func (k *VMFaultsKPI) Describe(ch chan<- *prometheus.Desc) { ch <- k.vmFaultsDesc } // Collect collects the current state of vms from the database and sends it as // Prometheus metrics to the provided channel. -func (k *VMStateKPI) Collect(ch chan<- prometheus.Metric) { - vmStateKPIlogger.Info("collecting vm state kpi") +func (k *VMFaultsKPI) Collect(ch chan<- prometheus.Metric) { + vmFaultsKPIlogger.Info("collecting metrics") // This can happen when no datasource is provided that connects to a database. if k.DB == nil { - vmStateKPIlogger.Error(nil, "no database connection, cannot collect vm state kpi") + err := errors.New("no database connection") + vmFaultsKPIlogger.Error(err, "cannot collect metric") return } @@ -63,19 +67,19 @@ func (k *VMStateKPI) Collect(ch chan<- prometheus.Metric) { var servers []nova.Server nServers, err := k.DB.Select(&servers, "SELECT * FROM "+nova.Server{}.TableName()) if err != nil { - vmStateKPIlogger.Error(err, "failed to query servers from database") + vmFaultsKPIlogger.Error(err, "failed to query servers from database") return } - vmStateKPIlogger.Info("queried servers from database", "nServers", nServers) + vmFaultsKPIlogger.Info("queried servers from database", "nServers", nServers) // Get all flavors from the database to map them to the vms. var flavors []nova.Flavor nFlavors, err := k.DB.Select(&flavors, "SELECT * FROM "+nova.Flavor{}.TableName()) if err != nil { - vmStateKPIlogger.Error(err, "failed to query flavors from database") + vmFaultsKPIlogger.Error(err, "failed to query flavors from database") return } - vmStateKPIlogger.Info("queried flavors from database", "nFlavors", nFlavors) + vmFaultsKPIlogger.Info("queried flavors from database", "nFlavors", nFlavors) flavorsByName := make(map[string]nova.Flavor, len(flavors)) for _, flavor := range flavors { @@ -83,37 +87,52 @@ func (k *VMStateKPI) Collect(ch chan<- prometheus.Metric) { } type labels struct { - az string - hvtype string - state string + az string + hvtype string + state string + errcode string + errmessage string } counts := make(map[labels]float64) // For each vm, get its hypervisor type and count up. + // Note: this will also expose vms that are NOT in an error state, + // but this can be useful to compare it to the number of faulty vms. for _, server := range servers { flavor, ok := flavorsByName[server.FlavorName] if !ok { - vmStateKPIlogger.Error(nil, "flavor not found for server", "server", + vmFaultsKPIlogger.Info("warning: flavor not found for server", "server", server.ID, "flavor", server.FlavorName) continue } hypervisorType, err := flavor.GetHypervisorType() if err != nil { - vmStateKPIlogger.Error(err, "failed to get hypervisor type for server", + vmFaultsKPIlogger.Error(err, "failed to get hypervisor type for server", "server", server.ID, "flavor", flavor.Name) continue } + var errcode uint = 0 + if server.FaultCode != nil { + errcode = *server.FaultCode + } + errmsg := "n/a" + if server.FaultMessage != nil { + errmsg = *server.FaultMessage + } key := labels{ - az: server.OSEXTAvailabilityZone, - hvtype: string(hypervisorType), - state: server.Status, + az: server.OSEXTAvailabilityZone, + hvtype: string(hypervisorType), + state: server.Status, + errcode: strconv.FormatUint(uint64(errcode), 10), + errmessage: errmsg, } counts[key]++ } // Emit metrics to prometheus. for key, count := range counts { - ch <- prometheus.MustNewConstMetric(k.vmStateDesc, prometheus.GaugeValue, count, - key.az, key.hvtype, key.state) + ch <- prometheus.MustNewConstMetric(k.vmFaultsDesc, prometheus.GaugeValue, count, + key.az, key.hvtype, key.state, key.errcode, key.errmessage) } + vmFaultsKPIlogger.Info("collected metrics", "nMetrics", len(counts)) } diff --git a/internal/knowledge/kpis/plugins/compute/vm_state_test.go b/internal/knowledge/kpis/plugins/compute/vm_state_test.go deleted file mode 100644 index 57196e1de..000000000 --- a/internal/knowledge/kpis/plugins/compute/vm_state_test.go +++ /dev/null @@ -1,240 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package compute - -import ( - "testing" - - "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" - "github.com/cobaltcore-dev/cortex/internal/knowledge/db" - testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing" - "github.com/cobaltcore-dev/cortex/pkg/conf" - "github.com/prometheus/client_golang/prometheus" - prometheusgo "github.com/prometheus/client_model/go" -) - -func TestVMStateKPI_Init(t *testing.T) { - dbEnv := testlibDB.SetupDBEnv(t) - testDB := db.DB{DbMap: dbEnv.DbMap} - defer dbEnv.Close() - kpi := &VMStateKPI{} - if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil { - t.Fatalf("expected no error, got %v", err) - } -} - -func TestVMStateKPI_Collect(t *testing.T) { - dbEnv := testlibDB.SetupDBEnv(t) - testDB := db.DB{DbMap: dbEnv.DbMap} - defer dbEnv.Close() - if err := testDB.CreateTable( - testDB.AddTable(nova.Server{}), - testDB.AddTable(nova.Flavor{}), - ); err != nil { - t.Fatalf("expected no error, got %v", err) - } - - mockData := []any{ - // Servers in different AZs, states, and with different flavors - &nova.Server{ - ID: "server-1", - FlavorName: "m1.small", - OSEXTAvailabilityZone: "az1", - Status: "ACTIVE", - }, - &nova.Server{ - ID: "server-2", - FlavorName: "m1.small", - OSEXTAvailabilityZone: "az1", - Status: "ACTIVE", - }, - &nova.Server{ - ID: "server-3", - FlavorName: "m1.small", - OSEXTAvailabilityZone: "az1", - Status: "STOPPED", - }, - &nova.Server{ - ID: "server-4", - FlavorName: "m1.vmware", - OSEXTAvailabilityZone: "az2", - Status: "ACTIVE", - }, - &nova.Server{ - ID: "server-5", - FlavorName: "m1.generic", - OSEXTAvailabilityZone: "az1", - Status: "PAUSED", - }, - // Flavors with different hypervisor types - &nova.Flavor{ - ID: "flavor-1", - Name: "m1.small", - ExtraSpecs: `{"capabilities:hypervisor_type": "QEMU"}`, - }, - &nova.Flavor{ - ID: "flavor-2", - Name: "m1.vmware", - ExtraSpecs: `{"capabilities:hypervisor_type": "VMware vCenter Server"}`, - }, - &nova.Flavor{ - ID: "flavor-3", - Name: "m1.generic", - ExtraSpecs: `{}`, - }, - } - - if err := testDB.Insert(mockData...); err != nil { - t.Fatalf("expected no error, got %v", err) - } - - kpi := &VMStateKPI{} - if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil { - t.Fatalf("expected no error, got %v", err) - } - - ch := make(chan prometheus.Metric, 100) - kpi.Collect(ch) - close(ch) - - type vmStateMetric struct { - az string - hvtype string - state string - count float64 - } - - metrics := make(map[string]vmStateMetric) - for metric := range ch { - var m prometheusgo.Metric - if err := metric.Write(&m); err != nil { - t.Fatalf("failed to write metric: %v", err) - } - labels := make(map[string]string) - for _, label := range m.Label { - labels[label.GetName()] = label.GetValue() - } - key := labels["az"] + "|" + labels["hvtype"] + "|" + labels["state"] - metrics[key] = vmStateMetric{ - az: labels["az"], - hvtype: labels["hvtype"], - state: labels["state"], - count: m.GetGauge().GetValue(), - } - } - - expectedMetrics := map[string]vmStateMetric{ - "az1|QEMU|ACTIVE": { - az: "az1", - hvtype: "QEMU", - state: "ACTIVE", - count: 2, - }, - "az1|QEMU|STOPPED": { - az: "az1", - hvtype: "QEMU", - state: "STOPPED", - count: 1, - }, - "az2|VMware vCenter Server|ACTIVE": { - az: "az2", - hvtype: "VMware vCenter Server", - state: "ACTIVE", - count: 1, - }, - "az1|Unspecified|PAUSED": { - az: "az1", - hvtype: "Unspecified", - state: "PAUSED", - count: 1, - }, - } - - if len(expectedMetrics) != len(metrics) { - t.Errorf("expected %d metrics, got %d", len(expectedMetrics), len(metrics)) - } - - for key, expected := range expectedMetrics { - actual, ok := metrics[key] - if !ok { - t.Errorf("expected metric %q not found", key) - continue - } - if expected != actual { - t.Errorf("metric %q: expected %+v, got %+v", key, expected, actual) - } - } -} - -func TestVMStateKPI_Collect_MissingFlavor(t *testing.T) { - dbEnv := testlibDB.SetupDBEnv(t) - testDB := db.DB{DbMap: dbEnv.DbMap} - defer dbEnv.Close() - if err := testDB.CreateTable( - testDB.AddTable(nova.Server{}), - testDB.AddTable(nova.Flavor{}), - ); err != nil { - t.Fatalf("expected no error, got %v", err) - } - - mockData := []any{ - &nova.Server{ - ID: "server-1", - FlavorName: "m1.existing", - OSEXTAvailabilityZone: "az1", - Status: "ACTIVE", - }, - &nova.Server{ - ID: "server-2", - FlavorName: "m1.missing", - OSEXTAvailabilityZone: "az1", - Status: "ACTIVE", - }, - &nova.Flavor{ - ID: "flavor-1", - Name: "m1.existing", - ExtraSpecs: `{}`, - }, - } - - if err := testDB.Insert(mockData...); err != nil { - t.Fatalf("expected no error, got %v", err) - } - - kpi := &VMStateKPI{} - if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil { - t.Fatalf("expected no error, got %v", err) - } - - ch := make(chan prometheus.Metric, 100) - kpi.Collect(ch) - close(ch) - - var count int - for range ch { - count++ - } - if count != 1 { - t.Errorf("expected 1 metric (missing flavor should be skipped), got %d", count) - } -} - -func TestVMStateKPI_Collect_NoDB(t *testing.T) { - kpi := &VMStateKPI{} - if err := kpi.Init(nil, nil, conf.NewRawOpts("{}")); err != nil { - t.Fatalf("expected no error, got %v", err) - } - - ch := make(chan prometheus.Metric, 100) - kpi.Collect(ch) // Should not panic - close(ch) - - var count int - for range ch { - count++ - } - if count != 0 { - t.Errorf("expected 0 metrics when no DB, got %d", count) - } -} diff --git a/internal/knowledge/kpis/supported_kpis.go b/internal/knowledge/kpis/supported_kpis.go index b469790cc..2623ff8bd 100644 --- a/internal/knowledge/kpis/supported_kpis.go +++ b/internal/knowledge/kpis/supported_kpis.go @@ -21,7 +21,7 @@ var supportedKPIs = map[string]plugins.KPI{ "vm_migration_statistics_kpi": &compute.VMMigrationStatisticsKPI{}, "vm_life_span_kpi": &compute.VMLifeSpanKPI{}, "vm_commitments_kpi": &compute.VMCommitmentsKPI{}, - "vm_state_kpi": &compute.VMStateKPI{}, + "vm_faults_kpi": &compute.VMFaultsKPI{}, "netapp_storage_pool_cpu_usage_kpi": &storage.NetAppStoragePoolCPUUsageKPI{}, From 9e290d185dd844977099223dabdd0c183f57939b Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Mon, 30 Mar 2026 13:47:59 +0200 Subject: [PATCH 05/11] Rename file, add faulty-vm label, and add alert --- helm/bundles/cortex-nova/alerts/nova.alerts.yaml | 16 ++++++++++++++++ .../compute/{vm_state.go => vm_faults.go} | 12 ++++++++++-- 2 files changed, 26 insertions(+), 2 deletions(-) rename internal/knowledge/kpis/plugins/compute/{vm_state.go => vm_faults.go} (91%) diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index 2449fa390..c8fc74b8e 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -592,3 +592,19 @@ groups: corruption, bugs in reservation creation, or external modifications. Reservations are automatically repaired, but the root cause should be investigated if this alert persists. + + - alert: CortexNovaDoesntFindValidHosts + expr: cortex_vm_faults{faultmessage=~".+No valid host was found.+"} > 0 + labels: + context: scheduling + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Nova scheduling cannot find valid hosts" + description: > + Cortex is seeing faulty vms in `{{$labels.az}}` where Nova scheduling + failed to find a valid host. This may indicate capacity issues, + misconfigured filters, or resource constraints in the datacenter. + Investigate the affected VMs and hypervisor availability. diff --git a/internal/knowledge/kpis/plugins/compute/vm_state.go b/internal/knowledge/kpis/plugins/compute/vm_faults.go similarity index 91% rename from internal/knowledge/kpis/plugins/compute/vm_state.go rename to internal/knowledge/kpis/plugins/compute/vm_faults.go index 30a0832b3..7d69f709f 100644 --- a/internal/knowledge/kpis/plugins/compute/vm_state.go +++ b/internal/knowledge/kpis/plugins/compute/vm_faults.go @@ -42,7 +42,7 @@ func (k *VMFaultsKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) e } k.vmFaultsDesc = prometheus.NewDesc("cortex_vm_faults", "Number of vm faults in the datacenter", - []string{"az", "hvtype", "state", "faultcode", "faultmessage"}, nil, + []string{"az", "hvtype", "state", "fault-code", "fault-message", "faulty-vm"}, nil, ) return nil } @@ -92,6 +92,7 @@ func (k *VMFaultsKPI) Collect(ch chan<- prometheus.Metric) { state string errcode string errmessage string + faultyVM string } counts := make(map[labels]float64) @@ -119,12 +120,19 @@ func (k *VMFaultsKPI) Collect(ch chan<- prometheus.Metric) { if server.FaultMessage != nil { errmsg = *server.FaultMessage } + // Only provide the server ID for faulty VMs, to avoid cardinality + // explosion in the metric. + faultyVM := "no" + if server.FaultCode != nil || server.FaultMessage != nil { + faultyVM = server.ID + } key := labels{ az: server.OSEXTAvailabilityZone, hvtype: string(hypervisorType), state: server.Status, errcode: strconv.FormatUint(uint64(errcode), 10), errmessage: errmsg, + faultyVM: faultyVM, } counts[key]++ } @@ -132,7 +140,7 @@ func (k *VMFaultsKPI) Collect(ch chan<- prometheus.Metric) { // Emit metrics to prometheus. for key, count := range counts { ch <- prometheus.MustNewConstMetric(k.vmFaultsDesc, prometheus.GaugeValue, count, - key.az, key.hvtype, key.state, key.errcode, key.errmessage) + key.az, key.hvtype, key.state, key.errcode, key.errmessage, key.faultyVM) } vmFaultsKPIlogger.Info("collected metrics", "nMetrics", len(counts)) } From cf2a9e564d3ea4094ae30c09ff808ba450b90e21 Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Mon, 30 Mar 2026 13:49:37 +0200 Subject: [PATCH 06/11] Fix linting issue --- tools/visualize-reservations/main.go | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/visualize-reservations/main.go b/tools/visualize-reservations/main.go index 90824fb6e..c99ff2eb1 100644 --- a/tools/visualize-reservations/main.go +++ b/tools/visualize-reservations/main.go @@ -1762,6 +1762,7 @@ func connectToPostgres( // Query servers with host information serverMap = make(map[string]serverInfo) + //nolint:gosec // This query is not using any user input, so it's not vulnerable to SQL injection rows, err := db.QueryContext(ctx, "SELECT id, flavor_name, COALESCE(host_id, ''), COALESCE(os_ext_srv_attr_host, '') FROM "+nova.Server{}.TableName()) if err != nil { fmt.Fprintf(os.Stderr, "Warning: Could not query "+nova.Server{}.TableName()+": %v\n", err) From 50a38166b4ba1a306122c60384d5b223a16e9472 Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Mon, 30 Mar 2026 13:57:58 +0200 Subject: [PATCH 07/11] Unit tests --- .../kpis/plugins/compute/vm_faults_test.go | 408 ++++++++++++++++++ 1 file changed, 408 insertions(+) create mode 100644 internal/knowledge/kpis/plugins/compute/vm_faults_test.go diff --git a/internal/knowledge/kpis/plugins/compute/vm_faults_test.go b/internal/knowledge/kpis/plugins/compute/vm_faults_test.go new file mode 100644 index 000000000..a5f63b42c --- /dev/null +++ b/internal/knowledge/kpis/plugins/compute/vm_faults_test.go @@ -0,0 +1,408 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package compute + +import ( + "reflect" + "testing" + + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" + "github.com/cobaltcore-dev/cortex/internal/knowledge/db" + testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing" + "github.com/cobaltcore-dev/cortex/pkg/conf" + testlib "github.com/cobaltcore-dev/cortex/pkg/testing" + "github.com/prometheus/client_golang/prometheus" + prometheusgo "github.com/prometheus/client_model/go" +) + +func TestVMFaultsKPI_GetName(t *testing.T) { + kpi := VMFaultsKPI{} + if kpi.GetName() != "vm_faults_kpi" { + t.Errorf("expected 'vm_faults_kpi', got %q", kpi.GetName()) + } +} + +func TestVMFaultsKPI_Init(t *testing.T) { + dbEnv := testlibDB.SetupDBEnv(t) + testDB := db.DB{DbMap: dbEnv.DbMap} + defer dbEnv.Close() + + kpi := &VMFaultsKPI{} + if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error, got %v", err) + } + if kpi.vmFaultsDesc == nil { + t.Error("vmFaultsDesc should be initialized") + } +} + +func TestVMFaultsKPI_Describe(t *testing.T) { + kpi := &VMFaultsKPI{} + if err := kpi.Init(nil, nil, conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + ch := make(chan *prometheus.Desc, 1) + kpi.Describe(ch) + close(ch) + + desc := <-ch + if desc == nil { + t.Error("expected descriptor to be sent to channel") + } +} + +func TestVMFaultsKPI_Collect_NoDB(t *testing.T) { + kpi := &VMFaultsKPI{} + if err := kpi.Init(nil, nil, conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + // Collect should not panic when no database is provided + ch := make(chan prometheus.Metric, 100) + kpi.Collect(ch) + close(ch) + + count := 0 + for range ch { + count++ + } + if count != 0 { + t.Errorf("expected 0 metrics when no DB, got %d", count) + } +} + +func TestVMFaultsKPI_Collect(t *testing.T) { + dbEnv := testlibDB.SetupDBEnv(t) + testDB := db.DB{DbMap: dbEnv.DbMap} + defer dbEnv.Close() + + if err := testDB.CreateTable( + testDB.AddTable(nova.Server{}), + testDB.AddTable(nova.Flavor{}), + ); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + // Insert mock flavors with different hypervisor types + flavors := []any{ + &nova.Flavor{ + ID: "flavor-qemu", + Name: "qemu-small", + VCPUs: 2, + RAM: 4096, + ExtraSpecs: `{"capabilities:hypervisor_type":"QEMU"}`, + }, + &nova.Flavor{ + ID: "flavor-vmware", + Name: "vmware-medium", + VCPUs: 4, + RAM: 8192, + ExtraSpecs: `{"capabilities:hypervisor_type":"VMware vCenter Server"}`, + }, + &nova.Flavor{ + ID: "flavor-unspecified", + Name: "generic-large", + VCPUs: 8, + RAM: 16384, + ExtraSpecs: `{}`, + }, + } + if err := testDB.Insert(flavors...); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + // Insert mock servers + servers := []any{ + // Normal server without fault + &nova.Server{ + ID: "server-1", + Name: "normal-vm", + Status: "ACTIVE", + FlavorName: "qemu-small", + OSEXTAvailabilityZone: "az1", + }, + // Server with fault code and message + &nova.Server{ + ID: "server-2", + Name: "faulty-vm", + Status: "ERROR", + FlavorName: "qemu-small", + OSEXTAvailabilityZone: "az1", + FaultCode: testlib.Ptr(uint(500)), + FaultMessage: testlib.Ptr("Internal error"), + }, + // Another faulty server in different AZ + &nova.Server{ + ID: "server-3", + Name: "another-faulty", + Status: "ERROR", + FlavorName: "vmware-medium", + OSEXTAvailabilityZone: "az2", + FaultCode: testlib.Ptr(uint(400)), + FaultMessage: testlib.Ptr("Bad request"), + }, + // Server with only fault message (no code) + &nova.Server{ + ID: "server-4", + Name: "partial-fault", + Status: "BUILD", + FlavorName: "generic-large", + OSEXTAvailabilityZone: "az1", + FaultMessage: testlib.Ptr("Some warning"), + }, + // Server with flavor that doesn't exist (should be skipped) + &nova.Server{ + ID: "server-5", + Name: "orphan-vm", + Status: "ACTIVE", + FlavorName: "nonexistent-flavor", + OSEXTAvailabilityZone: "az1", + }, + } + if err := testDB.Insert(servers...); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + kpi := &VMFaultsKPI{} + if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + ch := make(chan prometheus.Metric, 100) + kpi.Collect(ch) + close(ch) + + type vmFaultsMetric struct { + az string + hvtype string + state string + faultCode string + faultMessage string + faultyVM string + value float64 + } + + metrics := make(map[string]vmFaultsMetric) + for metric := range ch { + var m prometheusgo.Metric + if err := metric.Write(&m); err != nil { + t.Fatalf("failed to write metric: %v", err) + } + + labels := make(map[string]string) + for _, label := range m.Label { + labels[label.GetName()] = label.GetValue() + } + + key := labels["az"] + "|" + labels["hvtype"] + "|" + labels["state"] + "|" + + labels["fault-code"] + "|" + labels["faulty-vm"] + + metrics[key] = vmFaultsMetric{ + az: labels["az"], + hvtype: labels["hvtype"], + state: labels["state"], + faultCode: labels["fault-code"], + faultMessage: labels["fault-message"], + faultyVM: labels["faulty-vm"], + value: m.GetGauge().GetValue(), + } + } + + expectedMetrics := map[string]vmFaultsMetric{ + // Normal VM without fault + "az1|QEMU|ACTIVE|0|no": { + az: "az1", + hvtype: "QEMU", + state: "ACTIVE", + faultCode: "0", + faultMessage: "n/a", + faultyVM: "no", + value: 1, + }, + // Faulty VM with code 500 + "az1|QEMU|ERROR|500|server-2": { + az: "az1", + hvtype: "QEMU", + state: "ERROR", + faultCode: "500", + faultMessage: "Internal error", + faultyVM: "server-2", + value: 1, + }, + // Faulty VM with code 400 in az2 + "az2|VMware vCenter Server|ERROR|400|server-3": { + az: "az2", + hvtype: "VMware vCenter Server", + state: "ERROR", + faultCode: "400", + faultMessage: "Bad request", + faultyVM: "server-3", + value: 1, + }, + // Server with only fault message (code=0 but has message) + "az1|Unspecified|BUILD|0|server-4": { + az: "az1", + hvtype: "Unspecified", + state: "BUILD", + faultCode: "0", + faultMessage: "Some warning", + faultyVM: "server-4", + value: 1, + }, + } + + if len(expectedMetrics) != len(metrics) { + t.Errorf("expected %d metrics, got %d", len(expectedMetrics), len(metrics)) + t.Logf("actual metrics: %+v", metrics) + } + + for key, expected := range expectedMetrics { + actual, ok := metrics[key] + if !ok { + t.Errorf("expected metric %q not found", key) + continue + } + + if !reflect.DeepEqual(expected, actual) { + t.Errorf("metric %q: expected %+v, got %+v", key, expected, actual) + } + } +} + +func TestVMFaultsKPI_Collect_InvalidExtraSpecs(t *testing.T) { + dbEnv := testlibDB.SetupDBEnv(t) + testDB := db.DB{DbMap: dbEnv.DbMap} + defer dbEnv.Close() + + if err := testDB.CreateTable( + testDB.AddTable(nova.Server{}), + testDB.AddTable(nova.Flavor{}), + ); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + // Insert flavor with invalid extra specs JSON + flavors := []any{ + &nova.Flavor{ + ID: "flavor-bad", + Name: "bad-flavor", + VCPUs: 2, + RAM: 4096, + ExtraSpecs: `invalid-json`, + }, + } + if err := testDB.Insert(flavors...); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + servers := []any{ + &nova.Server{ + ID: "server-bad", + Name: "bad-vm", + Status: "ACTIVE", + FlavorName: "bad-flavor", + OSEXTAvailabilityZone: "az1", + }, + } + if err := testDB.Insert(servers...); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + kpi := &VMFaultsKPI{} + if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + // Should not panic, but should skip the server with invalid flavor + ch := make(chan prometheus.Metric, 100) + kpi.Collect(ch) + close(ch) + + count := 0 + for range ch { + count++ + } + // Should have 0 metrics since the server's flavor has invalid extra specs + if count != 0 { + t.Errorf("expected 0 metrics, got %d", count) + } +} + +func TestVMFaultsKPI_Collect_MultipleSameLabels(t *testing.T) { + dbEnv := testlibDB.SetupDBEnv(t) + testDB := db.DB{DbMap: dbEnv.DbMap} + defer dbEnv.Close() + + if err := testDB.CreateTable( + testDB.AddTable(nova.Server{}), + testDB.AddTable(nova.Flavor{}), + ); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + flavors := []any{ + &nova.Flavor{ + ID: "flavor-1", + Name: "small", + VCPUs: 2, + RAM: 4096, + ExtraSpecs: `{"capabilities:hypervisor_type":"QEMU"}`, + }, + } + if err := testDB.Insert(flavors...); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + // Insert multiple servers that should aggregate to same metric + servers := []any{ + &nova.Server{ + ID: "server-1", + Name: "vm-1", + Status: "ACTIVE", + FlavorName: "small", + OSEXTAvailabilityZone: "az1", + }, + &nova.Server{ + ID: "server-2", + Name: "vm-2", + Status: "ACTIVE", + FlavorName: "small", + OSEXTAvailabilityZone: "az1", + }, + &nova.Server{ + ID: "server-3", + Name: "vm-3", + Status: "ACTIVE", + FlavorName: "small", + OSEXTAvailabilityZone: "az1", + }, + } + if err := testDB.Insert(servers...); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + kpi := &VMFaultsKPI{} + if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + ch := make(chan prometheus.Metric, 100) + kpi.Collect(ch) + close(ch) + + var value float64 + for metric := range ch { + var m prometheusgo.Metric + if err := metric.Write(&m); err != nil { + t.Fatalf("failed to write metric: %v", err) + } + value = m.GetGauge().GetValue() + } + + // All 3 VMs should be counted together since they have the same labels + if value != 3 { + t.Errorf("expected metric value 3, got %f", value) + } +} From aa05ab53d7521b8e1cf0c5444bec8a7f03b7eb80 Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Mon, 30 Mar 2026 14:01:50 +0200 Subject: [PATCH 08/11] Limit alert to kvm hypervisors --- helm/bundles/cortex-nova/alerts/nova.alerts.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index c8fc74b8e..e702bc508 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -593,8 +593,8 @@ groups: Reservations are automatically repaired, but the root cause should be investigated if this alert persists. - - alert: CortexNovaDoesntFindValidHosts - expr: cortex_vm_faults{faultmessage=~".+No valid host was found.+"} > 0 + - alert: CortexNovaDoesntFindValidKVMHosts + expr: cortex_vm_faults{hvtype=~"CH|QEMU",faultmessage=~".+No valid host was found.+"} > 0 labels: context: scheduling dashboard: cortex/cortex @@ -602,9 +602,9 @@ groups: severity: warning support_group: workload-management annotations: - summary: "Nova scheduling cannot find valid hosts" + summary: "Nova scheduling cannot find valid KVM hosts" description: > Cortex is seeing faulty vms in `{{$labels.az}}` where Nova scheduling - failed to find a valid host. This may indicate capacity issues, - misconfigured filters, or resource constraints in the datacenter. - Investigate the affected VMs and hypervisor availability. + failed to find a valid `{{$labels.hvtype}}` host. This may indicate + capacity issues, misconfigured filters, or resource constraints in the + datacenter. Investigate the affected VMs and hypervisor availability. From 7cce0da334cbf38e1548088c6292123be7a34ed6 Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Mon, 30 Mar 2026 14:34:29 +0200 Subject: [PATCH 09/11] Refine metric labels and add dashboard panel --- .../kpis/plugins/compute/vm_faults.go | 7 +- .../kpis/plugins/compute/vm_faults_test.go | 10 +- .../dashboards/cortex-status.json | 147 +++++++++++++++--- 3 files changed, 138 insertions(+), 26 deletions(-) diff --git a/internal/knowledge/kpis/plugins/compute/vm_faults.go b/internal/knowledge/kpis/plugins/compute/vm_faults.go index 7d69f709f..fec71247c 100644 --- a/internal/knowledge/kpis/plugins/compute/vm_faults.go +++ b/internal/knowledge/kpis/plugins/compute/vm_faults.go @@ -6,6 +6,7 @@ package compute import ( "errors" "strconv" + "strings" "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" "github.com/cobaltcore-dev/cortex/internal/knowledge/db" @@ -42,7 +43,7 @@ func (k *VMFaultsKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) e } k.vmFaultsDesc = prometheus.NewDesc("cortex_vm_faults", "Number of vm faults in the datacenter", - []string{"az", "hvtype", "state", "fault-code", "fault-message", "faulty-vm"}, nil, + []string{"az", "hvtype", "state", "faultcode", "faultmsg", "faultyvm"}, nil, ) return nil } @@ -119,6 +120,10 @@ func (k *VMFaultsKPI) Collect(ch chan<- prometheus.Metric) { errmsg := "n/a" if server.FaultMessage != nil { errmsg = *server.FaultMessage + // Sometimes the VM ID may appear in the error message, which can + // lead to high cardinality in the metric. To avoid this, we replace + // the VM ID with a placeholder. + errmsg = strings.ReplaceAll(errmsg, server.ID, "") } // Only provide the server ID for faulty VMs, to avoid cardinality // explosion in the metric. diff --git a/internal/knowledge/kpis/plugins/compute/vm_faults_test.go b/internal/knowledge/kpis/plugins/compute/vm_faults_test.go index a5f63b42c..a5b248b55 100644 --- a/internal/knowledge/kpis/plugins/compute/vm_faults_test.go +++ b/internal/knowledge/kpis/plugins/compute/vm_faults_test.go @@ -126,7 +126,7 @@ func TestVMFaultsKPI_Collect(t *testing.T) { // Server with fault code and message &nova.Server{ ID: "server-2", - Name: "faulty-vm", + Name: "faultyvm", Status: "ERROR", FlavorName: "qemu-small", OSEXTAvailabilityZone: "az1", @@ -197,15 +197,15 @@ func TestVMFaultsKPI_Collect(t *testing.T) { } key := labels["az"] + "|" + labels["hvtype"] + "|" + labels["state"] + "|" + - labels["fault-code"] + "|" + labels["faulty-vm"] + labels["faultcode"] + "|" + labels["faultyvm"] metrics[key] = vmFaultsMetric{ az: labels["az"], hvtype: labels["hvtype"], state: labels["state"], - faultCode: labels["fault-code"], - faultMessage: labels["fault-message"], - faultyVM: labels["faulty-vm"], + faultCode: labels["faultcode"], + faultMessage: labels["faultmsg"], + faultyVM: labels["faultyvm"], value: m.GetGauge().GetValue(), } } diff --git a/tools/plutono/provisioning/dashboards/cortex-status.json b/tools/plutono/provisioning/dashboards/cortex-status.json index f83e2926b..d05157d8d 100644 --- a/tools/plutono/provisioning/dashboards/cortex-status.json +++ b/tools/plutono/provisioning/dashboards/cortex-status.json @@ -16,7 +16,7 @@ "editable": true, "gnetId": null, "graphTooltip": 0, - "id": 3, + "id": 1, "links": [], "panels": [ { @@ -557,6 +557,7 @@ "dashLength": 10, "dashes": false, "datasource": "prometheus-openstack", + "description": "", "fieldConfig": { "defaults": { "unit": "short" @@ -567,11 +568,117 @@ "fillGradient": 0, "gridPos": { "h": 12, - "w": 24, + "w": 12, "x": 0, "y": 31 }, "hiddenSeries": false, + "id": 58, + "interval": null, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.37", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": false, + "expr": "sum by (faultmsg,state) (cortex_vm_faults{faultcode!=\"0\"})", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{state}} {{faultmsg}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nova: faults in vm scheduling lifecycle", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:234", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:235", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus-openstack", + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 43 + }, + "hiddenSeries": false, "id": 39, "legend": { "avg": false, @@ -669,7 +776,7 @@ "h": 11, "w": 6, "x": 0, - "y": 43 + "y": 55 }, "hiddenSeries": false, "id": 31, @@ -766,7 +873,7 @@ "h": 11, "w": 6, "x": 6, - "y": 43 + "y": 55 }, "hiddenSeries": false, "id": 33, @@ -878,7 +985,7 @@ "h": 11, "w": 6, "x": 12, - "y": 43 + "y": 55 }, "hiddenSeries": false, "id": 35, @@ -990,7 +1097,7 @@ "h": 11, "w": 6, "x": 18, - "y": 43 + "y": 55 }, "hiddenSeries": false, "id": 37, @@ -1100,7 +1207,7 @@ "h": 12, "w": 12, "x": 0, - "y": 54 + "y": 66 }, "hiddenSeries": false, "id": 27, @@ -1208,7 +1315,7 @@ "h": 12, "w": 12, "x": 12, - "y": 54 + "y": 66 }, "hiddenSeries": false, "id": 29, @@ -1296,7 +1403,7 @@ "h": 1, "w": 24, "x": 0, - "y": 66 + "y": 78 }, "id": 5, "panels": [], @@ -1321,7 +1428,7 @@ "h": 11, "w": 12, "x": 0, - "y": 67 + "y": 79 }, "hiddenSeries": false, "id": 2, @@ -1441,7 +1548,7 @@ "h": 11, "w": 12, "x": 12, - "y": 67 + "y": 79 }, "hiddenSeries": false, "id": 3, @@ -1580,7 +1687,7 @@ "h": 12, "w": 24, "x": 0, - "y": 78 + "y": 90 }, "id": 50, "options": { @@ -1621,7 +1728,7 @@ "h": 1, "w": 24, "x": 0, - "y": 90 + "y": 102 }, "id": 25, "panels": [], @@ -1644,7 +1751,7 @@ "h": 14, "w": 12, "x": 0, - "y": 91 + "y": 103 }, "hiddenSeries": false, "id": 21, @@ -1746,7 +1853,7 @@ "h": 14, "w": 12, "x": 12, - "y": 91 + "y": 103 }, "hiddenSeries": false, "id": 23, @@ -1839,7 +1946,7 @@ "h": 1, "w": 24, "x": 0, - "y": 105 + "y": 117 }, "id": 19, "panels": [], @@ -1862,7 +1969,7 @@ "h": 13, "w": 12, "x": 0, - "y": 106 + "y": 118 }, "hiddenSeries": false, "id": 17, @@ -1960,7 +2067,7 @@ "h": 13, "w": 12, "x": 12, - "y": 106 + "y": 118 }, "hiddenSeries": false, "id": 15, @@ -2057,7 +2164,7 @@ "h": 12, "w": 12, "x": 0, - "y": 119 + "y": 131 }, "hiddenSeries": false, "id": 11, @@ -2155,7 +2262,7 @@ "h": 12, "w": 12, "x": 12, - "y": 119 + "y": 131 }, "hiddenSeries": false, "id": 13, From 1c1c7c83f2461bc9691cf2a6037383838684a5ad Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Mon, 30 Mar 2026 14:51:22 +0200 Subject: [PATCH 10/11] PR feedback --- helm/bundles/cortex-nova/alerts/nova.alerts.yaml | 2 +- .../datasources/plugins/openstack/nova/nova_types.go | 4 +++- tools/plutono/provisioning/dashboards/cortex-status.json | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index e702bc508..a92881603 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -594,7 +594,7 @@ groups: investigated if this alert persists. - alert: CortexNovaDoesntFindValidKVMHosts - expr: cortex_vm_faults{hvtype=~"CH|QEMU",faultmessage=~".+No valid host was found.+"} > 0 + expr: sum by (az, hvtype) (cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".+No valid host was found.+"}) > 0 labels: context: scheduling dashboard: cortex/cortex diff --git a/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go b/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go index 70e4fb02e..1be2b7a29 100644 --- a/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go +++ b/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go @@ -358,7 +358,9 @@ const ( // extra specs. func (f Flavor) GetHypervisorType() (FlavorHypervisorType, error) { var extraSpecs map[string]string - if err := json.Unmarshal([]byte(f.ExtraSpecs), &extraSpecs); err != nil { + if f.ExtraSpecs == "" { + extraSpecs = map[string]string{} + } else if err := json.Unmarshal([]byte(f.ExtraSpecs), &extraSpecs); err != nil { return "", err // Return an error if the extra specs cannot be parsed. } hypervisorType, ok := extraSpecs["capabilities:hypervisor_type"] diff --git a/tools/plutono/provisioning/dashboards/cortex-status.json b/tools/plutono/provisioning/dashboards/cortex-status.json index d05157d8d..37f4b2479 100644 --- a/tools/plutono/provisioning/dashboards/cortex-status.json +++ b/tools/plutono/provisioning/dashboards/cortex-status.json @@ -606,7 +606,7 @@ "targets": [ { "exemplar": false, - "expr": "sum by (faultmsg,state) (cortex_vm_faults{faultcode!=\"0\"})", + "expr": "sum by (faultmsg,state) (cortex_vm_faults{faultyvm!=\"no\"})", "format": "time_series", "instant": false, "interval": "", From c1a2a7319f738d368b659df5e145421b4c24d909 Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Mon, 30 Mar 2026 15:30:06 +0200 Subject: [PATCH 11/11] PR feedback --- helm/bundles/cortex-nova/alerts/nova.alerts.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index a92881603..e3271f119 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -594,7 +594,8 @@ groups: investigated if this alert persists. - alert: CortexNovaDoesntFindValidKVMHosts - expr: sum by (az, hvtype) (cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".+No valid host was found.+"}) > 0 + expr: sum by (az, hvtype) (cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*"}) > 0 + for: 5m labels: context: scheduling dashboard: cortex/cortex