From f4acf63054eb7b45bb6dc904729946fa8952f04b Mon Sep 17 00:00:00 2001
From: Philipp Matthes
Date: Mon, 30 Mar 2026 11:46:55 +0200
Subject: [PATCH 01/11] Add vm state kpi
---
.../plugins/openstack/nova/nova_types.go | 46 ++++
.../kpis/plugins/compute/vm_state.go | 119 +++++++++
.../kpis/plugins/compute/vm_state_test.go | 240 ++++++++++++++++++
3 files changed, 405 insertions(+)
create mode 100644 internal/knowledge/kpis/plugins/compute/vm_state.go
create mode 100644 internal/knowledge/kpis/plugins/compute/vm_state_test.go
diff --git a/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go b/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go
index 322b05d69..2633c76fc 100644
--- a/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go
+++ b/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go
@@ -285,6 +285,52 @@ type Flavor struct {
ExtraSpecs string `json:"extra_specs" db:"extra_specs"`
}
+// FlavorHypervisorType is a type alias for a string to represent the specific
+// values the hypervisor type contained in flavor extra specs may have.
+type FlavorHypervisorType string
+
+const (
+ // FlavorHypervisorTypeQEMU maps a flavor for QEMU/KVM hypervisors.
+ FlavorHypervisorTypeQEMU FlavorHypervisorType = "QEMU"
+ // FlavorHypervisorTypeCH maps flavors to Cloud-Hypervisor/KVM hypervisors.
+ FlavorHypervisorTypeCH FlavorHypervisorType = "CH"
+ // FlavorHypervisorTypeVMware maps flavors to VMware hypervisors.
+ FlavorHypervisorTypeVMware FlavorHypervisorType = "VMware vCenter Server"
+ // FlavorHypervisorTypeIronic maps flavors to Ironic baremetal instances.
+ FlavorHypervisorTypeIronic FlavorHypervisorType = "Ironic"
+ // FlavorHypervisorTypeOther is a flavor for which the hypervisor type
+ // is set in the extra specs but has an unknown value.
+ FlavorHypervisorTypeOther FlavorHypervisorType = "Other"
+ // FlavorHypervisorTypeUnspecified is a flavor for which the hypervisor type
+ // is not set in the extra specs.
+ FlavorHypervisorTypeUnspecified FlavorHypervisorType = "Unspecified"
+)
+
+// GetHypervisorType returns the hypervisor type of the flavor based on its
+// extra specs.
+func (f Flavor) GetHypervisorType() (FlavorHypervisorType, error) {
+ var extraSpecs map[string]string
+ if err := json.Unmarshal([]byte(f.ExtraSpecs), &extraSpecs); err != nil {
+ return "", err // Return an error if the extra specs cannot be parsed.
+ }
+ hypervisorType, ok := extraSpecs["capabilities:hypervisor_type"]
+ if !ok {
+ return FlavorHypervisorTypeUnspecified, nil
+ }
+ switch hypervisorType {
+ case string(FlavorHypervisorTypeQEMU):
+ return FlavorHypervisorTypeQEMU, nil
+ case string(FlavorHypervisorTypeCH):
+ return FlavorHypervisorTypeCH, nil
+ case string(FlavorHypervisorTypeVMware):
+ return FlavorHypervisorTypeVMware, nil
+ case string(FlavorHypervisorTypeIronic):
+ return FlavorHypervisorTypeIronic, nil
+ default:
+ return FlavorHypervisorTypeOther, nil
+ }
+}
+
// Custom unmarshaler for OpenStackFlavor to handle nested JSON.
func (f *Flavor) UnmarshalJSON(data []byte) error {
type Alias Flavor
diff --git a/internal/knowledge/kpis/plugins/compute/vm_state.go b/internal/knowledge/kpis/plugins/compute/vm_state.go
new file mode 100644
index 000000000..53e3d2fc4
--- /dev/null
+++ b/internal/knowledge/kpis/plugins/compute/vm_state.go
@@ -0,0 +1,119 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package compute
+
+import (
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/db"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins"
+ "github.com/cobaltcore-dev/cortex/pkg/conf"
+ "github.com/prometheus/client_golang/prometheus"
+ ctrl "sigs.k8s.io/controller-runtime"
+ "sigs.k8s.io/controller-runtime/pkg/client"
+)
+
+var vmStateKPIlogger = ctrl.Log.WithName("vm-state-kpi")
+
+// This kpi monitors the current state of vms, i.e. how many vms are running,
+// stopped, paused, etc. It also exposes additional labels such as the vm's
+// hypervisor type which can be used to define alerts on non-running vms.
+type VMStateKPI struct {
+ // Common base for all KPIs that provides standard functionality.
+ plugins.BaseKPI[struct{}] // No options passed through yaml config
+
+ // Current state of the VM, e.g. running, stopped, paused, etc.
+ vmStateDesc *prometheus.Desc
+}
+
+// GetName returns a unique name for this kpi that is used for registration
+// and configuration.
+func (VMStateKPI) GetName() string { return "vm_state_kpi" }
+
+// Init initializes the kpi, e.g. by creating the necessary Prometheus
+// descriptors. The base kpi is also initialized with the provided database,
+// client and options.
+func (k *VMStateKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) error {
+ if err := k.BaseKPI.Init(db, client, opts); err != nil {
+ return err
+ }
+ k.vmStateDesc = prometheus.NewDesc("cortex_vm_state",
+ "Current state of the VM, e.g. running, stopped, paused, etc.",
+ []string{"az", "hvtype", "state"}, nil,
+ )
+ return nil
+}
+
+// Describe sends the descriptor of this kpi to the provided channel. This is
+// used by Prometheus to know which metrics this kpi exposes.
+func (k *VMStateKPI) Describe(ch chan<- *prometheus.Desc) { ch <- k.vmStateDesc }
+
+// Collect collects the current state of vms from the database and sends it as
+// Prometheus metrics to the provided channel.
+func (k *VMStateKPI) Collect(ch chan<- prometheus.Metric) {
+ vmStateKPIlogger.Info("collecting vm state kpi")
+
+ // This can happen when no datasource is provided that connects to a database.
+ if k.DB == nil {
+ vmStateKPIlogger.Error(nil, "no database connection, cannot collect vm state kpi")
+ return
+ }
+
+ // Get all vms with their current state from the database.
+ var servers []nova.Server
+ nServers, err := k.DB.Select(&servers, "SELECT * FROM "+nova.Server{}.TableName())
+ if err != nil {
+ vmStateKPIlogger.Error(err, "failed to query servers from database")
+ return
+ }
+ vmStateKPIlogger.Info("queried servers from database", "nServers", nServers)
+
+ // Get all flavors from the database to map them to the vms.
+ var flavors []nova.Flavor
+ nFlavors, err := k.DB.Select(&flavors, "SELECT * FROM "+nova.Flavor{}.TableName())
+ if err != nil {
+ vmStateKPIlogger.Error(err, "failed to query flavors from database")
+ return
+ }
+ vmStateKPIlogger.Info("queried flavors from database", "nFlavors", nFlavors)
+
+ flavorsByName := make(map[string]nova.Flavor, len(flavors))
+ for _, flavor := range flavors {
+ flavorsByName[flavor.Name] = flavor
+ }
+
+ type labels struct {
+ az string
+ hvtype string
+ state string
+ }
+ counts := make(map[labels]float64)
+
+ // For each vm, get its hypervisor type and count up.
+ for _, server := range servers {
+ flavor, ok := flavorsByName[server.FlavorName]
+ if !ok {
+ vmStateKPIlogger.Error(nil, "flavor not found for server", "server",
+ server.ID, "flavor", server.FlavorName)
+ continue
+ }
+ hypervisorType, err := flavor.GetHypervisorType()
+ if err != nil {
+ vmStateKPIlogger.Error(err, "failed to get hypervisor type for server",
+ "server", server.ID, "flavor", flavor.Name)
+ continue
+ }
+ key := labels{
+ az: server.OSEXTAvailabilityZone,
+ hvtype: string(hypervisorType),
+ state: server.Status,
+ }
+ counts[key]++
+ }
+
+ // Emit metrics to prometheus.
+ for key, count := range counts {
+ ch <- prometheus.MustNewConstMetric(k.vmStateDesc, prometheus.GaugeValue, count,
+ key.az, key.hvtype, key.state)
+ }
+}
diff --git a/internal/knowledge/kpis/plugins/compute/vm_state_test.go b/internal/knowledge/kpis/plugins/compute/vm_state_test.go
new file mode 100644
index 000000000..57196e1de
--- /dev/null
+++ b/internal/knowledge/kpis/plugins/compute/vm_state_test.go
@@ -0,0 +1,240 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package compute
+
+import (
+ "testing"
+
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/db"
+ testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing"
+ "github.com/cobaltcore-dev/cortex/pkg/conf"
+ "github.com/prometheus/client_golang/prometheus"
+ prometheusgo "github.com/prometheus/client_model/go"
+)
+
+func TestVMStateKPI_Init(t *testing.T) {
+ dbEnv := testlibDB.SetupDBEnv(t)
+ testDB := db.DB{DbMap: dbEnv.DbMap}
+ defer dbEnv.Close()
+ kpi := &VMStateKPI{}
+ if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+}
+
+func TestVMStateKPI_Collect(t *testing.T) {
+ dbEnv := testlibDB.SetupDBEnv(t)
+ testDB := db.DB{DbMap: dbEnv.DbMap}
+ defer dbEnv.Close()
+ if err := testDB.CreateTable(
+ testDB.AddTable(nova.Server{}),
+ testDB.AddTable(nova.Flavor{}),
+ ); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ mockData := []any{
+ // Servers in different AZs, states, and with different flavors
+ &nova.Server{
+ ID: "server-1",
+ FlavorName: "m1.small",
+ OSEXTAvailabilityZone: "az1",
+ Status: "ACTIVE",
+ },
+ &nova.Server{
+ ID: "server-2",
+ FlavorName: "m1.small",
+ OSEXTAvailabilityZone: "az1",
+ Status: "ACTIVE",
+ },
+ &nova.Server{
+ ID: "server-3",
+ FlavorName: "m1.small",
+ OSEXTAvailabilityZone: "az1",
+ Status: "STOPPED",
+ },
+ &nova.Server{
+ ID: "server-4",
+ FlavorName: "m1.vmware",
+ OSEXTAvailabilityZone: "az2",
+ Status: "ACTIVE",
+ },
+ &nova.Server{
+ ID: "server-5",
+ FlavorName: "m1.generic",
+ OSEXTAvailabilityZone: "az1",
+ Status: "PAUSED",
+ },
+ // Flavors with different hypervisor types
+ &nova.Flavor{
+ ID: "flavor-1",
+ Name: "m1.small",
+ ExtraSpecs: `{"capabilities:hypervisor_type": "QEMU"}`,
+ },
+ &nova.Flavor{
+ ID: "flavor-2",
+ Name: "m1.vmware",
+ ExtraSpecs: `{"capabilities:hypervisor_type": "VMware vCenter Server"}`,
+ },
+ &nova.Flavor{
+ ID: "flavor-3",
+ Name: "m1.generic",
+ ExtraSpecs: `{}`,
+ },
+ }
+
+ if err := testDB.Insert(mockData...); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ kpi := &VMStateKPI{}
+ if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ ch := make(chan prometheus.Metric, 100)
+ kpi.Collect(ch)
+ close(ch)
+
+ type vmStateMetric struct {
+ az string
+ hvtype string
+ state string
+ count float64
+ }
+
+ metrics := make(map[string]vmStateMetric)
+ for metric := range ch {
+ var m prometheusgo.Metric
+ if err := metric.Write(&m); err != nil {
+ t.Fatalf("failed to write metric: %v", err)
+ }
+ labels := make(map[string]string)
+ for _, label := range m.Label {
+ labels[label.GetName()] = label.GetValue()
+ }
+ key := labels["az"] + "|" + labels["hvtype"] + "|" + labels["state"]
+ metrics[key] = vmStateMetric{
+ az: labels["az"],
+ hvtype: labels["hvtype"],
+ state: labels["state"],
+ count: m.GetGauge().GetValue(),
+ }
+ }
+
+ expectedMetrics := map[string]vmStateMetric{
+ "az1|QEMU|ACTIVE": {
+ az: "az1",
+ hvtype: "QEMU",
+ state: "ACTIVE",
+ count: 2,
+ },
+ "az1|QEMU|STOPPED": {
+ az: "az1",
+ hvtype: "QEMU",
+ state: "STOPPED",
+ count: 1,
+ },
+ "az2|VMware vCenter Server|ACTIVE": {
+ az: "az2",
+ hvtype: "VMware vCenter Server",
+ state: "ACTIVE",
+ count: 1,
+ },
+ "az1|Unspecified|PAUSED": {
+ az: "az1",
+ hvtype: "Unspecified",
+ state: "PAUSED",
+ count: 1,
+ },
+ }
+
+ if len(expectedMetrics) != len(metrics) {
+ t.Errorf("expected %d metrics, got %d", len(expectedMetrics), len(metrics))
+ }
+
+ for key, expected := range expectedMetrics {
+ actual, ok := metrics[key]
+ if !ok {
+ t.Errorf("expected metric %q not found", key)
+ continue
+ }
+ if expected != actual {
+ t.Errorf("metric %q: expected %+v, got %+v", key, expected, actual)
+ }
+ }
+}
+
+func TestVMStateKPI_Collect_MissingFlavor(t *testing.T) {
+ dbEnv := testlibDB.SetupDBEnv(t)
+ testDB := db.DB{DbMap: dbEnv.DbMap}
+ defer dbEnv.Close()
+ if err := testDB.CreateTable(
+ testDB.AddTable(nova.Server{}),
+ testDB.AddTable(nova.Flavor{}),
+ ); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ mockData := []any{
+ &nova.Server{
+ ID: "server-1",
+ FlavorName: "m1.existing",
+ OSEXTAvailabilityZone: "az1",
+ Status: "ACTIVE",
+ },
+ &nova.Server{
+ ID: "server-2",
+ FlavorName: "m1.missing",
+ OSEXTAvailabilityZone: "az1",
+ Status: "ACTIVE",
+ },
+ &nova.Flavor{
+ ID: "flavor-1",
+ Name: "m1.existing",
+ ExtraSpecs: `{}`,
+ },
+ }
+
+ if err := testDB.Insert(mockData...); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ kpi := &VMStateKPI{}
+ if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ ch := make(chan prometheus.Metric, 100)
+ kpi.Collect(ch)
+ close(ch)
+
+ var count int
+ for range ch {
+ count++
+ }
+ if count != 1 {
+ t.Errorf("expected 1 metric (missing flavor should be skipped), got %d", count)
+ }
+}
+
+func TestVMStateKPI_Collect_NoDB(t *testing.T) {
+ kpi := &VMStateKPI{}
+ if err := kpi.Init(nil, nil, conf.NewRawOpts("{}")); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ ch := make(chan prometheus.Metric, 100)
+ kpi.Collect(ch) // Should not panic
+ close(ch)
+
+ var count int
+ for range ch {
+ count++
+ }
+ if count != 0 {
+ t.Errorf("expected 0 metrics when no DB, got %d", count)
+ }
+}
From 4ba03bb75444fb1ef67eb56b4b3a525a87268dbb Mon Sep 17 00:00:00 2001
From: Philipp Matthes
Date: Mon, 30 Mar 2026 13:16:49 +0200
Subject: [PATCH 02/11] Add kpi to nova bundle
---
helm/bundles/cortex-nova/templates/kpis.yaml | 16 ++++++++++++++++
internal/knowledge/kpis/supported_kpis.go | 1 +
2 files changed, 17 insertions(+)
diff --git a/helm/bundles/cortex-nova/templates/kpis.yaml b/helm/bundles/cortex-nova/templates/kpis.yaml
index af01c10c5..62ff7f499 100644
--- a/helm/bundles/cortex-nova/templates/kpis.yaml
+++ b/helm/bundles/cortex-nova/templates/kpis.yaml
@@ -110,6 +110,22 @@ spec:
---
apiVersion: cortex.cloud/v1alpha1
kind: KPI
+metadata:
+ name: vm-state
+spec:
+ schedulingDomain: nova
+ impl: vm_state_kpi
+ dependencies:
+ datasources:
+ - name: nova-servers
+ - name: nova-flavors
+ description: |
+ This kpi monitors the current state of vms, i.e. how many vms are running,
+ stopped, paused, etc. It also exposes additional labels such as the vm's
+ hypervisor type which can be used to define alerts on non-running vms.
+---
+apiVersion: cortex.cloud/v1alpha1
+kind: KPI
metadata:
name: cortex-nova-datasource-state
spec:
diff --git a/internal/knowledge/kpis/supported_kpis.go b/internal/knowledge/kpis/supported_kpis.go
index 274c5ace5..b469790cc 100644
--- a/internal/knowledge/kpis/supported_kpis.go
+++ b/internal/knowledge/kpis/supported_kpis.go
@@ -21,6 +21,7 @@ var supportedKPIs = map[string]plugins.KPI{
"vm_migration_statistics_kpi": &compute.VMMigrationStatisticsKPI{},
"vm_life_span_kpi": &compute.VMLifeSpanKPI{},
"vm_commitments_kpi": &compute.VMCommitmentsKPI{},
+ "vm_state_kpi": &compute.VMStateKPI{},
"netapp_storage_pool_cpu_usage_kpi": &storage.NetAppStoragePoolCPUUsageKPI{},
From b613bdc7dbd692442945202c90a36901bad89304 Mon Sep 17 00:00:00 2001
From: Philipp Matthes
Date: Mon, 30 Mar 2026 13:17:00 +0200
Subject: [PATCH 03/11] Support syncing server faults from nova
---
.../plugins/openstack/nova/nova_types.go | 74 +++++++++++++++----
.../compute/libvirt_domain_cpu_steal_pct.sql | 2 +-
.../libvirt_domain_cpu_steal_pct_test.go | 2 +-
.../plugins/compute/vm_host_residency.sql | 2 +-
.../plugins/compute/vm_life_span.sql | 2 +-
.../compute/vrops_hostsystem_resolver.sql | 2 +-
.../compute/vrops_project_noisiness.sql | 2 +-
.../reservations/commitments/controller.go | 2 +-
tools/visualize-reservations/main.go | 5 +-
9 files changed, 71 insertions(+), 22 deletions(-)
diff --git a/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go b/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go
index 2633c76fc..70e4fb02e 100644
--- a/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go
+++ b/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go
@@ -108,9 +108,24 @@ type Server struct {
OSEXTSTSVmState string `json:"OS-EXT-STS:vm_state" db:"os_ext_sts_vm_state"`
OSEXTSTSPowerState int `json:"OS-EXT-STS:power_state" db:"os_ext_sts_power_state"`
- // From nested JSON
+ // From nested server.flavor JSON
FlavorName string `json:"-" db:"flavor_name"`
+ // From nested server.fault JSON
+
+ // The error response code.
+ FaultCode *uint `json:"-" db:"fault_code"`
+ // The date and time when the exception was raised. The date and time stamp
+ // format is ISO 8601 (CCYY-MM-DDThh:mm:ss±hh:mm). For example,
+ // 2015-08-27T09:49:58-05:00. The ±hh:mm value if included, is the time zone
+ // as an offset from UTC. In the previous example, the offset value is -05:00.
+ FaultCreated *string `json:"-" db:"fault_created"`
+ // The error message.
+ FaultMessage *string `json:"-" db:"fault_message"`
+ // The stack trace. It is available if the response code is not 500 or you
+ // have the administrator privilege.
+ FaultDetails *string `json:"-" db:"fault_details"`
+
// Note: there are some more fields that are omitted. To include them again, add
// custom unmarshalers and marshalers for the struct below.
}
@@ -119,7 +134,8 @@ type Server struct {
func (s *Server) UnmarshalJSON(data []byte) error {
type Alias Server
aux := &struct {
- Flavor json.RawMessage `json:"flavor"`
+ Flavor json.RawMessage `json:"flavor"`
+ Fault *json.RawMessage `json:"fault,omitempty"`
*Alias
}{
Alias: (*Alias)(s),
@@ -135,31 +151,63 @@ func (s *Server) UnmarshalJSON(data []byte) error {
return err
}
s.FlavorName = flavor.Name
+ var fault struct {
+ Code uint `json:"code"`
+ Created string `json:"created"`
+ Message string `json:"message"`
+ Details *string `json:"details,omitempty"`
+ }
+ if aux.Fault != nil {
+ if err := json.Unmarshal(*aux.Fault, &fault); err != nil {
+ return err
+ }
+ s.FaultCode = &fault.Code
+ s.FaultCreated = &fault.Created
+ s.FaultMessage = &fault.Message
+ s.FaultDetails = fault.Details
+ }
return nil
}
// Custom marshaler for OpenStackServer to handle nested JSON.
func (s *Server) MarshalJSON() ([]byte, error) {
type Alias Server
+ type flavor struct {
+ // Starting in microversion 2.47, "id" was removed...
+ Name string `json:"original_name"`
+ }
+ flavorVal := flavor{
+ Name: s.FlavorName,
+ }
+ type fault struct {
+ Code uint `json:"code"`
+ Created string `json:"created"`
+ Message string `json:"message"`
+ Details *string `json:"details,omitempty"`
+ }
+ var faultVal *fault
+ if s.FaultCode != nil && s.FaultCreated != nil && s.FaultMessage != nil {
+ faultVal = &fault{
+ Code: *s.FaultCode,
+ Created: *s.FaultCreated,
+ Message: *s.FaultMessage,
+ Details: s.FaultDetails,
+ }
+ }
aux := &struct {
- Flavor struct {
- // Starting in microversion 2.47, "id" was removed...
- Name string `json:"original_name"`
- } `json:"flavor"`
+ Flavor flavor `json:"flavor"`
+ Fault *fault `json:"fault,omitempty"`
*Alias
}{
- Alias: (*Alias)(s),
- Flavor: struct {
- Name string `json:"original_name"`
- }{
- Name: s.FlavorName,
- },
+ Alias: (*Alias)(s),
+ Flavor: flavorVal,
+ Fault: faultVal,
}
return json.Marshal(aux)
}
// Table in which the openstack model is stored.
-func (Server) TableName() string { return "openstack_servers" }
+func (Server) TableName() string { return "openstack_servers_v2" }
// Index for the openstack model.
func (Server) Indexes() map[string][]string { return nil }
diff --git a/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct.sql b/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct.sql
index ea2b9c97a..ab3c7b8a7 100644
--- a/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct.sql
+++ b/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct.sql
@@ -3,6 +3,6 @@ SELECT
os.os_ext_srv_attr_host AS host,
MAX(value) AS max_steal_time_pct
FROM kvm_libvirt_domain_metrics kvm
-JOIN openstack_servers os ON os.os_ext_srv_attr_instance_name = kvm.domain
+JOIN openstack_servers_v2 os ON os.os_ext_srv_attr_instance_name = kvm.domain
WHERE kvm.name = 'kvm_libvirt_domain_steal_pct' AND os.id IS NOT NULL
GROUP BY os.os_ext_srv_attr_host, os.id;
\ No newline at end of file
diff --git a/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct_test.go b/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct_test.go
index b9f84b188..bc28218b5 100644
--- a/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct_test.go
+++ b/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct_test.go
@@ -56,7 +56,7 @@ func TestLibvirtDomainCPUStealPctExtractor_Extract(t *testing.T) {
t.Fatalf("expected no error, got %v", err)
}
- // Insert mock data into the openstack_servers table
+ // Insert mock data into the openstack servers table
servers := []any{
&nova.Server{
ID: "uuid-1",
diff --git a/internal/knowledge/extractor/plugins/compute/vm_host_residency.sql b/internal/knowledge/extractor/plugins/compute/vm_host_residency.sql
index fff0086c4..c2b4b8846 100644
--- a/internal/knowledge/extractor/plugins/compute/vm_host_residency.sql
+++ b/internal/knowledge/extractor/plugins/compute/vm_host_residency.sql
@@ -21,7 +21,7 @@ WITH durations AS (
)) AS BIGINT)
) AS duration
FROM openstack_migrations AS migrations
- LEFT JOIN openstack_servers AS servers ON servers.id = migrations.instance_uuid
+ LEFT JOIN openstack_servers_v2 AS servers ON servers.id = migrations.instance_uuid
LEFT JOIN openstack_flavors_v2 AS flavors ON flavors.name = servers.flavor_name
)
SELECT
diff --git a/internal/knowledge/extractor/plugins/compute/vm_life_span.sql b/internal/knowledge/extractor/plugins/compute/vm_life_span.sql
index daaa0a470..1fad31536 100644
--- a/internal/knowledge/extractor/plugins/compute/vm_life_span.sql
+++ b/internal/knowledge/extractor/plugins/compute/vm_life_span.sql
@@ -13,7 +13,7 @@ running_servers AS (
EXTRACT(EPOCH FROM (NOW()::timestamp - servers.created::timestamp))::BIGINT AS duration,
COALESCE(flavors.name, 'unknown')::TEXT AS flavor_name,
false::BOOLEAN AS deleted
- FROM openstack_servers servers
+ FROM openstack_servers_v2 servers
LEFT JOIN openstack_flavors_v2 flavors ON flavors.name = servers.flavor_name
WHERE servers.created IS NOT NULL
)
diff --git a/internal/knowledge/extractor/plugins/compute/vrops_hostsystem_resolver.sql b/internal/knowledge/extractor/plugins/compute/vrops_hostsystem_resolver.sql
index e2c6ad4b2..8ab0a2c70 100644
--- a/internal/knowledge/extractor/plugins/compute/vrops_hostsystem_resolver.sql
+++ b/internal/knowledge/extractor/plugins/compute/vrops_hostsystem_resolver.sql
@@ -3,5 +3,5 @@ SELECT DISTINCT
m.hostsystem AS vrops_hostsystem,
s.os_ext_srv_attr_host AS nova_compute_host
FROM vrops_vm_metrics m
-LEFT JOIN openstack_servers s ON m.instance_uuid = s.id
+LEFT JOIN openstack_servers_v2 s ON m.instance_uuid = s.id
WHERE s.os_ext_srv_attr_host IS NOT NULL;
diff --git a/internal/knowledge/extractor/plugins/compute/vrops_project_noisiness.sql b/internal/knowledge/extractor/plugins/compute/vrops_project_noisiness.sql
index 334668b22..0b0067790 100644
--- a/internal/knowledge/extractor/plugins/compute/vrops_project_noisiness.sql
+++ b/internal/knowledge/extractor/plugins/compute/vrops_project_noisiness.sql
@@ -19,7 +19,7 @@ host_cpu_usage AS (
s.tenant_id,
h.service_host,
AVG(p.avg_cpu) AS avg_cpu_of_project
- FROM openstack_servers s
+ FROM openstack_servers_v2 s
JOIN vrops_vm_metrics m ON s.id = m.instance_uuid
JOIN projects_avg_cpu p ON s.tenant_id = p.tenant_id
JOIN openstack_hypervisors h ON s.os_ext_srv_attr_hypervisor_hostname = h.hostname
diff --git a/internal/scheduling/reservations/commitments/controller.go b/internal/scheduling/reservations/commitments/controller.go
index 9c238aeee..d38c6e1d8 100644
--- a/internal/scheduling/reservations/commitments/controller.go
+++ b/internal/scheduling/reservations/commitments/controller.go
@@ -445,7 +445,7 @@ func (r *CommitmentReservationController) listServersByProjectID(ctx context.Con
// Query servers from the database cache.
var servers []nova.Server
_, err := r.DB.Select(&servers,
- "SELECT * FROM openstack_servers WHERE tenant_id = $1",
+ "SELECT * FROM "+nova.Server{}.TableName()+" WHERE tenant_id = $1",
projectID)
if err != nil {
return nil, fmt.Errorf("failed to query servers from database: %w", err)
diff --git a/tools/visualize-reservations/main.go b/tools/visualize-reservations/main.go
index 9b5880be5..90824fb6e 100644
--- a/tools/visualize-reservations/main.go
+++ b/tools/visualize-reservations/main.go
@@ -52,6 +52,7 @@ import (
"time"
"github.com/cobaltcore-dev/cortex/api/v1alpha1"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova"
hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
_ "github.com/lib/pq"
corev1 "k8s.io/api/core/v1"
@@ -1761,9 +1762,9 @@ func connectToPostgres(
// Query servers with host information
serverMap = make(map[string]serverInfo)
- rows, err := db.QueryContext(ctx, "SELECT id, flavor_name, COALESCE(host_id, ''), COALESCE(os_ext_srv_attr_host, '') FROM openstack_servers")
+ rows, err := db.QueryContext(ctx, "SELECT id, flavor_name, COALESCE(host_id, ''), COALESCE(os_ext_srv_attr_host, '') FROM "+nova.Server{}.TableName())
if err != nil {
- fmt.Fprintf(os.Stderr, "Warning: Could not query openstack_servers: %v\n", err)
+ fmt.Fprintf(os.Stderr, "Warning: Could not query "+nova.Server{}.TableName()+": %v\n", err)
} else {
defer rows.Close()
for rows.Next() {
From aeb3fb30eba86812270c852b1a85161ee138c7bd Mon Sep 17 00:00:00 2001
From: Philipp Matthes
Date: Mon, 30 Mar 2026 13:35:31 +0200
Subject: [PATCH 04/11] Rename vm_state_kpi -> vm_faults_kpi
---
helm/bundles/cortex-nova/templates/kpis.yaml | 11 +-
.../kpis/plugins/compute/vm_state.go | 83 +++---
.../kpis/plugins/compute/vm_state_test.go | 240 ------------------
internal/knowledge/kpis/supported_kpis.go | 2 +-
4 files changed, 58 insertions(+), 278 deletions(-)
delete mode 100644 internal/knowledge/kpis/plugins/compute/vm_state_test.go
diff --git a/helm/bundles/cortex-nova/templates/kpis.yaml b/helm/bundles/cortex-nova/templates/kpis.yaml
index 62ff7f499..bc5666926 100644
--- a/helm/bundles/cortex-nova/templates/kpis.yaml
+++ b/helm/bundles/cortex-nova/templates/kpis.yaml
@@ -111,18 +111,19 @@ spec:
apiVersion: cortex.cloud/v1alpha1
kind: KPI
metadata:
- name: vm-state
+ name: vm-faults
spec:
schedulingDomain: nova
- impl: vm_state_kpi
+ impl: vm_faults_kpi
dependencies:
datasources:
- name: nova-servers
- name: nova-flavors
description: |
- This kpi monitors the current state of vms, i.e. how many vms are running,
- stopped, paused, etc. It also exposes additional labels such as the vm's
- hypervisor type which can be used to define alerts on non-running vms.
+ This kpi tracks vm faults in the datacenter. It exposes helpful information
+ about the faults, such as the availability zone, hypervisor type, vm state,
+ and error info if available. This can be used to identify issues in the
+ datacenter and to monitor the overall health of the vms.
---
apiVersion: cortex.cloud/v1alpha1
kind: KPI
diff --git a/internal/knowledge/kpis/plugins/compute/vm_state.go b/internal/knowledge/kpis/plugins/compute/vm_state.go
index 53e3d2fc4..30a0832b3 100644
--- a/internal/knowledge/kpis/plugins/compute/vm_state.go
+++ b/internal/knowledge/kpis/plugins/compute/vm_state.go
@@ -4,6 +4,9 @@
package compute
import (
+ "errors"
+ "strconv"
+
"github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova"
"github.com/cobaltcore-dev/cortex/internal/knowledge/db"
"github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins"
@@ -13,49 +16,50 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
)
-var vmStateKPIlogger = ctrl.Log.WithName("vm-state-kpi")
+var vmFaultsKPIlogger = ctrl.Log.WithName("vm-faults-kpi")
-// This kpi monitors the current state of vms, i.e. how many vms are running,
-// stopped, paused, etc. It also exposes additional labels such as the vm's
-// hypervisor type which can be used to define alerts on non-running vms.
-type VMStateKPI struct {
- // Common base for all KPIs that provides standard functionality.
- plugins.BaseKPI[struct{}] // No options passed through yaml config
+// This kpi tracks vm faults in the datacenter. It exposes helpful information
+// about the faults, such as the availability zone, hypervisor type, vm state,
+// and error info if available. This can be used to identify issues in the
+// datacenter and to monitor the overall health of the vms.
+type VMFaultsKPI struct {
+ plugins.BaseKPI[struct{} /* No opts */]
- // Current state of the VM, e.g. running, stopped, paused, etc.
- vmStateDesc *prometheus.Desc
+ // vmFaultsDesc describes the prometheus metric for vm faults.
+ vmFaultsDesc *prometheus.Desc
}
// GetName returns a unique name for this kpi that is used for registration
// and configuration.
-func (VMStateKPI) GetName() string { return "vm_state_kpi" }
+func (VMFaultsKPI) GetName() string { return "vm_faults_kpi" }
// Init initializes the kpi, e.g. by creating the necessary Prometheus
// descriptors. The base kpi is also initialized with the provided database,
// client and options.
-func (k *VMStateKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) error {
+func (k *VMFaultsKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) error {
if err := k.BaseKPI.Init(db, client, opts); err != nil {
return err
}
- k.vmStateDesc = prometheus.NewDesc("cortex_vm_state",
- "Current state of the VM, e.g. running, stopped, paused, etc.",
- []string{"az", "hvtype", "state"}, nil,
+ k.vmFaultsDesc = prometheus.NewDesc("cortex_vm_faults",
+ "Number of vm faults in the datacenter",
+ []string{"az", "hvtype", "state", "faultcode", "faultmessage"}, nil,
)
return nil
}
// Describe sends the descriptor of this kpi to the provided channel. This is
// used by Prometheus to know which metrics this kpi exposes.
-func (k *VMStateKPI) Describe(ch chan<- *prometheus.Desc) { ch <- k.vmStateDesc }
+func (k *VMFaultsKPI) Describe(ch chan<- *prometheus.Desc) { ch <- k.vmFaultsDesc }
// Collect collects the current state of vms from the database and sends it as
// Prometheus metrics to the provided channel.
-func (k *VMStateKPI) Collect(ch chan<- prometheus.Metric) {
- vmStateKPIlogger.Info("collecting vm state kpi")
+func (k *VMFaultsKPI) Collect(ch chan<- prometheus.Metric) {
+ vmFaultsKPIlogger.Info("collecting metrics")
// This can happen when no datasource is provided that connects to a database.
if k.DB == nil {
- vmStateKPIlogger.Error(nil, "no database connection, cannot collect vm state kpi")
+ err := errors.New("no database connection")
+ vmFaultsKPIlogger.Error(err, "cannot collect metric")
return
}
@@ -63,19 +67,19 @@ func (k *VMStateKPI) Collect(ch chan<- prometheus.Metric) {
var servers []nova.Server
nServers, err := k.DB.Select(&servers, "SELECT * FROM "+nova.Server{}.TableName())
if err != nil {
- vmStateKPIlogger.Error(err, "failed to query servers from database")
+ vmFaultsKPIlogger.Error(err, "failed to query servers from database")
return
}
- vmStateKPIlogger.Info("queried servers from database", "nServers", nServers)
+ vmFaultsKPIlogger.Info("queried servers from database", "nServers", nServers)
// Get all flavors from the database to map them to the vms.
var flavors []nova.Flavor
nFlavors, err := k.DB.Select(&flavors, "SELECT * FROM "+nova.Flavor{}.TableName())
if err != nil {
- vmStateKPIlogger.Error(err, "failed to query flavors from database")
+ vmFaultsKPIlogger.Error(err, "failed to query flavors from database")
return
}
- vmStateKPIlogger.Info("queried flavors from database", "nFlavors", nFlavors)
+ vmFaultsKPIlogger.Info("queried flavors from database", "nFlavors", nFlavors)
flavorsByName := make(map[string]nova.Flavor, len(flavors))
for _, flavor := range flavors {
@@ -83,37 +87,52 @@ func (k *VMStateKPI) Collect(ch chan<- prometheus.Metric) {
}
type labels struct {
- az string
- hvtype string
- state string
+ az string
+ hvtype string
+ state string
+ errcode string
+ errmessage string
}
counts := make(map[labels]float64)
// For each vm, get its hypervisor type and count up.
+ // Note: this will also expose vms that are NOT in an error state,
+ // but this can be useful to compare it to the number of faulty vms.
for _, server := range servers {
flavor, ok := flavorsByName[server.FlavorName]
if !ok {
- vmStateKPIlogger.Error(nil, "flavor not found for server", "server",
+ vmFaultsKPIlogger.Info("warning: flavor not found for server", "server",
server.ID, "flavor", server.FlavorName)
continue
}
hypervisorType, err := flavor.GetHypervisorType()
if err != nil {
- vmStateKPIlogger.Error(err, "failed to get hypervisor type for server",
+ vmFaultsKPIlogger.Error(err, "failed to get hypervisor type for server",
"server", server.ID, "flavor", flavor.Name)
continue
}
+ var errcode uint = 0
+ if server.FaultCode != nil {
+ errcode = *server.FaultCode
+ }
+ errmsg := "n/a"
+ if server.FaultMessage != nil {
+ errmsg = *server.FaultMessage
+ }
key := labels{
- az: server.OSEXTAvailabilityZone,
- hvtype: string(hypervisorType),
- state: server.Status,
+ az: server.OSEXTAvailabilityZone,
+ hvtype: string(hypervisorType),
+ state: server.Status,
+ errcode: strconv.FormatUint(uint64(errcode), 10),
+ errmessage: errmsg,
}
counts[key]++
}
// Emit metrics to prometheus.
for key, count := range counts {
- ch <- prometheus.MustNewConstMetric(k.vmStateDesc, prometheus.GaugeValue, count,
- key.az, key.hvtype, key.state)
+ ch <- prometheus.MustNewConstMetric(k.vmFaultsDesc, prometheus.GaugeValue, count,
+ key.az, key.hvtype, key.state, key.errcode, key.errmessage)
}
+ vmFaultsKPIlogger.Info("collected metrics", "nMetrics", len(counts))
}
diff --git a/internal/knowledge/kpis/plugins/compute/vm_state_test.go b/internal/knowledge/kpis/plugins/compute/vm_state_test.go
deleted file mode 100644
index 57196e1de..000000000
--- a/internal/knowledge/kpis/plugins/compute/vm_state_test.go
+++ /dev/null
@@ -1,240 +0,0 @@
-// Copyright SAP SE
-// SPDX-License-Identifier: Apache-2.0
-
-package compute
-
-import (
- "testing"
-
- "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova"
- "github.com/cobaltcore-dev/cortex/internal/knowledge/db"
- testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing"
- "github.com/cobaltcore-dev/cortex/pkg/conf"
- "github.com/prometheus/client_golang/prometheus"
- prometheusgo "github.com/prometheus/client_model/go"
-)
-
-func TestVMStateKPI_Init(t *testing.T) {
- dbEnv := testlibDB.SetupDBEnv(t)
- testDB := db.DB{DbMap: dbEnv.DbMap}
- defer dbEnv.Close()
- kpi := &VMStateKPI{}
- if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil {
- t.Fatalf("expected no error, got %v", err)
- }
-}
-
-func TestVMStateKPI_Collect(t *testing.T) {
- dbEnv := testlibDB.SetupDBEnv(t)
- testDB := db.DB{DbMap: dbEnv.DbMap}
- defer dbEnv.Close()
- if err := testDB.CreateTable(
- testDB.AddTable(nova.Server{}),
- testDB.AddTable(nova.Flavor{}),
- ); err != nil {
- t.Fatalf("expected no error, got %v", err)
- }
-
- mockData := []any{
- // Servers in different AZs, states, and with different flavors
- &nova.Server{
- ID: "server-1",
- FlavorName: "m1.small",
- OSEXTAvailabilityZone: "az1",
- Status: "ACTIVE",
- },
- &nova.Server{
- ID: "server-2",
- FlavorName: "m1.small",
- OSEXTAvailabilityZone: "az1",
- Status: "ACTIVE",
- },
- &nova.Server{
- ID: "server-3",
- FlavorName: "m1.small",
- OSEXTAvailabilityZone: "az1",
- Status: "STOPPED",
- },
- &nova.Server{
- ID: "server-4",
- FlavorName: "m1.vmware",
- OSEXTAvailabilityZone: "az2",
- Status: "ACTIVE",
- },
- &nova.Server{
- ID: "server-5",
- FlavorName: "m1.generic",
- OSEXTAvailabilityZone: "az1",
- Status: "PAUSED",
- },
- // Flavors with different hypervisor types
- &nova.Flavor{
- ID: "flavor-1",
- Name: "m1.small",
- ExtraSpecs: `{"capabilities:hypervisor_type": "QEMU"}`,
- },
- &nova.Flavor{
- ID: "flavor-2",
- Name: "m1.vmware",
- ExtraSpecs: `{"capabilities:hypervisor_type": "VMware vCenter Server"}`,
- },
- &nova.Flavor{
- ID: "flavor-3",
- Name: "m1.generic",
- ExtraSpecs: `{}`,
- },
- }
-
- if err := testDB.Insert(mockData...); err != nil {
- t.Fatalf("expected no error, got %v", err)
- }
-
- kpi := &VMStateKPI{}
- if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil {
- t.Fatalf("expected no error, got %v", err)
- }
-
- ch := make(chan prometheus.Metric, 100)
- kpi.Collect(ch)
- close(ch)
-
- type vmStateMetric struct {
- az string
- hvtype string
- state string
- count float64
- }
-
- metrics := make(map[string]vmStateMetric)
- for metric := range ch {
- var m prometheusgo.Metric
- if err := metric.Write(&m); err != nil {
- t.Fatalf("failed to write metric: %v", err)
- }
- labels := make(map[string]string)
- for _, label := range m.Label {
- labels[label.GetName()] = label.GetValue()
- }
- key := labels["az"] + "|" + labels["hvtype"] + "|" + labels["state"]
- metrics[key] = vmStateMetric{
- az: labels["az"],
- hvtype: labels["hvtype"],
- state: labels["state"],
- count: m.GetGauge().GetValue(),
- }
- }
-
- expectedMetrics := map[string]vmStateMetric{
- "az1|QEMU|ACTIVE": {
- az: "az1",
- hvtype: "QEMU",
- state: "ACTIVE",
- count: 2,
- },
- "az1|QEMU|STOPPED": {
- az: "az1",
- hvtype: "QEMU",
- state: "STOPPED",
- count: 1,
- },
- "az2|VMware vCenter Server|ACTIVE": {
- az: "az2",
- hvtype: "VMware vCenter Server",
- state: "ACTIVE",
- count: 1,
- },
- "az1|Unspecified|PAUSED": {
- az: "az1",
- hvtype: "Unspecified",
- state: "PAUSED",
- count: 1,
- },
- }
-
- if len(expectedMetrics) != len(metrics) {
- t.Errorf("expected %d metrics, got %d", len(expectedMetrics), len(metrics))
- }
-
- for key, expected := range expectedMetrics {
- actual, ok := metrics[key]
- if !ok {
- t.Errorf("expected metric %q not found", key)
- continue
- }
- if expected != actual {
- t.Errorf("metric %q: expected %+v, got %+v", key, expected, actual)
- }
- }
-}
-
-func TestVMStateKPI_Collect_MissingFlavor(t *testing.T) {
- dbEnv := testlibDB.SetupDBEnv(t)
- testDB := db.DB{DbMap: dbEnv.DbMap}
- defer dbEnv.Close()
- if err := testDB.CreateTable(
- testDB.AddTable(nova.Server{}),
- testDB.AddTable(nova.Flavor{}),
- ); err != nil {
- t.Fatalf("expected no error, got %v", err)
- }
-
- mockData := []any{
- &nova.Server{
- ID: "server-1",
- FlavorName: "m1.existing",
- OSEXTAvailabilityZone: "az1",
- Status: "ACTIVE",
- },
- &nova.Server{
- ID: "server-2",
- FlavorName: "m1.missing",
- OSEXTAvailabilityZone: "az1",
- Status: "ACTIVE",
- },
- &nova.Flavor{
- ID: "flavor-1",
- Name: "m1.existing",
- ExtraSpecs: `{}`,
- },
- }
-
- if err := testDB.Insert(mockData...); err != nil {
- t.Fatalf("expected no error, got %v", err)
- }
-
- kpi := &VMStateKPI{}
- if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil {
- t.Fatalf("expected no error, got %v", err)
- }
-
- ch := make(chan prometheus.Metric, 100)
- kpi.Collect(ch)
- close(ch)
-
- var count int
- for range ch {
- count++
- }
- if count != 1 {
- t.Errorf("expected 1 metric (missing flavor should be skipped), got %d", count)
- }
-}
-
-func TestVMStateKPI_Collect_NoDB(t *testing.T) {
- kpi := &VMStateKPI{}
- if err := kpi.Init(nil, nil, conf.NewRawOpts("{}")); err != nil {
- t.Fatalf("expected no error, got %v", err)
- }
-
- ch := make(chan prometheus.Metric, 100)
- kpi.Collect(ch) // Should not panic
- close(ch)
-
- var count int
- for range ch {
- count++
- }
- if count != 0 {
- t.Errorf("expected 0 metrics when no DB, got %d", count)
- }
-}
diff --git a/internal/knowledge/kpis/supported_kpis.go b/internal/knowledge/kpis/supported_kpis.go
index b469790cc..2623ff8bd 100644
--- a/internal/knowledge/kpis/supported_kpis.go
+++ b/internal/knowledge/kpis/supported_kpis.go
@@ -21,7 +21,7 @@ var supportedKPIs = map[string]plugins.KPI{
"vm_migration_statistics_kpi": &compute.VMMigrationStatisticsKPI{},
"vm_life_span_kpi": &compute.VMLifeSpanKPI{},
"vm_commitments_kpi": &compute.VMCommitmentsKPI{},
- "vm_state_kpi": &compute.VMStateKPI{},
+ "vm_faults_kpi": &compute.VMFaultsKPI{},
"netapp_storage_pool_cpu_usage_kpi": &storage.NetAppStoragePoolCPUUsageKPI{},
From 9e290d185dd844977099223dabdd0c183f57939b Mon Sep 17 00:00:00 2001
From: Philipp Matthes
Date: Mon, 30 Mar 2026 13:47:59 +0200
Subject: [PATCH 05/11] Rename file, add faulty-vm label, and add alert
---
helm/bundles/cortex-nova/alerts/nova.alerts.yaml | 16 ++++++++++++++++
.../compute/{vm_state.go => vm_faults.go} | 12 ++++++++++--
2 files changed, 26 insertions(+), 2 deletions(-)
rename internal/knowledge/kpis/plugins/compute/{vm_state.go => vm_faults.go} (91%)
diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
index 2449fa390..c8fc74b8e 100644
--- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
+++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
@@ -592,3 +592,19 @@ groups:
corruption, bugs in reservation creation, or external modifications.
Reservations are automatically repaired, but the root cause should be
investigated if this alert persists.
+
+ - alert: CortexNovaDoesntFindValidHosts
+ expr: cortex_vm_faults{faultmessage=~".+No valid host was found.+"} > 0
+ labels:
+ context: scheduling
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Nova scheduling cannot find valid hosts"
+ description: >
+ Cortex is seeing faulty vms in `{{$labels.az}}` where Nova scheduling
+ failed to find a valid host. This may indicate capacity issues,
+ misconfigured filters, or resource constraints in the datacenter.
+ Investigate the affected VMs and hypervisor availability.
diff --git a/internal/knowledge/kpis/plugins/compute/vm_state.go b/internal/knowledge/kpis/plugins/compute/vm_faults.go
similarity index 91%
rename from internal/knowledge/kpis/plugins/compute/vm_state.go
rename to internal/knowledge/kpis/plugins/compute/vm_faults.go
index 30a0832b3..7d69f709f 100644
--- a/internal/knowledge/kpis/plugins/compute/vm_state.go
+++ b/internal/knowledge/kpis/plugins/compute/vm_faults.go
@@ -42,7 +42,7 @@ func (k *VMFaultsKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) e
}
k.vmFaultsDesc = prometheus.NewDesc("cortex_vm_faults",
"Number of vm faults in the datacenter",
- []string{"az", "hvtype", "state", "faultcode", "faultmessage"}, nil,
+ []string{"az", "hvtype", "state", "fault-code", "fault-message", "faulty-vm"}, nil,
)
return nil
}
@@ -92,6 +92,7 @@ func (k *VMFaultsKPI) Collect(ch chan<- prometheus.Metric) {
state string
errcode string
errmessage string
+ faultyVM string
}
counts := make(map[labels]float64)
@@ -119,12 +120,19 @@ func (k *VMFaultsKPI) Collect(ch chan<- prometheus.Metric) {
if server.FaultMessage != nil {
errmsg = *server.FaultMessage
}
+ // Only provide the server ID for faulty VMs, to avoid cardinality
+ // explosion in the metric.
+ faultyVM := "no"
+ if server.FaultCode != nil || server.FaultMessage != nil {
+ faultyVM = server.ID
+ }
key := labels{
az: server.OSEXTAvailabilityZone,
hvtype: string(hypervisorType),
state: server.Status,
errcode: strconv.FormatUint(uint64(errcode), 10),
errmessage: errmsg,
+ faultyVM: faultyVM,
}
counts[key]++
}
@@ -132,7 +140,7 @@ func (k *VMFaultsKPI) Collect(ch chan<- prometheus.Metric) {
// Emit metrics to prometheus.
for key, count := range counts {
ch <- prometheus.MustNewConstMetric(k.vmFaultsDesc, prometheus.GaugeValue, count,
- key.az, key.hvtype, key.state, key.errcode, key.errmessage)
+ key.az, key.hvtype, key.state, key.errcode, key.errmessage, key.faultyVM)
}
vmFaultsKPIlogger.Info("collected metrics", "nMetrics", len(counts))
}
From cf2a9e564d3ea4094ae30c09ff808ba450b90e21 Mon Sep 17 00:00:00 2001
From: Philipp Matthes
Date: Mon, 30 Mar 2026 13:49:37 +0200
Subject: [PATCH 06/11] Fix linting issue
---
tools/visualize-reservations/main.go | 1 +
1 file changed, 1 insertion(+)
diff --git a/tools/visualize-reservations/main.go b/tools/visualize-reservations/main.go
index 90824fb6e..c99ff2eb1 100644
--- a/tools/visualize-reservations/main.go
+++ b/tools/visualize-reservations/main.go
@@ -1762,6 +1762,7 @@ func connectToPostgres(
// Query servers with host information
serverMap = make(map[string]serverInfo)
+ //nolint:gosec // This query is not using any user input, so it's not vulnerable to SQL injection
rows, err := db.QueryContext(ctx, "SELECT id, flavor_name, COALESCE(host_id, ''), COALESCE(os_ext_srv_attr_host, '') FROM "+nova.Server{}.TableName())
if err != nil {
fmt.Fprintf(os.Stderr, "Warning: Could not query "+nova.Server{}.TableName()+": %v\n", err)
From 50a38166b4ba1a306122c60384d5b223a16e9472 Mon Sep 17 00:00:00 2001
From: Philipp Matthes
Date: Mon, 30 Mar 2026 13:57:58 +0200
Subject: [PATCH 07/11] Unit tests
---
.../kpis/plugins/compute/vm_faults_test.go | 408 ++++++++++++++++++
1 file changed, 408 insertions(+)
create mode 100644 internal/knowledge/kpis/plugins/compute/vm_faults_test.go
diff --git a/internal/knowledge/kpis/plugins/compute/vm_faults_test.go b/internal/knowledge/kpis/plugins/compute/vm_faults_test.go
new file mode 100644
index 000000000..a5f63b42c
--- /dev/null
+++ b/internal/knowledge/kpis/plugins/compute/vm_faults_test.go
@@ -0,0 +1,408 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package compute
+
+import (
+ "reflect"
+ "testing"
+
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova"
+ "github.com/cobaltcore-dev/cortex/internal/knowledge/db"
+ testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing"
+ "github.com/cobaltcore-dev/cortex/pkg/conf"
+ testlib "github.com/cobaltcore-dev/cortex/pkg/testing"
+ "github.com/prometheus/client_golang/prometheus"
+ prometheusgo "github.com/prometheus/client_model/go"
+)
+
+func TestVMFaultsKPI_GetName(t *testing.T) {
+ kpi := VMFaultsKPI{}
+ if kpi.GetName() != "vm_faults_kpi" {
+ t.Errorf("expected 'vm_faults_kpi', got %q", kpi.GetName())
+ }
+}
+
+func TestVMFaultsKPI_Init(t *testing.T) {
+ dbEnv := testlibDB.SetupDBEnv(t)
+ testDB := db.DB{DbMap: dbEnv.DbMap}
+ defer dbEnv.Close()
+
+ kpi := &VMFaultsKPI{}
+ if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+ if kpi.vmFaultsDesc == nil {
+ t.Error("vmFaultsDesc should be initialized")
+ }
+}
+
+func TestVMFaultsKPI_Describe(t *testing.T) {
+ kpi := &VMFaultsKPI{}
+ if err := kpi.Init(nil, nil, conf.NewRawOpts("{}")); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ ch := make(chan *prometheus.Desc, 1)
+ kpi.Describe(ch)
+ close(ch)
+
+ desc := <-ch
+ if desc == nil {
+ t.Error("expected descriptor to be sent to channel")
+ }
+}
+
+func TestVMFaultsKPI_Collect_NoDB(t *testing.T) {
+ kpi := &VMFaultsKPI{}
+ if err := kpi.Init(nil, nil, conf.NewRawOpts("{}")); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ // Collect should not panic when no database is provided
+ ch := make(chan prometheus.Metric, 100)
+ kpi.Collect(ch)
+ close(ch)
+
+ count := 0
+ for range ch {
+ count++
+ }
+ if count != 0 {
+ t.Errorf("expected 0 metrics when no DB, got %d", count)
+ }
+}
+
+func TestVMFaultsKPI_Collect(t *testing.T) {
+ dbEnv := testlibDB.SetupDBEnv(t)
+ testDB := db.DB{DbMap: dbEnv.DbMap}
+ defer dbEnv.Close()
+
+ if err := testDB.CreateTable(
+ testDB.AddTable(nova.Server{}),
+ testDB.AddTable(nova.Flavor{}),
+ ); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ // Insert mock flavors with different hypervisor types
+ flavors := []any{
+ &nova.Flavor{
+ ID: "flavor-qemu",
+ Name: "qemu-small",
+ VCPUs: 2,
+ RAM: 4096,
+ ExtraSpecs: `{"capabilities:hypervisor_type":"QEMU"}`,
+ },
+ &nova.Flavor{
+ ID: "flavor-vmware",
+ Name: "vmware-medium",
+ VCPUs: 4,
+ RAM: 8192,
+ ExtraSpecs: `{"capabilities:hypervisor_type":"VMware vCenter Server"}`,
+ },
+ &nova.Flavor{
+ ID: "flavor-unspecified",
+ Name: "generic-large",
+ VCPUs: 8,
+ RAM: 16384,
+ ExtraSpecs: `{}`,
+ },
+ }
+ if err := testDB.Insert(flavors...); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ // Insert mock servers
+ servers := []any{
+ // Normal server without fault
+ &nova.Server{
+ ID: "server-1",
+ Name: "normal-vm",
+ Status: "ACTIVE",
+ FlavorName: "qemu-small",
+ OSEXTAvailabilityZone: "az1",
+ },
+ // Server with fault code and message
+ &nova.Server{
+ ID: "server-2",
+ Name: "faulty-vm",
+ Status: "ERROR",
+ FlavorName: "qemu-small",
+ OSEXTAvailabilityZone: "az1",
+ FaultCode: testlib.Ptr(uint(500)),
+ FaultMessage: testlib.Ptr("Internal error"),
+ },
+ // Another faulty server in different AZ
+ &nova.Server{
+ ID: "server-3",
+ Name: "another-faulty",
+ Status: "ERROR",
+ FlavorName: "vmware-medium",
+ OSEXTAvailabilityZone: "az2",
+ FaultCode: testlib.Ptr(uint(400)),
+ FaultMessage: testlib.Ptr("Bad request"),
+ },
+ // Server with only fault message (no code)
+ &nova.Server{
+ ID: "server-4",
+ Name: "partial-fault",
+ Status: "BUILD",
+ FlavorName: "generic-large",
+ OSEXTAvailabilityZone: "az1",
+ FaultMessage: testlib.Ptr("Some warning"),
+ },
+ // Server with flavor that doesn't exist (should be skipped)
+ &nova.Server{
+ ID: "server-5",
+ Name: "orphan-vm",
+ Status: "ACTIVE",
+ FlavorName: "nonexistent-flavor",
+ OSEXTAvailabilityZone: "az1",
+ },
+ }
+ if err := testDB.Insert(servers...); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ kpi := &VMFaultsKPI{}
+ if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ ch := make(chan prometheus.Metric, 100)
+ kpi.Collect(ch)
+ close(ch)
+
+ type vmFaultsMetric struct {
+ az string
+ hvtype string
+ state string
+ faultCode string
+ faultMessage string
+ faultyVM string
+ value float64
+ }
+
+ metrics := make(map[string]vmFaultsMetric)
+ for metric := range ch {
+ var m prometheusgo.Metric
+ if err := metric.Write(&m); err != nil {
+ t.Fatalf("failed to write metric: %v", err)
+ }
+
+ labels := make(map[string]string)
+ for _, label := range m.Label {
+ labels[label.GetName()] = label.GetValue()
+ }
+
+ key := labels["az"] + "|" + labels["hvtype"] + "|" + labels["state"] + "|" +
+ labels["fault-code"] + "|" + labels["faulty-vm"]
+
+ metrics[key] = vmFaultsMetric{
+ az: labels["az"],
+ hvtype: labels["hvtype"],
+ state: labels["state"],
+ faultCode: labels["fault-code"],
+ faultMessage: labels["fault-message"],
+ faultyVM: labels["faulty-vm"],
+ value: m.GetGauge().GetValue(),
+ }
+ }
+
+ expectedMetrics := map[string]vmFaultsMetric{
+ // Normal VM without fault
+ "az1|QEMU|ACTIVE|0|no": {
+ az: "az1",
+ hvtype: "QEMU",
+ state: "ACTIVE",
+ faultCode: "0",
+ faultMessage: "n/a",
+ faultyVM: "no",
+ value: 1,
+ },
+ // Faulty VM with code 500
+ "az1|QEMU|ERROR|500|server-2": {
+ az: "az1",
+ hvtype: "QEMU",
+ state: "ERROR",
+ faultCode: "500",
+ faultMessage: "Internal error",
+ faultyVM: "server-2",
+ value: 1,
+ },
+ // Faulty VM with code 400 in az2
+ "az2|VMware vCenter Server|ERROR|400|server-3": {
+ az: "az2",
+ hvtype: "VMware vCenter Server",
+ state: "ERROR",
+ faultCode: "400",
+ faultMessage: "Bad request",
+ faultyVM: "server-3",
+ value: 1,
+ },
+ // Server with only fault message (code=0 but has message)
+ "az1|Unspecified|BUILD|0|server-4": {
+ az: "az1",
+ hvtype: "Unspecified",
+ state: "BUILD",
+ faultCode: "0",
+ faultMessage: "Some warning",
+ faultyVM: "server-4",
+ value: 1,
+ },
+ }
+
+ if len(expectedMetrics) != len(metrics) {
+ t.Errorf("expected %d metrics, got %d", len(expectedMetrics), len(metrics))
+ t.Logf("actual metrics: %+v", metrics)
+ }
+
+ for key, expected := range expectedMetrics {
+ actual, ok := metrics[key]
+ if !ok {
+ t.Errorf("expected metric %q not found", key)
+ continue
+ }
+
+ if !reflect.DeepEqual(expected, actual) {
+ t.Errorf("metric %q: expected %+v, got %+v", key, expected, actual)
+ }
+ }
+}
+
+func TestVMFaultsKPI_Collect_InvalidExtraSpecs(t *testing.T) {
+ dbEnv := testlibDB.SetupDBEnv(t)
+ testDB := db.DB{DbMap: dbEnv.DbMap}
+ defer dbEnv.Close()
+
+ if err := testDB.CreateTable(
+ testDB.AddTable(nova.Server{}),
+ testDB.AddTable(nova.Flavor{}),
+ ); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ // Insert flavor with invalid extra specs JSON
+ flavors := []any{
+ &nova.Flavor{
+ ID: "flavor-bad",
+ Name: "bad-flavor",
+ VCPUs: 2,
+ RAM: 4096,
+ ExtraSpecs: `invalid-json`,
+ },
+ }
+ if err := testDB.Insert(flavors...); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ servers := []any{
+ &nova.Server{
+ ID: "server-bad",
+ Name: "bad-vm",
+ Status: "ACTIVE",
+ FlavorName: "bad-flavor",
+ OSEXTAvailabilityZone: "az1",
+ },
+ }
+ if err := testDB.Insert(servers...); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ kpi := &VMFaultsKPI{}
+ if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ // Should not panic, but should skip the server with invalid flavor
+ ch := make(chan prometheus.Metric, 100)
+ kpi.Collect(ch)
+ close(ch)
+
+ count := 0
+ for range ch {
+ count++
+ }
+ // Should have 0 metrics since the server's flavor has invalid extra specs
+ if count != 0 {
+ t.Errorf("expected 0 metrics, got %d", count)
+ }
+}
+
+func TestVMFaultsKPI_Collect_MultipleSameLabels(t *testing.T) {
+ dbEnv := testlibDB.SetupDBEnv(t)
+ testDB := db.DB{DbMap: dbEnv.DbMap}
+ defer dbEnv.Close()
+
+ if err := testDB.CreateTable(
+ testDB.AddTable(nova.Server{}),
+ testDB.AddTable(nova.Flavor{}),
+ ); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ flavors := []any{
+ &nova.Flavor{
+ ID: "flavor-1",
+ Name: "small",
+ VCPUs: 2,
+ RAM: 4096,
+ ExtraSpecs: `{"capabilities:hypervisor_type":"QEMU"}`,
+ },
+ }
+ if err := testDB.Insert(flavors...); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ // Insert multiple servers that should aggregate to same metric
+ servers := []any{
+ &nova.Server{
+ ID: "server-1",
+ Name: "vm-1",
+ Status: "ACTIVE",
+ FlavorName: "small",
+ OSEXTAvailabilityZone: "az1",
+ },
+ &nova.Server{
+ ID: "server-2",
+ Name: "vm-2",
+ Status: "ACTIVE",
+ FlavorName: "small",
+ OSEXTAvailabilityZone: "az1",
+ },
+ &nova.Server{
+ ID: "server-3",
+ Name: "vm-3",
+ Status: "ACTIVE",
+ FlavorName: "small",
+ OSEXTAvailabilityZone: "az1",
+ },
+ }
+ if err := testDB.Insert(servers...); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ kpi := &VMFaultsKPI{}
+ if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil {
+ t.Fatalf("expected no error, got %v", err)
+ }
+
+ ch := make(chan prometheus.Metric, 100)
+ kpi.Collect(ch)
+ close(ch)
+
+ var value float64
+ for metric := range ch {
+ var m prometheusgo.Metric
+ if err := metric.Write(&m); err != nil {
+ t.Fatalf("failed to write metric: %v", err)
+ }
+ value = m.GetGauge().GetValue()
+ }
+
+ // All 3 VMs should be counted together since they have the same labels
+ if value != 3 {
+ t.Errorf("expected metric value 3, got %f", value)
+ }
+}
From aa05ab53d7521b8e1cf0c5444bec8a7f03b7eb80 Mon Sep 17 00:00:00 2001
From: Philipp Matthes
Date: Mon, 30 Mar 2026 14:01:50 +0200
Subject: [PATCH 08/11] Limit alert to kvm hypervisors
---
helm/bundles/cortex-nova/alerts/nova.alerts.yaml | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
index c8fc74b8e..e702bc508 100644
--- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
+++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
@@ -593,8 +593,8 @@ groups:
Reservations are automatically repaired, but the root cause should be
investigated if this alert persists.
- - alert: CortexNovaDoesntFindValidHosts
- expr: cortex_vm_faults{faultmessage=~".+No valid host was found.+"} > 0
+ - alert: CortexNovaDoesntFindValidKVMHosts
+ expr: cortex_vm_faults{hvtype=~"CH|QEMU",faultmessage=~".+No valid host was found.+"} > 0
labels:
context: scheduling
dashboard: cortex/cortex
@@ -602,9 +602,9 @@ groups:
severity: warning
support_group: workload-management
annotations:
- summary: "Nova scheduling cannot find valid hosts"
+ summary: "Nova scheduling cannot find valid KVM hosts"
description: >
Cortex is seeing faulty vms in `{{$labels.az}}` where Nova scheduling
- failed to find a valid host. This may indicate capacity issues,
- misconfigured filters, or resource constraints in the datacenter.
- Investigate the affected VMs and hypervisor availability.
+ failed to find a valid `{{$labels.hvtype}}` host. This may indicate
+ capacity issues, misconfigured filters, or resource constraints in the
+ datacenter. Investigate the affected VMs and hypervisor availability.
From 7cce0da334cbf38e1548088c6292123be7a34ed6 Mon Sep 17 00:00:00 2001
From: Philipp Matthes
Date: Mon, 30 Mar 2026 14:34:29 +0200
Subject: [PATCH 09/11] Refine metric labels and add dashboard panel
---
.../kpis/plugins/compute/vm_faults.go | 7 +-
.../kpis/plugins/compute/vm_faults_test.go | 10 +-
.../dashboards/cortex-status.json | 147 +++++++++++++++---
3 files changed, 138 insertions(+), 26 deletions(-)
diff --git a/internal/knowledge/kpis/plugins/compute/vm_faults.go b/internal/knowledge/kpis/plugins/compute/vm_faults.go
index 7d69f709f..fec71247c 100644
--- a/internal/knowledge/kpis/plugins/compute/vm_faults.go
+++ b/internal/knowledge/kpis/plugins/compute/vm_faults.go
@@ -6,6 +6,7 @@ package compute
import (
"errors"
"strconv"
+ "strings"
"github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova"
"github.com/cobaltcore-dev/cortex/internal/knowledge/db"
@@ -42,7 +43,7 @@ func (k *VMFaultsKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) e
}
k.vmFaultsDesc = prometheus.NewDesc("cortex_vm_faults",
"Number of vm faults in the datacenter",
- []string{"az", "hvtype", "state", "fault-code", "fault-message", "faulty-vm"}, nil,
+ []string{"az", "hvtype", "state", "faultcode", "faultmsg", "faultyvm"}, nil,
)
return nil
}
@@ -119,6 +120,10 @@ func (k *VMFaultsKPI) Collect(ch chan<- prometheus.Metric) {
errmsg := "n/a"
if server.FaultMessage != nil {
errmsg = *server.FaultMessage
+ // Sometimes the VM ID may appear in the error message, which can
+ // lead to high cardinality in the metric. To avoid this, we replace
+ // the VM ID with a placeholder.
+ errmsg = strings.ReplaceAll(errmsg, server.ID, "")
}
// Only provide the server ID for faulty VMs, to avoid cardinality
// explosion in the metric.
diff --git a/internal/knowledge/kpis/plugins/compute/vm_faults_test.go b/internal/knowledge/kpis/plugins/compute/vm_faults_test.go
index a5f63b42c..a5b248b55 100644
--- a/internal/knowledge/kpis/plugins/compute/vm_faults_test.go
+++ b/internal/knowledge/kpis/plugins/compute/vm_faults_test.go
@@ -126,7 +126,7 @@ func TestVMFaultsKPI_Collect(t *testing.T) {
// Server with fault code and message
&nova.Server{
ID: "server-2",
- Name: "faulty-vm",
+ Name: "faultyvm",
Status: "ERROR",
FlavorName: "qemu-small",
OSEXTAvailabilityZone: "az1",
@@ -197,15 +197,15 @@ func TestVMFaultsKPI_Collect(t *testing.T) {
}
key := labels["az"] + "|" + labels["hvtype"] + "|" + labels["state"] + "|" +
- labels["fault-code"] + "|" + labels["faulty-vm"]
+ labels["faultcode"] + "|" + labels["faultyvm"]
metrics[key] = vmFaultsMetric{
az: labels["az"],
hvtype: labels["hvtype"],
state: labels["state"],
- faultCode: labels["fault-code"],
- faultMessage: labels["fault-message"],
- faultyVM: labels["faulty-vm"],
+ faultCode: labels["faultcode"],
+ faultMessage: labels["faultmsg"],
+ faultyVM: labels["faultyvm"],
value: m.GetGauge().GetValue(),
}
}
diff --git a/tools/plutono/provisioning/dashboards/cortex-status.json b/tools/plutono/provisioning/dashboards/cortex-status.json
index f83e2926b..d05157d8d 100644
--- a/tools/plutono/provisioning/dashboards/cortex-status.json
+++ b/tools/plutono/provisioning/dashboards/cortex-status.json
@@ -16,7 +16,7 @@
"editable": true,
"gnetId": null,
"graphTooltip": 0,
- "id": 3,
+ "id": 1,
"links": [],
"panels": [
{
@@ -557,6 +557,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "prometheus-openstack",
+ "description": "",
"fieldConfig": {
"defaults": {
"unit": "short"
@@ -567,11 +568,117 @@
"fillGradient": 0,
"gridPos": {
"h": 12,
- "w": 24,
+ "w": 12,
"x": 0,
"y": 31
},
"hiddenSeries": false,
+ "id": 58,
+ "interval": null,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "hideEmpty": false,
+ "hideZero": true,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.5.37",
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "exemplar": false,
+ "expr": "sum by (faultmsg,state) (cortex_vm_faults{faultcode!=\"0\"})",
+ "format": "time_series",
+ "instant": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{state}} {{faultmsg}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Nova: faults in vm scheduling lifecycle",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:234",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:235",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus-openstack",
+ "fieldConfig": {
+ "defaults": {
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 12,
+ "w": 24,
+ "x": 0,
+ "y": 43
+ },
+ "hiddenSeries": false,
"id": 39,
"legend": {
"avg": false,
@@ -669,7 +776,7 @@
"h": 11,
"w": 6,
"x": 0,
- "y": 43
+ "y": 55
},
"hiddenSeries": false,
"id": 31,
@@ -766,7 +873,7 @@
"h": 11,
"w": 6,
"x": 6,
- "y": 43
+ "y": 55
},
"hiddenSeries": false,
"id": 33,
@@ -878,7 +985,7 @@
"h": 11,
"w": 6,
"x": 12,
- "y": 43
+ "y": 55
},
"hiddenSeries": false,
"id": 35,
@@ -990,7 +1097,7 @@
"h": 11,
"w": 6,
"x": 18,
- "y": 43
+ "y": 55
},
"hiddenSeries": false,
"id": 37,
@@ -1100,7 +1207,7 @@
"h": 12,
"w": 12,
"x": 0,
- "y": 54
+ "y": 66
},
"hiddenSeries": false,
"id": 27,
@@ -1208,7 +1315,7 @@
"h": 12,
"w": 12,
"x": 12,
- "y": 54
+ "y": 66
},
"hiddenSeries": false,
"id": 29,
@@ -1296,7 +1403,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 66
+ "y": 78
},
"id": 5,
"panels": [],
@@ -1321,7 +1428,7 @@
"h": 11,
"w": 12,
"x": 0,
- "y": 67
+ "y": 79
},
"hiddenSeries": false,
"id": 2,
@@ -1441,7 +1548,7 @@
"h": 11,
"w": 12,
"x": 12,
- "y": 67
+ "y": 79
},
"hiddenSeries": false,
"id": 3,
@@ -1580,7 +1687,7 @@
"h": 12,
"w": 24,
"x": 0,
- "y": 78
+ "y": 90
},
"id": 50,
"options": {
@@ -1621,7 +1728,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 90
+ "y": 102
},
"id": 25,
"panels": [],
@@ -1644,7 +1751,7 @@
"h": 14,
"w": 12,
"x": 0,
- "y": 91
+ "y": 103
},
"hiddenSeries": false,
"id": 21,
@@ -1746,7 +1853,7 @@
"h": 14,
"w": 12,
"x": 12,
- "y": 91
+ "y": 103
},
"hiddenSeries": false,
"id": 23,
@@ -1839,7 +1946,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 105
+ "y": 117
},
"id": 19,
"panels": [],
@@ -1862,7 +1969,7 @@
"h": 13,
"w": 12,
"x": 0,
- "y": 106
+ "y": 118
},
"hiddenSeries": false,
"id": 17,
@@ -1960,7 +2067,7 @@
"h": 13,
"w": 12,
"x": 12,
- "y": 106
+ "y": 118
},
"hiddenSeries": false,
"id": 15,
@@ -2057,7 +2164,7 @@
"h": 12,
"w": 12,
"x": 0,
- "y": 119
+ "y": 131
},
"hiddenSeries": false,
"id": 11,
@@ -2155,7 +2262,7 @@
"h": 12,
"w": 12,
"x": 12,
- "y": 119
+ "y": 131
},
"hiddenSeries": false,
"id": 13,
From 1c1c7c83f2461bc9691cf2a6037383838684a5ad Mon Sep 17 00:00:00 2001
From: Philipp Matthes
Date: Mon, 30 Mar 2026 14:51:22 +0200
Subject: [PATCH 10/11] PR feedback
---
helm/bundles/cortex-nova/alerts/nova.alerts.yaml | 2 +-
.../datasources/plugins/openstack/nova/nova_types.go | 4 +++-
tools/plutono/provisioning/dashboards/cortex-status.json | 2 +-
3 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
index e702bc508..a92881603 100644
--- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
+++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
@@ -594,7 +594,7 @@ groups:
investigated if this alert persists.
- alert: CortexNovaDoesntFindValidKVMHosts
- expr: cortex_vm_faults{hvtype=~"CH|QEMU",faultmessage=~".+No valid host was found.+"} > 0
+ expr: sum by (az, hvtype) (cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".+No valid host was found.+"}) > 0
labels:
context: scheduling
dashboard: cortex/cortex
diff --git a/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go b/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go
index 70e4fb02e..1be2b7a29 100644
--- a/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go
+++ b/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go
@@ -358,7 +358,9 @@ const (
// extra specs.
func (f Flavor) GetHypervisorType() (FlavorHypervisorType, error) {
var extraSpecs map[string]string
- if err := json.Unmarshal([]byte(f.ExtraSpecs), &extraSpecs); err != nil {
+ if f.ExtraSpecs == "" {
+ extraSpecs = map[string]string{}
+ } else if err := json.Unmarshal([]byte(f.ExtraSpecs), &extraSpecs); err != nil {
return "", err // Return an error if the extra specs cannot be parsed.
}
hypervisorType, ok := extraSpecs["capabilities:hypervisor_type"]
diff --git a/tools/plutono/provisioning/dashboards/cortex-status.json b/tools/plutono/provisioning/dashboards/cortex-status.json
index d05157d8d..37f4b2479 100644
--- a/tools/plutono/provisioning/dashboards/cortex-status.json
+++ b/tools/plutono/provisioning/dashboards/cortex-status.json
@@ -606,7 +606,7 @@
"targets": [
{
"exemplar": false,
- "expr": "sum by (faultmsg,state) (cortex_vm_faults{faultcode!=\"0\"})",
+ "expr": "sum by (faultmsg,state) (cortex_vm_faults{faultyvm!=\"no\"})",
"format": "time_series",
"instant": false,
"interval": "",
From c1a2a7319f738d368b659df5e145421b4c24d909 Mon Sep 17 00:00:00 2001
From: Philipp Matthes
Date: Mon, 30 Mar 2026 15:30:06 +0200
Subject: [PATCH 11/11] PR feedback
---
helm/bundles/cortex-nova/alerts/nova.alerts.yaml | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
index a92881603..e3271f119 100644
--- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
+++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
@@ -594,7 +594,8 @@ groups:
investigated if this alert persists.
- alert: CortexNovaDoesntFindValidKVMHosts
- expr: sum by (az, hvtype) (cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".+No valid host was found.+"}) > 0
+ expr: sum by (az, hvtype) (cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*"}) > 0
+ for: 5m
labels:
context: scheduling
dashboard: cortex/cortex