From f4acf63054eb7b45bb6dc904729946fa8952f04b Mon Sep 17 00:00:00 2001
From: Philipp Matthes <p.matthes@sap.com>
Date: Mon, 30 Mar 2026 11:46:55 +0200
Subject: [PATCH 01/11] Add vm state kpi

---
 .../plugins/openstack/nova/nova_types.go      |  46 ++++
 .../kpis/plugins/compute/vm_state.go          | 119 +++++++++
 .../kpis/plugins/compute/vm_state_test.go     | 240 ++++++++++++++++++
 3 files changed, 405 insertions(+)
 create mode 100644 internal/knowledge/kpis/plugins/compute/vm_state.go
 create mode 100644 internal/knowledge/kpis/plugins/compute/vm_state_test.go

diff --git a/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go b/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go
index 322b05d69..2633c76fc 100644
--- a/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go
+++ b/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go
@@ -285,6 +285,52 @@ type Flavor struct {
 	ExtraSpecs string `json:"extra_specs" db:"extra_specs"`
 }
 
+// FlavorHypervisorType is a type alias for a string to represent the specific
+// values the hypervisor type contained in flavor extra specs may have.
+type FlavorHypervisorType string
+
+const (
+	// FlavorHypervisorTypeQEMU maps a flavor for QEMU/KVM hypervisors.
+	FlavorHypervisorTypeQEMU FlavorHypervisorType = "QEMU"
+	// FlavorHypervisorTypeCH maps flavors to Cloud-Hypervisor/KVM hypervisors.
+	FlavorHypervisorTypeCH FlavorHypervisorType = "CH"
+	// FlavorHypervisorTypeVMware maps flavors to VMware hypervisors.
+	FlavorHypervisorTypeVMware FlavorHypervisorType = "VMware vCenter Server"
+	// FlavorHypervisorTypeIronic maps flavors to Ironic baremetal instances.
+	FlavorHypervisorTypeIronic FlavorHypervisorType = "Ironic"
+	// FlavorHypervisorTypeOther is a flavor for which the hypervisor type
+	// is set in the extra specs but has an unknown value.
+	FlavorHypervisorTypeOther FlavorHypervisorType = "Other"
+	// FlavorHypervisorTypeUnspecified is a flavor for which the hypervisor type
+	// is not set in the extra specs.
+	FlavorHypervisorTypeUnspecified FlavorHypervisorType = "Unspecified"
+)
+
+// GetHypervisorType returns the hypervisor type of the flavor based on its
+// extra specs.
+func (f Flavor) GetHypervisorType() (FlavorHypervisorType, error) {
+	var extraSpecs map[string]string
+	if err := json.Unmarshal([]byte(f.ExtraSpecs), &extraSpecs); err != nil {
+		return "", err // Return an error if the extra specs cannot be parsed.
+	}
+	hypervisorType, ok := extraSpecs["capabilities:hypervisor_type"]
+	if !ok {
+		return FlavorHypervisorTypeUnspecified, nil
+	}
+	switch hypervisorType {
+	case string(FlavorHypervisorTypeQEMU):
+		return FlavorHypervisorTypeQEMU, nil
+	case string(FlavorHypervisorTypeCH):
+		return FlavorHypervisorTypeCH, nil
+	case string(FlavorHypervisorTypeVMware):
+		return FlavorHypervisorTypeVMware, nil
+	case string(FlavorHypervisorTypeIronic):
+		return FlavorHypervisorTypeIronic, nil
+	default:
+		return FlavorHypervisorTypeOther, nil
+	}
+}
+
 // Custom unmarshaler for OpenStackFlavor to handle nested JSON.
 func (f *Flavor) UnmarshalJSON(data []byte) error {
 	type Alias Flavor
diff --git a/internal/knowledge/kpis/plugins/compute/vm_state.go b/internal/knowledge/kpis/plugins/compute/vm_state.go
new file mode 100644
index 000000000..53e3d2fc4
--- /dev/null
+++ b/internal/knowledge/kpis/plugins/compute/vm_state.go
@@ -0,0 +1,119 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package compute
+
+import (
+	"github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova"
+	"github.com/cobaltcore-dev/cortex/internal/knowledge/db"
+	"github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins"
+	"github.com/cobaltcore-dev/cortex/pkg/conf"
+	"github.com/prometheus/client_golang/prometheus"
+	ctrl "sigs.k8s.io/controller-runtime"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+)
+
+var vmStateKPIlogger = ctrl.Log.WithName("vm-state-kpi")
+
+// This kpi monitors the current state of vms, i.e. how many vms are running,
+// stopped, paused, etc. It also exposes additional labels such as the vm's
+// hypervisor type which can be used to define alerts on non-running vms.
+type VMStateKPI struct {
+	// Common base for all KPIs that provides standard functionality.
+	plugins.BaseKPI[struct{}] // No options passed through yaml config
+
+	// Current state of the VM, e.g. running, stopped, paused, etc.
+	vmStateDesc *prometheus.Desc
+}
+
+// GetName returns a unique name for this kpi that is used for registration
+// and configuration.
+func (VMStateKPI) GetName() string { return "vm_state_kpi" }
+
+// Init initializes the kpi, e.g. by creating the necessary Prometheus
+// descriptors. The base kpi is also initialized with the provided database,
+// client and options.
+func (k *VMStateKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) error {
+	if err := k.BaseKPI.Init(db, client, opts); err != nil {
+		return err
+	}
+	k.vmStateDesc = prometheus.NewDesc("cortex_vm_state",
+		"Current state of the VM, e.g. running, stopped, paused, etc.",
+		[]string{"az", "hvtype", "state"}, nil,
+	)
+	return nil
+}
+
+// Describe sends the descriptor of this kpi to the provided channel. This is
+// used by Prometheus to know which metrics this kpi exposes.
+func (k *VMStateKPI) Describe(ch chan<- *prometheus.Desc) { ch <- k.vmStateDesc }
+
+// Collect collects the current state of vms from the database and sends it as
+// Prometheus metrics to the provided channel.
+func (k *VMStateKPI) Collect(ch chan<- prometheus.Metric) {
+	vmStateKPIlogger.Info("collecting vm state kpi")
+
+	// This can happen when no datasource is provided that connects to a database.
+	if k.DB == nil {
+		vmStateKPIlogger.Error(nil, "no database connection, cannot collect vm state kpi")
+		return
+	}
+
+	// Get all vms with their current state from the database.
+	var servers []nova.Server
+	nServers, err := k.DB.Select(&servers, "SELECT * FROM "+nova.Server{}.TableName())
+	if err != nil {
+		vmStateKPIlogger.Error(err, "failed to query servers from database")
+		return
+	}
+	vmStateKPIlogger.Info("queried servers from database", "nServers", nServers)
+
+	// Get all flavors from the database to map them to the vms.
+	var flavors []nova.Flavor
+	nFlavors, err := k.DB.Select(&flavors, "SELECT * FROM "+nova.Flavor{}.TableName())
+	if err != nil {
+		vmStateKPIlogger.Error(err, "failed to query flavors from database")
+		return
+	}
+	vmStateKPIlogger.Info("queried flavors from database", "nFlavors", nFlavors)
+
+	flavorsByName := make(map[string]nova.Flavor, len(flavors))
+	for _, flavor := range flavors {
+		flavorsByName[flavor.Name] = flavor
+	}
+
+	type labels struct {
+		az     string
+		hvtype string
+		state  string
+	}
+	counts := make(map[labels]float64)
+
+	// For each vm, get its hypervisor type and count up.
+	for _, server := range servers {
+		flavor, ok := flavorsByName[server.FlavorName]
+		if !ok {
+			vmStateKPIlogger.Error(nil, "flavor not found for server", "server",
+				server.ID, "flavor", server.FlavorName)
+			continue
+		}
+		hypervisorType, err := flavor.GetHypervisorType()
+		if err != nil {
+			vmStateKPIlogger.Error(err, "failed to get hypervisor type for server",
+				"server", server.ID, "flavor", flavor.Name)
+			continue
+		}
+		key := labels{
+			az:     server.OSEXTAvailabilityZone,
+			hvtype: string(hypervisorType),
+			state:  server.Status,
+		}
+		counts[key]++
+	}
+
+	// Emit metrics to prometheus.
+	for key, count := range counts {
+		ch <- prometheus.MustNewConstMetric(k.vmStateDesc, prometheus.GaugeValue, count,
+			key.az, key.hvtype, key.state)
+	}
+}
diff --git a/internal/knowledge/kpis/plugins/compute/vm_state_test.go b/internal/knowledge/kpis/plugins/compute/vm_state_test.go
new file mode 100644
index 000000000..57196e1de
--- /dev/null
+++ b/internal/knowledge/kpis/plugins/compute/vm_state_test.go
@@ -0,0 +1,240 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package compute
+
+import (
+	"testing"
+
+	"github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova"
+	"github.com/cobaltcore-dev/cortex/internal/knowledge/db"
+	testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing"
+	"github.com/cobaltcore-dev/cortex/pkg/conf"
+	"github.com/prometheus/client_golang/prometheus"
+	prometheusgo "github.com/prometheus/client_model/go"
+)
+
+func TestVMStateKPI_Init(t *testing.T) {
+	dbEnv := testlibDB.SetupDBEnv(t)
+	testDB := db.DB{DbMap: dbEnv.DbMap}
+	defer dbEnv.Close()
+	kpi := &VMStateKPI{}
+	if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil {
+		t.Fatalf("expected no error, got %v", err)
+	}
+}
+
+func TestVMStateKPI_Collect(t *testing.T) {
+	dbEnv := testlibDB.SetupDBEnv(t)
+	testDB := db.DB{DbMap: dbEnv.DbMap}
+	defer dbEnv.Close()
+	if err := testDB.CreateTable(
+		testDB.AddTable(nova.Server{}),
+		testDB.AddTable(nova.Flavor{}),
+	); err != nil {
+		t.Fatalf("expected no error, got %v", err)
+	}
+
+	mockData := []any{
+		// Servers in different AZs, states, and with different flavors
+		&nova.Server{
+			ID:                    "server-1",
+			FlavorName:            "m1.small",
+			OSEXTAvailabilityZone: "az1",
+			Status:                "ACTIVE",
+		},
+		&nova.Server{
+			ID:                    "server-2",
+			FlavorName:            "m1.small",
+			OSEXTAvailabilityZone: "az1",
+			Status:                "ACTIVE",
+		},
+		&nova.Server{
+			ID:                    "server-3",
+			FlavorName:            "m1.small",
+			OSEXTAvailabilityZone: "az1",
+			Status:                "STOPPED",
+		},
+		&nova.Server{
+			ID:                    "server-4",
+			FlavorName:            "m1.vmware",
+			OSEXTAvailabilityZone: "az2",
+			Status:                "ACTIVE",
+		},
+		&nova.Server{
+			ID:                    "server-5",
+			FlavorName:            "m1.generic",
+			OSEXTAvailabilityZone: "az1",
+			Status:                "PAUSED",
+		},
+		// Flavors with different hypervisor types
+		&nova.Flavor{
+			ID:         "flavor-1",
+			Name:       "m1.small",
+			ExtraSpecs: `{"capabilities:hypervisor_type": "QEMU"}`,
+		},
+		&nova.Flavor{
+			ID:         "flavor-2",
+			Name:       "m1.vmware",
+			ExtraSpecs: `{"capabilities:hypervisor_type": "VMware vCenter Server"}`,
+		},
+		&nova.Flavor{
+			ID:         "flavor-3",
+			Name:       "m1.generic",
+			ExtraSpecs: `{}`,
+		},
+	}
+
+	if err := testDB.Insert(mockData...); err != nil {
+		t.Fatalf("expected no error, got %v", err)
+	}
+
+	kpi := &VMStateKPI{}
+	if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil {
+		t.Fatalf("expected no error, got %v", err)
+	}
+
+	ch := make(chan prometheus.Metric, 100)
+	kpi.Collect(ch)
+	close(ch)
+
+	type vmStateMetric struct {
+		az     string
+		hvtype string
+		state  string
+		count  float64
+	}
+
+	metrics := make(map[string]vmStateMetric)
+	for metric := range ch {
+		var m prometheusgo.Metric
+		if err := metric.Write(&m); err != nil {
+			t.Fatalf("failed to write metric: %v", err)
+		}
+		labels := make(map[string]string)
+		for _, label := range m.Label {
+			labels[label.GetName()] = label.GetValue()
+		}
+		key := labels["az"] + "|" + labels["hvtype"] + "|" + labels["state"]
+		metrics[key] = vmStateMetric{
+			az:     labels["az"],
+			hvtype: labels["hvtype"],
+			state:  labels["state"],
+			count:  m.GetGauge().GetValue(),
+		}
+	}
+
+	expectedMetrics := map[string]vmStateMetric{
+		"az1|QEMU|ACTIVE": {
+			az:     "az1",
+			hvtype: "QEMU",
+			state:  "ACTIVE",
+			count:  2,
+		},
+		"az1|QEMU|STOPPED": {
+			az:     "az1",
+			hvtype: "QEMU",
+			state:  "STOPPED",
+			count:  1,
+		},
+		"az2|VMware vCenter Server|ACTIVE": {
+			az:     "az2",
+			hvtype: "VMware vCenter Server",
+			state:  "ACTIVE",
+			count:  1,
+		},
+		"az1|Unspecified|PAUSED": {
+			az:     "az1",
+			hvtype: "Unspecified",
+			state:  "PAUSED",
+			count:  1,
+		},
+	}
+
+	if len(expectedMetrics) != len(metrics) {
+		t.Errorf("expected %d metrics, got %d", len(expectedMetrics), len(metrics))
+	}
+
+	for key, expected := range expectedMetrics {
+		actual, ok := metrics[key]
+		if !ok {
+			t.Errorf("expected metric %q not found", key)
+			continue
+		}
+		if expected != actual {
+			t.Errorf("metric %q: expected %+v, got %+v", key, expected, actual)
+		}
+	}
+}
+
+func TestVMStateKPI_Collect_MissingFlavor(t *testing.T) {
+	dbEnv := testlibDB.SetupDBEnv(t)
+	testDB := db.DB{DbMap: dbEnv.DbMap}
+	defer dbEnv.Close()
+	if err := testDB.CreateTable(
+		testDB.AddTable(nova.Server{}),
+		testDB.AddTable(nova.Flavor{}),
+	); err != nil {
+		t.Fatalf("expected no error, got %v", err)
+	}
+
+	mockData := []any{
+		&nova.Server{
+			ID:                    "server-1",
+			FlavorName:            "m1.existing",
+			OSEXTAvailabilityZone: "az1",
+			Status:                "ACTIVE",
+		},
+		&nova.Server{
+			ID:                    "server-2",
+			FlavorName:            "m1.missing",
+			OSEXTAvailabilityZone: "az1",
+			Status:                "ACTIVE",
+		},
+		&nova.Flavor{
+			ID:         "flavor-1",
+			Name:       "m1.existing",
+			ExtraSpecs: `{}`,
+		},
+	}
+
+	if err := testDB.Insert(mockData...); err != nil {
+		t.Fatalf("expected no error, got %v", err)
+	}
+
+	kpi := &VMStateKPI{}
+	if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil {
+		t.Fatalf("expected no error, got %v", err)
+	}
+
+	ch := make(chan prometheus.Metric, 100)
+	kpi.Collect(ch)
+	close(ch)
+
+	var count int
+	for range ch {
+		count++
+	}
+	if count != 1 {
+		t.Errorf("expected 1 metric (missing flavor should be skipped), got %d", count)
+	}
+}
+
+func TestVMStateKPI_Collect_NoDB(t *testing.T) {
+	kpi := &VMStateKPI{}
+	if err := kpi.Init(nil, nil, conf.NewRawOpts("{}")); err != nil {
+		t.Fatalf("expected no error, got %v", err)
+	}
+
+	ch := make(chan prometheus.Metric, 100)
+	kpi.Collect(ch) // Should not panic
+	close(ch)
+
+	var count int
+	for range ch {
+		count++
+	}
+	if count != 0 {
+		t.Errorf("expected 0 metrics when no DB, got %d", count)
+	}
+}

From 4ba03bb75444fb1ef67eb56b4b3a525a87268dbb Mon Sep 17 00:00:00 2001
From: Philipp Matthes <p.matthes@sap.com>
Date: Mon, 30 Mar 2026 13:16:49 +0200
Subject: [PATCH 02/11] Add kpi to nova bundle

---
 helm/bundles/cortex-nova/templates/kpis.yaml | 16 ++++++++++++++++
 internal/knowledge/kpis/supported_kpis.go    |  1 +
 2 files changed, 17 insertions(+)

diff --git a/helm/bundles/cortex-nova/templates/kpis.yaml b/helm/bundles/cortex-nova/templates/kpis.yaml
index af01c10c5..62ff7f499 100644
--- a/helm/bundles/cortex-nova/templates/kpis.yaml
+++ b/helm/bundles/cortex-nova/templates/kpis.yaml
@@ -110,6 +110,22 @@ spec:
 ---
 apiVersion: cortex.cloud/v1alpha1
 kind: KPI
+metadata:
+  name: vm-state
+spec:
+  schedulingDomain: nova
+  impl: vm_state_kpi
+  dependencies:
+    datasources:
+      - name: nova-servers
+      - name: nova-flavors
+  description: |
+    This kpi monitors the current state of vms, i.e. how many vms are running,
+    stopped, paused, etc. It also exposes additional labels such as the vm's
+    hypervisor type which can be used to define alerts on non-running vms.
+---
+apiVersion: cortex.cloud/v1alpha1
+kind: KPI
 metadata:
   name: cortex-nova-datasource-state
 spec:
diff --git a/internal/knowledge/kpis/supported_kpis.go b/internal/knowledge/kpis/supported_kpis.go
index 274c5ace5..b469790cc 100644
--- a/internal/knowledge/kpis/supported_kpis.go
+++ b/internal/knowledge/kpis/supported_kpis.go
@@ -21,6 +21,7 @@ var supportedKPIs = map[string]plugins.KPI{
 	"vm_migration_statistics_kpi":  &compute.VMMigrationStatisticsKPI{},
 	"vm_life_span_kpi":             &compute.VMLifeSpanKPI{},
 	"vm_commitments_kpi":           &compute.VMCommitmentsKPI{},
+	"vm_state_kpi":                 &compute.VMStateKPI{},
 
 	"netapp_storage_pool_cpu_usage_kpi": &storage.NetAppStoragePoolCPUUsageKPI{},
 

From b613bdc7dbd692442945202c90a36901bad89304 Mon Sep 17 00:00:00 2001
From: Philipp Matthes <p.matthes@sap.com>
Date: Mon, 30 Mar 2026 13:17:00 +0200
Subject: [PATCH 03/11] Support syncing server faults from nova

---
 .../plugins/openstack/nova/nova_types.go      | 74 +++++++++++++++----
 .../compute/libvirt_domain_cpu_steal_pct.sql  |  2 +-
 .../libvirt_domain_cpu_steal_pct_test.go      |  2 +-
 .../plugins/compute/vm_host_residency.sql     |  2 +-
 .../plugins/compute/vm_life_span.sql          |  2 +-
 .../compute/vrops_hostsystem_resolver.sql     |  2 +-
 .../compute/vrops_project_noisiness.sql       |  2 +-
 .../reservations/commitments/controller.go    |  2 +-
 tools/visualize-reservations/main.go          |  5 +-
 9 files changed, 71 insertions(+), 22 deletions(-)

diff --git a/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go b/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go
index 2633c76fc..70e4fb02e 100644
--- a/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go
+++ b/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go
@@ -108,9 +108,24 @@ type Server struct {
 	OSEXTSTSVmState                string  `json:"OS-EXT-STS:vm_state" db:"os_ext_sts_vm_state"`
 	OSEXTSTSPowerState             int     `json:"OS-EXT-STS:power_state" db:"os_ext_sts_power_state"`
 
-	// From nested JSON
+	// From nested server.flavor JSON
 	FlavorName string `json:"-" db:"flavor_name"`
 
+	// From nested server.fault JSON
+
+	// The error response code.
+	FaultCode *uint `json:"-" db:"fault_code"`
+	// The date and time when the exception was raised. The date and time stamp
+	// format is ISO 8601 (CCYY-MM-DDThh:mm:ss±hh:mm). For example,
+	// 2015-08-27T09:49:58-05:00. The ±hh:mm value if included, is the time zone
+	// as an offset from UTC. In the previous example, the offset value is -05:00.
+	FaultCreated *string `json:"-" db:"fault_created"`
+	// The error message.
+	FaultMessage *string `json:"-" db:"fault_message"`
+	// The stack trace. It is available if the response code is not 500 or you
+	// have the administrator privilege.
+	FaultDetails *string `json:"-" db:"fault_details"`
+
 	// Note: there are some more fields that are omitted. To include them again, add
 	// custom unmarshalers and marshalers for the struct below.
 }
@@ -119,7 +134,8 @@ type Server struct {
 func (s *Server) UnmarshalJSON(data []byte) error {
 	type Alias Server
 	aux := &struct {
-		Flavor json.RawMessage `json:"flavor"`
+		Flavor json.RawMessage  `json:"flavor"`
+		Fault  *json.RawMessage `json:"fault,omitempty"`
 		*Alias
 	}{
 		Alias: (*Alias)(s),
@@ -135,31 +151,63 @@ func (s *Server) UnmarshalJSON(data []byte) error {
 		return err
 	}
 	s.FlavorName = flavor.Name
+	var fault struct {
+		Code    uint    `json:"code"`
+		Created string  `json:"created"`
+		Message string  `json:"message"`
+		Details *string `json:"details,omitempty"`
+	}
+	if aux.Fault != nil {
+		if err := json.Unmarshal(*aux.Fault, &fault); err != nil {
+			return err
+		}
+		s.FaultCode = &fault.Code
+		s.FaultCreated = &fault.Created
+		s.FaultMessage = &fault.Message
+		s.FaultDetails = fault.Details
+	}
 	return nil
 }
 
 // Custom marshaler for OpenStackServer to handle nested JSON.
 func (s *Server) MarshalJSON() ([]byte, error) {
 	type Alias Server
+	type flavor struct {
+		// Starting in microversion 2.47, "id" was removed...
+		Name string `json:"original_name"`
+	}
+	flavorVal := flavor{
+		Name: s.FlavorName,
+	}
+	type fault struct {
+		Code    uint    `json:"code"`
+		Created string  `json:"created"`
+		Message string  `json:"message"`
+		Details *string `json:"details,omitempty"`
+	}
+	var faultVal *fault
+	if s.FaultCode != nil && s.FaultCreated != nil && s.FaultMessage != nil {
+		faultVal = &fault{
+			Code:    *s.FaultCode,
+			Created: *s.FaultCreated,
+			Message: *s.FaultMessage,
+			Details: s.FaultDetails,
+		}
+	}
 	aux := &struct {
-		Flavor struct {
-			// Starting in microversion 2.47, "id" was removed...
-			Name string `json:"original_name"`
-		} `json:"flavor"`
+		Flavor flavor `json:"flavor"`
+		Fault  *fault `json:"fault,omitempty"`
 		*Alias
 	}{
-		Alias: (*Alias)(s),
-		Flavor: struct {
-			Name string `json:"original_name"`
-		}{
-			Name: s.FlavorName,
-		},
+		Alias:  (*Alias)(s),
+		Flavor: flavorVal,
+		Fault:  faultVal,
 	}
 	return json.Marshal(aux)
 }
 
 // Table in which the openstack model is stored.
-func (Server) TableName() string { return "openstack_servers" }
+func (Server) TableName() string { return "openstack_servers_v2" }
 
 // Index for the openstack model.
 func (Server) Indexes() map[string][]string { return nil }
diff --git a/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct.sql b/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct.sql
index ea2b9c97a..ab3c7b8a7 100644
--- a/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct.sql
+++ b/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct.sql
@@ -3,6 +3,6 @@ SELECT
     os.os_ext_srv_attr_host AS host,
     MAX(value) AS max_steal_time_pct
 FROM kvm_libvirt_domain_metrics kvm
-JOIN openstack_servers os ON os.os_ext_srv_attr_instance_name = kvm.domain
+JOIN openstack_servers_v2 os ON os.os_ext_srv_attr_instance_name = kvm.domain
 WHERE kvm.name = 'kvm_libvirt_domain_steal_pct' AND os.id IS NOT NULL
 GROUP BY os.os_ext_srv_attr_host, os.id;
\ No newline at end of file
diff --git a/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct_test.go b/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct_test.go
index b9f84b188..bc28218b5 100644
--- a/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct_test.go
+++ b/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct_test.go
@@ -56,7 +56,7 @@ func TestLibvirtDomainCPUStealPctExtractor_Extract(t *testing.T) {
 		t.Fatalf("expected no error, got %v", err)
 	}
 
-	// Insert mock data into the openstack_servers table
+	// Insert mock data into the openstack servers table
 	servers := []any{
 		&nova.Server{
 			ID:                       "uuid-1",
diff --git a/internal/knowledge/extractor/plugins/compute/vm_host_residency.sql b/internal/knowledge/extractor/plugins/compute/vm_host_residency.sql
index fff0086c4..c2b4b8846 100644
--- a/internal/knowledge/extractor/plugins/compute/vm_host_residency.sql
+++ b/internal/knowledge/extractor/plugins/compute/vm_host_residency.sql
@@ -21,7 +21,7 @@ WITH durations AS (
             )) AS BIGINT)
         ) AS duration
     FROM openstack_migrations AS migrations
-    LEFT JOIN openstack_servers AS servers ON servers.id = migrations.instance_uuid
+    LEFT JOIN openstack_servers_v2 AS servers ON servers.id = migrations.instance_uuid
     LEFT JOIN openstack_flavors_v2 AS flavors ON flavors.name = servers.flavor_name
 )
 SELECT
diff --git a/internal/knowledge/extractor/plugins/compute/vm_life_span.sql b/internal/knowledge/extractor/plugins/compute/vm_life_span.sql
index daaa0a470..1fad31536 100644
--- a/internal/knowledge/extractor/plugins/compute/vm_life_span.sql
+++ b/internal/knowledge/extractor/plugins/compute/vm_life_span.sql
@@ -13,7 +13,7 @@ running_servers AS (
         EXTRACT(EPOCH FROM (NOW()::timestamp - servers.created::timestamp))::BIGINT AS duration,
         COALESCE(flavors.name, 'unknown')::TEXT AS flavor_name,
         false::BOOLEAN AS deleted
-    FROM openstack_servers servers
+    FROM openstack_servers_v2 servers
     LEFT JOIN openstack_flavors_v2 flavors ON flavors.name = servers.flavor_name
     WHERE servers.created IS NOT NULL
 )
diff --git a/internal/knowledge/extractor/plugins/compute/vrops_hostsystem_resolver.sql b/internal/knowledge/extractor/plugins/compute/vrops_hostsystem_resolver.sql
index e2c6ad4b2..8ab0a2c70 100644
--- a/internal/knowledge/extractor/plugins/compute/vrops_hostsystem_resolver.sql
+++ b/internal/knowledge/extractor/plugins/compute/vrops_hostsystem_resolver.sql
@@ -3,5 +3,5 @@ SELECT DISTINCT
     m.hostsystem AS vrops_hostsystem,
     s.os_ext_srv_attr_host AS nova_compute_host
 FROM vrops_vm_metrics m
-LEFT JOIN openstack_servers s ON m.instance_uuid = s.id
+LEFT JOIN openstack_servers_v2 s ON m.instance_uuid = s.id
 WHERE s.os_ext_srv_attr_host IS NOT NULL;
diff --git a/internal/knowledge/extractor/plugins/compute/vrops_project_noisiness.sql b/internal/knowledge/extractor/plugins/compute/vrops_project_noisiness.sql
index 334668b22..0b0067790 100644
--- a/internal/knowledge/extractor/plugins/compute/vrops_project_noisiness.sql
+++ b/internal/knowledge/extractor/plugins/compute/vrops_project_noisiness.sql
@@ -19,7 +19,7 @@ host_cpu_usage AS (
         s.tenant_id,
         h.service_host,
         AVG(p.avg_cpu) AS avg_cpu_of_project
-    FROM openstack_servers s
+    FROM openstack_servers_v2 s
     JOIN vrops_vm_metrics m ON s.id = m.instance_uuid
     JOIN projects_avg_cpu p ON s.tenant_id = p.tenant_id
     JOIN openstack_hypervisors h ON s.os_ext_srv_attr_hypervisor_hostname = h.hostname
diff --git a/internal/scheduling/reservations/commitments/controller.go b/internal/scheduling/reservations/commitments/controller.go
index 9c238aeee..d38c6e1d8 100644
--- a/internal/scheduling/reservations/commitments/controller.go
+++ b/internal/scheduling/reservations/commitments/controller.go
@@ -445,7 +445,7 @@ func (r *CommitmentReservationController) listServersByProjectID(ctx context.Con
 	// Query servers from the database cache.
 	var servers []nova.Server
 	_, err := r.DB.Select(&servers,
-		"SELECT * FROM openstack_servers WHERE tenant_id = $1",
+		"SELECT * FROM "+nova.Server{}.TableName()+" WHERE tenant_id = $1",
 		projectID)
 	if err != nil {
 		return nil, fmt.Errorf("failed to query servers from database: %w", err)
diff --git a/tools/visualize-reservations/main.go b/tools/visualize-reservations/main.go
index 9b5880be5..90824fb6e 100644
--- a/tools/visualize-reservations/main.go
+++ b/tools/visualize-reservations/main.go
@@ -52,6 +52,7 @@ import (
 	"time"
 
 	"github.com/cobaltcore-dev/cortex/api/v1alpha1"
+	"github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova"
 	hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
 	_ "github.com/lib/pq"
 	corev1 "k8s.io/api/core/v1"
@@ -1761,9 +1762,9 @@ func connectToPostgres(
 
 	// Query servers with host information
 	serverMap = make(map[string]serverInfo)
-	rows, err := db.QueryContext(ctx, "SELECT id, flavor_name, COALESCE(host_id, ''), COALESCE(os_ext_srv_attr_host, '') FROM openstack_servers")
+	rows, err := db.QueryContext(ctx, "SELECT id, flavor_name, COALESCE(host_id, ''), COALESCE(os_ext_srv_attr_host, '') FROM "+nova.Server{}.TableName())
 	if err != nil {
-		fmt.Fprintf(os.Stderr, "Warning: Could not query openstack_servers: %v\n", err)
+		fmt.Fprintf(os.Stderr, "Warning: Could not query "+nova.Server{}.TableName()+": %v\n", err)
 	} else {
 		defer rows.Close()
 		for rows.Next() {

From aeb3fb30eba86812270c852b1a85161ee138c7bd Mon Sep 17 00:00:00 2001
From: Philipp Matthes <p.matthes@sap.com>
Date: Mon, 30 Mar 2026 13:35:31 +0200
Subject: [PATCH 04/11] Rename vm_state_kpi -> vm_faults_kpi

---
 helm/bundles/cortex-nova/templates/kpis.yaml  |  11 +-
 .../kpis/plugins/compute/vm_state.go          |  83 +++---
 .../kpis/plugins/compute/vm_state_test.go     | 240 ------------------
 internal/knowledge/kpis/supported_kpis.go     |   2 +-
 4 files changed, 58 insertions(+), 278 deletions(-)
 delete mode 100644 internal/knowledge/kpis/plugins/compute/vm_state_test.go

diff --git a/helm/bundles/cortex-nova/templates/kpis.yaml b/helm/bundles/cortex-nova/templates/kpis.yaml
index 62ff7f499..bc5666926 100644
--- a/helm/bundles/cortex-nova/templates/kpis.yaml
+++ b/helm/bundles/cortex-nova/templates/kpis.yaml
@@ -111,18 +111,19 @@ spec:
 apiVersion: cortex.cloud/v1alpha1
 kind: KPI
 metadata:
-  name: vm-state
+  name: vm-faults
 spec:
   schedulingDomain: nova
-  impl: vm_state_kpi
+  impl: vm_faults_kpi
   dependencies:
     datasources:
       - name: nova-servers
       - name: nova-flavors
   description: |
-    This kpi monitors the current state of vms, i.e. how many vms are running,
-    stopped, paused, etc. It also exposes additional labels such as the vm's
-    hypervisor type which can be used to define alerts on non-running vms.
+    This kpi tracks vm faults in the datacenter. It exposes helpful information
+    about the faults, such as the availability zone, hypervisor type, vm state,
+    and error info if available. This can be used to identify issues in the
+    datacenter and to monitor the overall health of the vms.
 ---
 apiVersion: cortex.cloud/v1alpha1
 kind: KPI
diff --git a/internal/knowledge/kpis/plugins/compute/vm_state.go b/internal/knowledge/kpis/plugins/compute/vm_state.go
index 53e3d2fc4..30a0832b3 100644
--- a/internal/knowledge/kpis/plugins/compute/vm_state.go
+++ b/internal/knowledge/kpis/plugins/compute/vm_state.go
@@ -4,6 +4,9 @@
 package compute
 
 import (
+	"errors"
+	"strconv"
+
 	"github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova"
 	"github.com/cobaltcore-dev/cortex/internal/knowledge/db"
 	"github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins"
@@ -13,49 +16,50 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
-var vmStateKPIlogger = ctrl.Log.WithName("vm-state-kpi")
+var vmFaultsKPIlogger = ctrl.Log.WithName("vm-faults-kpi")
 
-// This kpi monitors the current state of vms, i.e. how many vms are running,
-// stopped, paused, etc. It also exposes additional labels such as the vm's
-// hypervisor type which can be used to define alerts on non-running vms.
-type VMStateKPI struct {
-	// Common base for all KPIs that provides standard functionality.
-	plugins.BaseKPI[struct{}] // No options passed through yaml config
+// This kpi tracks vm faults in the datacenter. It exposes helpful information
+// about the faults, such as the availability zone, hypervisor type, vm state,
+// and error info if available. This can be used to identify issues in the
+// datacenter and to monitor the overall health of the vms.
+type VMFaultsKPI struct {
+	plugins.BaseKPI[struct{} /* No opts */]
 
-	// Current state of the VM, e.g. running, stopped, paused, etc.
-	vmStateDesc *prometheus.Desc
+	// vmFaultsDesc describes the prometheus metric for vm faults.
+	vmFaultsDesc *prometheus.Desc
 }
 
 // GetName returns a unique name for this kpi that is used for registration
 // and configuration.
-func (VMStateKPI) GetName() string { return "vm_state_kpi" }
+func (VMFaultsKPI) GetName() string { return "vm_faults_kpi" }
 
 // Init initializes the kpi, e.g. by creating the necessary Prometheus
 // descriptors. The base kpi is also initialized with the provided database,
 // client and options.
-func (k *VMStateKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) error {
+func (k *VMFaultsKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) error {
 	if err := k.BaseKPI.Init(db, client, opts); err != nil {
 		return err
 	}
-	k.vmStateDesc = prometheus.NewDesc("cortex_vm_state",
-		"Current state of the VM, e.g. running, stopped, paused, etc.",
-		[]string{"az", "hvtype", "state"}, nil,
+	k.vmFaultsDesc = prometheus.NewDesc("cortex_vm_faults",
+		"Number of vm faults in the datacenter",
+		[]string{"az", "hvtype", "state", "faultcode", "faultmessage"}, nil,
 	)
 	return nil
 }
 
 // Describe sends the descriptor of this kpi to the provided channel. This is
 // used by Prometheus to know which metrics this kpi exposes.
-func (k *VMStateKPI) Describe(ch chan<- *prometheus.Desc) { ch <- k.vmStateDesc }
+func (k *VMFaultsKPI) Describe(ch chan<- *prometheus.Desc) { ch <- k.vmFaultsDesc }
 
 // Collect collects the current state of vms from the database and sends it as
 // Prometheus metrics to the provided channel.
-func (k *VMStateKPI) Collect(ch chan<- prometheus.Metric) {
-	vmStateKPIlogger.Info("collecting vm state kpi")
+func (k *VMFaultsKPI) Collect(ch chan<- prometheus.Metric) {
+	vmFaultsKPIlogger.Info("collecting metrics")
 
 	// This can happen when no datasource is provided that connects to a database.
 	if k.DB == nil {
-		vmStateKPIlogger.Error(nil, "no database connection, cannot collect vm state kpi")
+		err := errors.New("no database connection")
+		vmFaultsKPIlogger.Error(err, "cannot collect metric")
 		return
 	}
 
@@ -63,19 +67,19 @@ func (k *VMStateKPI) Collect(ch chan<- prometheus.Metric) {
 	var servers []nova.Server
 	nServers, err := k.DB.Select(&servers, "SELECT * FROM "+nova.Server{}.TableName())
 	if err != nil {
-		vmStateKPIlogger.Error(err, "failed to query servers from database")
+		vmFaultsKPIlogger.Error(err, "failed to query servers from database")
 		return
 	}
-	vmStateKPIlogger.Info("queried servers from database", "nServers", nServers)
+	vmFaultsKPIlogger.Info("queried servers from database", "nServers", nServers)
 
 	// Get all flavors from the database to map them to the vms.
 	var flavors []nova.Flavor
 	nFlavors, err := k.DB.Select(&flavors, "SELECT * FROM "+nova.Flavor{}.TableName())
 	if err != nil {
-		vmStateKPIlogger.Error(err, "failed to query flavors from database")
+		vmFaultsKPIlogger.Error(err, "failed to query flavors from database")
 		return
 	}
-	vmStateKPIlogger.Info("queried flavors from database", "nFlavors", nFlavors)
+	vmFaultsKPIlogger.Info("queried flavors from database", "nFlavors", nFlavors)
 
 	flavorsByName := make(map[string]nova.Flavor, len(flavors))
 	for _, flavor := range flavors {
@@ -83,37 +87,52 @@ func (k *VMStateKPI) Collect(ch chan<- prometheus.Metric) {
 	}
 
 	type labels struct {
-		az     string
-		hvtype string
-		state  string
+		az         string
+		hvtype     string
+		state      string
+		errcode    string
+		errmessage string
 	}
 	counts := make(map[labels]float64)
 
 	// For each vm, get its hypervisor type and count up.
+	// Note: this will also expose vms that are NOT in an error state,
+	// but this can be useful to compare it to the number of faulty vms.
 	for _, server := range servers {
 		flavor, ok := flavorsByName[server.FlavorName]
 		if !ok {
-			vmStateKPIlogger.Error(nil, "flavor not found for server", "server",
+			vmFaultsKPIlogger.Info("warning: flavor not found for server", "server",
 				server.ID, "flavor", server.FlavorName)
 			continue
 		}
 		hypervisorType, err := flavor.GetHypervisorType()
 		if err != nil {
-			vmStateKPIlogger.Error(err, "failed to get hypervisor type for server",
+			vmFaultsKPIlogger.Error(err, "failed to get hypervisor type for server",
 				"server", server.ID, "flavor", flavor.Name)
 			continue
 		}
+		var errcode uint = 0
+		if server.FaultCode != nil {
+			errcode = *server.FaultCode
+		}
+		errmsg := "n/a"
+		if server.FaultMessage != nil {
+			errmsg = *server.FaultMessage
+		}
 		key := labels{
-			az:     server.OSEXTAvailabilityZone,
-			hvtype: string(hypervisorType),
-			state:  server.Status,
+			az:         server.OSEXTAvailabilityZone,
+			hvtype:     string(hypervisorType),
+			state:      server.Status,
+			errcode:    strconv.FormatUint(uint64(errcode), 10),
+			errmessage: errmsg,
 		}
 		counts[key]++
 	}
 
 	// Emit metrics to prometheus.
 	for key, count := range counts {
-		ch <- prometheus.MustNewConstMetric(k.vmStateDesc, prometheus.GaugeValue, count,
-			key.az, key.hvtype, key.state)
+		ch <- prometheus.MustNewConstMetric(k.vmFaultsDesc, prometheus.GaugeValue, count,
+			key.az, key.hvtype, key.state, key.errcode, key.errmessage)
 	}
+	vmFaultsKPIlogger.Info("collected metrics", "nMetrics", len(counts))
 }
diff --git a/internal/knowledge/kpis/plugins/compute/vm_state_test.go b/internal/knowledge/kpis/plugins/compute/vm_state_test.go
deleted file mode 100644
index 57196e1de..000000000
--- a/internal/knowledge/kpis/plugins/compute/vm_state_test.go
+++ /dev/null
@@ -1,240 +0,0 @@
-// Copyright SAP SE
-// SPDX-License-Identifier: Apache-2.0
-
-package compute
-
-import (
-	"testing"
-
-	"github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova"
-	"github.com/cobaltcore-dev/cortex/internal/knowledge/db"
-	testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing"
-	"github.com/cobaltcore-dev/cortex/pkg/conf"
-	"github.com/prometheus/client_golang/prometheus"
-	prometheusgo "github.com/prometheus/client_model/go"
-)
-
-func TestVMStateKPI_Init(t *testing.T) {
-	dbEnv := testlibDB.SetupDBEnv(t)
-	testDB := db.DB{DbMap: dbEnv.DbMap}
-	defer dbEnv.Close()
-	kpi := &VMStateKPI{}
-	if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil {
-		t.Fatalf("expected no error, got %v", err)
-	}
-}
-
-func TestVMStateKPI_Collect(t *testing.T) {
-	dbEnv := testlibDB.SetupDBEnv(t)
-	testDB := db.DB{DbMap: dbEnv.DbMap}
-	defer dbEnv.Close()
-	if err := testDB.CreateTable(
-		testDB.AddTable(nova.Server{}),
-		testDB.AddTable(nova.Flavor{}),
-	); err != nil {
-		t.Fatalf("expected no error, got %v", err)
-	}
-
-	mockData := []any{
-		// Servers in different AZs, states, and with different flavors
-		&nova.Server{
-			ID:                    "server-1",
-			FlavorName:            "m1.small",
-			OSEXTAvailabilityZone: "az1",
-			Status:                "ACTIVE",
-		},
-		&nova.Server{
-			ID:                    "server-2",
-			FlavorName:            "m1.small",
-			OSEXTAvailabilityZone: "az1",
-			Status:                "ACTIVE",
-		},
-		&nova.Server{
-			ID:                    "server-3",
-			FlavorName:            "m1.small",
-			OSEXTAvailabilityZone: "az1",
-			Status:                "STOPPED",
-		},
-		&nova.Server{
-			ID:                    "server-4",
-			FlavorName:            "m1.vmware",
-			OSEXTAvailabilityZone: "az2",
-			Status:                "ACTIVE",
-		},
-		&nova.Server{
-			ID:                    "server-5",
-			FlavorName:            "m1.generic",
-			OSEXTAvailabilityZone: "az1",
-			Status:                "PAUSED",
-		},
-		// Flavors with different hypervisor types
-		&nova.Flavor{
-			ID:         "flavor-1",
-			Name:       "m1.small",
-			ExtraSpecs: `{"capabilities:hypervisor_type": "QEMU"}`,
-		},
-		&nova.Flavor{
-			ID:         "flavor-2",
-			Name:       "m1.vmware",
-			ExtraSpecs: `{"capabilities:hypervisor_type": "VMware vCenter Server"}`,
-		},
-		&nova.Flavor{
-			ID:         "flavor-3",
-			Name:       "m1.generic",
-			ExtraSpecs: `{}`,
-		},
-	}
-
-	if err := testDB.Insert(mockData...); err != nil {
-		t.Fatalf("expected no error, got %v", err)
-	}
-
-	kpi := &VMStateKPI{}
-	if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil {
-		t.Fatalf("expected no error, got %v", err)
-	}
-
-	ch := make(chan prometheus.Metric, 100)
-	kpi.Collect(ch)
-	close(ch)
-
-	type vmStateMetric struct {
-		az     string
-		hvtype string
-		state  string
-		count  float64
-	}
-
-	metrics := make(map[string]vmStateMetric)
-	for metric := range ch {
-		var m prometheusgo.Metric
-		if err := metric.Write(&m); err != nil {
-			t.Fatalf("failed to write metric: %v", err)
-		}
-		labels := make(map[string]string)
-		for _, label := range m.Label {
-			labels[label.GetName()] = label.GetValue()
-		}
-		key := labels["az"] + "|" + labels["hvtype"] + "|" + labels["state"]
-		metrics[key] = vmStateMetric{
-			az:     labels["az"],
-			hvtype: labels["hvtype"],
-			state:  labels["state"],
-			count:  m.GetGauge().GetValue(),
-		}
-	}
-
-	expectedMetrics := map[string]vmStateMetric{
-		"az1|QEMU|ACTIVE": {
-			az:     "az1",
-			hvtype: "QEMU",
-			state:  "ACTIVE",
-			count:  2,
-		},
-		"az1|QEMU|STOPPED": {
-			az:     "az1",
-			hvtype: "QEMU",
-			state:  "STOPPED",
-			count:  1,
-		},
-		"az2|VMware vCenter Server|ACTIVE": {
-			az:     "az2",
-			hvtype: "VMware vCenter Server",
-			state:  "ACTIVE",
-			count:  1,
-		},
-		"az1|Unspecified|PAUSED": {
-			az:     "az1",
-			hvtype: "Unspecified",
-			state:  "PAUSED",
-			count:  1,
-		},
-	}
-
-	if len(expectedMetrics) != len(metrics) {
-		t.Errorf("expected %d metrics, got %d", len(expectedMetrics), len(metrics))
-	}
-
-	for key, expected := range expectedMetrics {
-		actual, ok := metrics[key]
-		if !ok {
-			t.Errorf("expected metric %q not found", key)
-			continue
-		}
-		if expected != actual {
-			t.Errorf("metric %q: expected %+v, got %+v", key, expected, actual)
-		}
-	}
-}
-
-func TestVMStateKPI_Collect_MissingFlavor(t *testing.T) {
-	dbEnv := testlibDB.SetupDBEnv(t)
-	testDB := db.DB{DbMap: dbEnv.DbMap}
-	defer dbEnv.Close()
-	if err := testDB.CreateTable(
-		testDB.AddTable(nova.Server{}),
-		testDB.AddTable(nova.Flavor{}),
-	); err != nil {
-		t.Fatalf("expected no error, got %v", err)
-	}
-
-	mockData := []any{
-		&nova.Server{
-			ID:                    "server-1",
-			FlavorName:            "m1.existing",
-			OSEXTAvailabilityZone: "az1",
-			Status:                "ACTIVE",
-		},
-		&nova.Server{
-			ID:                    "server-2",
-			FlavorName:            "m1.missing",
-			OSEXTAvailabilityZone: "az1",
-			Status:                "ACTIVE",
-		},
-		&nova.Flavor{
-			ID:         "flavor-1",
-			Name:       "m1.existing",
-			ExtraSpecs: `{}`,
-		},
-	}
-
-	if err := testDB.Insert(mockData...); err != nil {
-		t.Fatalf("expected no error, got %v", err)
-	}
-
-	kpi := &VMStateKPI{}
-	if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil {
-		t.Fatalf("expected no error, got %v", err)
-	}
-
-	ch := make(chan prometheus.Metric, 100)
-	kpi.Collect(ch)
-	close(ch)
-
-	var count int
-	for range ch {
-		count++
-	}
-	if count != 1 {
-		t.Errorf("expected 1 metric (missing flavor should be skipped), got %d", count)
-	}
-}
-
-func TestVMStateKPI_Collect_NoDB(t *testing.T) {
-	kpi := &VMStateKPI{}
-	if err := kpi.Init(nil, nil, conf.NewRawOpts("{}")); err != nil {
-		t.Fatalf("expected no error, got %v", err)
-	}
-
-	ch := make(chan prometheus.Metric, 100)
-	kpi.Collect(ch) // Should not panic
-	close(ch)
-
-	var count int
-	for range ch {
-		count++
-	}
-	if count != 0 {
-		t.Errorf("expected 0 metrics when no DB, got %d", count)
-	}
-}
diff --git a/internal/knowledge/kpis/supported_kpis.go b/internal/knowledge/kpis/supported_kpis.go
index b469790cc..2623ff8bd 100644
--- a/internal/knowledge/kpis/supported_kpis.go
+++ b/internal/knowledge/kpis/supported_kpis.go
@@ -21,7 +21,7 @@ var supportedKPIs = map[string]plugins.KPI{
 	"vm_migration_statistics_kpi":  &compute.VMMigrationStatisticsKPI{},
 	"vm_life_span_kpi":             &compute.VMLifeSpanKPI{},
 	"vm_commitments_kpi":           &compute.VMCommitmentsKPI{},
-	"vm_state_kpi":                 &compute.VMStateKPI{},
+	"vm_faults_kpi":                &compute.VMFaultsKPI{},
 
 	"netapp_storage_pool_cpu_usage_kpi": &storage.NetAppStoragePoolCPUUsageKPI{},
 

From 9e290d185dd844977099223dabdd0c183f57939b Mon Sep 17 00:00:00 2001
From: Philipp Matthes <p.matthes@sap.com>
Date: Mon, 30 Mar 2026 13:47:59 +0200
Subject: [PATCH 05/11] Rename file, add faulty-vm label, and add alert

---
 helm/bundles/cortex-nova/alerts/nova.alerts.yaml | 16 ++++++++++++++++
 .../compute/{vm_state.go => vm_faults.go}        | 12 ++++++++++--
 2 files changed, 26 insertions(+), 2 deletions(-)
 rename internal/knowledge/kpis/plugins/compute/{vm_state.go => vm_faults.go} (91%)

diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
index 2449fa390..c8fc74b8e 100644
--- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
+++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
@@ -592,3 +592,19 @@ groups:
         corruption, bugs in reservation creation, or external modifications.
         Reservations are automatically repaired, but the root cause should be
         investigated if this alert persists.
+
+  - alert: CortexNovaDoesntFindValidHosts
+    expr: cortex_vm_faults{faultmessage=~".+No valid host was found.+"} > 0
+    labels:
+      context: scheduling
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Nova scheduling cannot find valid hosts"
+      description: >
+        Cortex is seeing faulty vms in `{{$labels.az}}` where Nova scheduling
+        failed to find a valid host. This may indicate capacity issues,
+        misconfigured filters, or resource constraints in the datacenter.
+        Investigate the affected VMs and hypervisor availability.
diff --git a/internal/knowledge/kpis/plugins/compute/vm_state.go b/internal/knowledge/kpis/plugins/compute/vm_faults.go
similarity index 91%
rename from internal/knowledge/kpis/plugins/compute/vm_state.go
rename to internal/knowledge/kpis/plugins/compute/vm_faults.go
index 30a0832b3..7d69f709f 100644
--- a/internal/knowledge/kpis/plugins/compute/vm_state.go
+++ b/internal/knowledge/kpis/plugins/compute/vm_faults.go
@@ -42,7 +42,7 @@ func (k *VMFaultsKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) e
 	}
 	k.vmFaultsDesc = prometheus.NewDesc("cortex_vm_faults",
 		"Number of vm faults in the datacenter",
-		[]string{"az", "hvtype", "state", "faultcode", "faultmessage"}, nil,
+		[]string{"az", "hvtype", "state", "fault-code", "fault-message", "faulty-vm"}, nil,
 	)
 	return nil
 }
@@ -92,6 +92,7 @@ func (k *VMFaultsKPI) Collect(ch chan<- prometheus.Metric) {
 		state      string
 		errcode    string
 		errmessage string
+		faultyVM   string
 	}
 	counts := make(map[labels]float64)
 
@@ -119,12 +120,19 @@ func (k *VMFaultsKPI) Collect(ch chan<- prometheus.Metric) {
 		if server.FaultMessage != nil {
 			errmsg = *server.FaultMessage
 		}
+		// Only provide the server ID for faulty VMs, to avoid cardinality
+		// explosion in the metric.
+		faultyVM := "no"
+		if server.FaultCode != nil || server.FaultMessage != nil {
+			faultyVM = server.ID
+		}
 		key := labels{
 			az:         server.OSEXTAvailabilityZone,
 			hvtype:     string(hypervisorType),
 			state:      server.Status,
 			errcode:    strconv.FormatUint(uint64(errcode), 10),
 			errmessage: errmsg,
+			faultyVM:   faultyVM,
 		}
 		counts[key]++
 	}
@@ -132,7 +140,7 @@ func (k *VMFaultsKPI) Collect(ch chan<- prometheus.Metric) {
 	// Emit metrics to prometheus.
 	for key, count := range counts {
 		ch <- prometheus.MustNewConstMetric(k.vmFaultsDesc, prometheus.GaugeValue, count,
-			key.az, key.hvtype, key.state, key.errcode, key.errmessage)
+			key.az, key.hvtype, key.state, key.errcode, key.errmessage, key.faultyVM)
 	}
 	vmFaultsKPIlogger.Info("collected metrics", "nMetrics", len(counts))
 }

From cf2a9e564d3ea4094ae30c09ff808ba450b90e21 Mon Sep 17 00:00:00 2001
From: Philipp Matthes <p.matthes@sap.com>
Date: Mon, 30 Mar 2026 13:49:37 +0200
Subject: [PATCH 06/11] Fix linting issue

---
 tools/visualize-reservations/main.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/visualize-reservations/main.go b/tools/visualize-reservations/main.go
index 90824fb6e..c99ff2eb1 100644
--- a/tools/visualize-reservations/main.go
+++ b/tools/visualize-reservations/main.go
@@ -1762,6 +1762,7 @@ func connectToPostgres(
 
 	// Query servers with host information
 	serverMap = make(map[string]serverInfo)
+	//nolint:gosec // This query is not using any user input, so it's not vulnerable to SQL injection
 	rows, err := db.QueryContext(ctx, "SELECT id, flavor_name, COALESCE(host_id, ''), COALESCE(os_ext_srv_attr_host, '') FROM "+nova.Server{}.TableName())
 	if err != nil {
 		fmt.Fprintf(os.Stderr, "Warning: Could not query "+nova.Server{}.TableName()+": %v\n", err)

From 50a38166b4ba1a306122c60384d5b223a16e9472 Mon Sep 17 00:00:00 2001
From: Philipp Matthes <p.matthes@sap.com>
Date: Mon, 30 Mar 2026 13:57:58 +0200
Subject: [PATCH 07/11] Unit tests

---
 .../kpis/plugins/compute/vm_faults_test.go    | 408 ++++++++++++++++++
 1 file changed, 408 insertions(+)
 create mode 100644 internal/knowledge/kpis/plugins/compute/vm_faults_test.go

diff --git a/internal/knowledge/kpis/plugins/compute/vm_faults_test.go b/internal/knowledge/kpis/plugins/compute/vm_faults_test.go
new file mode 100644
index 000000000..a5f63b42c
--- /dev/null
+++ b/internal/knowledge/kpis/plugins/compute/vm_faults_test.go
@@ -0,0 +1,408 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package compute
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova"
+	"github.com/cobaltcore-dev/cortex/internal/knowledge/db"
+	testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing"
+	"github.com/cobaltcore-dev/cortex/pkg/conf"
+	testlib "github.com/cobaltcore-dev/cortex/pkg/testing"
+	"github.com/prometheus/client_golang/prometheus"
+	prometheusgo "github.com/prometheus/client_model/go"
+)
+
+func TestVMFaultsKPI_GetName(t *testing.T) {
+	kpi := VMFaultsKPI{}
+	if kpi.GetName() != "vm_faults_kpi" {
+		t.Errorf("expected 'vm_faults_kpi', got %q", kpi.GetName())
+	}
+}
+
+func TestVMFaultsKPI_Init(t *testing.T) {
+	dbEnv := testlibDB.SetupDBEnv(t)
+	testDB := db.DB{DbMap: dbEnv.DbMap}
+	defer dbEnv.Close()
+
+	kpi := &VMFaultsKPI{}
+	if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil {
+		t.Fatalf("expected no error, got %v", err)
+	}
+	if kpi.vmFaultsDesc == nil {
+		t.Error("vmFaultsDesc should be initialized")
+	}
+}
+
+func TestVMFaultsKPI_Describe(t *testing.T) {
+	kpi := &VMFaultsKPI{}
+	if err := kpi.Init(nil, nil, conf.NewRawOpts("{}")); err != nil {
+		t.Fatalf("expected no error, got %v", err)
+	}
+
+	ch := make(chan *prometheus.Desc, 1)
+	kpi.Describe(ch)
+	close(ch)
+
+	desc := <-ch
+	if desc == nil {
+		t.Error("expected descriptor to be sent to channel")
+	}
+}
+
+func TestVMFaultsKPI_Collect_NoDB(t *testing.T) {
+	kpi := &VMFaultsKPI{}
+	if err := kpi.Init(nil, nil, conf.NewRawOpts("{}")); err != nil {
+		t.Fatalf("expected no error, got %v", err)
+	}
+
+	// Collect should not panic when no database is provided
+	ch := make(chan prometheus.Metric, 100)
+	kpi.Collect(ch)
+	close(ch)
+
+	count := 0
+	for range ch {
+		count++
+	}
+	if count != 0 {
+		t.Errorf("expected 0 metrics when no DB, got %d", count)
+	}
+}
+
+func TestVMFaultsKPI_Collect(t *testing.T) {
+	dbEnv := testlibDB.SetupDBEnv(t)
+	testDB := db.DB{DbMap: dbEnv.DbMap}
+	defer dbEnv.Close()
+
+	if err := testDB.CreateTable(
+		testDB.AddTable(nova.Server{}),
+		testDB.AddTable(nova.Flavor{}),
+	); err != nil {
+		t.Fatalf("expected no error, got %v", err)
+	}
+
+	// Insert mock flavors with different hypervisor types
+	flavors := []any{
+		&nova.Flavor{
+			ID:         "flavor-qemu",
+			Name:       "qemu-small",
+			VCPUs:      2,
+			RAM:        4096,
+			ExtraSpecs: `{"capabilities:hypervisor_type":"QEMU"}`,
+		},
+		&nova.Flavor{
+			ID:         "flavor-vmware",
+			Name:       "vmware-medium",
+			VCPUs:      4,
+			RAM:        8192,
+			ExtraSpecs: `{"capabilities:hypervisor_type":"VMware vCenter Server"}`,
+		},
+		&nova.Flavor{
+			ID:         "flavor-unspecified",
+			Name:       "generic-large",
+			VCPUs:      8,
+			RAM:        16384,
+			ExtraSpecs: `{}`,
+		},
+	}
+	if err := testDB.Insert(flavors...); err != nil {
+		t.Fatalf("expected no error, got %v", err)
+	}
+
+	// Insert mock servers
+	servers := []any{
+		// Normal server without fault
+		&nova.Server{
+			ID:                    "server-1",
+			Name:                  "normal-vm",
+			Status:                "ACTIVE",
+			FlavorName:            "qemu-small",
+			OSEXTAvailabilityZone: "az1",
+		},
+		// Server with fault code and message
+		&nova.Server{
+			ID:                    "server-2",
+			Name:                  "faulty-vm",
+			Status:                "ERROR",
+			FlavorName:            "qemu-small",
+			OSEXTAvailabilityZone: "az1",
+			FaultCode:             testlib.Ptr(uint(500)),
+			FaultMessage:          testlib.Ptr("Internal error"),
+		},
+		// Another faulty server in different AZ
+		&nova.Server{
+			ID:                    "server-3",
+			Name:                  "another-faulty",
+			Status:                "ERROR",
+			FlavorName:            "vmware-medium",
+			OSEXTAvailabilityZone: "az2",
+			FaultCode:             testlib.Ptr(uint(400)),
+			FaultMessage:          testlib.Ptr("Bad request"),
+		},
+		// Server with only fault message (no code)
+		&nova.Server{
+			ID:                    "server-4",
+			Name:                  "partial-fault",
+			Status:                "BUILD",
+			FlavorName:            "generic-large",
+			OSEXTAvailabilityZone: "az1",
+			FaultMessage:          testlib.Ptr("Some warning"),
+		},
+		// Server with flavor that doesn't exist (should be skipped)
+		&nova.Server{
+			ID:                    "server-5",
+			Name:                  "orphan-vm",
+			Status:                "ACTIVE",
+			FlavorName:            "nonexistent-flavor",
+			OSEXTAvailabilityZone: "az1",
+		},
+	}
+	if err := testDB.Insert(servers...); err != nil {
+		t.Fatalf("expected no error, got %v", err)
+	}
+
+	kpi := &VMFaultsKPI{}
+	if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil {
+		t.Fatalf("expected no error, got %v", err)
+	}
+
+	ch := make(chan prometheus.Metric, 100)
+	kpi.Collect(ch)
+	close(ch)
+
+	type vmFaultsMetric struct {
+		az           string
+		hvtype       string
+		state        string
+		faultCode    string
+		faultMessage string
+		faultyVM     string
+		value        float64
+	}
+
+	metrics := make(map[string]vmFaultsMetric)
+	for metric := range ch {
+		var m prometheusgo.Metric
+		if err := metric.Write(&m); err != nil {
+			t.Fatalf("failed to write metric: %v", err)
+		}
+
+		labels := make(map[string]string)
+		for _, label := range m.Label {
+			labels[label.GetName()] = label.GetValue()
+		}
+
+		key := labels["az"] + "|" + labels["hvtype"] + "|" + labels["state"] + "|" +
+			labels["fault-code"] + "|" + labels["faulty-vm"]
+
+		metrics[key] = vmFaultsMetric{
+			az:           labels["az"],
+			hvtype:       labels["hvtype"],
+			state:        labels["state"],
+			faultCode:    labels["fault-code"],
+			faultMessage: labels["fault-message"],
+			faultyVM:     labels["faulty-vm"],
+			value:        m.GetGauge().GetValue(),
+		}
+	}
+
+	expectedMetrics := map[string]vmFaultsMetric{
+		// Normal VM without fault
+		"az1|QEMU|ACTIVE|0|no": {
+			az:           "az1",
+			hvtype:       "QEMU",
+			state:        "ACTIVE",
+			faultCode:    "0",
+			faultMessage: "n/a",
+			faultyVM:     "no",
+			value:        1,
+		},
+		// Faulty VM with code 500
+		"az1|QEMU|ERROR|500|server-2": {
+			az:           "az1",
+			hvtype:       "QEMU",
+			state:        "ERROR",
+			faultCode:    "500",
+			faultMessage: "Internal error",
+			faultyVM:     "server-2",
+			value:        1,
+		},
+		// Faulty VM with code 400 in az2
+		"az2|VMware vCenter Server|ERROR|400|server-3": {
+			az:           "az2",
+			hvtype:       "VMware vCenter Server",
+			state:        "ERROR",
+			faultCode:    "400",
+			faultMessage: "Bad request",
+			faultyVM:     "server-3",
+			value:        1,
+		},
+		// Server with only fault message (code=0 but has message)
+		"az1|Unspecified|BUILD|0|server-4": {
+			az:           "az1",
+			hvtype:       "Unspecified",
+			state:        "BUILD",
+			faultCode:    "0",
+			faultMessage: "Some warning",
+			faultyVM:     "server-4",
+			value:        1,
+		},
+	}
+
+	if len(expectedMetrics) != len(metrics) {
+		t.Errorf("expected %d metrics, got %d", len(expectedMetrics), len(metrics))
+		t.Logf("actual metrics: %+v", metrics)
+	}
+
+	for key, expected := range expectedMetrics {
+		actual, ok := metrics[key]
+		if !ok {
+			t.Errorf("expected metric %q not found", key)
+			continue
+		}
+
+		if !reflect.DeepEqual(expected, actual) {
+			t.Errorf("metric %q: expected %+v, got %+v", key, expected, actual)
+		}
+	}
+}
+
+func TestVMFaultsKPI_Collect_InvalidExtraSpecs(t *testing.T) {
+	dbEnv := testlibDB.SetupDBEnv(t)
+	testDB := db.DB{DbMap: dbEnv.DbMap}
+	defer dbEnv.Close()
+
+	if err := testDB.CreateTable(
+		testDB.AddTable(nova.Server{}),
+		testDB.AddTable(nova.Flavor{}),
+	); err != nil {
+		t.Fatalf("expected no error, got %v", err)
+	}
+
+	// Insert flavor with invalid extra specs JSON
+	flavors := []any{
+		&nova.Flavor{
+			ID:         "flavor-bad",
+			Name:       "bad-flavor",
+			VCPUs:      2,
+			RAM:        4096,
+			ExtraSpecs: `invalid-json`,
+		},
+	}
+	if err := testDB.Insert(flavors...); err != nil {
+		t.Fatalf("expected no error, got %v", err)
+	}
+
+	servers := []any{
+		&nova.Server{
+			ID:                    "server-bad",
+			Name:                  "bad-vm",
+			Status:                "ACTIVE",
+			FlavorName:            "bad-flavor",
+			OSEXTAvailabilityZone: "az1",
+		},
+	}
+	if err := testDB.Insert(servers...); err != nil {
+		t.Fatalf("expected no error, got %v", err)
+	}
+
+	kpi := &VMFaultsKPI{}
+	if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil {
+		t.Fatalf("expected no error, got %v", err)
+	}
+
+	// Should not panic, but should skip the server with invalid flavor
+	ch := make(chan prometheus.Metric, 100)
+	kpi.Collect(ch)
+	close(ch)
+
+	count := 0
+	for range ch {
+		count++
+	}
+	// Should have 0 metrics since the server's flavor has invalid extra specs
+	if count != 0 {
+		t.Errorf("expected 0 metrics, got %d", count)
+	}
+}
+
+func TestVMFaultsKPI_Collect_MultipleSameLabels(t *testing.T) {
+	dbEnv := testlibDB.SetupDBEnv(t)
+	testDB := db.DB{DbMap: dbEnv.DbMap}
+	defer dbEnv.Close()
+
+	if err := testDB.CreateTable(
+		testDB.AddTable(nova.Server{}),
+		testDB.AddTable(nova.Flavor{}),
+	); err != nil {
+		t.Fatalf("expected no error, got %v", err)
+	}
+
+	flavors := []any{
+		&nova.Flavor{
+			ID:         "flavor-1",
+			Name:       "small",
+			VCPUs:      2,
+			RAM:        4096,
+			ExtraSpecs: `{"capabilities:hypervisor_type":"QEMU"}`,
+		},
+	}
+	if err := testDB.Insert(flavors...); err != nil {
+		t.Fatalf("expected no error, got %v", err)
+	}
+
+	// Insert multiple servers that should aggregate to same metric
+	servers := []any{
+		&nova.Server{
+			ID:                    "server-1",
+			Name:                  "vm-1",
+			Status:                "ACTIVE",
+			FlavorName:            "small",
+			OSEXTAvailabilityZone: "az1",
+		},
+		&nova.Server{
+			ID:                    "server-2",
+			Name:                  "vm-2",
+			Status:                "ACTIVE",
+			FlavorName:            "small",
+			OSEXTAvailabilityZone: "az1",
+		},
+		&nova.Server{
+			ID:                    "server-3",
+			Name:                  "vm-3",
+			Status:                "ACTIVE",
+			FlavorName:            "small",
+			OSEXTAvailabilityZone: "az1",
+		},
+	}
+	if err := testDB.Insert(servers...); err != nil {
+		t.Fatalf("expected no error, got %v", err)
+	}
+
+	kpi := &VMFaultsKPI{}
+	if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil {
+		t.Fatalf("expected no error, got %v", err)
+	}
+
+	ch := make(chan prometheus.Metric, 100)
+	kpi.Collect(ch)
+	close(ch)
+
+	var value float64
+	for metric := range ch {
+		var m prometheusgo.Metric
+		if err := metric.Write(&m); err != nil {
+			t.Fatalf("failed to write metric: %v", err)
+		}
+		value = m.GetGauge().GetValue()
+	}
+
+	// All 3 VMs should be counted together since they have the same labels
+	if value != 3 {
+		t.Errorf("expected metric value 3, got %f", value)
+	}
+}

From aa05ab53d7521b8e1cf0c5444bec8a7f03b7eb80 Mon Sep 17 00:00:00 2001
From: Philipp Matthes <p.matthes@sap.com>
Date: Mon, 30 Mar 2026 14:01:50 +0200
Subject: [PATCH 08/11] Limit alert to kvm hypervisors

---
 helm/bundles/cortex-nova/alerts/nova.alerts.yaml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
index c8fc74b8e..e702bc508 100644
--- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
+++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
@@ -593,8 +593,8 @@ groups:
         Reservations are automatically repaired, but the root cause should be
         investigated if this alert persists.
 
-  - alert: CortexNovaDoesntFindValidHosts
-    expr: cortex_vm_faults{faultmessage=~".+No valid host was found.+"} > 0
+  - alert: CortexNovaDoesntFindValidKVMHosts
+    expr: cortex_vm_faults{hvtype=~"CH|QEMU",faultmessage=~".+No valid host was found.+"} > 0
     labels:
       context: scheduling
       dashboard: cortex/cortex
@@ -602,9 +602,9 @@ groups:
       severity: warning
       support_group: workload-management
     annotations:
-      summary: "Nova scheduling cannot find valid hosts"
+      summary: "Nova scheduling cannot find valid KVM hosts"
       description: >
         Cortex is seeing faulty vms in `{{$labels.az}}` where Nova scheduling
-        failed to find a valid host. This may indicate capacity issues,
-        misconfigured filters, or resource constraints in the datacenter.
-        Investigate the affected VMs and hypervisor availability.
+        failed to find a valid `{{$labels.hvtype}}` host. This may indicate
+        capacity issues, misconfigured filters, or resource constraints in the
+        datacenter. Investigate the affected VMs and hypervisor availability.

From 7cce0da334cbf38e1548088c6292123be7a34ed6 Mon Sep 17 00:00:00 2001
From: Philipp Matthes <p.matthes@sap.com>
Date: Mon, 30 Mar 2026 14:34:29 +0200
Subject: [PATCH 09/11] Refine metric labels and add dashboard panel

---
 .../kpis/plugins/compute/vm_faults.go         |   7 +-
 .../kpis/plugins/compute/vm_faults_test.go    |  10 +-
 .../dashboards/cortex-status.json             | 147 +++++++++++++++---
 3 files changed, 138 insertions(+), 26 deletions(-)

diff --git a/internal/knowledge/kpis/plugins/compute/vm_faults.go b/internal/knowledge/kpis/plugins/compute/vm_faults.go
index 7d69f709f..fec71247c 100644
--- a/internal/knowledge/kpis/plugins/compute/vm_faults.go
+++ b/internal/knowledge/kpis/plugins/compute/vm_faults.go
@@ -6,6 +6,7 @@ package compute
 import (
 	"errors"
 	"strconv"
+	"strings"
 
 	"github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova"
 	"github.com/cobaltcore-dev/cortex/internal/knowledge/db"
@@ -42,7 +43,7 @@ func (k *VMFaultsKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) e
 	}
 	k.vmFaultsDesc = prometheus.NewDesc("cortex_vm_faults",
 		"Number of vm faults in the datacenter",
-		[]string{"az", "hvtype", "state", "fault-code", "fault-message", "faulty-vm"}, nil,
+		[]string{"az", "hvtype", "state", "faultcode", "faultmsg", "faultyvm"}, nil,
 	)
 	return nil
 }
@@ -119,6 +120,10 @@ func (k *VMFaultsKPI) Collect(ch chan<- prometheus.Metric) {
 		errmsg := "n/a"
 		if server.FaultMessage != nil {
 			errmsg = *server.FaultMessage
+			// Sometimes the VM ID may appear in the error message, which can
+			// lead to high cardinality in the metric. To avoid this, we replace
+			// the VM ID with a placeholder.
+			errmsg = strings.ReplaceAll(errmsg, server.ID, "<vm_id>")
 		}
 		// Only provide the server ID for faulty VMs, to avoid cardinality
 		// explosion in the metric.
diff --git a/internal/knowledge/kpis/plugins/compute/vm_faults_test.go b/internal/knowledge/kpis/plugins/compute/vm_faults_test.go
index a5f63b42c..a5b248b55 100644
--- a/internal/knowledge/kpis/plugins/compute/vm_faults_test.go
+++ b/internal/knowledge/kpis/plugins/compute/vm_faults_test.go
@@ -126,7 +126,7 @@ func TestVMFaultsKPI_Collect(t *testing.T) {
 		// Server with fault code and message
 		&nova.Server{
 			ID:                    "server-2",
-			Name:                  "faulty-vm",
+			Name:                  "faultyvm",
 			Status:                "ERROR",
 			FlavorName:            "qemu-small",
 			OSEXTAvailabilityZone: "az1",
@@ -197,15 +197,15 @@ func TestVMFaultsKPI_Collect(t *testing.T) {
 		}
 
 		key := labels["az"] + "|" + labels["hvtype"] + "|" + labels["state"] + "|" +
-			labels["fault-code"] + "|" + labels["faulty-vm"]
+			labels["faultcode"] + "|" + labels["faultyvm"]
 
 		metrics[key] = vmFaultsMetric{
 			az:           labels["az"],
 			hvtype:       labels["hvtype"],
 			state:        labels["state"],
-			faultCode:    labels["fault-code"],
-			faultMessage: labels["fault-message"],
-			faultyVM:     labels["faulty-vm"],
+			faultCode:    labels["faultcode"],
+			faultMessage: labels["faultmsg"],
+			faultyVM:     labels["faultyvm"],
 			value:        m.GetGauge().GetValue(),
 		}
 	}
diff --git a/tools/plutono/provisioning/dashboards/cortex-status.json b/tools/plutono/provisioning/dashboards/cortex-status.json
index f83e2926b..d05157d8d 100644
--- a/tools/plutono/provisioning/dashboards/cortex-status.json
+++ b/tools/plutono/provisioning/dashboards/cortex-status.json
@@ -16,7 +16,7 @@
   "editable": true,
   "gnetId": null,
   "graphTooltip": 0,
-  "id": 3,
+  "id": 1,
   "links": [],
   "panels": [
     {
@@ -557,6 +557,7 @@
       "dashLength": 10,
       "dashes": false,
       "datasource": "prometheus-openstack",
+      "description": "",
       "fieldConfig": {
         "defaults": {
           "unit": "short"
@@ -567,11 +568,117 @@
       "fillGradient": 0,
       "gridPos": {
         "h": 12,
-        "w": 24,
+        "w": 12,
         "x": 0,
         "y": 31
       },
       "hiddenSeries": false,
+      "id": 58,
+      "interval": null,
+      "legend": {
+        "alignAsTable": false,
+        "avg": false,
+        "current": false,
+        "hideEmpty": false,
+        "hideZero": true,
+        "max": false,
+        "min": false,
+        "rightSide": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.5.37",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "exemplar": false,
+          "expr": "sum by (faultmsg,state) (cortex_vm_faults{faultcode!=\"0\"})",
+          "format": "time_series",
+          "instant": false,
+          "interval": "",
+          "intervalFactor": 1,
+          "legendFormat": "{{state}} {{faultmsg}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Nova: faults in vm scheduling lifecycle",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:234",
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "$$hashKey": "object:235",
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "prometheus-openstack",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 12,
+        "w": 24,
+        "x": 0,
+        "y": 43
+      },
+      "hiddenSeries": false,
       "id": 39,
       "legend": {
         "avg": false,
@@ -669,7 +776,7 @@
         "h": 11,
         "w": 6,
         "x": 0,
-        "y": 43
+        "y": 55
       },
       "hiddenSeries": false,
       "id": 31,
@@ -766,7 +873,7 @@
         "h": 11,
         "w": 6,
         "x": 6,
-        "y": 43
+        "y": 55
       },
       "hiddenSeries": false,
       "id": 33,
@@ -878,7 +985,7 @@
         "h": 11,
         "w": 6,
         "x": 12,
-        "y": 43
+        "y": 55
       },
       "hiddenSeries": false,
       "id": 35,
@@ -990,7 +1097,7 @@
         "h": 11,
         "w": 6,
         "x": 18,
-        "y": 43
+        "y": 55
       },
       "hiddenSeries": false,
       "id": 37,
@@ -1100,7 +1207,7 @@
         "h": 12,
         "w": 12,
         "x": 0,
-        "y": 54
+        "y": 66
       },
       "hiddenSeries": false,
       "id": 27,
@@ -1208,7 +1315,7 @@
         "h": 12,
         "w": 12,
         "x": 12,
-        "y": 54
+        "y": 66
       },
       "hiddenSeries": false,
       "id": 29,
@@ -1296,7 +1403,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 66
+        "y": 78
       },
       "id": 5,
       "panels": [],
@@ -1321,7 +1428,7 @@
         "h": 11,
         "w": 12,
         "x": 0,
-        "y": 67
+        "y": 79
       },
       "hiddenSeries": false,
       "id": 2,
@@ -1441,7 +1548,7 @@
         "h": 11,
         "w": 12,
         "x": 12,
-        "y": 67
+        "y": 79
       },
       "hiddenSeries": false,
       "id": 3,
@@ -1580,7 +1687,7 @@
         "h": 12,
         "w": 24,
         "x": 0,
-        "y": 78
+        "y": 90
       },
       "id": 50,
       "options": {
@@ -1621,7 +1728,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 90
+        "y": 102
       },
       "id": 25,
       "panels": [],
@@ -1644,7 +1751,7 @@
         "h": 14,
         "w": 12,
         "x": 0,
-        "y": 91
+        "y": 103
       },
       "hiddenSeries": false,
       "id": 21,
@@ -1746,7 +1853,7 @@
         "h": 14,
         "w": 12,
         "x": 12,
-        "y": 91
+        "y": 103
       },
       "hiddenSeries": false,
       "id": 23,
@@ -1839,7 +1946,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 105
+        "y": 117
       },
       "id": 19,
       "panels": [],
@@ -1862,7 +1969,7 @@
         "h": 13,
         "w": 12,
         "x": 0,
-        "y": 106
+        "y": 118
       },
       "hiddenSeries": false,
       "id": 17,
@@ -1960,7 +2067,7 @@
         "h": 13,
         "w": 12,
         "x": 12,
-        "y": 106
+        "y": 118
       },
       "hiddenSeries": false,
       "id": 15,
@@ -2057,7 +2164,7 @@
         "h": 12,
         "w": 12,
         "x": 0,
-        "y": 119
+        "y": 131
       },
       "hiddenSeries": false,
       "id": 11,
@@ -2155,7 +2262,7 @@
         "h": 12,
         "w": 12,
         "x": 12,
-        "y": 119
+        "y": 131
       },
       "hiddenSeries": false,
       "id": 13,

From 1c1c7c83f2461bc9691cf2a6037383838684a5ad Mon Sep 17 00:00:00 2001
From: Philipp Matthes <p.matthes@sap.com>
Date: Mon, 30 Mar 2026 14:51:22 +0200
Subject: [PATCH 10/11] PR feedback

---
 helm/bundles/cortex-nova/alerts/nova.alerts.yaml              | 2 +-
 .../datasources/plugins/openstack/nova/nova_types.go          | 4 +++-
 tools/plutono/provisioning/dashboards/cortex-status.json      | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
index e702bc508..a92881603 100644
--- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
+++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
@@ -594,7 +594,7 @@ groups:
         investigated if this alert persists.
 
   - alert: CortexNovaDoesntFindValidKVMHosts
-    expr: cortex_vm_faults{hvtype=~"CH|QEMU",faultmessage=~".+No valid host was found.+"} > 0
+    expr: sum by (az, hvtype) (cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".+No valid host was found.+"}) > 0
     labels:
       context: scheduling
       dashboard: cortex/cortex
diff --git a/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go b/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go
index 70e4fb02e..1be2b7a29 100644
--- a/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go
+++ b/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go
@@ -358,7 +358,9 @@ const (
 // extra specs.
 func (f Flavor) GetHypervisorType() (FlavorHypervisorType, error) {
 	var extraSpecs map[string]string
-	if err := json.Unmarshal([]byte(f.ExtraSpecs), &extraSpecs); err != nil {
+	if f.ExtraSpecs == "" {
+		extraSpecs = map[string]string{}
+	} else if err := json.Unmarshal([]byte(f.ExtraSpecs), &extraSpecs); err != nil {
 		return "", err // Return an error if the extra specs cannot be parsed.
 	}
 	hypervisorType, ok := extraSpecs["capabilities:hypervisor_type"]
diff --git a/tools/plutono/provisioning/dashboards/cortex-status.json b/tools/plutono/provisioning/dashboards/cortex-status.json
index d05157d8d..37f4b2479 100644
--- a/tools/plutono/provisioning/dashboards/cortex-status.json
+++ b/tools/plutono/provisioning/dashboards/cortex-status.json
@@ -606,7 +606,7 @@
       "targets": [
         {
           "exemplar": false,
-          "expr": "sum by (faultmsg,state) (cortex_vm_faults{faultcode!=\"0\"})",
+          "expr": "sum by (faultmsg,state) (cortex_vm_faults{faultyvm!=\"no\"})",
           "format": "time_series",
           "instant": false,
           "interval": "",

From c1a2a7319f738d368b659df5e145421b4c24d909 Mon Sep 17 00:00:00 2001
From: Philipp Matthes <p.matthes@sap.com>
Date: Mon, 30 Mar 2026 15:30:06 +0200
Subject: [PATCH 11/11] PR feedback

---
 helm/bundles/cortex-nova/alerts/nova.alerts.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
index a92881603..e3271f119 100644
--- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
+++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
@@ -594,7 +594,8 @@ groups:
         investigated if this alert persists.
 
   - alert: CortexNovaDoesntFindValidKVMHosts
-    expr: sum by (az, hvtype) (cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".+No valid host was found.+"}) > 0
+    expr: sum by (az, hvtype) (cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*"}) > 0
+    for: 5m
     labels:
       context: scheduling
       dashboard: cortex/cortex