diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index 2449fa390..e3271f119 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -592,3 +592,20 @@ groups: corruption, bugs in reservation creation, or external modifications. Reservations are automatically repaired, but the root cause should be investigated if this alert persists. + + - alert: CortexNovaDoesntFindValidKVMHosts + expr: sum by (az, hvtype) (cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*"}) > 0 + for: 5m + labels: + context: scheduling + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Nova scheduling cannot find valid KVM hosts" + description: > + Cortex is seeing faulty vms in `{{$labels.az}}` where Nova scheduling + failed to find a valid `{{$labels.hvtype}}` host. This may indicate + capacity issues, misconfigured filters, or resource constraints in the + datacenter. Investigate the affected VMs and hypervisor availability. diff --git a/helm/bundles/cortex-nova/templates/kpis.yaml b/helm/bundles/cortex-nova/templates/kpis.yaml index af01c10c5..bc5666926 100644 --- a/helm/bundles/cortex-nova/templates/kpis.yaml +++ b/helm/bundles/cortex-nova/templates/kpis.yaml @@ -110,6 +110,23 @@ spec: --- apiVersion: cortex.cloud/v1alpha1 kind: KPI +metadata: + name: vm-faults +spec: + schedulingDomain: nova + impl: vm_faults_kpi + dependencies: + datasources: + - name: nova-servers + - name: nova-flavors + description: | + This kpi tracks vm faults in the datacenter. It exposes helpful information + about the faults, such as the availability zone, hypervisor type, vm state, + and error info if available. This can be used to identify issues in the + datacenter and to monitor the overall health of the vms. +--- +apiVersion: cortex.cloud/v1alpha1 +kind: KPI metadata: name: cortex-nova-datasource-state spec: diff --git a/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go b/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go index 322b05d69..1be2b7a29 100644 --- a/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go +++ b/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go @@ -108,9 +108,24 @@ type Server struct { OSEXTSTSVmState string `json:"OS-EXT-STS:vm_state" db:"os_ext_sts_vm_state"` OSEXTSTSPowerState int `json:"OS-EXT-STS:power_state" db:"os_ext_sts_power_state"` - // From nested JSON + // From nested server.flavor JSON FlavorName string `json:"-" db:"flavor_name"` + // From nested server.fault JSON + + // The error response code. + FaultCode *uint `json:"-" db:"fault_code"` + // The date and time when the exception was raised. The date and time stamp + // format is ISO 8601 (CCYY-MM-DDThh:mm:ss±hh:mm). For example, + // 2015-08-27T09:49:58-05:00. The ±hh:mm value if included, is the time zone + // as an offset from UTC. In the previous example, the offset value is -05:00. + FaultCreated *string `json:"-" db:"fault_created"` + // The error message. + FaultMessage *string `json:"-" db:"fault_message"` + // The stack trace. It is available if the response code is not 500 or you + // have the administrator privilege. + FaultDetails *string `json:"-" db:"fault_details"` + // Note: there are some more fields that are omitted. To include them again, add // custom unmarshalers and marshalers for the struct below. } @@ -119,7 +134,8 @@ type Server struct { func (s *Server) UnmarshalJSON(data []byte) error { type Alias Server aux := &struct { - Flavor json.RawMessage `json:"flavor"` + Flavor json.RawMessage `json:"flavor"` + Fault *json.RawMessage `json:"fault,omitempty"` *Alias }{ Alias: (*Alias)(s), @@ -135,31 +151,63 @@ func (s *Server) UnmarshalJSON(data []byte) error { return err } s.FlavorName = flavor.Name + var fault struct { + Code uint `json:"code"` + Created string `json:"created"` + Message string `json:"message"` + Details *string `json:"details,omitempty"` + } + if aux.Fault != nil { + if err := json.Unmarshal(*aux.Fault, &fault); err != nil { + return err + } + s.FaultCode = &fault.Code + s.FaultCreated = &fault.Created + s.FaultMessage = &fault.Message + s.FaultDetails = fault.Details + } return nil } // Custom marshaler for OpenStackServer to handle nested JSON. func (s *Server) MarshalJSON() ([]byte, error) { type Alias Server + type flavor struct { + // Starting in microversion 2.47, "id" was removed... + Name string `json:"original_name"` + } + flavorVal := flavor{ + Name: s.FlavorName, + } + type fault struct { + Code uint `json:"code"` + Created string `json:"created"` + Message string `json:"message"` + Details *string `json:"details,omitempty"` + } + var faultVal *fault + if s.FaultCode != nil && s.FaultCreated != nil && s.FaultMessage != nil { + faultVal = &fault{ + Code: *s.FaultCode, + Created: *s.FaultCreated, + Message: *s.FaultMessage, + Details: s.FaultDetails, + } + } aux := &struct { - Flavor struct { - // Starting in microversion 2.47, "id" was removed... - Name string `json:"original_name"` - } `json:"flavor"` + Flavor flavor `json:"flavor"` + Fault *fault `json:"fault,omitempty"` *Alias }{ - Alias: (*Alias)(s), - Flavor: struct { - Name string `json:"original_name"` - }{ - Name: s.FlavorName, - }, + Alias: (*Alias)(s), + Flavor: flavorVal, + Fault: faultVal, } return json.Marshal(aux) } // Table in which the openstack model is stored. -func (Server) TableName() string { return "openstack_servers" } +func (Server) TableName() string { return "openstack_servers_v2" } // Index for the openstack model. func (Server) Indexes() map[string][]string { return nil } @@ -285,6 +333,54 @@ type Flavor struct { ExtraSpecs string `json:"extra_specs" db:"extra_specs"` } +// FlavorHypervisorType is a type alias for a string to represent the specific +// values the hypervisor type contained in flavor extra specs may have. +type FlavorHypervisorType string + +const ( + // FlavorHypervisorTypeQEMU maps a flavor for QEMU/KVM hypervisors. + FlavorHypervisorTypeQEMU FlavorHypervisorType = "QEMU" + // FlavorHypervisorTypeCH maps flavors to Cloud-Hypervisor/KVM hypervisors. + FlavorHypervisorTypeCH FlavorHypervisorType = "CH" + // FlavorHypervisorTypeVMware maps flavors to VMware hypervisors. + FlavorHypervisorTypeVMware FlavorHypervisorType = "VMware vCenter Server" + // FlavorHypervisorTypeIronic maps flavors to Ironic baremetal instances. + FlavorHypervisorTypeIronic FlavorHypervisorType = "Ironic" + // FlavorHypervisorTypeOther is a flavor for which the hypervisor type + // is set in the extra specs but has an unknown value. + FlavorHypervisorTypeOther FlavorHypervisorType = "Other" + // FlavorHypervisorTypeUnspecified is a flavor for which the hypervisor type + // is not set in the extra specs. + FlavorHypervisorTypeUnspecified FlavorHypervisorType = "Unspecified" +) + +// GetHypervisorType returns the hypervisor type of the flavor based on its +// extra specs. +func (f Flavor) GetHypervisorType() (FlavorHypervisorType, error) { + var extraSpecs map[string]string + if f.ExtraSpecs == "" { + extraSpecs = map[string]string{} + } else if err := json.Unmarshal([]byte(f.ExtraSpecs), &extraSpecs); err != nil { + return "", err // Return an error if the extra specs cannot be parsed. + } + hypervisorType, ok := extraSpecs["capabilities:hypervisor_type"] + if !ok { + return FlavorHypervisorTypeUnspecified, nil + } + switch hypervisorType { + case string(FlavorHypervisorTypeQEMU): + return FlavorHypervisorTypeQEMU, nil + case string(FlavorHypervisorTypeCH): + return FlavorHypervisorTypeCH, nil + case string(FlavorHypervisorTypeVMware): + return FlavorHypervisorTypeVMware, nil + case string(FlavorHypervisorTypeIronic): + return FlavorHypervisorTypeIronic, nil + default: + return FlavorHypervisorTypeOther, nil + } +} + // Custom unmarshaler for OpenStackFlavor to handle nested JSON. func (f *Flavor) UnmarshalJSON(data []byte) error { type Alias Flavor diff --git a/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct.sql b/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct.sql index ea2b9c97a..ab3c7b8a7 100644 --- a/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct.sql +++ b/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct.sql @@ -3,6 +3,6 @@ SELECT os.os_ext_srv_attr_host AS host, MAX(value) AS max_steal_time_pct FROM kvm_libvirt_domain_metrics kvm -JOIN openstack_servers os ON os.os_ext_srv_attr_instance_name = kvm.domain +JOIN openstack_servers_v2 os ON os.os_ext_srv_attr_instance_name = kvm.domain WHERE kvm.name = 'kvm_libvirt_domain_steal_pct' AND os.id IS NOT NULL GROUP BY os.os_ext_srv_attr_host, os.id; \ No newline at end of file diff --git a/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct_test.go b/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct_test.go index b9f84b188..bc28218b5 100644 --- a/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct_test.go +++ b/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct_test.go @@ -56,7 +56,7 @@ func TestLibvirtDomainCPUStealPctExtractor_Extract(t *testing.T) { t.Fatalf("expected no error, got %v", err) } - // Insert mock data into the openstack_servers table + // Insert mock data into the openstack servers table servers := []any{ &nova.Server{ ID: "uuid-1", diff --git a/internal/knowledge/extractor/plugins/compute/vm_host_residency.sql b/internal/knowledge/extractor/plugins/compute/vm_host_residency.sql index fff0086c4..c2b4b8846 100644 --- a/internal/knowledge/extractor/plugins/compute/vm_host_residency.sql +++ b/internal/knowledge/extractor/plugins/compute/vm_host_residency.sql @@ -21,7 +21,7 @@ WITH durations AS ( )) AS BIGINT) ) AS duration FROM openstack_migrations AS migrations - LEFT JOIN openstack_servers AS servers ON servers.id = migrations.instance_uuid + LEFT JOIN openstack_servers_v2 AS servers ON servers.id = migrations.instance_uuid LEFT JOIN openstack_flavors_v2 AS flavors ON flavors.name = servers.flavor_name ) SELECT diff --git a/internal/knowledge/extractor/plugins/compute/vm_life_span.sql b/internal/knowledge/extractor/plugins/compute/vm_life_span.sql index daaa0a470..1fad31536 100644 --- a/internal/knowledge/extractor/plugins/compute/vm_life_span.sql +++ b/internal/knowledge/extractor/plugins/compute/vm_life_span.sql @@ -13,7 +13,7 @@ running_servers AS ( EXTRACT(EPOCH FROM (NOW()::timestamp - servers.created::timestamp))::BIGINT AS duration, COALESCE(flavors.name, 'unknown')::TEXT AS flavor_name, false::BOOLEAN AS deleted - FROM openstack_servers servers + FROM openstack_servers_v2 servers LEFT JOIN openstack_flavors_v2 flavors ON flavors.name = servers.flavor_name WHERE servers.created IS NOT NULL ) diff --git a/internal/knowledge/extractor/plugins/compute/vrops_hostsystem_resolver.sql b/internal/knowledge/extractor/plugins/compute/vrops_hostsystem_resolver.sql index e2c6ad4b2..8ab0a2c70 100644 --- a/internal/knowledge/extractor/plugins/compute/vrops_hostsystem_resolver.sql +++ b/internal/knowledge/extractor/plugins/compute/vrops_hostsystem_resolver.sql @@ -3,5 +3,5 @@ SELECT DISTINCT m.hostsystem AS vrops_hostsystem, s.os_ext_srv_attr_host AS nova_compute_host FROM vrops_vm_metrics m -LEFT JOIN openstack_servers s ON m.instance_uuid = s.id +LEFT JOIN openstack_servers_v2 s ON m.instance_uuid = s.id WHERE s.os_ext_srv_attr_host IS NOT NULL; diff --git a/internal/knowledge/extractor/plugins/compute/vrops_project_noisiness.sql b/internal/knowledge/extractor/plugins/compute/vrops_project_noisiness.sql index 334668b22..0b0067790 100644 --- a/internal/knowledge/extractor/plugins/compute/vrops_project_noisiness.sql +++ b/internal/knowledge/extractor/plugins/compute/vrops_project_noisiness.sql @@ -19,7 +19,7 @@ host_cpu_usage AS ( s.tenant_id, h.service_host, AVG(p.avg_cpu) AS avg_cpu_of_project - FROM openstack_servers s + FROM openstack_servers_v2 s JOIN vrops_vm_metrics m ON s.id = m.instance_uuid JOIN projects_avg_cpu p ON s.tenant_id = p.tenant_id JOIN openstack_hypervisors h ON s.os_ext_srv_attr_hypervisor_hostname = h.hostname diff --git a/internal/knowledge/kpis/plugins/compute/vm_faults.go b/internal/knowledge/kpis/plugins/compute/vm_faults.go new file mode 100644 index 000000000..fec71247c --- /dev/null +++ b/internal/knowledge/kpis/plugins/compute/vm_faults.go @@ -0,0 +1,151 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package compute + +import ( + "errors" + "strconv" + "strings" + + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" + "github.com/cobaltcore-dev/cortex/internal/knowledge/db" + "github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins" + "github.com/cobaltcore-dev/cortex/pkg/conf" + "github.com/prometheus/client_golang/prometheus" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +var vmFaultsKPIlogger = ctrl.Log.WithName("vm-faults-kpi") + +// This kpi tracks vm faults in the datacenter. It exposes helpful information +// about the faults, such as the availability zone, hypervisor type, vm state, +// and error info if available. This can be used to identify issues in the +// datacenter and to monitor the overall health of the vms. +type VMFaultsKPI struct { + plugins.BaseKPI[struct{} /* No opts */] + + // vmFaultsDesc describes the prometheus metric for vm faults. + vmFaultsDesc *prometheus.Desc +} + +// GetName returns a unique name for this kpi that is used for registration +// and configuration. +func (VMFaultsKPI) GetName() string { return "vm_faults_kpi" } + +// Init initializes the kpi, e.g. by creating the necessary Prometheus +// descriptors. The base kpi is also initialized with the provided database, +// client and options. +func (k *VMFaultsKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) error { + if err := k.BaseKPI.Init(db, client, opts); err != nil { + return err + } + k.vmFaultsDesc = prometheus.NewDesc("cortex_vm_faults", + "Number of vm faults in the datacenter", + []string{"az", "hvtype", "state", "faultcode", "faultmsg", "faultyvm"}, nil, + ) + return nil +} + +// Describe sends the descriptor of this kpi to the provided channel. This is +// used by Prometheus to know which metrics this kpi exposes. +func (k *VMFaultsKPI) Describe(ch chan<- *prometheus.Desc) { ch <- k.vmFaultsDesc } + +// Collect collects the current state of vms from the database and sends it as +// Prometheus metrics to the provided channel. +func (k *VMFaultsKPI) Collect(ch chan<- prometheus.Metric) { + vmFaultsKPIlogger.Info("collecting metrics") + + // This can happen when no datasource is provided that connects to a database. + if k.DB == nil { + err := errors.New("no database connection") + vmFaultsKPIlogger.Error(err, "cannot collect metric") + return + } + + // Get all vms with their current state from the database. + var servers []nova.Server + nServers, err := k.DB.Select(&servers, "SELECT * FROM "+nova.Server{}.TableName()) + if err != nil { + vmFaultsKPIlogger.Error(err, "failed to query servers from database") + return + } + vmFaultsKPIlogger.Info("queried servers from database", "nServers", nServers) + + // Get all flavors from the database to map them to the vms. + var flavors []nova.Flavor + nFlavors, err := k.DB.Select(&flavors, "SELECT * FROM "+nova.Flavor{}.TableName()) + if err != nil { + vmFaultsKPIlogger.Error(err, "failed to query flavors from database") + return + } + vmFaultsKPIlogger.Info("queried flavors from database", "nFlavors", nFlavors) + + flavorsByName := make(map[string]nova.Flavor, len(flavors)) + for _, flavor := range flavors { + flavorsByName[flavor.Name] = flavor + } + + type labels struct { + az string + hvtype string + state string + errcode string + errmessage string + faultyVM string + } + counts := make(map[labels]float64) + + // For each vm, get its hypervisor type and count up. + // Note: this will also expose vms that are NOT in an error state, + // but this can be useful to compare it to the number of faulty vms. + for _, server := range servers { + flavor, ok := flavorsByName[server.FlavorName] + if !ok { + vmFaultsKPIlogger.Info("warning: flavor not found for server", "server", + server.ID, "flavor", server.FlavorName) + continue + } + hypervisorType, err := flavor.GetHypervisorType() + if err != nil { + vmFaultsKPIlogger.Error(err, "failed to get hypervisor type for server", + "server", server.ID, "flavor", flavor.Name) + continue + } + var errcode uint = 0 + if server.FaultCode != nil { + errcode = *server.FaultCode + } + errmsg := "n/a" + if server.FaultMessage != nil { + errmsg = *server.FaultMessage + // Sometimes the VM ID may appear in the error message, which can + // lead to high cardinality in the metric. To avoid this, we replace + // the VM ID with a placeholder. + errmsg = strings.ReplaceAll(errmsg, server.ID, "") + } + // Only provide the server ID for faulty VMs, to avoid cardinality + // explosion in the metric. + faultyVM := "no" + if server.FaultCode != nil || server.FaultMessage != nil { + faultyVM = server.ID + } + key := labels{ + az: server.OSEXTAvailabilityZone, + hvtype: string(hypervisorType), + state: server.Status, + errcode: strconv.FormatUint(uint64(errcode), 10), + errmessage: errmsg, + faultyVM: faultyVM, + } + counts[key]++ + } + + // Emit metrics to prometheus. + for key, count := range counts { + ch <- prometheus.MustNewConstMetric(k.vmFaultsDesc, prometheus.GaugeValue, count, + key.az, key.hvtype, key.state, key.errcode, key.errmessage, key.faultyVM) + } + vmFaultsKPIlogger.Info("collected metrics", "nMetrics", len(counts)) +} diff --git a/internal/knowledge/kpis/plugins/compute/vm_faults_test.go b/internal/knowledge/kpis/plugins/compute/vm_faults_test.go new file mode 100644 index 000000000..a5b248b55 --- /dev/null +++ b/internal/knowledge/kpis/plugins/compute/vm_faults_test.go @@ -0,0 +1,408 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package compute + +import ( + "reflect" + "testing" + + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" + "github.com/cobaltcore-dev/cortex/internal/knowledge/db" + testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing" + "github.com/cobaltcore-dev/cortex/pkg/conf" + testlib "github.com/cobaltcore-dev/cortex/pkg/testing" + "github.com/prometheus/client_golang/prometheus" + prometheusgo "github.com/prometheus/client_model/go" +) + +func TestVMFaultsKPI_GetName(t *testing.T) { + kpi := VMFaultsKPI{} + if kpi.GetName() != "vm_faults_kpi" { + t.Errorf("expected 'vm_faults_kpi', got %q", kpi.GetName()) + } +} + +func TestVMFaultsKPI_Init(t *testing.T) { + dbEnv := testlibDB.SetupDBEnv(t) + testDB := db.DB{DbMap: dbEnv.DbMap} + defer dbEnv.Close() + + kpi := &VMFaultsKPI{} + if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error, got %v", err) + } + if kpi.vmFaultsDesc == nil { + t.Error("vmFaultsDesc should be initialized") + } +} + +func TestVMFaultsKPI_Describe(t *testing.T) { + kpi := &VMFaultsKPI{} + if err := kpi.Init(nil, nil, conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + ch := make(chan *prometheus.Desc, 1) + kpi.Describe(ch) + close(ch) + + desc := <-ch + if desc == nil { + t.Error("expected descriptor to be sent to channel") + } +} + +func TestVMFaultsKPI_Collect_NoDB(t *testing.T) { + kpi := &VMFaultsKPI{} + if err := kpi.Init(nil, nil, conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + // Collect should not panic when no database is provided + ch := make(chan prometheus.Metric, 100) + kpi.Collect(ch) + close(ch) + + count := 0 + for range ch { + count++ + } + if count != 0 { + t.Errorf("expected 0 metrics when no DB, got %d", count) + } +} + +func TestVMFaultsKPI_Collect(t *testing.T) { + dbEnv := testlibDB.SetupDBEnv(t) + testDB := db.DB{DbMap: dbEnv.DbMap} + defer dbEnv.Close() + + if err := testDB.CreateTable( + testDB.AddTable(nova.Server{}), + testDB.AddTable(nova.Flavor{}), + ); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + // Insert mock flavors with different hypervisor types + flavors := []any{ + &nova.Flavor{ + ID: "flavor-qemu", + Name: "qemu-small", + VCPUs: 2, + RAM: 4096, + ExtraSpecs: `{"capabilities:hypervisor_type":"QEMU"}`, + }, + &nova.Flavor{ + ID: "flavor-vmware", + Name: "vmware-medium", + VCPUs: 4, + RAM: 8192, + ExtraSpecs: `{"capabilities:hypervisor_type":"VMware vCenter Server"}`, + }, + &nova.Flavor{ + ID: "flavor-unspecified", + Name: "generic-large", + VCPUs: 8, + RAM: 16384, + ExtraSpecs: `{}`, + }, + } + if err := testDB.Insert(flavors...); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + // Insert mock servers + servers := []any{ + // Normal server without fault + &nova.Server{ + ID: "server-1", + Name: "normal-vm", + Status: "ACTIVE", + FlavorName: "qemu-small", + OSEXTAvailabilityZone: "az1", + }, + // Server with fault code and message + &nova.Server{ + ID: "server-2", + Name: "faultyvm", + Status: "ERROR", + FlavorName: "qemu-small", + OSEXTAvailabilityZone: "az1", + FaultCode: testlib.Ptr(uint(500)), + FaultMessage: testlib.Ptr("Internal error"), + }, + // Another faulty server in different AZ + &nova.Server{ + ID: "server-3", + Name: "another-faulty", + Status: "ERROR", + FlavorName: "vmware-medium", + OSEXTAvailabilityZone: "az2", + FaultCode: testlib.Ptr(uint(400)), + FaultMessage: testlib.Ptr("Bad request"), + }, + // Server with only fault message (no code) + &nova.Server{ + ID: "server-4", + Name: "partial-fault", + Status: "BUILD", + FlavorName: "generic-large", + OSEXTAvailabilityZone: "az1", + FaultMessage: testlib.Ptr("Some warning"), + }, + // Server with flavor that doesn't exist (should be skipped) + &nova.Server{ + ID: "server-5", + Name: "orphan-vm", + Status: "ACTIVE", + FlavorName: "nonexistent-flavor", + OSEXTAvailabilityZone: "az1", + }, + } + if err := testDB.Insert(servers...); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + kpi := &VMFaultsKPI{} + if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + ch := make(chan prometheus.Metric, 100) + kpi.Collect(ch) + close(ch) + + type vmFaultsMetric struct { + az string + hvtype string + state string + faultCode string + faultMessage string + faultyVM string + value float64 + } + + metrics := make(map[string]vmFaultsMetric) + for metric := range ch { + var m prometheusgo.Metric + if err := metric.Write(&m); err != nil { + t.Fatalf("failed to write metric: %v", err) + } + + labels := make(map[string]string) + for _, label := range m.Label { + labels[label.GetName()] = label.GetValue() + } + + key := labels["az"] + "|" + labels["hvtype"] + "|" + labels["state"] + "|" + + labels["faultcode"] + "|" + labels["faultyvm"] + + metrics[key] = vmFaultsMetric{ + az: labels["az"], + hvtype: labels["hvtype"], + state: labels["state"], + faultCode: labels["faultcode"], + faultMessage: labels["faultmsg"], + faultyVM: labels["faultyvm"], + value: m.GetGauge().GetValue(), + } + } + + expectedMetrics := map[string]vmFaultsMetric{ + // Normal VM without fault + "az1|QEMU|ACTIVE|0|no": { + az: "az1", + hvtype: "QEMU", + state: "ACTIVE", + faultCode: "0", + faultMessage: "n/a", + faultyVM: "no", + value: 1, + }, + // Faulty VM with code 500 + "az1|QEMU|ERROR|500|server-2": { + az: "az1", + hvtype: "QEMU", + state: "ERROR", + faultCode: "500", + faultMessage: "Internal error", + faultyVM: "server-2", + value: 1, + }, + // Faulty VM with code 400 in az2 + "az2|VMware vCenter Server|ERROR|400|server-3": { + az: "az2", + hvtype: "VMware vCenter Server", + state: "ERROR", + faultCode: "400", + faultMessage: "Bad request", + faultyVM: "server-3", + value: 1, + }, + // Server with only fault message (code=0 but has message) + "az1|Unspecified|BUILD|0|server-4": { + az: "az1", + hvtype: "Unspecified", + state: "BUILD", + faultCode: "0", + faultMessage: "Some warning", + faultyVM: "server-4", + value: 1, + }, + } + + if len(expectedMetrics) != len(metrics) { + t.Errorf("expected %d metrics, got %d", len(expectedMetrics), len(metrics)) + t.Logf("actual metrics: %+v", metrics) + } + + for key, expected := range expectedMetrics { + actual, ok := metrics[key] + if !ok { + t.Errorf("expected metric %q not found", key) + continue + } + + if !reflect.DeepEqual(expected, actual) { + t.Errorf("metric %q: expected %+v, got %+v", key, expected, actual) + } + } +} + +func TestVMFaultsKPI_Collect_InvalidExtraSpecs(t *testing.T) { + dbEnv := testlibDB.SetupDBEnv(t) + testDB := db.DB{DbMap: dbEnv.DbMap} + defer dbEnv.Close() + + if err := testDB.CreateTable( + testDB.AddTable(nova.Server{}), + testDB.AddTable(nova.Flavor{}), + ); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + // Insert flavor with invalid extra specs JSON + flavors := []any{ + &nova.Flavor{ + ID: "flavor-bad", + Name: "bad-flavor", + VCPUs: 2, + RAM: 4096, + ExtraSpecs: `invalid-json`, + }, + } + if err := testDB.Insert(flavors...); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + servers := []any{ + &nova.Server{ + ID: "server-bad", + Name: "bad-vm", + Status: "ACTIVE", + FlavorName: "bad-flavor", + OSEXTAvailabilityZone: "az1", + }, + } + if err := testDB.Insert(servers...); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + kpi := &VMFaultsKPI{} + if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + // Should not panic, but should skip the server with invalid flavor + ch := make(chan prometheus.Metric, 100) + kpi.Collect(ch) + close(ch) + + count := 0 + for range ch { + count++ + } + // Should have 0 metrics since the server's flavor has invalid extra specs + if count != 0 { + t.Errorf("expected 0 metrics, got %d", count) + } +} + +func TestVMFaultsKPI_Collect_MultipleSameLabels(t *testing.T) { + dbEnv := testlibDB.SetupDBEnv(t) + testDB := db.DB{DbMap: dbEnv.DbMap} + defer dbEnv.Close() + + if err := testDB.CreateTable( + testDB.AddTable(nova.Server{}), + testDB.AddTable(nova.Flavor{}), + ); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + flavors := []any{ + &nova.Flavor{ + ID: "flavor-1", + Name: "small", + VCPUs: 2, + RAM: 4096, + ExtraSpecs: `{"capabilities:hypervisor_type":"QEMU"}`, + }, + } + if err := testDB.Insert(flavors...); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + // Insert multiple servers that should aggregate to same metric + servers := []any{ + &nova.Server{ + ID: "server-1", + Name: "vm-1", + Status: "ACTIVE", + FlavorName: "small", + OSEXTAvailabilityZone: "az1", + }, + &nova.Server{ + ID: "server-2", + Name: "vm-2", + Status: "ACTIVE", + FlavorName: "small", + OSEXTAvailabilityZone: "az1", + }, + &nova.Server{ + ID: "server-3", + Name: "vm-3", + Status: "ACTIVE", + FlavorName: "small", + OSEXTAvailabilityZone: "az1", + }, + } + if err := testDB.Insert(servers...); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + kpi := &VMFaultsKPI{} + if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error, got %v", err) + } + + ch := make(chan prometheus.Metric, 100) + kpi.Collect(ch) + close(ch) + + var value float64 + for metric := range ch { + var m prometheusgo.Metric + if err := metric.Write(&m); err != nil { + t.Fatalf("failed to write metric: %v", err) + } + value = m.GetGauge().GetValue() + } + + // All 3 VMs should be counted together since they have the same labels + if value != 3 { + t.Errorf("expected metric value 3, got %f", value) + } +} diff --git a/internal/knowledge/kpis/supported_kpis.go b/internal/knowledge/kpis/supported_kpis.go index 274c5ace5..2623ff8bd 100644 --- a/internal/knowledge/kpis/supported_kpis.go +++ b/internal/knowledge/kpis/supported_kpis.go @@ -21,6 +21,7 @@ var supportedKPIs = map[string]plugins.KPI{ "vm_migration_statistics_kpi": &compute.VMMigrationStatisticsKPI{}, "vm_life_span_kpi": &compute.VMLifeSpanKPI{}, "vm_commitments_kpi": &compute.VMCommitmentsKPI{}, + "vm_faults_kpi": &compute.VMFaultsKPI{}, "netapp_storage_pool_cpu_usage_kpi": &storage.NetAppStoragePoolCPUUsageKPI{}, diff --git a/internal/scheduling/reservations/commitments/controller.go b/internal/scheduling/reservations/commitments/controller.go index 9c238aeee..d38c6e1d8 100644 --- a/internal/scheduling/reservations/commitments/controller.go +++ b/internal/scheduling/reservations/commitments/controller.go @@ -445,7 +445,7 @@ func (r *CommitmentReservationController) listServersByProjectID(ctx context.Con // Query servers from the database cache. var servers []nova.Server _, err := r.DB.Select(&servers, - "SELECT * FROM openstack_servers WHERE tenant_id = $1", + "SELECT * FROM "+nova.Server{}.TableName()+" WHERE tenant_id = $1", projectID) if err != nil { return nil, fmt.Errorf("failed to query servers from database: %w", err) diff --git a/tools/plutono/provisioning/dashboards/cortex-status.json b/tools/plutono/provisioning/dashboards/cortex-status.json index f83e2926b..37f4b2479 100644 --- a/tools/plutono/provisioning/dashboards/cortex-status.json +++ b/tools/plutono/provisioning/dashboards/cortex-status.json @@ -16,7 +16,7 @@ "editable": true, "gnetId": null, "graphTooltip": 0, - "id": 3, + "id": 1, "links": [], "panels": [ { @@ -557,6 +557,7 @@ "dashLength": 10, "dashes": false, "datasource": "prometheus-openstack", + "description": "", "fieldConfig": { "defaults": { "unit": "short" @@ -567,11 +568,117 @@ "fillGradient": 0, "gridPos": { "h": 12, - "w": 24, + "w": 12, "x": 0, "y": 31 }, "hiddenSeries": false, + "id": 58, + "interval": null, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.37", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": false, + "expr": "sum by (faultmsg,state) (cortex_vm_faults{faultyvm!=\"no\"})", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{state}} {{faultmsg}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nova: faults in vm scheduling lifecycle", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:234", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:235", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus-openstack", + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 43 + }, + "hiddenSeries": false, "id": 39, "legend": { "avg": false, @@ -669,7 +776,7 @@ "h": 11, "w": 6, "x": 0, - "y": 43 + "y": 55 }, "hiddenSeries": false, "id": 31, @@ -766,7 +873,7 @@ "h": 11, "w": 6, "x": 6, - "y": 43 + "y": 55 }, "hiddenSeries": false, "id": 33, @@ -878,7 +985,7 @@ "h": 11, "w": 6, "x": 12, - "y": 43 + "y": 55 }, "hiddenSeries": false, "id": 35, @@ -990,7 +1097,7 @@ "h": 11, "w": 6, "x": 18, - "y": 43 + "y": 55 }, "hiddenSeries": false, "id": 37, @@ -1100,7 +1207,7 @@ "h": 12, "w": 12, "x": 0, - "y": 54 + "y": 66 }, "hiddenSeries": false, "id": 27, @@ -1208,7 +1315,7 @@ "h": 12, "w": 12, "x": 12, - "y": 54 + "y": 66 }, "hiddenSeries": false, "id": 29, @@ -1296,7 +1403,7 @@ "h": 1, "w": 24, "x": 0, - "y": 66 + "y": 78 }, "id": 5, "panels": [], @@ -1321,7 +1428,7 @@ "h": 11, "w": 12, "x": 0, - "y": 67 + "y": 79 }, "hiddenSeries": false, "id": 2, @@ -1441,7 +1548,7 @@ "h": 11, "w": 12, "x": 12, - "y": 67 + "y": 79 }, "hiddenSeries": false, "id": 3, @@ -1580,7 +1687,7 @@ "h": 12, "w": 24, "x": 0, - "y": 78 + "y": 90 }, "id": 50, "options": { @@ -1621,7 +1728,7 @@ "h": 1, "w": 24, "x": 0, - "y": 90 + "y": 102 }, "id": 25, "panels": [], @@ -1644,7 +1751,7 @@ "h": 14, "w": 12, "x": 0, - "y": 91 + "y": 103 }, "hiddenSeries": false, "id": 21, @@ -1746,7 +1853,7 @@ "h": 14, "w": 12, "x": 12, - "y": 91 + "y": 103 }, "hiddenSeries": false, "id": 23, @@ -1839,7 +1946,7 @@ "h": 1, "w": 24, "x": 0, - "y": 105 + "y": 117 }, "id": 19, "panels": [], @@ -1862,7 +1969,7 @@ "h": 13, "w": 12, "x": 0, - "y": 106 + "y": 118 }, "hiddenSeries": false, "id": 17, @@ -1960,7 +2067,7 @@ "h": 13, "w": 12, "x": 12, - "y": 106 + "y": 118 }, "hiddenSeries": false, "id": 15, @@ -2057,7 +2164,7 @@ "h": 12, "w": 12, "x": 0, - "y": 119 + "y": 131 }, "hiddenSeries": false, "id": 11, @@ -2155,7 +2262,7 @@ "h": 12, "w": 12, "x": 12, - "y": 119 + "y": 131 }, "hiddenSeries": false, "id": 13, diff --git a/tools/visualize-reservations/main.go b/tools/visualize-reservations/main.go index 9b5880be5..c99ff2eb1 100644 --- a/tools/visualize-reservations/main.go +++ b/tools/visualize-reservations/main.go @@ -52,6 +52,7 @@ import ( "time" "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" _ "github.com/lib/pq" corev1 "k8s.io/api/core/v1" @@ -1761,9 +1762,10 @@ func connectToPostgres( // Query servers with host information serverMap = make(map[string]serverInfo) - rows, err := db.QueryContext(ctx, "SELECT id, flavor_name, COALESCE(host_id, ''), COALESCE(os_ext_srv_attr_host, '') FROM openstack_servers") + //nolint:gosec // This query is not using any user input, so it's not vulnerable to SQL injection + rows, err := db.QueryContext(ctx, "SELECT id, flavor_name, COALESCE(host_id, ''), COALESCE(os_ext_srv_attr_host, '') FROM "+nova.Server{}.TableName()) if err != nil { - fmt.Fprintf(os.Stderr, "Warning: Could not query openstack_servers: %v\n", err) + fmt.Fprintf(os.Stderr, "Warning: Could not query "+nova.Server{}.TableName()+": %v\n", err) } else { defer rows.Close() for rows.Next() {