Skip to content
Merged
6 changes: 4 additions & 2 deletions helm/bundles/cortex-nova/templates/kpis.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -192,15 +192,17 @@ spec:
apiVersion: cortex.cloud/v1alpha1
kind: KPI
metadata:
name: vmware-resource-commitments
name: vmware-project-commitments
spec:
schedulingDomain: nova
impl: vmware_resource_commitments_kpi
impl: vmware_project_commitments_kpi
dependencies:
datasources:
- name: nova-servers
- name: nova-flavors
- name: limes-project-commitments
- name: identity-domains
Comment thread
coderabbitai[bot] marked this conversation as resolved.
- name: identity-projects
description: |
This KPI tracks the resource commitments of projects running VMs on VMware hosts.
---
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"log/slog"
"strings"

"github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/identity"
"github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/limes"
"github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova"
"github.com/cobaltcore-dev/cortex/internal/knowledge/db"
Expand All @@ -24,58 +25,64 @@ import (
// For general purpose workloads its not possible to differentiate the cpu architecture. To avoid weird behavior in a dashboard we don't export this label for the metric.
// For HANA flavors the cpu architecture is part of the flavor name (_v2 suffix for sapphire rapids, without suffix for cascade lake).
// For both types of workload however we can not determine on which host the commitment is fulfilled.
type VMwareResourceCommitmentsKPI struct {
type VMwareProjectCommitmentsKPI struct {
// BaseKPI provides common fields and methods for all KPIs, such as database connection and Kubernetes client.
plugins.BaseKPI[struct{}]

unusedGeneralPurposeCommitmentsPerProject *prometheus.Desc
unusedHanaCommittedResourcesPerProject *prometheus.Desc
}

func (k *VMwareResourceCommitmentsKPI) GetName() string {
return "vmware_resource_commitments_kpi"
func (k *VMwareProjectCommitmentsKPI) GetName() string {
return "vmware_project_commitments_kpi"
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

func (k *VMwareResourceCommitmentsKPI) Init(dbConn *db.DB, c client.Client, opts conf.RawOpts) error {
func (k *VMwareProjectCommitmentsKPI) Init(dbConn *db.DB, c client.Client, opts conf.RawOpts) error {
if err := k.BaseKPI.Init(dbConn, c, opts); err != nil {
return err
}

k.unusedGeneralPurposeCommitmentsPerProject = prometheus.NewDesc(
"cortex_vmware_commitments_general_purpose",
"Committed general purpose resources that are currently unused. CPU (resource=cpu) in vCPUs, memory (resource=ram) in bytes.",
[]string{"availability_zone", "resource", "project_id"}, nil,
[]string{"availability_zone", "resource", "project_id", "project_name", "domain_id", "domain_name"}, nil,
)
k.unusedHanaCommittedResourcesPerProject = prometheus.NewDesc(
"cortex_vmware_commitments_hana_resources",
"Total committed HANA instances capacity that is currently unused, translated to resources. CPU in vCPUs, memory and disk in bytes.",
[]string{"availability_zone", "cpu_architecture", "resource", "project_id"}, nil,
[]string{"availability_zone", "cpu_architecture", "resource", "project_id", "project_name", "domain_id", "domain_name"}, nil,
)
return nil
}

func (k *VMwareResourceCommitmentsKPI) Describe(ch chan<- *prometheus.Desc) {
func (k *VMwareProjectCommitmentsKPI) Describe(ch chan<- *prometheus.Desc) {
ch <- k.unusedGeneralPurposeCommitmentsPerProject
ch <- k.unusedHanaCommittedResourcesPerProject
}

func (k *VMwareResourceCommitmentsKPI) Collect(ch chan<- prometheus.Metric) {
func (k *VMwareProjectCommitmentsKPI) Collect(ch chan<- prometheus.Metric) {
if k.DB == nil {
return
}

flavorsByName, err := k.getFlavorsByName()
if err != nil {
slog.Error("vmware_resource_commitments: failed to load flavors", "err", err)
slog.Error("vmware_project_commitments: failed to load flavors", "err", err)
return
}

k.collectGeneralPurpose(ch, flavorsByName)
k.collectHana(ch, flavorsByName)
projects, err := k.getProjectsWithDomains()
if err != nil {
slog.Error("vmware_project_commitments: failed to load projects with domains", "err", err)
return
}

k.collectGeneralPurpose(ch, flavorsByName, projects)
k.collectHana(ch, flavorsByName, projects)
}

// getFlavorsByName loads all flavors and returns them keyed by name.
func (k *VMwareResourceCommitmentsKPI) getFlavorsByName() (map[string]nova.Flavor, error) {
func (k *VMwareProjectCommitmentsKPI) getFlavorsByName() (map[string]nova.Flavor, error) {
var flavors []nova.Flavor
if _, err := k.DB.Select(&flavors, "SELECT * FROM "+nova.Flavor{}.TableName()); err != nil {
return nil, err
Expand All @@ -88,7 +95,7 @@ func (k *VMwareResourceCommitmentsKPI) getFlavorsByName() (map[string]nova.Flavo
}

// getGeneralPurposeCommitments loads confirmed/guaranteed cores and ram commitments.
func (k *VMwareResourceCommitmentsKPI) getGeneralPurposeCommitments() ([]limes.Commitment, error) {
func (k *VMwareProjectCommitmentsKPI) getGeneralPurposeCommitments() ([]limes.Commitment, error) {
var commitments []limes.Commitment
if _, err := k.DB.Select(&commitments, `
SELECT * FROM `+limes.Commitment{}.TableName()+`
Expand All @@ -103,7 +110,7 @@ func (k *VMwareResourceCommitmentsKPI) getGeneralPurposeCommitments() ([]limes.C

// getGeneralPurposeServers loads running non-HANA servers for general purpose usage accounting.
// KVM-specific flavors are filtered out in Go since SQL LIKE cannot express the segment-exact pattern.
func (k *VMwareResourceCommitmentsKPI) getGeneralPurposeServers() ([]nova.Server, error) {
func (k *VMwareProjectCommitmentsKPI) getGeneralPurposeServers() ([]nova.Server, error) {
var servers []nova.Server
if _, err := k.DB.Select(&servers, `
SELECT * FROM `+nova.Server{}.TableName()+`
Expand All @@ -122,7 +129,7 @@ func (k *VMwareResourceCommitmentsKPI) getGeneralPurposeServers() ([]nova.Server
}

// getHanaInstanceCommitments loads confirmed/guaranteed HANA instance commitments.
func (k *VMwareResourceCommitmentsKPI) getHanaInstanceCommitments() ([]limes.Commitment, error) {
func (k *VMwareProjectCommitmentsKPI) getHanaInstanceCommitments() ([]limes.Commitment, error) {
var commitments []limes.Commitment
if _, err := k.DB.Select(&commitments, `
SELECT * FROM `+limes.Commitment{}.TableName()+`
Expand All @@ -136,7 +143,7 @@ func (k *VMwareResourceCommitmentsKPI) getHanaInstanceCommitments() ([]limes.Com
}

// getRunningHanaServers loads all running HANA VMware servers (KVM HANA flavors excluded in Go).
func (k *VMwareResourceCommitmentsKPI) getRunningHanaServers() ([]nova.Server, error) {
func (k *VMwareProjectCommitmentsKPI) getRunningHanaServers() ([]nova.Server, error) {
var servers []nova.Server
if _, err := k.DB.Select(&servers, `
SELECT * FROM `+nova.Server{}.TableName()+`
Expand All @@ -156,15 +163,15 @@ func (k *VMwareResourceCommitmentsKPI) getRunningHanaServers() ([]nova.Server, e

// collectGeneralPurpose computes and emits unused general purpose committed resources per project.
// Unused = committed - in-use (clamped to zero; zero values are not emitted).
func (k *VMwareResourceCommitmentsKPI) collectGeneralPurpose(ch chan<- prometheus.Metric, flavorsByName map[string]nova.Flavor) {
func (k *VMwareProjectCommitmentsKPI) collectGeneralPurpose(ch chan<- prometheus.Metric, flavorsByName map[string]nova.Flavor, projects map[string]projectWithDomain) {
commitments, err := k.getGeneralPurposeCommitments()
if err != nil {
slog.Error("vmware_resource_commitments: failed to load gp commitments", "err", err)
slog.Error("vmware_project_commitments: failed to load gp commitments", "err", err)
return
}
servers, err := k.getGeneralPurposeServers()
if err != nil {
slog.Error("vmware_resource_commitments: failed to load gp servers", "err", err)
slog.Error("vmware_project_commitments: failed to load gp servers", "err", err)
return
}

Expand All @@ -178,7 +185,7 @@ func (k *VMwareResourceCommitmentsKPI) collectGeneralPurpose(ch chan<- prometheu
case "ram":
bytes, err := bytesFromUnit(float64(c.Amount), c.Unit)
if err != nil {
slog.Warn("vmware_resource_commitments: unknown ram unit", "unit", c.Unit, "err", err)
slog.Warn("vmware_project_commitments: unknown ram unit", "unit", c.Unit, "err", err)
continue
}
committed[gpKey{c.ProjectID, c.AvailabilityZone, "ram"}] += bytes
Expand All @@ -189,7 +196,7 @@ func (k *VMwareResourceCommitmentsKPI) collectGeneralPurpose(ch chan<- prometheu
for _, s := range servers {
flavor, ok := flavorsByName[s.FlavorName]
if !ok {
slog.Warn("vmware_resource_commitments: gp flavor not found", "flavor", s.FlavorName)
slog.Warn("vmware_project_commitments: gp flavor not found", "flavor", s.FlavorName)
continue
}
used[gpKey{s.TenantID, s.OSEXTAvailabilityZone, "cpu"}] += float64(flavor.VCPUs)
Expand All @@ -201,19 +208,20 @@ func (k *VMwareResourceCommitmentsKPI) collectGeneralPurpose(ch chan<- prometheu
if unused <= 0 {
continue
}
project := projects[key.projectID]
ch <- prometheus.MustNewConstMetric(
k.unusedGeneralPurposeCommitmentsPerProject,
prometheus.GaugeValue,
unused,
key.az, key.resource, key.projectID,
key.az, key.resource, key.projectID, project.ProjectName, project.DomainID, project.DomainName,
)
}
}

// collectHana computes and emits unused committed HANA instance resources per project.
// Each HANA instance commitment is compared against running servers; the remainder is
// translated to cpu/ram/disk capacity using the flavor spec.
func (k *VMwareResourceCommitmentsKPI) collectHana(ch chan<- prometheus.Metric, flavorsByName map[string]nova.Flavor) {
func (k *VMwareProjectCommitmentsKPI) collectHana(ch chan<- prometheus.Metric, flavorsByName map[string]nova.Flavor, projects map[string]projectWithDomain) {
commitments, err := k.getHanaInstanceCommitments()
if err != nil {
slog.Error("vmware_resource_commitments: failed to load hana commitments", "err", err)
Expand Down Expand Up @@ -261,11 +269,36 @@ func (k *VMwareResourceCommitmentsKPI) collectHana(ch chan<- prometheus.Metric,
}

for key, value := range totals {
project := projects[key.projectID]
ch <- prometheus.MustNewConstMetric(
k.unusedHanaCommittedResourcesPerProject,
prometheus.GaugeValue,
value,
key.az, key.cpuArch, key.resource, key.projectID,
key.az, key.cpuArch, key.resource, key.projectID, project.ProjectName, project.DomainID, project.DomainName,
)
}
}

type projectWithDomain struct {
ProjectID string `db:"project_id"`
ProjectName string `db:"project_name"`
DomainID string `db:"domain_id"`
DomainName string `db:"domain_name"`
}

func (k *VMwareProjectCommitmentsKPI) getProjectsWithDomains() (map[string]projectWithDomain, error) {
var projects []projectWithDomain
if _, err := k.DB.Select(&projects, `
SELECT p.id AS project_id, p.name AS project_name, COALESCE(d.id, '') AS domain_id, COALESCE(d.name, '') AS domain_name
FROM `+identity.Project{}.TableName()+` p
LEFT JOIN `+identity.Domain{}.TableName()+` d ON p.domain_id = d.id
`); err != nil {
return nil, err
}

projectMap := make(map[string]projectWithDomain, len(projects))
for _, p := range projects {
projectMap[p.ProjectID] = p
}
return projectMap, nil
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Loading
Loading