From b32f2593824e6a125b3c24f6781e9e7783782462 Mon Sep 17 00:00:00 2001 From: mblos Date: Mon, 11 May 2026 16:00:54 +0200 Subject: [PATCH 1/3] fix: CR alerts and metrics --- .../cortex-nova/alerts/nova.alerts.yaml | 168 ++++++++++++++++++ helm/bundles/cortex-nova/values.yaml | 2 +- .../api/change_commitments_metrics.go | 2 +- .../api/change_commitments_monitor.go | 16 +- .../api/change_commitments_monitor_test.go | 14 +- .../commitments/api/report_capacity.go | 7 + .../api/report_capacity_monitor.go | 11 +- 7 files changed, 200 insertions(+), 20 deletions(-) diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index 47b337968..db13790ae 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -432,3 +432,171 @@ groups: This may indicate issues with the webhook logic, connectivity problems, or external factors causing failures. Check the webhook server logs for error details and investigate the affected resources. + + # Committed Resource Info API + - alert: CortexNovaCommittedResourceInfoUnavailable + expr: | + rate(cortex_committed_resource_info_api_requests_total{service="cortex-nova-metrics", status_code="503"}[5m]) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource info API is unavailable" + description: > + The committed resource info API (Limes LIQUID integration) has been returning + 503 Service Unavailable for more than 5 minutes. This typically means the + flavor group knowledge CRD is not ready or missing. Limes cannot discover + available committed resources until the issue is resolved. + + # Committed Resource Change API + - alert: CortexNovaCommittedResourceChangeErrors + expr: | + rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource change API HTTP 5xx errors" + description: > + The committed resource change API (Limes LIQUID integration) is returning + HTTP 5xx errors. This is not expected and indicates an internal problem + processing commitment changes. Limes will retry, but new commitments may + not be fulfilled until the issue is resolved. + + - alert: CortexNovaCommittedResourceRejectionRateTooHigh + expr: | + ( + sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[15m])) + / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[15m])) + ) > 0.3 + and on() sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[15m])) > 0 + for: 15m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource rejection rate too high ({{ $value | humanizePercentage }})" + description: > + More than 30% of commitment changes have been rejected over the last 15 minutes. + This may indicate insufficient capacity to fulfill new commitments. Rejected + commitments are rolled back; Limes will see them as failed and may report + the failure to users. + + - alert: CortexNovaCommittedResourceTimeoutsTooHigh + expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics"}[30m]) > 3 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource change API repeated timeouts" + description: > + More than 3 commitment change requests timed out in the last 30 minutes. + Timeouts occur when the scheduling pipeline cannot place reservations within + the deadline. Affected changes are rolled back. Investigate scheduler + performance or reservation backlog. + + - alert: CortexNovaCommittedResourceChangeLatencyTooHigh + expr: | + histogram_quantile(0.95, sum(rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) >= 10 + and on() sum(rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics"}[5m])) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource change API p95 latency >= 10s" + description: > + The committed resource change API p95 latency has reached or exceeded 10 seconds, + approaching the 15-second watch timeout. Requests close to the timeout are at risk + of being rolled back. Investigate scheduler performance or reservation backlog. + + # Committed Resource Capacity API + - alert: CortexNovaCommittedResourceCapacityErrors + expr: | + rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource capacity API HTTP 5xx errors" + description: > + The committed resource capacity API (Limes LIQUID integration) is returning + HTTP 5xx errors. This indicates internal problems calculating cluster capacity. + Limes may receive stale or incomplete capacity data. + + - alert: CortexNovaCommittedResourceCapacityDroppedToZero + expr: | + (cortex_committed_resource_reported_capacity_gib{service="cortex-nova-metrics"} == 0) + and on(resource, az) (cortex_committed_resource_reported_capacity_gib{service="cortex-nova-metrics"} offset 30m > 0) + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource capacity for {{ $labels.resource }} in {{ $labels.az }} dropped to zero" + description: > + The reported capacity for committed resource {{ $labels.resource }} in + availability zone {{ $labels.az }} has dropped from a positive value to zero. + This may mean hypervisors in that AZ are fully utilized for the corresponding + flavor group and no further committed resources can be placed there. + + # Committed Resource Usage API + - alert: CortexNovaCommittedResourceUsageErrors + expr: | + rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource usage API HTTP 5xx errors" + description: > + The committed resource usage API (Limes LIQUID integration) is returning + HTTP 5xx errors. This indicates internal problems fetching reservation or + Nova server data. Limes may receive stale or incomplete usage data. + + # Committed Resource Quota API + - alert: CortexNovaCommittedResourceQuotaErrors + expr: | + rate(cortex_committed_resource_quota_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource quota API HTTP 5xx errors" + description: > + The committed resource quota API (Limes LIQUID integration) is returning + HTTP 5xx errors. This indicates internal problems computing or applying + quota. Limes may not be able to enforce committed resource quotas. diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml index 65ce2ddde..7283ec2ea 100644 --- a/helm/bundles/cortex-nova/values.yaml +++ b/helm/bundles/cortex-nova/values.yaml @@ -177,7 +177,7 @@ cortex-scheduling-controllers: maxRequeueInterval: "30m" committedResourceAPI: # Timeout for watching CommittedResource CRDs before rolling back - watchTimeout: "10s" + watchTimeout: "15s" # How often to poll CommittedResource CRD conditions during watch watchPollInterval: "500ms" # When false, the endpoint returns HTTP 503; the info endpoint remains available. diff --git a/internal/scheduling/reservations/commitments/api/change_commitments_metrics.go b/internal/scheduling/reservations/commitments/api/change_commitments_metrics.go index 1afeea5f5..6bb87fbca 100644 --- a/internal/scheduling/reservations/commitments/api/change_commitments_metrics.go +++ b/internal/scheduling/reservations/commitments/api/change_commitments_metrics.go @@ -29,7 +29,7 @@ func (api *HTTPAPI) recordMetrics(req liquid.CommitmentChangeRequest, resp liqui } // Record commitment changes counter - api.monitor.commitmentChanges.WithLabelValues(result).Add(float64(commitmentCount)) + api.monitor.commitmentChanges.WithLabelValues(result, string(req.AZ)).Add(float64(commitmentCount)) } // countCommitments counts the total number of commitments in a request. diff --git a/internal/scheduling/reservations/commitments/api/change_commitments_monitor.go b/internal/scheduling/reservations/commitments/api/change_commitments_monitor.go index 6b1a5c31d..5867f1790 100644 --- a/internal/scheduling/reservations/commitments/api/change_commitments_monitor.go +++ b/internal/scheduling/reservations/commitments/api/change_commitments_monitor.go @@ -25,20 +25,21 @@ func NewChangeCommitmentsAPIMonitor() ChangeCommitmentsAPIMonitor { Help: "Total number of committed resource change API requests by HTTP status code", }, []string{"status_code"}), requestDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ - Name: "cortex_committed_resource_change_api_request_duration_seconds", - Help: "Duration of committed resource change API requests in seconds by HTTP status code", + Name: "cortex_committed_resource_change_api_request_duration_seconds", + Help: "Duration of committed resource change API requests in seconds by HTTP status code", + Buckets: []float64{0.5, 1, 2.5, 5, 7.5, 10, 12.5, 15, 20, 30}, }, []string{"status_code"}), commitmentChanges: prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "cortex_committed_resource_change_api_commitment_changes_total", - Help: "Total number of commitment changes processed by result", - }, []string{"result"}), + Help: "Total number of commitment changes processed by result and availability zone", + }, []string{"result", "az"}), timeouts: prometheus.NewCounter(prometheus.CounterOpts{ Name: "cortex_committed_resource_change_api_timeouts_total", Help: "Total number of commitment change requests that timed out while waiting for reservations to become ready", }), } - // Pre-initialize metrics with zero values for common HTTP status codes. + // Pre-initialize request metrics with zero values for common HTTP status codes. // This ensures metrics exist in Prometheus before the first request, // preventing "metric missing" warnings in alerting rules. for _, statusCode := range []string{"200", "400", "409", "500", "503"} { @@ -46,11 +47,6 @@ func NewChangeCommitmentsAPIMonitor() ChangeCommitmentsAPIMonitor { m.requestDuration.WithLabelValues(statusCode) } - // Pre-initialize commitment change result labels - for _, result := range []string{"accepted", "rejected"} { - m.commitmentChanges.WithLabelValues(result) - } - return m } diff --git a/internal/scheduling/reservations/commitments/api/change_commitments_monitor_test.go b/internal/scheduling/reservations/commitments/api/change_commitments_monitor_test.go index a999df7dc..ed10702c3 100644 --- a/internal/scheduling/reservations/commitments/api/change_commitments_monitor_test.go +++ b/internal/scheduling/reservations/commitments/api/change_commitments_monitor_test.go @@ -23,7 +23,7 @@ func TestChangeCommitmentsAPIMonitor_MetricsRegistration(t *testing.T) { // Observe metrics before gathering (Prometheus metrics with labels only appear after being used) monitor.requestCounter.WithLabelValues("200").Inc() monitor.requestDuration.WithLabelValues("200").Observe(0.1) - monitor.commitmentChanges.WithLabelValues("success").Inc() + monitor.commitmentChanges.WithLabelValues("success", "az-1").Inc() monitor.timeouts.Inc() // Verify metrics can be gathered @@ -90,8 +90,8 @@ func TestChangeCommitmentsAPIMonitor_MetricLabels(t *testing.T) { monitor.requestCounter.WithLabelValues("409").Inc() monitor.requestCounter.WithLabelValues("503").Inc() monitor.requestDuration.WithLabelValues("200").Observe(1.5) - monitor.commitmentChanges.WithLabelValues("success").Add(5) - monitor.commitmentChanges.WithLabelValues("rejected").Add(2) + monitor.commitmentChanges.WithLabelValues("success", "az-1").Add(5) + monitor.commitmentChanges.WithLabelValues("rejected", "az-1").Add(2) // Gather metrics families, err := registry.Gather() @@ -142,13 +142,12 @@ func TestChangeCommitmentsAPIMonitor_MetricLabels(t *testing.T) { } if *family.Name == "cortex_committed_resource_change_api_commitment_changes_total" { - // At minimum we expect the 2 labels we added (success, rejected) - // Plus pre-initialized labels (accepted) - so >= 2 total + // 2 label combinations: (success,az-1) and (rejected,az-1) if len(family.Metric) < 2 { t.Errorf("Expected at least 2 commitment changes metrics, got %d", len(family.Metric)) } - // Check all metrics have the result label + // Check all metrics have both result and az labels for _, metric := range family.Metric { labelNames := make(map[string]bool) for _, label := range metric.Label { @@ -158,6 +157,9 @@ func TestChangeCommitmentsAPIMonitor_MetricLabels(t *testing.T) { if !labelNames["result"] { t.Error("Missing 'result' label in commitment changes counter") } + if !labelNames["az"] { + t.Error("Missing 'az' label in commitment changes counter") + } } } } diff --git a/internal/scheduling/reservations/commitments/api/report_capacity.go b/internal/scheduling/reservations/commitments/api/report_capacity.go index ec537607a..f8e43378f 100644 --- a/internal/scheduling/reservations/commitments/api/report_capacity.go +++ b/internal/scheduling/reservations/commitments/api/report_capacity.go @@ -71,6 +71,13 @@ func (api *HTTPAPI) HandleReportCapacity(w http.ResponseWriter, r *http.Request) logger.Info("calculated capacity report", "resourceCount", len(report.Resources)) + // Update capacity gauge for each resource/AZ combination. + for resName, resReport := range report.Resources { + for az, azReport := range resReport.PerAZ { + api.capacityMonitor.reportedCapacity.WithLabelValues(string(resName), string(az)).Set(float64(azReport.Capacity)) + } + } + // Return response w.Header().Set("Content-Type", "application/json") w.WriteHeader(statusCode) diff --git a/internal/scheduling/reservations/commitments/api/report_capacity_monitor.go b/internal/scheduling/reservations/commitments/api/report_capacity_monitor.go index 930d3dcb6..39023e693 100644 --- a/internal/scheduling/reservations/commitments/api/report_capacity_monitor.go +++ b/internal/scheduling/reservations/commitments/api/report_capacity_monitor.go @@ -9,8 +9,9 @@ import ( // ReportCapacityAPIMonitor provides metrics for the CR report-capacity API. type ReportCapacityAPIMonitor struct { - requestCounter *prometheus.CounterVec - requestDuration *prometheus.HistogramVec + requestCounter *prometheus.CounterVec + requestDuration *prometheus.HistogramVec + reportedCapacity *prometheus.GaugeVec } // NewReportCapacityAPIMonitor creates a new monitor with Prometheus metrics. @@ -27,6 +28,10 @@ func NewReportCapacityAPIMonitor() ReportCapacityAPIMonitor { Help: "Duration of committed resource capacity API requests in seconds by HTTP status code", Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10}, }, []string{"status_code"}), + reportedCapacity: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_committed_resource_reported_capacity_gib", + Help: "Last reported capacity in GiB per resource and availability zone as returned by the capacity API", + }, []string{"resource", "az"}), } // Pre-initialize metrics with zero values for common HTTP status codes. @@ -44,10 +49,12 @@ func NewReportCapacityAPIMonitor() ReportCapacityAPIMonitor { func (m *ReportCapacityAPIMonitor) Describe(ch chan<- *prometheus.Desc) { m.requestCounter.Describe(ch) m.requestDuration.Describe(ch) + m.reportedCapacity.Describe(ch) } // Collect implements prometheus.Collector. func (m *ReportCapacityAPIMonitor) Collect(ch chan<- prometheus.Metric) { m.requestCounter.Collect(ch) m.requestDuration.Collect(ch) + m.reportedCapacity.Collect(ch) } From c53c8f49ef515621f14ea9cd5c5005c0a8049699 Mon Sep 17 00:00:00 2001 From: mblos Date: Mon, 11 May 2026 16:05:10 +0200 Subject: [PATCH 2/3] fix: align timeout alert with 15s watchTimeout and per-any-timeout firing - Change timeout alert from increase([30m])>3 to increase([10m])>0 so any single timeout fires within 1 minute rather than requiring 3+ in 30m --- helm/bundles/cortex-nova/alerts/nova.alerts.yaml | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index db13790ae..130d7228c 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -494,8 +494,8 @@ groups: the failure to users. - alert: CortexNovaCommittedResourceTimeoutsTooHigh - expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics"}[30m]) > 3 - for: 5m + expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics"}[10m]) > 0 + for: 1m labels: context: committed-resource-api dashboard: cortex-status-dashboard/cortex-status-dashboard @@ -503,12 +503,11 @@ groups: severity: warning support_group: workload-management annotations: - summary: "Committed Resource change API repeated timeouts" + summary: "Committed Resource change API timeout detected" description: > - More than 3 commitment change requests timed out in the last 30 minutes. - Timeouts occur when the scheduling pipeline cannot place reservations within - the deadline. Affected changes are rolled back. Investigate scheduler - performance or reservation backlog. + A commitment change request timed out after the 15-second deadline. + Timeouts indicate the scheduling pipeline could not place reservations in time. + Affected changes are rolled back. Investigate scheduler performance or reservation backlog. - alert: CortexNovaCommittedResourceChangeLatencyTooHigh expr: | From 3d5e9469e68b00436961ae3bb32e9e023f856788 Mon Sep 17 00:00:00 2001 From: mblos Date: Tue, 12 May 2026 09:37:28 +0200 Subject: [PATCH 3/3] metric names --- .../reservations/capacity/metrics.go | 77 ++++++++----------- internal/scheduling/reservations/monitor.go | 8 +- 2 files changed, 38 insertions(+), 47 deletions(-) diff --git a/internal/scheduling/reservations/capacity/metrics.go b/internal/scheduling/reservations/capacity/metrics.go index bd13ca7ca..f282cc9d0 100644 --- a/internal/scheduling/reservations/capacity/metrics.go +++ b/internal/scheduling/reservations/capacity/metrics.go @@ -20,53 +20,47 @@ var ( // Monitor provides Prometheus metrics for FlavorGroupCapacity CRDs. // It implements prometheus.Collector and reads CRD status on each Collect call. type Monitor struct { - client client.Client - totalCapacityVMSlots *prometheus.GaugeVec - placeableVMs *prometheus.GaugeVec - totalCapacityHosts *prometheus.GaugeVec - placeableHosts *prometheus.GaugeVec - totalInstances *prometheus.GaugeVec - committedCapacity *prometheus.GaugeVec + client client.Client + vmSlotsEmpty *prometheus.GaugeVec + vmSlotsPlaceable *prometheus.GaugeVec + hostsEmpty *prometheus.GaugeVec + hostsPlaceable *prometheus.GaugeVec + committedCapacity *prometheus.GaugeVec } // NewMonitor creates a new Monitor that reads FlavorGroupCapacity CRDs. func NewMonitor(c client.Client) Monitor { return Monitor{ client: c, - totalCapacityVMSlots: prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "cortex_committed_resource_capacity_total", - Help: "Total schedulable slots in an empty-datacenter scenario per flavor.", + vmSlotsEmpty: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_committed_resource_capacity_vm_slots_empty_datacenter", + Help: "Schedulable VM slots per flavor assuming an empty datacenter (no existing VMs).", }, capacityFlavorLabels), - placeableVMs: prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "cortex_committed_resource_capacity_placeable", - Help: "Schedulable slots remaining given current VM allocations per flavor.", + vmSlotsPlaceable: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_committed_resource_capacity_vm_slots_placeable", + Help: "Schedulable VM slots remaining per flavor given current VM allocations.", }, capacityFlavorLabels), - totalCapacityHosts: prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "cortex_committed_resource_capacity_hosts_total", - Help: "Number of hosts eligible for this flavor in the empty-state probe.", + hostsEmpty: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_committed_resource_capacity_hosts_empty_datacenter", + Help: "Number of hosts eligible for this flavor assuming an empty datacenter.", }, capacityFlavorLabels), - placeableHosts: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + hostsPlaceable: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "cortex_committed_resource_capacity_hosts_placeable", Help: "Number of hosts still able to accept a new VM of this flavor.", }, capacityFlavorLabels), - totalInstances: prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "cortex_committed_resource_capacity_instances", - Help: "Total VM instances running on hypervisors in this AZ (not filtered by flavor group).", - }, capacityLabels), committedCapacity: prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "cortex_committed_resource_capacity_committed", - Help: "Sum of AcceptedAmount across Ready CommittedResource CRDs for this flavor group and AZ.", + Name: "cortex_committed_resource_committed_gib", + Help: "Sum of AcceptedAmount in GiB across Ready CommittedResource CRDs for this flavor group and AZ.", }, capacityLabels), } } // Describe implements prometheus.Collector. func (m *Monitor) Describe(ch chan<- *prometheus.Desc) { - m.totalCapacityVMSlots.Describe(ch) - m.placeableVMs.Describe(ch) - m.totalCapacityHosts.Describe(ch) - m.placeableHosts.Describe(ch) - m.totalInstances.Describe(ch) + m.vmSlotsEmpty.Describe(ch) + m.vmSlotsPlaceable.Describe(ch) + m.hostsEmpty.Describe(ch) + m.hostsPlaceable.Describe(ch) m.committedCapacity.Describe(ch) } @@ -81,11 +75,10 @@ func (m *Monitor) Collect(ch chan<- prometheus.Metric) { } // Reset all gauges so deleted CRDs don't linger. - m.totalCapacityVMSlots.Reset() - m.placeableVMs.Reset() - m.totalCapacityHosts.Reset() - m.placeableHosts.Reset() - m.totalInstances.Reset() + m.vmSlotsEmpty.Reset() + m.vmSlotsPlaceable.Reset() + m.hostsEmpty.Reset() + m.hostsPlaceable.Reset() m.committedCapacity.Reset() for _, crd := range list.Items { @@ -93,7 +86,6 @@ func (m *Monitor) Collect(ch chan<- prometheus.Metric) { "flavor_group": crd.Spec.FlavorGroup, "az": crd.Spec.AvailabilityZone, } - m.totalInstances.With(groupAZLabels).Set(float64(crd.Status.TotalInstances)) m.committedCapacity.With(groupAZLabels).Set(float64(crd.Status.CommittedCapacity)) for _, f := range crd.Status.Flavors { @@ -102,17 +94,16 @@ func (m *Monitor) Collect(ch chan<- prometheus.Metric) { "az": crd.Spec.AvailabilityZone, "flavor_name": f.FlavorName, } - m.totalCapacityVMSlots.With(flavorLabels).Set(float64(f.TotalCapacityVMSlots)) - m.placeableVMs.With(flavorLabels).Set(float64(f.PlaceableVMs)) - m.totalCapacityHosts.With(flavorLabels).Set(float64(f.TotalCapacityHosts)) - m.placeableHosts.With(flavorLabels).Set(float64(f.PlaceableHosts)) + m.vmSlotsEmpty.With(flavorLabels).Set(float64(f.TotalCapacityVMSlots)) + m.vmSlotsPlaceable.With(flavorLabels).Set(float64(f.PlaceableVMs)) + m.hostsEmpty.With(flavorLabels).Set(float64(f.TotalCapacityHosts)) + m.hostsPlaceable.With(flavorLabels).Set(float64(f.PlaceableHosts)) } } - m.totalCapacityVMSlots.Collect(ch) - m.placeableVMs.Collect(ch) - m.totalCapacityHosts.Collect(ch) - m.placeableHosts.Collect(ch) - m.totalInstances.Collect(ch) + m.vmSlotsEmpty.Collect(ch) + m.vmSlotsPlaceable.Collect(ch) + m.hostsEmpty.Collect(ch) + m.hostsPlaceable.Collect(ch) m.committedCapacity.Collect(ch) } diff --git a/internal/scheduling/reservations/monitor.go b/internal/scheduling/reservations/monitor.go index 2050cf880..557a87920 100644 --- a/internal/scheduling/reservations/monitor.go +++ b/internal/scheduling/reservations/monitor.go @@ -33,12 +33,12 @@ func NewMonitor(k8sClient client.Client) Monitor { return Monitor{ Client: k8sClient, numberOfReservations: prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "cortex_reservations_number", - Help: "Number of reservations.", + Name: "cortex_reservations", + Help: "Number of reservations by readiness and error status.", }, []string{"status_ready", "status_error"}), reservedResources: prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "cortex_reservations_resources", - Help: "Resources reserved by reservations.", + Name: "cortex_reservations_allocated_resources", + Help: "Resource units allocated across active reservations, by host and resource type.", }, []string{"status_ready", "status_error", "host", "resource"}), } }