From b32f2593824e6a125b3c24f6781e9e7783782462 Mon Sep 17 00:00:00 2001
From: mblos <marcel.gute@sap.com>
Date: Mon, 11 May 2026 16:00:54 +0200
Subject: [PATCH 1/3] fix: CR alerts and metrics

---
 .../cortex-nova/alerts/nova.alerts.yaml       | 168 ++++++++++++++++++
 helm/bundles/cortex-nova/values.yaml          |   2 +-
 .../api/change_commitments_metrics.go         |   2 +-
 .../api/change_commitments_monitor.go         |  16 +-
 .../api/change_commitments_monitor_test.go    |  14 +-
 .../commitments/api/report_capacity.go        |   7 +
 .../api/report_capacity_monitor.go            |  11 +-
 7 files changed, 200 insertions(+), 20 deletions(-)

diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
index 47b337968..db13790ae 100644
--- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
+++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
@@ -432,3 +432,171 @@ groups:
         This may indicate issues with the webhook logic, connectivity problems, or
         external factors causing failures. Check the webhook server logs for error
         details and investigate the affected resources.
+
+  # Committed Resource Info API
+  - alert: CortexNovaCommittedResourceInfoUnavailable
+    expr: |
+      rate(cortex_committed_resource_info_api_requests_total{service="cortex-nova-metrics", status_code="503"}[5m]) > 0
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex-status-dashboard/cortex-status-dashboard
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource info API is unavailable"
+      description: >
+        The committed resource info API (Limes LIQUID integration) has been returning
+        503 Service Unavailable for more than 5 minutes. This typically means the
+        flavor group knowledge CRD is not ready or missing. Limes cannot discover
+        available committed resources until the issue is resolved.
+
+  # Committed Resource Change API
+  - alert: CortexNovaCommittedResourceChangeErrors
+    expr: |
+      rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex-status-dashboard/cortex-status-dashboard
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource change API HTTP 5xx errors"
+      description: >
+        The committed resource change API (Limes LIQUID integration) is returning
+        HTTP 5xx errors. This is not expected and indicates an internal problem
+        processing commitment changes. Limes will retry, but new commitments may
+        not be fulfilled until the issue is resolved.
+
+  - alert: CortexNovaCommittedResourceRejectionRateTooHigh
+    expr: |
+      (
+        sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[15m]))
+        / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[15m]))
+      ) > 0.3
+      and on() sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[15m])) > 0
+    for: 15m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex-status-dashboard/cortex-status-dashboard
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource rejection rate too high ({{ $value | humanizePercentage }})"
+      description: >
+        More than 30% of commitment changes have been rejected over the last 15 minutes.
+        This may indicate insufficient capacity to fulfill new commitments. Rejected
+        commitments are rolled back; Limes will see them as failed and may report
+        the failure to users.
+
+  - alert: CortexNovaCommittedResourceTimeoutsTooHigh
+    expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics"}[30m]) > 3
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex-status-dashboard/cortex-status-dashboard
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource change API repeated timeouts"
+      description: >
+        More than 3 commitment change requests timed out in the last 30 minutes.
+        Timeouts occur when the scheduling pipeline cannot place reservations within
+        the deadline. Affected changes are rolled back. Investigate scheduler
+        performance or reservation backlog.
+
+  - alert: CortexNovaCommittedResourceChangeLatencyTooHigh
+    expr: |
+      histogram_quantile(0.95, sum(rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) >= 10
+      and on() sum(rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics"}[5m])) > 0
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex-status-dashboard/cortex-status-dashboard
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource change API p95 latency >= 10s"
+      description: >
+        The committed resource change API p95 latency has reached or exceeded 10 seconds,
+        approaching the 15-second watch timeout. Requests close to the timeout are at risk
+        of being rolled back. Investigate scheduler performance or reservation backlog.
+
+  # Committed Resource Capacity API
+  - alert: CortexNovaCommittedResourceCapacityErrors
+    expr: |
+      rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex-status-dashboard/cortex-status-dashboard
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource capacity API HTTP 5xx errors"
+      description: >
+        The committed resource capacity API (Limes LIQUID integration) is returning
+        HTTP 5xx errors. This indicates internal problems calculating cluster capacity.
+        Limes may receive stale or incomplete capacity data.
+
+  - alert: CortexNovaCommittedResourceCapacityDroppedToZero
+    expr: |
+      (cortex_committed_resource_reported_capacity_gib{service="cortex-nova-metrics"} == 0)
+      and on(resource, az) (cortex_committed_resource_reported_capacity_gib{service="cortex-nova-metrics"} offset 30m > 0)
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex-status-dashboard/cortex-status-dashboard
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource capacity for {{ $labels.resource }} in {{ $labels.az }} dropped to zero"
+      description: >
+        The reported capacity for committed resource {{ $labels.resource }} in
+        availability zone {{ $labels.az }} has dropped from a positive value to zero.
+        This may mean hypervisors in that AZ are fully utilized for the corresponding
+        flavor group and no further committed resources can be placed there.
+
+  # Committed Resource Usage API
+  - alert: CortexNovaCommittedResourceUsageErrors
+    expr: |
+      rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex-status-dashboard/cortex-status-dashboard
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource usage API HTTP 5xx errors"
+      description: >
+        The committed resource usage API (Limes LIQUID integration) is returning
+        HTTP 5xx errors. This indicates internal problems fetching reservation or
+        Nova server data. Limes may receive stale or incomplete usage data.
+
+  # Committed Resource Quota API
+  - alert: CortexNovaCommittedResourceQuotaErrors
+    expr: |
+      rate(cortex_committed_resource_quota_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex-status-dashboard/cortex-status-dashboard
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource quota API HTTP 5xx errors"
+      description: >
+        The committed resource quota API (Limes LIQUID integration) is returning
+        HTTP 5xx errors. This indicates internal problems computing or applying
+        quota. Limes may not be able to enforce committed resource quotas.
diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml
index 65ce2ddde..7283ec2ea 100644
--- a/helm/bundles/cortex-nova/values.yaml
+++ b/helm/bundles/cortex-nova/values.yaml
@@ -177,7 +177,7 @@ cortex-scheduling-controllers:
       maxRequeueInterval: "30m"
     committedResourceAPI:
       # Timeout for watching CommittedResource CRDs before rolling back
-      watchTimeout: "10s"
+      watchTimeout: "15s"
       # How often to poll CommittedResource CRD conditions during watch
       watchPollInterval: "500ms"
       # When false, the endpoint returns HTTP 503; the info endpoint remains available.
diff --git a/internal/scheduling/reservations/commitments/api/change_commitments_metrics.go b/internal/scheduling/reservations/commitments/api/change_commitments_metrics.go
index 1afeea5f5..6bb87fbca 100644
--- a/internal/scheduling/reservations/commitments/api/change_commitments_metrics.go
+++ b/internal/scheduling/reservations/commitments/api/change_commitments_metrics.go
@@ -29,7 +29,7 @@ func (api *HTTPAPI) recordMetrics(req liquid.CommitmentChangeRequest, resp liqui
 	}
 
 	// Record commitment changes counter
-	api.monitor.commitmentChanges.WithLabelValues(result).Add(float64(commitmentCount))
+	api.monitor.commitmentChanges.WithLabelValues(result, string(req.AZ)).Add(float64(commitmentCount))
 }
 
 // countCommitments counts the total number of commitments in a request.
diff --git a/internal/scheduling/reservations/commitments/api/change_commitments_monitor.go b/internal/scheduling/reservations/commitments/api/change_commitments_monitor.go
index 6b1a5c31d..5867f1790 100644
--- a/internal/scheduling/reservations/commitments/api/change_commitments_monitor.go
+++ b/internal/scheduling/reservations/commitments/api/change_commitments_monitor.go
@@ -25,20 +25,21 @@ func NewChangeCommitmentsAPIMonitor() ChangeCommitmentsAPIMonitor {
 			Help: "Total number of committed resource change API requests by HTTP status code",
 		}, []string{"status_code"}),
 		requestDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{
-			Name: "cortex_committed_resource_change_api_request_duration_seconds",
-			Help: "Duration of committed resource change API requests in seconds by HTTP status code",
+			Name:    "cortex_committed_resource_change_api_request_duration_seconds",
+			Help:    "Duration of committed resource change API requests in seconds by HTTP status code",
+			Buckets: []float64{0.5, 1, 2.5, 5, 7.5, 10, 12.5, 15, 20, 30},
 		}, []string{"status_code"}),
 		commitmentChanges: prometheus.NewCounterVec(prometheus.CounterOpts{
 			Name: "cortex_committed_resource_change_api_commitment_changes_total",
-			Help: "Total number of commitment changes processed by result",
-		}, []string{"result"}),
+			Help: "Total number of commitment changes processed by result and availability zone",
+		}, []string{"result", "az"}),
 		timeouts: prometheus.NewCounter(prometheus.CounterOpts{
 			Name: "cortex_committed_resource_change_api_timeouts_total",
 			Help: "Total number of commitment change requests that timed out while waiting for reservations to become ready",
 		}),
 	}
 
-	// Pre-initialize metrics with zero values for common HTTP status codes.
+	// Pre-initialize request metrics with zero values for common HTTP status codes.
 	// This ensures metrics exist in Prometheus before the first request,
 	// preventing "metric missing" warnings in alerting rules.
 	for _, statusCode := range []string{"200", "400", "409", "500", "503"} {
@@ -46,11 +47,6 @@ func NewChangeCommitmentsAPIMonitor() ChangeCommitmentsAPIMonitor {
 		m.requestDuration.WithLabelValues(statusCode)
 	}
 
-	// Pre-initialize commitment change result labels
-	for _, result := range []string{"accepted", "rejected"} {
-		m.commitmentChanges.WithLabelValues(result)
-	}
-
 	return m
 }
 
diff --git a/internal/scheduling/reservations/commitments/api/change_commitments_monitor_test.go b/internal/scheduling/reservations/commitments/api/change_commitments_monitor_test.go
index a999df7dc..ed10702c3 100644
--- a/internal/scheduling/reservations/commitments/api/change_commitments_monitor_test.go
+++ b/internal/scheduling/reservations/commitments/api/change_commitments_monitor_test.go
@@ -23,7 +23,7 @@ func TestChangeCommitmentsAPIMonitor_MetricsRegistration(t *testing.T) {
 	// Observe metrics before gathering (Prometheus metrics with labels only appear after being used)
 	monitor.requestCounter.WithLabelValues("200").Inc()
 	monitor.requestDuration.WithLabelValues("200").Observe(0.1)
-	monitor.commitmentChanges.WithLabelValues("success").Inc()
+	monitor.commitmentChanges.WithLabelValues("success", "az-1").Inc()
 	monitor.timeouts.Inc()
 
 	// Verify metrics can be gathered
@@ -90,8 +90,8 @@ func TestChangeCommitmentsAPIMonitor_MetricLabels(t *testing.T) {
 	monitor.requestCounter.WithLabelValues("409").Inc()
 	monitor.requestCounter.WithLabelValues("503").Inc()
 	monitor.requestDuration.WithLabelValues("200").Observe(1.5)
-	monitor.commitmentChanges.WithLabelValues("success").Add(5)
-	monitor.commitmentChanges.WithLabelValues("rejected").Add(2)
+	monitor.commitmentChanges.WithLabelValues("success", "az-1").Add(5)
+	monitor.commitmentChanges.WithLabelValues("rejected", "az-1").Add(2)
 
 	// Gather metrics
 	families, err := registry.Gather()
@@ -142,13 +142,12 @@ func TestChangeCommitmentsAPIMonitor_MetricLabels(t *testing.T) {
 		}
 
 		if *family.Name == "cortex_committed_resource_change_api_commitment_changes_total" {
-			// At minimum we expect the 2 labels we added (success, rejected)
-			// Plus pre-initialized labels (accepted) - so >= 2 total
+			// 2 label combinations: (success,az-1) and (rejected,az-1)
 			if len(family.Metric) < 2 {
 				t.Errorf("Expected at least 2 commitment changes metrics, got %d", len(family.Metric))
 			}
 
-			// Check all metrics have the result label
+			// Check all metrics have both result and az labels
 			for _, metric := range family.Metric {
 				labelNames := make(map[string]bool)
 				for _, label := range metric.Label {
@@ -158,6 +157,9 @@ func TestChangeCommitmentsAPIMonitor_MetricLabels(t *testing.T) {
 				if !labelNames["result"] {
 					t.Error("Missing 'result' label in commitment changes counter")
 				}
+				if !labelNames["az"] {
+					t.Error("Missing 'az' label in commitment changes counter")
+				}
 			}
 		}
 	}
diff --git a/internal/scheduling/reservations/commitments/api/report_capacity.go b/internal/scheduling/reservations/commitments/api/report_capacity.go
index ec537607a..f8e43378f 100644
--- a/internal/scheduling/reservations/commitments/api/report_capacity.go
+++ b/internal/scheduling/reservations/commitments/api/report_capacity.go
@@ -71,6 +71,13 @@ func (api *HTTPAPI) HandleReportCapacity(w http.ResponseWriter, r *http.Request)
 
 	logger.Info("calculated capacity report", "resourceCount", len(report.Resources))
 
+	// Update capacity gauge for each resource/AZ combination.
+	for resName, resReport := range report.Resources {
+		for az, azReport := range resReport.PerAZ {
+			api.capacityMonitor.reportedCapacity.WithLabelValues(string(resName), string(az)).Set(float64(azReport.Capacity))
+		}
+	}
+
 	// Return response
 	w.Header().Set("Content-Type", "application/json")
 	w.WriteHeader(statusCode)
diff --git a/internal/scheduling/reservations/commitments/api/report_capacity_monitor.go b/internal/scheduling/reservations/commitments/api/report_capacity_monitor.go
index 930d3dcb6..39023e693 100644
--- a/internal/scheduling/reservations/commitments/api/report_capacity_monitor.go
+++ b/internal/scheduling/reservations/commitments/api/report_capacity_monitor.go
@@ -9,8 +9,9 @@ import (
 
 // ReportCapacityAPIMonitor provides metrics for the CR report-capacity API.
 type ReportCapacityAPIMonitor struct {
-	requestCounter  *prometheus.CounterVec
-	requestDuration *prometheus.HistogramVec
+	requestCounter   *prometheus.CounterVec
+	requestDuration  *prometheus.HistogramVec
+	reportedCapacity *prometheus.GaugeVec
 }
 
 // NewReportCapacityAPIMonitor creates a new monitor with Prometheus metrics.
@@ -27,6 +28,10 @@ func NewReportCapacityAPIMonitor() ReportCapacityAPIMonitor {
 			Help:    "Duration of committed resource capacity API requests in seconds by HTTP status code",
 			Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10},
 		}, []string{"status_code"}),
+		reportedCapacity: prometheus.NewGaugeVec(prometheus.GaugeOpts{
+			Name: "cortex_committed_resource_reported_capacity_gib",
+			Help: "Last reported capacity in GiB per resource and availability zone as returned by the capacity API",
+		}, []string{"resource", "az"}),
 	}
 
 	// Pre-initialize metrics with zero values for common HTTP status codes.
@@ -44,10 +49,12 @@ func NewReportCapacityAPIMonitor() ReportCapacityAPIMonitor {
 func (m *ReportCapacityAPIMonitor) Describe(ch chan<- *prometheus.Desc) {
 	m.requestCounter.Describe(ch)
 	m.requestDuration.Describe(ch)
+	m.reportedCapacity.Describe(ch)
 }
 
 // Collect implements prometheus.Collector.
 func (m *ReportCapacityAPIMonitor) Collect(ch chan<- prometheus.Metric) {
 	m.requestCounter.Collect(ch)
 	m.requestDuration.Collect(ch)
+	m.reportedCapacity.Collect(ch)
 }

From c53c8f49ef515621f14ea9cd5c5005c0a8049699 Mon Sep 17 00:00:00 2001
From: mblos <marcel.gute@sap.com>
Date: Mon, 11 May 2026 16:05:10 +0200
Subject: [PATCH 2/3] fix: align timeout alert with 15s watchTimeout and
 per-any-timeout firing

- Change timeout alert from increase([30m])>3 to increase([10m])>0
  so any single timeout fires within 1 minute rather than requiring 3+ in 30m
---
 helm/bundles/cortex-nova/alerts/nova.alerts.yaml | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
index db13790ae..130d7228c 100644
--- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
+++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
@@ -494,8 +494,8 @@ groups:
         the failure to users.
 
   - alert: CortexNovaCommittedResourceTimeoutsTooHigh
-    expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics"}[30m]) > 3
-    for: 5m
+    expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics"}[10m]) > 0
+    for: 1m
     labels:
       context: committed-resource-api
       dashboard: cortex-status-dashboard/cortex-status-dashboard
@@ -503,12 +503,11 @@ groups:
       severity: warning
       support_group: workload-management
     annotations:
-      summary: "Committed Resource change API repeated timeouts"
+      summary: "Committed Resource change API timeout detected"
       description: >
-        More than 3 commitment change requests timed out in the last 30 minutes.
-        Timeouts occur when the scheduling pipeline cannot place reservations within
-        the deadline. Affected changes are rolled back. Investigate scheduler
-        performance or reservation backlog.
+        A commitment change request timed out after the 15-second deadline.
+        Timeouts indicate the scheduling pipeline could not place reservations in time.
+        Affected changes are rolled back. Investigate scheduler performance or reservation backlog.
 
   - alert: CortexNovaCommittedResourceChangeLatencyTooHigh
     expr: |

From 3d5e9469e68b00436961ae3bb32e9e023f856788 Mon Sep 17 00:00:00 2001
From: mblos <marcel.gute@sap.com>
Date: Tue, 12 May 2026 09:37:28 +0200
Subject: [PATCH 3/3] metric names

---
 .../reservations/capacity/metrics.go          | 77 ++++++++-----------
 internal/scheduling/reservations/monitor.go   |  8 +-
 2 files changed, 38 insertions(+), 47 deletions(-)

diff --git a/internal/scheduling/reservations/capacity/metrics.go b/internal/scheduling/reservations/capacity/metrics.go
index bd13ca7ca..f282cc9d0 100644
--- a/internal/scheduling/reservations/capacity/metrics.go
+++ b/internal/scheduling/reservations/capacity/metrics.go
@@ -20,53 +20,47 @@ var (
 // Monitor provides Prometheus metrics for FlavorGroupCapacity CRDs.
 // It implements prometheus.Collector and reads CRD status on each Collect call.
 type Monitor struct {
-	client               client.Client
-	totalCapacityVMSlots *prometheus.GaugeVec
-	placeableVMs         *prometheus.GaugeVec
-	totalCapacityHosts   *prometheus.GaugeVec
-	placeableHosts       *prometheus.GaugeVec
-	totalInstances       *prometheus.GaugeVec
-	committedCapacity    *prometheus.GaugeVec
+	client            client.Client
+	vmSlotsEmpty      *prometheus.GaugeVec
+	vmSlotsPlaceable  *prometheus.GaugeVec
+	hostsEmpty        *prometheus.GaugeVec
+	hostsPlaceable    *prometheus.GaugeVec
+	committedCapacity *prometheus.GaugeVec
 }
 
 // NewMonitor creates a new Monitor that reads FlavorGroupCapacity CRDs.
 func NewMonitor(c client.Client) Monitor {
 	return Monitor{
 		client: c,
-		totalCapacityVMSlots: prometheus.NewGaugeVec(prometheus.GaugeOpts{
-			Name: "cortex_committed_resource_capacity_total",
-			Help: "Total schedulable slots in an empty-datacenter scenario per flavor.",
+		vmSlotsEmpty: prometheus.NewGaugeVec(prometheus.GaugeOpts{
+			Name: "cortex_committed_resource_capacity_vm_slots_empty_datacenter",
+			Help: "Schedulable VM slots per flavor assuming an empty datacenter (no existing VMs).",
 		}, capacityFlavorLabels),
-		placeableVMs: prometheus.NewGaugeVec(prometheus.GaugeOpts{
-			Name: "cortex_committed_resource_capacity_placeable",
-			Help: "Schedulable slots remaining given current VM allocations per flavor.",
+		vmSlotsPlaceable: prometheus.NewGaugeVec(prometheus.GaugeOpts{
+			Name: "cortex_committed_resource_capacity_vm_slots_placeable",
+			Help: "Schedulable VM slots remaining per flavor given current VM allocations.",
 		}, capacityFlavorLabels),
-		totalCapacityHosts: prometheus.NewGaugeVec(prometheus.GaugeOpts{
-			Name: "cortex_committed_resource_capacity_hosts_total",
-			Help: "Number of hosts eligible for this flavor in the empty-state probe.",
+		hostsEmpty: prometheus.NewGaugeVec(prometheus.GaugeOpts{
+			Name: "cortex_committed_resource_capacity_hosts_empty_datacenter",
+			Help: "Number of hosts eligible for this flavor assuming an empty datacenter.",
 		}, capacityFlavorLabels),
-		placeableHosts: prometheus.NewGaugeVec(prometheus.GaugeOpts{
+		hostsPlaceable: prometheus.NewGaugeVec(prometheus.GaugeOpts{
 			Name: "cortex_committed_resource_capacity_hosts_placeable",
 			Help: "Number of hosts still able to accept a new VM of this flavor.",
 		}, capacityFlavorLabels),
-		totalInstances: prometheus.NewGaugeVec(prometheus.GaugeOpts{
-			Name: "cortex_committed_resource_capacity_instances",
-			Help: "Total VM instances running on hypervisors in this AZ (not filtered by flavor group).",
-		}, capacityLabels),
 		committedCapacity: prometheus.NewGaugeVec(prometheus.GaugeOpts{
-			Name: "cortex_committed_resource_capacity_committed",
-			Help: "Sum of AcceptedAmount across Ready CommittedResource CRDs for this flavor group and AZ.",
+			Name: "cortex_committed_resource_committed_gib",
+			Help: "Sum of AcceptedAmount in GiB across Ready CommittedResource CRDs for this flavor group and AZ.",
 		}, capacityLabels),
 	}
 }
 
 // Describe implements prometheus.Collector.
 func (m *Monitor) Describe(ch chan<- *prometheus.Desc) {
-	m.totalCapacityVMSlots.Describe(ch)
-	m.placeableVMs.Describe(ch)
-	m.totalCapacityHosts.Describe(ch)
-	m.placeableHosts.Describe(ch)
-	m.totalInstances.Describe(ch)
+	m.vmSlotsEmpty.Describe(ch)
+	m.vmSlotsPlaceable.Describe(ch)
+	m.hostsEmpty.Describe(ch)
+	m.hostsPlaceable.Describe(ch)
 	m.committedCapacity.Describe(ch)
 }
 
@@ -81,11 +75,10 @@ func (m *Monitor) Collect(ch chan<- prometheus.Metric) {
 	}
 
 	// Reset all gauges so deleted CRDs don't linger.
-	m.totalCapacityVMSlots.Reset()
-	m.placeableVMs.Reset()
-	m.totalCapacityHosts.Reset()
-	m.placeableHosts.Reset()
-	m.totalInstances.Reset()
+	m.vmSlotsEmpty.Reset()
+	m.vmSlotsPlaceable.Reset()
+	m.hostsEmpty.Reset()
+	m.hostsPlaceable.Reset()
 	m.committedCapacity.Reset()
 
 	for _, crd := range list.Items {
@@ -93,7 +86,6 @@ func (m *Monitor) Collect(ch chan<- prometheus.Metric) {
 			"flavor_group": crd.Spec.FlavorGroup,
 			"az":           crd.Spec.AvailabilityZone,
 		}
-		m.totalInstances.With(groupAZLabels).Set(float64(crd.Status.TotalInstances))
 		m.committedCapacity.With(groupAZLabels).Set(float64(crd.Status.CommittedCapacity))
 
 		for _, f := range crd.Status.Flavors {
@@ -102,17 +94,16 @@ func (m *Monitor) Collect(ch chan<- prometheus.Metric) {
 				"az":           crd.Spec.AvailabilityZone,
 				"flavor_name":  f.FlavorName,
 			}
-			m.totalCapacityVMSlots.With(flavorLabels).Set(float64(f.TotalCapacityVMSlots))
-			m.placeableVMs.With(flavorLabels).Set(float64(f.PlaceableVMs))
-			m.totalCapacityHosts.With(flavorLabels).Set(float64(f.TotalCapacityHosts))
-			m.placeableHosts.With(flavorLabels).Set(float64(f.PlaceableHosts))
+			m.vmSlotsEmpty.With(flavorLabels).Set(float64(f.TotalCapacityVMSlots))
+			m.vmSlotsPlaceable.With(flavorLabels).Set(float64(f.PlaceableVMs))
+			m.hostsEmpty.With(flavorLabels).Set(float64(f.TotalCapacityHosts))
+			m.hostsPlaceable.With(flavorLabels).Set(float64(f.PlaceableHosts))
 		}
 	}
 
-	m.totalCapacityVMSlots.Collect(ch)
-	m.placeableVMs.Collect(ch)
-	m.totalCapacityHosts.Collect(ch)
-	m.placeableHosts.Collect(ch)
-	m.totalInstances.Collect(ch)
+	m.vmSlotsEmpty.Collect(ch)
+	m.vmSlotsPlaceable.Collect(ch)
+	m.hostsEmpty.Collect(ch)
+	m.hostsPlaceable.Collect(ch)
 	m.committedCapacity.Collect(ch)
 }
diff --git a/internal/scheduling/reservations/monitor.go b/internal/scheduling/reservations/monitor.go
index 2050cf880..557a87920 100644
--- a/internal/scheduling/reservations/monitor.go
+++ b/internal/scheduling/reservations/monitor.go
@@ -33,12 +33,12 @@ func NewMonitor(k8sClient client.Client) Monitor {
 	return Monitor{
 		Client: k8sClient,
 		numberOfReservations: prometheus.NewGaugeVec(prometheus.GaugeOpts{
-			Name: "cortex_reservations_number",
-			Help: "Number of reservations.",
+			Name: "cortex_reservations",
+			Help: "Number of reservations by readiness and error status.",
 		}, []string{"status_ready", "status_error"}),
 		reservedResources: prometheus.NewGaugeVec(prometheus.GaugeOpts{
-			Name: "cortex_reservations_resources",
-			Help: "Resources reserved by reservations.",
+			Name: "cortex_reservations_allocated_resources",
+			Help: "Resource units allocated across active reservations, by host and resource type.",
 		}, []string{"status_ready", "status_error", "host", "resource"}),
 	}
 }