From e6ae001fe8dd6c76269f0031baca5dd9ae314803 Mon Sep 17 00:00:00 2001 From: mblos <156897072+mblos@users.noreply.github.com> Date: Mon, 4 May 2026 14:33:17 +0200 Subject: [PATCH 1/9] fix: capacity filter correctly accounts for multi-VM CR reservation slots (#784) Fixes capacity blocking for CommittedResource reservation slots that contain multiple VMs at different confirmation stages. --- .../filters/filter_has_enough_capacity.go | 80 ++- .../filter_has_enough_capacity_test.go | 644 ++++++++++++------ 2 files changed, 488 insertions(+), 236 deletions(-) diff --git a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go index 5b471f789..88e2f07d5 100644 --- a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go +++ b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go @@ -41,6 +41,16 @@ type FilterHasEnoughCapacity struct { // // In case the project and flavor match, space reserved is unlocked (slotting). // +// Capacity accounting uses two sources: hv.Status.Allocation (aggregate real-time usage of +// all running VMs) and Reservation.Status.Allocations (which VMs are confirmed on a slot, +// maintained by the reservation controller with a one-reconcile-cycle lag). During the window +// between a VM starting and the reservation controller reconciling, a VM appears in both +// sources — a conservative transient over-count that self-corrects on the next reconcile. +// +// During a CR reservation migration (TargetHost != Status.Host), both the source and target +// host are blocked with the full slot. The source block is intentionally conservative to +// preserve rollback capacity if the migration fails. +// // Please note that, if num_instances is larger than 1, there needs to be enough // capacity to place all instances on the same host. This limitation is necessary // because we can't spread out instances, as the final set of valid hosts is not @@ -170,41 +180,61 @@ func (s *FilterHasEnoughCapacity) Run(traceLog *slog.Logger, request api.Externa continue } - // For CR reservations with allocations, calculate remaining (unallocated) resources to block. - // This prevents double-blocking of resources already consumed by running instances. + // For CR reservations with allocations, compute the effective block: + // confirmed = sum of resources for VMs present in both Spec and Status allocations + // specOnly = sum of resources for VMs present in Spec but not yet in Status + // remaining = max(0, Spec.Resources - confirmed) [clamped: never negative] + // block = max(remaining, specOnly) [spec-only VM must be fully covered] + // + // Clamping: if confirmed VMs exceed slot size (e.g. after resize), block = 0. + // Oversize spec-only: if a pending VM is larger than the remaining slot, block its full size. var resourcesToBlock map[hv1.ResourceName]resource.Quantity if reservation.Spec.Type == v1alpha1.ReservationTypeCommittedResource && // if the reservation is not being migrated, block only unused resources reservation.Spec.TargetHost == reservation.Status.Host && reservation.Spec.CommittedResourceReservation != nil && - reservation.Status.CommittedResourceReservation != nil && - len(reservation.Spec.CommittedResourceReservation.Allocations) > 0 && - len(reservation.Status.CommittedResourceReservation.Allocations) > 0 { - // Start with full reservation resources - resourcesToBlock = make(map[hv1.ResourceName]resource.Quantity) - for k, v := range reservation.Spec.Resources { - resourcesToBlock[k] = v.DeepCopy() + len(reservation.Spec.CommittedResourceReservation.Allocations) > 0 { + confirmedResources := make(map[hv1.ResourceName]resource.Quantity) + specOnlyResources := make(map[hv1.ResourceName]resource.Quantity) + + statusAllocs := map[string]string{} + if reservation.Status.CommittedResourceReservation != nil { + statusAllocs = reservation.Status.CommittedResourceReservation.Allocations } - // Subtract already-allocated resources because those consume already resources on the host for instanceUUID, allocation := range reservation.Spec.CommittedResourceReservation.Allocations { - // Only subtract if allocation is already present in status (VM is actually running) - if _, isRunning := reservation.Status.CommittedResourceReservation.Allocations[instanceUUID]; !isRunning { - continue - } - + _, isConfirmed := statusAllocs[instanceUUID] for resourceName, quantity := range allocation.Resources { - if current, ok := resourcesToBlock[resourceName]; ok { - current.Sub(quantity) - resourcesToBlock[resourceName] = current - traceLog.Debug("subtracting allocated resources from reservation", - "reservation", reservation.Name, - "instanceUUID", instanceUUID, - "resource", resourceName, - "quantity", quantity.String()) + if isConfirmed { + existing := confirmedResources[resourceName] + existing.Add(quantity) + confirmedResources[resourceName] = existing + } else { + existing := specOnlyResources[resourceName] + existing.Add(quantity) + specOnlyResources[resourceName] = existing } } } + + resourcesToBlock = make(map[hv1.ResourceName]resource.Quantity) + zero := resource.Quantity{} + for resourceName, slotSize := range reservation.Spec.Resources { + confirmed := confirmedResources[resourceName] + specOnly := specOnlyResources[resourceName] + + remaining := slotSize.DeepCopy() + remaining.Sub(confirmed) + if remaining.Cmp(zero) < 0 { + remaining = zero.DeepCopy() + } + + if specOnly.Cmp(remaining) > 0 { + resourcesToBlock[resourceName] = specOnly.DeepCopy() + } else { + resourcesToBlock[resourceName] = remaining + } + } } else { // For other reservation types or CR without allocations, block full resources resourcesToBlock = reservation.Spec.Resources @@ -229,7 +259,7 @@ func (s *FilterHasEnoughCapacity) Run(traceLog *slog.Logger, request api.Externa "reservationType", reservation.Spec.Type, "freeCPU", freeCPU.String(), "blocked", cpu.String()) - freeCPU = resource.MustParse("0") + freeCPU = resource.Quantity{} } freeResourcesByHost[host]["cpu"] = freeCPU } @@ -244,7 +274,7 @@ func (s *FilterHasEnoughCapacity) Run(traceLog *slog.Logger, request api.Externa "reservationType", reservation.Spec.Type, "freeMemory", freeMemory.String(), "blocked", memory.String()) - freeMemory = resource.MustParse("0") + freeMemory = resource.Quantity{} } freeResourcesByHost[host]["memory"] = freeMemory } diff --git a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity_test.go b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity_test.go index cabc3a3b4..5b026408f 100644 --- a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity_test.go +++ b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity_test.go @@ -94,10 +94,22 @@ func newHypervisorWithBothCapacities(name, cpuCap, cpuEffCap, memCap, memEffCap } } +// newCommittedReservation creates a reservation where TargetHost and Status.Host are the same. func newCommittedReservation( + name, host, projectID, flavorName, flavorGroup, cpu, memory string, + specAllocations map[string]v1alpha1.CommittedResourceAllocation, + statusAllocations map[string]string, +) *v1alpha1.Reservation { + + return newMigratingReservation(name, host, host, projectID, flavorName, flavorGroup, cpu, memory, specAllocations, statusAllocations) +} + +// newMigratingReservation creates a reservation where TargetHost and Status.Host may differ, +// used for in-progress reservation migrations or pending placements. +func newMigratingReservation( name, targetHost, observedHost, projectID, flavorName, flavorGroup, cpu, memory string, - specAllocations map[string]v1alpha1.CommittedResourceAllocation, // Spec allocations for CR - statusAllocations map[string]string, // Status allocations for CR (instance UUID -> host) + specAllocations map[string]v1alpha1.CommittedResourceAllocation, + statusAllocations map[string]string, ) *v1alpha1.Reservation { res := &v1alpha1.Reservation{ @@ -266,6 +278,20 @@ func newNovaRequestWithIntent(instanceUUID, projectID, flavorName, flavorGroup s } } +func assertActivations(t *testing.T, activations map[string]float64, expectedHosts, filteredHosts []string) { + t.Helper() + for _, host := range expectedHosts { + if _, ok := activations[host]; !ok { + t.Errorf("expected host %s to pass, got activations: %v", host, activations) + } + } + for _, host := range filteredHosts { + if _, ok := activations[host]; ok { + t.Errorf("expected host %s to be filtered, got activations: %v", host, activations) + } + } +} + // ============================================================================ // Tests // ============================================================================ @@ -325,8 +351,8 @@ func TestFilterHasEnoughCapacity_ReservationTypes(t *testing.T) { { name: "CommittedResourceReservation of other project blocks some hosts", reservations: []*v1alpha1.Reservation{ - newCommittedReservation("res-1", "host1", "host1", "project-A", "m1.large", "gp-1", "8", "16Gi", nil, nil), - newCommittedReservation("res-2", "host2", "host2", "project-A", "m1.large", "gp-1", "4", "8Gi", nil, nil), + newCommittedReservation("res-1", "host1", "project-A", "m1.large", "gp-1", "8", "16Gi", nil, nil), + newCommittedReservation("res-2", "host2", "project-A", "m1.large", "gp-1", "4", "8Gi", nil, nil), }, request: newNovaRequest("instance-123", "project-B", "m1.small", "gp-1", 4, "8Gi", false, []string{"host1", "host2", "host3", "host4"}), opts: FilterHasEnoughCapacityOpts{LockReserved: false}, @@ -336,20 +362,37 @@ func TestFilterHasEnoughCapacity_ReservationTypes(t *testing.T) { { name: "CommittedResourceReservation of other project blocks only unused resources of reservation", reservations: []*v1alpha1.Reservation{ - newCommittedReservation("res-1 half used", "host1", "host1", "project-A", "m1.large", "gp-1", "8", "16Gi", crSpecAllocs(crVm("vm-1", "4", "8Gi")), map[string]string{"vm-1": "host1"}), - newCommittedReservation("res-2 fully used", "host2", "host2", "project-A", "m1.large", "gp-1", "4", "8Gi", crSpecAllocs(crVm("vm-2", "4", "8Gi")), map[string]string{"vm-2": "host2"}), + newCommittedReservation("res-1 half used", "host1", "project-A", "m1.large", "gp-1", "8", "16Gi", crSpecAllocs(crVm("vm-1", "4", "8Gi")), map[string]string{"vm-1": "host1"}), + newCommittedReservation("res-2 fully used", "host2", "project-A", "m1.large", "gp-1", "4", "8Gi", crSpecAllocs(crVm("vm-2", "4", "8Gi")), map[string]string{"vm-2": "host2"}), }, request: newNovaRequest("instance-123", "project-B", "m1.small", "gp-1", 4, "8Gi", false, []string{"host1", "host2", "host3", "host4"}), opts: FilterHasEnoughCapacityOpts{LockReserved: false}, expectedHosts: []string{"host1", "host2", "host3"}, filteredHosts: []string{"host4"}, }, + { + // host1: 8 CPU free, 16Gi free. + // Slot=8cpu/16Gi, two confirmed VMs: vm-1=3cpu/6Gi + vm-2=2cpu/4Gi. + // Correct: confirmed sum=5cpu/10Gi → remaining=3cpu/6Gi → block=3cpu/6Gi → free=5cpu/10Gi. + // Bug (only one VM counted): block=5cpu/10Gi → free=3cpu/6Gi → 4-cpu request wrongly filtered. + name: "CommittedResourceReservation blocks only remaining capacity when multiple VMs are confirmed in one slot", + reservations: []*v1alpha1.Reservation{ + newCommittedReservation("res-multi-vm", "host1", "project-A", "m1.large", "gp-1", "8", "16Gi", + crSpecAllocs(crVm("vm-1", "3", "6Gi"), crVm("vm-2", "2", "4Gi")), + map[string]string{"vm-1": "host1", "vm-2": "host1"}, + ), + }, + request: newNovaRequest("instance-123", "project-B", "m1.small", "gp-1", 4, "8Gi", false, []string{"host1", "host2", "host3", "host4"}), + opts: FilterHasEnoughCapacityOpts{LockReserved: false}, + expectedHosts: []string{"host1", "host2", "host3"}, // host1: 5cpu/10Gi free → 4cpu/8Gi request passes + filteredHosts: []string{"host4"}, + }, { name: "CommittedResourceReservation of other project blocks both source and target host during migration, ignoring used resources", reservations: []*v1alpha1.Reservation{ - newCommittedReservation("res-1", "host1", "host1", "project-A", "m1.large", "gp-1", "4", "8Gi", nil, nil), - newCommittedReservation("res-2", "host1", "host2", "project-A", "m1.large", "gp-1", "2", "4Gi", nil, nil), // migration reservation from host1 to host2 - newCommittedReservation("res-3", "host2", "host1", "project-A", "m1.large", "gp-1", "2", "4Gi", crSpecAllocs(crVm("vm-1", "2", "4Gi")), map[string]string{"vm-1": "host1"}), // migration reservation from host2 to host1 + newCommittedReservation("res-1", "host1", "project-A", "m1.large", "gp-1", "4", "8Gi", nil, nil), + newMigratingReservation("res-2", "host1", "host2", "project-A", "m1.large", "gp-1", "2", "4Gi", nil, nil), // migration reservation from host1 to host2 + newMigratingReservation("res-3", "host2", "host1", "project-A", "m1.large", "gp-1", "2", "4Gi", crSpecAllocs(crVm("vm-1", "2", "4Gi")), map[string]string{"vm-1": "host1"}), // migration reservation from host2 to host1 }, request: newNovaRequest("instance-123", "project-B", "m1.small", "gp-1", 2, "4Gi", false, []string{"host1", "host2", "host3", "host4"}), opts: FilterHasEnoughCapacityOpts{LockReserved: false}, @@ -360,10 +403,10 @@ func TestFilterHasEnoughCapacity_ReservationTypes(t *testing.T) { name: "CommittedResourceReservation unlocks for matching project and flavor group", reservations: []*v1alpha1.Reservation{ // all three reservations 1,2,3 are required to have enough capacity for the request - newCommittedReservation("res-1-unused", "host1", "host1", "project-A", "some flavor", "gp-1", "2", "4Gi", nil, nil), // fully unused reservation - newCommittedReservation("res-2-pending-used", "host1", "host1", "project-A", "some flavor", "gp-1", "2", "4Gi", crSpecAllocs(crVm("vm-1", "2", "4Gi")), nil), // reservation with a pending allocation - newCommittedReservation("res-3-used", "host1", "host1", "project-A", "some flavor", "gp-1", "2", "4Gi", crSpecAllocs(crVm("vm-2", "1", "1Gi")), map[string]string{"vm-2": "host1"}), // used reservation - newCommittedReservation("res-4", "host2", "host2", "project-A", "some flavor", "gp-2", "4", "8Gi", nil, nil), // different flavor group, should still block + newCommittedReservation("res-1-unused", "host1", "project-A", "some flavor", "gp-1", "2", "4Gi", nil, nil), // fully unused reservation + newCommittedReservation("res-2-pending-used", "host1", "project-A", "some flavor", "gp-1", "2", "4Gi", crSpecAllocs(crVm("vm-1", "2", "4Gi")), nil), // reservation with a pending allocation + newCommittedReservation("res-3-used", "host1", "project-A", "some flavor", "gp-1", "2", "4Gi", crSpecAllocs(crVm("vm-2", "1", "1Gi")), map[string]string{"vm-2": "host1"}), // used reservation + newCommittedReservation("res-4", "host2", "project-A", "some flavor", "gp-2", "4", "8Gi", nil, nil), // different flavor group, should still block }, request: newNovaRequest("instance-123", "project-A", "m1.large", "gp-1", 8, "16Gi", false, []string{"host1", "host2", "host3", "host4"}), opts: FilterHasEnoughCapacityOpts{LockReserved: false}, @@ -373,8 +416,8 @@ func TestFilterHasEnoughCapacity_ReservationTypes(t *testing.T) { { name: "CommittedResourceReservation stays locked when LockReserved is true", reservations: []*v1alpha1.Reservation{ - newCommittedReservation("res-1", "host1", "host1", "project-A", "m1.large", "gp-1", "8", "16Gi", nil, nil), - newCommittedReservation("res-2", "host3", "host3", "project-A", "m1.large", "gp-1", "16", "32Gi", nil, nil), + newCommittedReservation("res-1", "host1", "project-A", "m1.large", "gp-1", "8", "16Gi", nil, nil), + newCommittedReservation("res-2", "host3", "project-A", "m1.large", "gp-1", "16", "32Gi", nil, nil), }, request: newNovaRequest("instance-123", "project-A", "m1.large", "gp-1", 4, "8Gi", false, []string{"host1", "host2", "host3", "host4"}), opts: FilterHasEnoughCapacityOpts{LockReserved: true}, @@ -384,7 +427,7 @@ func TestFilterHasEnoughCapacity_ReservationTypes(t *testing.T) { { name: "Empty reservation type defaults to CommittedResourceReservation behavior", reservations: []*v1alpha1.Reservation{ - newCommittedReservation("legacy-res", "host1", "host1", "project-A", "m1.large", "gp-1", "4", "8Gi", nil, nil), + newCommittedReservation("legacy-res", "host1", "project-A", "m1.large", "gp-1", "4", "8Gi", nil, nil), }, request: newNovaRequest("instance-123", "project-A", "m1.large", "gp-1", 4, "8Gi", false, []string{"host1", "host2"}), opts: FilterHasEnoughCapacityOpts{LockReserved: false}, @@ -394,9 +437,9 @@ func TestFilterHasEnoughCapacity_ReservationTypes(t *testing.T) { { name: "All hosts blocked by reservations - none pass", reservations: []*v1alpha1.Reservation{ - newCommittedReservation("res-1", "host1", "host1", "project-X", "m1.xlarge", "gp-1", "8", "16Gi", nil, nil), - newCommittedReservation("res-2", "host2", "host2", "project-X", "m1.xlarge", "gp-1", "4", "8Gi", nil, nil), - newCommittedReservation("res-3", "host3", "host3", "project-X", "m1.xlarge", "gp-1", "16", "32Gi", nil, nil), + newCommittedReservation("res-1", "host1", "project-X", "m1.xlarge", "gp-1", "8", "16Gi", nil, nil), + newCommittedReservation("res-2", "host2", "project-X", "m1.xlarge", "gp-1", "4", "8Gi", nil, nil), + newCommittedReservation("res-3", "host3", "project-X", "m1.xlarge", "gp-1", "16", "32Gi", nil, nil), }, request: newNovaRequest("instance-123", "project-A", "m1.small", "gp-1", 4, "8Gi", false, []string{"host1", "host2", "host3", "host4"}), opts: FilterHasEnoughCapacityOpts{LockReserved: false}, @@ -406,7 +449,7 @@ func TestFilterHasEnoughCapacity_ReservationTypes(t *testing.T) { { name: "Pending reservation (only TargetHost set) blocks capacity on desired host", reservations: []*v1alpha1.Reservation{ - newCommittedReservation("pending-res", "host1", "", "project-X", "m1.large", "gp-1", "8", "16Gi", nil, nil), + newMigratingReservation("pending-res", "host1", "", "project-X", "m1.large", "gp-1", "8", "16Gi", nil, nil), }, request: newNovaRequest("instance-123", "project-A", "m1.small", "gp-1", 4, "8Gi", false, []string{"host1", "host2", "host3", "host4"}), opts: FilterHasEnoughCapacityOpts{LockReserved: false}, @@ -416,8 +459,8 @@ func TestFilterHasEnoughCapacity_ReservationTypes(t *testing.T) { { name: "Multiple reservations: pending and placed block different hosts", reservations: []*v1alpha1.Reservation{ - newCommittedReservation("pending-res", "host1", "", "project-X", "m1.large", "gp-1", "8", "16Gi", nil, nil), - newCommittedReservation("placed-res", "host2", "host3", "project-X", "m1.large", "gp-1", "4", "8Gi", nil, nil), + newMigratingReservation("pending-res", "host1", "", "project-X", "m1.large", "gp-1", "8", "16Gi", nil, nil), + newMigratingReservation("placed-res", "host2", "host3", "project-X", "m1.large", "gp-1", "4", "8Gi", nil, nil), }, request: newNovaRequest("instance-123", "project-A", "m1.small", "gp-1", 4, "8Gi", false, []string{"host1", "host2", "host3", "host4"}), opts: FilterHasEnoughCapacityOpts{LockReserved: false}, @@ -427,13 +470,50 @@ func TestFilterHasEnoughCapacity_ReservationTypes(t *testing.T) { { name: "Reservation with no host is skipped", reservations: []*v1alpha1.Reservation{ - newCommittedReservation("no-host-res", "", "", "project-X", "m1.large", "gp-1", "8", "16Gi", nil, nil), + newCommittedReservation("no-host-res", "", "project-X", "m1.large", "gp-1", "8", "16Gi", nil, nil), }, request: newNovaRequest("instance-123", "project-A", "m1.small", "gp-1", 4, "8Gi", false, []string{"host1", "host2", "host3", "host4"}), opts: FilterHasEnoughCapacityOpts{LockReserved: false}, expectedHosts: []string{"host1", "host2", "host3"}, filteredHosts: []string{"host4"}, }, + { + // host1: 8 CPU free, 16Gi free (shared hypervisors). + // Reservation slot=4cpu/8Gi, but confirmed VM consumed 6cpu/10Gi (exceeds slot after resize). + // Unclamped: block = -2cpu/-2Gi → free becomes {10cpu,18Gi}; passes 9-cpu request (wrong). + // Clamped: block = 0 → free stays {8cpu,16Gi}; filtered for 9-cpu request. + name: "Confirmed VMs exceeding reservation size: block clamped to 0", + reservations: []*v1alpha1.Reservation{ + newCommittedReservation("res-oversized-vm", "host1", "project-X", "m1.large", "gp-1", "4", "8Gi", + crSpecAllocs(crVm("vm-1", "6", "10Gi")), + map[string]string{"vm-1": "host1"}, + ), + }, + request: newNovaRequest("instance-123", "project-A", "m1.small", "gp-1", 9, "17Gi", false, []string{"host1", "host2", "host3", "host4"}), + opts: FilterHasEnoughCapacityOpts{LockReserved: false}, + // host1: 8 free CPU < 9, 16Gi < 17Gi → filtered; host2: 4 < 9 → filtered; host4: no capacity → filtered + expectedHosts: []string{"host3"}, + filteredHosts: []string{"host1", "host2", "host4"}, + }, + { + // host1: 8 CPU free, 16Gi free (shared hypervisors). + // Reservation slot=8cpu/16Gi, confirmed vm-1=4cpu/8Gi (remaining={4,8Gi}). + // Spec-only vm-2=6cpu/12Gi EXCEEDS remaining → block must be {6,12Gi}, not {4,8Gi}. + // Without fix: block={4,8Gi} → free={4cpu,8Gi}; 3-cpu/5Gi request passes (wrong). + // With fix: block={6,12Gi} → free={2cpu,4Gi}; filtered for 3-cpu/5Gi request. + name: "Spec-only VM larger than remaining slot: block covers spec-only VM", + reservations: []*v1alpha1.Reservation{ + newCommittedReservation("res-spec-only-oversize", "host1", "project-X", "m1.large", "gp-1", "8", "16Gi", + crSpecAllocs(crVm("vm-1", "4", "8Gi"), crVm("vm-2", "6", "12Gi")), + map[string]string{"vm-1": "host1"}, // vm-2 is spec-only + ), + }, + request: newNovaRequest("instance-123", "project-A", "m1.small", "gp-1", 3, "5Gi", false, []string{"host1", "host2", "host3", "host4"}), + opts: FilterHasEnoughCapacityOpts{LockReserved: false}, + // host1: 2 free CPU < 3 requested, 4Gi < 5Gi → filtered; host4: no capacity → filtered + expectedHosts: []string{"host2", "host3"}, + filteredHosts: []string{"host1", "host4"}, + }, { name: "FailoverReservation blocks hosts for non-evacuation request even when instance is in Allocations", reservations: []*v1alpha1.Reservation{ @@ -496,7 +576,7 @@ func TestFilterHasEnoughCapacity_ReservationTypes(t *testing.T) { objects = append(objects, h.DeepCopy()) } for _, r := range tt.reservations { - objects = append(objects, r) + objects = append(objects, r.DeepCopy()) } step := &FilterHasEnoughCapacity{} @@ -507,18 +587,7 @@ func TestFilterHasEnoughCapacity_ReservationTypes(t *testing.T) { if err != nil { t.Fatalf("expected no error, got %v", err) } - - for _, host := range tt.expectedHosts { - if _, ok := result.Activations[host]; !ok { - t.Errorf("expected host %s to be present in activations, but got %+v", host, result.Activations) - } - } - - for _, host := range tt.filteredHosts { - if _, ok := result.Activations[host]; ok { - t.Errorf("expected host %s to be filtered out", host) - } - } + assertActivations(t, result.Activations, tt.expectedHosts, tt.filteredHosts) }) } } @@ -526,6 +595,27 @@ func TestFilterHasEnoughCapacity_ReservationTypes(t *testing.T) { func TestFilterHasEnoughCapacity_IgnoredReservationTypes(t *testing.T) { scheme := buildTestScheme(t) + // Two-host scenario: CR on host1 (4cpu/8Gi, project-X), Failover on host2 (4cpu/8Gi). + // Each host: 8 CPU free after base allocation → after reservation: 4 CPU free each. + twoHostHVs := []*hv1.Hypervisor{ + newHypervisor("host1", "16", "8", "32Gi", "16Gi"), + newHypervisor("host2", "16", "8", "32Gi", "16Gi"), + } + twoHostRes := []*v1alpha1.Reservation{ + newCommittedReservation("cr-res", "host1", "project-X", "m1.large", "gp-1", "4", "8Gi", nil, nil), + newFailoverReservation("failover-res", "host2", "4", "8Gi", map[string]string{"other-vm": "host3"}), + } + + // Single-host scenario: both CR (4cpu/8Gi) and Failover (2cpu/4Gi) on host1. + // host1: 12 CPU free → after both reservations: 6 CPU free. + singleHostHVs := []*hv1.Hypervisor{ + newHypervisor("host1", "12", "0", "24Gi", "0"), + } + singleHostRes := []*v1alpha1.Reservation{ + newCommittedReservation("cr-res", "host1", "project-X", "m1.large", "gp-1", "4", "8Gi", nil, nil), + newFailoverReservation("failover-res", "host1", "2", "4Gi", map[string]string{"other-vm": "host3"}), + } + tests := []struct { name string hypervisors []*hv1.Hypervisor @@ -538,60 +628,36 @@ func TestFilterHasEnoughCapacity_IgnoredReservationTypes(t *testing.T) { // Two-host scenario tests (CR on host1, Failover on host2) // host1: 8 CPU free, host2: 8 CPU free, CR blocks 4 on host1, Failover blocks 4 on host2 { - name: "Two hosts: No ignore - both hosts blocked by reservations", - hypervisors: []*hv1.Hypervisor{ - newHypervisor("host1", "16", "8", "32Gi", "16Gi"), // 8 CPU free - newHypervisor("host2", "16", "8", "32Gi", "16Gi"), // 8 CPU free - }, - reservations: []*v1alpha1.Reservation{ - newCommittedReservation("cr-res", "host1", "host1", "project-X", "m1.large", "gp-1", "4", "8Gi", nil, nil), - newFailoverReservation("failover-res", "host2", "4", "8Gi", map[string]string{"other-vm": "host3"}), - }, + name: "Two hosts: No ignore - both hosts blocked by reservations", + hypervisors: twoHostHVs, + reservations: twoHostRes, request: newNovaRequest("instance-123", "project-A", "m1.large", "gp-1", 8, "16Gi", false, []string{"host1", "host2"}), ignoredReservationTypes: nil, expectedHosts: []string{}, filteredHosts: []string{"host1", "host2"}, }, { - name: "Two hosts: Ignore CR only - host1 passes, host2 blocked by failover", - hypervisors: []*hv1.Hypervisor{ - newHypervisor("host1", "16", "8", "32Gi", "16Gi"), - newHypervisor("host2", "16", "8", "32Gi", "16Gi"), - }, - reservations: []*v1alpha1.Reservation{ - newCommittedReservation("cr-res", "host1", "host1", "project-X", "m1.large", "gp-1", "4", "8Gi", nil, nil), - newFailoverReservation("failover-res", "host2", "4", "8Gi", map[string]string{"other-vm": "host3"}), - }, + name: "Two hosts: Ignore CR only - host1 passes, host2 blocked by failover", + hypervisors: twoHostHVs, + reservations: twoHostRes, request: newNovaRequest("instance-123", "project-A", "m1.large", "gp-1", 8, "16Gi", false, []string{"host1", "host2"}), ignoredReservationTypes: []v1alpha1.ReservationType{v1alpha1.ReservationTypeCommittedResource}, expectedHosts: []string{"host1"}, filteredHosts: []string{"host2"}, }, { - name: "Two hosts: Ignore Failover only - host2 passes, host1 blocked by CR", - hypervisors: []*hv1.Hypervisor{ - newHypervisor("host1", "16", "8", "32Gi", "16Gi"), - newHypervisor("host2", "16", "8", "32Gi", "16Gi"), - }, - reservations: []*v1alpha1.Reservation{ - newCommittedReservation("cr-res", "host1", "host1", "project-X", "m1.large", "gp-1", "4", "8Gi", nil, nil), - newFailoverReservation("failover-res", "host2", "4", "8Gi", map[string]string{"other-vm": "host3"}), - }, + name: "Two hosts: Ignore Failover only - host2 passes, host1 blocked by CR", + hypervisors: twoHostHVs, + reservations: twoHostRes, request: newNovaRequest("instance-123", "project-A", "m1.large", "gp-1", 8, "16Gi", false, []string{"host1", "host2"}), ignoredReservationTypes: []v1alpha1.ReservationType{v1alpha1.ReservationTypeFailover}, expectedHosts: []string{"host2"}, filteredHosts: []string{"host1"}, }, { - name: "Two hosts: Ignore both - both hosts pass", - hypervisors: []*hv1.Hypervisor{ - newHypervisor("host1", "16", "8", "32Gi", "16Gi"), - newHypervisor("host2", "16", "8", "32Gi", "16Gi"), - }, - reservations: []*v1alpha1.Reservation{ - newCommittedReservation("cr-res", "host1", "host1", "project-X", "m1.large", "gp-1", "4", "8Gi", nil, nil), - newFailoverReservation("failover-res", "host2", "4", "8Gi", map[string]string{"other-vm": "host3"}), - }, + name: "Two hosts: Ignore both - both hosts pass", + hypervisors: twoHostHVs, + reservations: twoHostRes, request: newNovaRequest("instance-123", "project-A", "m1.large", "gp-1", 8, "16Gi", false, []string{"host1", "host2"}), ignoredReservationTypes: []v1alpha1.ReservationType{v1alpha1.ReservationTypeCommittedResource, v1alpha1.ReservationTypeFailover}, expectedHosts: []string{"host1", "host2"}, @@ -602,56 +668,36 @@ func TestFilterHasEnoughCapacity_IgnoredReservationTypes(t *testing.T) { // host1: 12 CPU free, CR blocks 4, Failover blocks 2 → 6 free when both active // Large VM (12 CPU) - only fits if BOTH reservations are ignored { - name: "Single host, Large VM (12 CPU): No ignore - blocked (6 free < 12 needed)", - hypervisors: []*hv1.Hypervisor{ - newHypervisor("host1", "12", "0", "24Gi", "0"), // 12 CPU free - }, - reservations: []*v1alpha1.Reservation{ - newCommittedReservation("cr-res", "host1", "host1", "project-X", "m1.large", "gp-1", "4", "8Gi", nil, nil), - newFailoverReservation("failover-res", "host1", "2", "4Gi", map[string]string{"other-vm": "host3"}), - }, + name: "Single host, Large VM (12 CPU): No ignore - blocked (6 free < 12 needed)", + hypervisors: singleHostHVs, + reservations: singleHostRes, request: newNovaRequest("instance-123", "project-A", "m1.large", "gp-1", 12, "24Gi", false, []string{"host1"}), ignoredReservationTypes: nil, expectedHosts: []string{}, filteredHosts: []string{"host1"}, }, { - name: "Single host, Large VM (12 CPU): Ignore CR - blocked (10 free < 12 needed)", - hypervisors: []*hv1.Hypervisor{ - newHypervisor("host1", "12", "0", "24Gi", "0"), - }, - reservations: []*v1alpha1.Reservation{ - newCommittedReservation("cr-res", "host1", "host1", "project-X", "m1.large", "gp-1", "4", "8Gi", nil, nil), - newFailoverReservation("failover-res", "host1", "2", "4Gi", map[string]string{"other-vm": "host3"}), - }, + name: "Single host, Large VM (12 CPU): Ignore CR - blocked (10 free < 12 needed)", + hypervisors: singleHostHVs, + reservations: singleHostRes, request: newNovaRequest("instance-123", "project-A", "m1.large", "gp-1", 12, "24Gi", false, []string{"host1"}), ignoredReservationTypes: []v1alpha1.ReservationType{v1alpha1.ReservationTypeCommittedResource}, expectedHosts: []string{}, filteredHosts: []string{"host1"}, }, { - name: "Single host, Large VM (12 CPU): Ignore Failover - blocked (8 free < 12 needed)", - hypervisors: []*hv1.Hypervisor{ - newHypervisor("host1", "12", "0", "24Gi", "0"), - }, - reservations: []*v1alpha1.Reservation{ - newCommittedReservation("cr-res", "host1", "host1", "project-X", "m1.large", "gp-1", "4", "8Gi", nil, nil), - newFailoverReservation("failover-res", "host1", "2", "4Gi", map[string]string{"other-vm": "host3"}), - }, + name: "Single host, Large VM (12 CPU): Ignore Failover - blocked (8 free < 12 needed)", + hypervisors: singleHostHVs, + reservations: singleHostRes, request: newNovaRequest("instance-123", "project-A", "m1.large", "gp-1", 12, "24Gi", false, []string{"host1"}), ignoredReservationTypes: []v1alpha1.ReservationType{v1alpha1.ReservationTypeFailover}, expectedHosts: []string{}, filteredHosts: []string{"host1"}, }, { - name: "Single host, Large VM (12 CPU): Ignore both - passes (12 free = 12 needed)", - hypervisors: []*hv1.Hypervisor{ - newHypervisor("host1", "12", "0", "24Gi", "0"), - }, - reservations: []*v1alpha1.Reservation{ - newCommittedReservation("cr-res", "host1", "host1", "project-X", "m1.large", "gp-1", "4", "8Gi", nil, nil), - newFailoverReservation("failover-res", "host1", "2", "4Gi", map[string]string{"other-vm": "host3"}), - }, + name: "Single host, Large VM (12 CPU): Ignore both - passes (12 free = 12 needed)", + hypervisors: singleHostHVs, + reservations: singleHostRes, request: newNovaRequest("instance-123", "project-A", "m1.large", "gp-1", 12, "24Gi", false, []string{"host1"}), ignoredReservationTypes: []v1alpha1.ReservationType{v1alpha1.ReservationTypeCommittedResource, v1alpha1.ReservationTypeFailover}, expectedHosts: []string{"host1"}, @@ -660,56 +706,36 @@ func TestFilterHasEnoughCapacity_IgnoredReservationTypes(t *testing.T) { // Failover-size VM (10 CPU) - fits if CR is ignored (10 free = 10 needed) { - name: "Single host, Failover-size VM (10 CPU): No ignore - blocked (6 free < 10 needed)", - hypervisors: []*hv1.Hypervisor{ - newHypervisor("host1", "12", "0", "24Gi", "0"), - }, - reservations: []*v1alpha1.Reservation{ - newCommittedReservation("cr-res", "host1", "host1", "project-X", "m1.large", "gp-1", "4", "8Gi", nil, nil), - newFailoverReservation("failover-res", "host1", "2", "4Gi", map[string]string{"other-vm": "host3"}), - }, + name: "Single host, Failover-size VM (10 CPU): No ignore - blocked (6 free < 10 needed)", + hypervisors: singleHostHVs, + reservations: singleHostRes, request: newNovaRequest("instance-123", "project-A", "m1.large", "gp-1", 10, "20Gi", false, []string{"host1"}), ignoredReservationTypes: nil, expectedHosts: []string{}, filteredHosts: []string{"host1"}, }, { - name: "Single host, Failover-size VM (10 CPU): Ignore CR - passes (10 free = 10 needed)", - hypervisors: []*hv1.Hypervisor{ - newHypervisor("host1", "12", "0", "24Gi", "0"), - }, - reservations: []*v1alpha1.Reservation{ - newCommittedReservation("cr-res", "host1", "host1", "project-X", "m1.large", "gp-1", "4", "8Gi", nil, nil), - newFailoverReservation("failover-res", "host1", "2", "4Gi", map[string]string{"other-vm": "host3"}), - }, + name: "Single host, Failover-size VM (10 CPU): Ignore CR - passes (10 free = 10 needed)", + hypervisors: singleHostHVs, + reservations: singleHostRes, request: newNovaRequest("instance-123", "project-A", "m1.large", "gp-1", 10, "20Gi", false, []string{"host1"}), ignoredReservationTypes: []v1alpha1.ReservationType{v1alpha1.ReservationTypeCommittedResource}, expectedHosts: []string{"host1"}, filteredHosts: []string{}, }, { - name: "Single host, Failover-size VM (10 CPU): Ignore Failover - blocked (8 free < 10 needed)", - hypervisors: []*hv1.Hypervisor{ - newHypervisor("host1", "12", "0", "24Gi", "0"), - }, - reservations: []*v1alpha1.Reservation{ - newCommittedReservation("cr-res", "host1", "host1", "project-X", "m1.large", "gp-1", "4", "8Gi", nil, nil), - newFailoverReservation("failover-res", "host1", "2", "4Gi", map[string]string{"other-vm": "host3"}), - }, + name: "Single host, Failover-size VM (10 CPU): Ignore Failover - blocked (8 free < 10 needed)", + hypervisors: singleHostHVs, + reservations: singleHostRes, request: newNovaRequest("instance-123", "project-A", "m1.large", "gp-1", 10, "20Gi", false, []string{"host1"}), ignoredReservationTypes: []v1alpha1.ReservationType{v1alpha1.ReservationTypeFailover}, expectedHosts: []string{}, filteredHosts: []string{"host1"}, }, { - name: "Single host, Failover-size VM (10 CPU): Ignore both - passes (12 free > 10 needed)", - hypervisors: []*hv1.Hypervisor{ - newHypervisor("host1", "12", "0", "24Gi", "0"), - }, - reservations: []*v1alpha1.Reservation{ - newCommittedReservation("cr-res", "host1", "host1", "project-X", "m1.large", "gp-1", "4", "8Gi", nil, nil), - newFailoverReservation("failover-res", "host1", "2", "4Gi", map[string]string{"other-vm": "host3"}), - }, + name: "Single host, Failover-size VM (10 CPU): Ignore both - passes (12 free > 10 needed)", + hypervisors: singleHostHVs, + reservations: singleHostRes, request: newNovaRequest("instance-123", "project-A", "m1.large", "gp-1", 10, "20Gi", false, []string{"host1"}), ignoredReservationTypes: []v1alpha1.ReservationType{v1alpha1.ReservationTypeCommittedResource, v1alpha1.ReservationTypeFailover}, expectedHosts: []string{"host1"}, @@ -718,56 +744,36 @@ func TestFilterHasEnoughCapacity_IgnoredReservationTypes(t *testing.T) { // CR-size VM (8 CPU) - fits if Failover is ignored (8 free = 8 needed) { - name: "Single host, CR-size VM (8 CPU): No ignore - blocked (6 free < 8 needed)", - hypervisors: []*hv1.Hypervisor{ - newHypervisor("host1", "12", "0", "24Gi", "0"), - }, - reservations: []*v1alpha1.Reservation{ - newCommittedReservation("cr-res", "host1", "host1", "project-X", "m1.large", "gp-1", "4", "8Gi", nil, nil), - newFailoverReservation("failover-res", "host1", "2", "4Gi", map[string]string{"other-vm": "host3"}), - }, + name: "Single host, CR-size VM (8 CPU): No ignore - blocked (6 free < 8 needed)", + hypervisors: singleHostHVs, + reservations: singleHostRes, request: newNovaRequest("instance-123", "project-A", "m1.large", "gp-1", 8, "16Gi", false, []string{"host1"}), ignoredReservationTypes: nil, expectedHosts: []string{}, filteredHosts: []string{"host1"}, }, { - name: "Single host, CR-size VM (8 CPU): Ignore CR - passes (10 free > 8 needed)", - hypervisors: []*hv1.Hypervisor{ - newHypervisor("host1", "12", "0", "24Gi", "0"), - }, - reservations: []*v1alpha1.Reservation{ - newCommittedReservation("cr-res", "host1", "host1", "project-X", "m1.large", "gp-1", "4", "8Gi", nil, nil), - newFailoverReservation("failover-res", "host1", "2", "4Gi", map[string]string{"other-vm": "host3"}), - }, + name: "Single host, CR-size VM (8 CPU): Ignore CR - passes (10 free > 8 needed)", + hypervisors: singleHostHVs, + reservations: singleHostRes, request: newNovaRequest("instance-123", "project-A", "m1.large", "gp-1", 8, "16Gi", false, []string{"host1"}), ignoredReservationTypes: []v1alpha1.ReservationType{v1alpha1.ReservationTypeCommittedResource}, expectedHosts: []string{"host1"}, filteredHosts: []string{}, }, { - name: "Single host, CR-size VM (8 CPU): Ignore Failover - passes (8 free = 8 needed)", - hypervisors: []*hv1.Hypervisor{ - newHypervisor("host1", "12", "0", "24Gi", "0"), - }, - reservations: []*v1alpha1.Reservation{ - newCommittedReservation("cr-res", "host1", "host1", "project-X", "m1.large", "gp-1", "4", "8Gi", nil, nil), - newFailoverReservation("failover-res", "host1", "2", "4Gi", map[string]string{"other-vm": "host3"}), - }, + name: "Single host, CR-size VM (8 CPU): Ignore Failover - passes (8 free = 8 needed)", + hypervisors: singleHostHVs, + reservations: singleHostRes, request: newNovaRequest("instance-123", "project-A", "m1.large", "gp-1", 8, "16Gi", false, []string{"host1"}), ignoredReservationTypes: []v1alpha1.ReservationType{v1alpha1.ReservationTypeFailover}, expectedHosts: []string{"host1"}, filteredHosts: []string{}, }, { - name: "Single host, CR-size VM (8 CPU): Ignore both - passes (12 free > 8 needed)", - hypervisors: []*hv1.Hypervisor{ - newHypervisor("host1", "12", "0", "24Gi", "0"), - }, - reservations: []*v1alpha1.Reservation{ - newCommittedReservation("cr-res", "host1", "host1", "project-X", "m1.large", "gp-1", "4", "8Gi", nil, nil), - newFailoverReservation("failover-res", "host1", "2", "4Gi", map[string]string{"other-vm": "host3"}), - }, + name: "Single host, CR-size VM (8 CPU): Ignore both - passes (12 free > 8 needed)", + hypervisors: singleHostHVs, + reservations: singleHostRes, request: newNovaRequest("instance-123", "project-A", "m1.large", "gp-1", 8, "16Gi", false, []string{"host1"}), ignoredReservationTypes: []v1alpha1.ReservationType{v1alpha1.ReservationTypeCommittedResource, v1alpha1.ReservationTypeFailover}, expectedHosts: []string{"host1"}, @@ -796,18 +802,7 @@ func TestFilterHasEnoughCapacity_IgnoredReservationTypes(t *testing.T) { if err != nil { t.Fatalf("expected no error, got %v", err) } - - for _, host := range tt.expectedHosts { - if _, ok := result.Activations[host]; !ok { - t.Errorf("expected host %s to be present in activations, but got %+v", host, result.Activations) - } - } - - for _, host := range tt.filteredHosts { - if _, ok := result.Activations[host]; ok { - t.Errorf("expected host %s to be filtered out, but it was present", host) - } - } + assertActivations(t, result.Activations, tt.expectedHosts, tt.filteredHosts) }) } } @@ -835,7 +830,7 @@ func TestFilterHasEnoughCapacity_ReserveForCommittedResourceIntent(t *testing.T) }, reservations: []*v1alpha1.Reservation{ // Existing CR reservation on host1 for same project+flavor group - newCommittedReservation("existing-cr", "host1", "host1", "project-A", "m1.large", "gp-1", "8", "16Gi", nil, nil), + newCommittedReservation("existing-cr", "host1", "project-A", "m1.large", "gp-1", "8", "16Gi", nil, nil), }, // Request with reserve_for_committed_resource intent (scheduling a new CR reservation) request: newNovaRequestWithIntent("new-reservation-uuid", "project-A", "m1.large", "gp-1", 4, "8Gi", "reserve_for_committed_resource", false, []string{"host1", "host2"}), @@ -851,7 +846,7 @@ func TestFilterHasEnoughCapacity_ReserveForCommittedResourceIntent(t *testing.T) }, reservations: []*v1alpha1.Reservation{ // Existing CR reservation on host1 for same project+flavor group - newCommittedReservation("existing-cr", "host1", "host1", "project-A", "m1.large", "gp-1", "8", "16Gi", nil, nil), + newCommittedReservation("existing-cr", "host1", "project-A", "m1.large", "gp-1", "8", "16Gi", nil, nil), }, // Normal VM create request (no special intent) - CR reservation should be unlocked request: newNovaRequest("vm-instance-123", "project-A", "m1.large", "gp-1", 4, "8Gi", false, []string{"host1", "host2"}), @@ -867,7 +862,7 @@ func TestFilterHasEnoughCapacity_ReserveForCommittedResourceIntent(t *testing.T) }, reservations: []*v1alpha1.Reservation{ // Existing CR reservation on host1 for different project - newCommittedReservation("other-project-cr", "host1", "host1", "project-B", "m1.large", "gp-1", "8", "16Gi", nil, nil), + newCommittedReservation("other-project-cr", "host1", "project-B", "m1.large", "gp-1", "8", "16Gi", nil, nil), }, // Request with reserve_for_committed_resource intent request: newNovaRequestWithIntent("new-reservation-uuid", "project-A", "m1.large", "gp-1", 4, "8Gi", "reserve_for_committed_resource", false, []string{"host1", "host2"}), @@ -882,9 +877,9 @@ func TestFilterHasEnoughCapacity_ReserveForCommittedResourceIntent(t *testing.T) }, reservations: []*v1alpha1.Reservation{ // Three existing CR reservations on host1 for same project+flavor group - newCommittedReservation("cr-1", "host1", "host1", "project-A", "m1.large", "gp-1", "8", "16Gi", nil, nil), - newCommittedReservation("cr-2", "host1", "host1", "project-A", "m1.large", "gp-1", "8", "16Gi", nil, nil), - newCommittedReservation("cr-3", "host1", "host1", "project-A", "m1.large", "gp-1", "8", "16Gi", nil, nil), + newCommittedReservation("cr-1", "host1", "project-A", "m1.large", "gp-1", "8", "16Gi", nil, nil), + newCommittedReservation("cr-2", "host1", "project-A", "m1.large", "gp-1", "8", "16Gi", nil, nil), + newCommittedReservation("cr-3", "host1", "project-A", "m1.large", "gp-1", "8", "16Gi", nil, nil), }, // Request with reserve_for_committed_resource intent, needs 10 CPU // After blocking all 3 reservations (24 CPU), only 8 CPU free -> should fail @@ -900,9 +895,9 @@ func TestFilterHasEnoughCapacity_ReserveForCommittedResourceIntent(t *testing.T) }, reservations: []*v1alpha1.Reservation{ // Three existing CR reservations on host1 for same project+flavor group - newCommittedReservation("cr-1", "host1", "host1", "project-A", "m1.large", "gp-1", "8", "16Gi", nil, nil), - newCommittedReservation("cr-2", "host1", "host1", "project-A", "m1.large", "gp-1", "8", "16Gi", nil, nil), - newCommittedReservation("cr-3", "host1", "host1", "project-A", "m1.large", "gp-1", "8", "16Gi", nil, nil), + newCommittedReservation("cr-1", "host1", "project-A", "m1.large", "gp-1", "8", "16Gi", nil, nil), + newCommittedReservation("cr-2", "host1", "project-A", "m1.large", "gp-1", "8", "16Gi", nil, nil), + newCommittedReservation("cr-3", "host1", "project-A", "m1.large", "gp-1", "8", "16Gi", nil, nil), }, // Normal VM create request, needs 10 CPU // All 3 reservations unlocked for matching project+flavor -> 32 CPU free -> should pass @@ -918,7 +913,7 @@ func TestFilterHasEnoughCapacity_ReserveForCommittedResourceIntent(t *testing.T) }, reservations: []*v1alpha1.Reservation{ // Existing CR reservation on host1 blocks all 8 free CPU - newCommittedReservation("existing-cr", "host1", "host1", "project-A", "m1.large", "gp-1", "8", "16Gi", nil, nil), + newCommittedReservation("existing-cr", "host1", "project-A", "m1.large", "gp-1", "8", "16Gi", nil, nil), }, // Request with reserve_for_committed_resource intent // IgnoredReservationTypes is a safety flag that overrides everything, including intent @@ -938,7 +933,7 @@ func TestFilterHasEnoughCapacity_ReserveForCommittedResourceIntent(t *testing.T) }, reservations: []*v1alpha1.Reservation{ // Existing CR reservation on host1 blocks all 8 free CPU - newCommittedReservation("existing-cr", "host1", "host1", "project-B", "m1.large", "gp-1", "8", "16Gi", nil, nil), + newCommittedReservation("existing-cr", "host1", "project-B", "m1.large", "gp-1", "8", "16Gi", nil, nil), }, // Normal VM create request (different project, so unlocking via project match won't work) // But IgnoredReservationTypes should make it work @@ -970,19 +965,58 @@ func TestFilterHasEnoughCapacity_ReserveForCommittedResourceIntent(t *testing.T) if err != nil { t.Fatalf("expected no error, got %v", err) } + assertActivations(t, result.Activations, tt.expectedHosts, tt.filteredHosts) + }) + } +} - for _, host := range tt.expectedHosts { - if _, ok := result.Activations[host]; !ok { - t.Errorf("expected host %s to be present in activations, but got %+v", host, result.Activations) - } - } +// TestFilterHasEnoughCapacity_PlannedCRDoesNotBlock verifies that a CommittedResource CRD +// in "planned" state has no child Reservation CRDs and therefore blocks no capacity. +// This is correct by design: the filter reads only Reservation CRDs, so planned CRDs +// have no effect regardless of the committed amount. +func TestFilterHasEnoughCapacity_PlannedCRDoesNotBlock(t *testing.T) { + scheme := buildTestScheme(t) - for _, host := range tt.filteredHosts { - if _, ok := result.Activations[host]; ok { - t.Errorf("expected host %s to be filtered out, but it was present", host) - } - } - }) + // A planned CommittedResource: StartTime not yet reached, no Reservation CRDs created. + plannedCR := &v1alpha1.CommittedResource{ + ObjectMeta: metav1.ObjectMeta{Name: "cr-planned-uuid-1"}, + Spec: v1alpha1.CommittedResourceSpec{ + CommitmentUUID: "uuid-1", + ProjectID: "project-A", + DomainID: "domain-A", + FlavorGroupName: "gp-1", + ResourceType: v1alpha1.CommittedResourceTypeMemory, + Amount: resource.MustParse("16Gi"), + AvailabilityZone: "az-1", + State: v1alpha1.CommitmentStatusPlanned, + }, + Status: v1alpha1.CommittedResourceStatus{ + Conditions: []metav1.Condition{ + { + Type: v1alpha1.CommittedResourceConditionReady, + Status: metav1.ConditionFalse, + Reason: v1alpha1.CommittedResourceReasonPlanned, + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + hv := newHypervisor("host1", "16", "8", "32Gi", "16Gi") // 8 CPU free, 16Gi free + objects := []client.Object{hv, plannedCR} + // No Reservation CRDs — planned CommittedResources have none. + + step := &FilterHasEnoughCapacity{} + step.Client = fake.NewClientBuilder().WithScheme(scheme).WithObjects(objects...).Build() + step.Options = FilterHasEnoughCapacityOpts{LockReserved: false} + + request := newNovaRequest("instance-123", "project-A", "m1.large", "gp-1", 4, "8Gi", false, []string{"host1"}) + result, err := step.Run(slog.Default(), request) + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if _, ok := result.Activations["host1"]; !ok { + t.Error("expected host1 to pass: planned CommittedResource has no child Reservations and must not block capacity") } } @@ -1054,18 +1088,206 @@ func TestFilterHasEnoughCapacity_NilEffectiveCapacityFallback(t *testing.T) { if err != nil { t.Fatalf("expected no error, got %v", err) } + assertActivations(t, result.Activations, tt.expectedHosts, tt.filteredHosts) + }) + } +} - for _, host := range tt.expectedHosts { - if _, ok := result.Activations[host]; !ok { - t.Errorf("expected host %s to be present in activations, but got %+v", host, result.Activations) - } - } +// TestFilterHasEnoughCapacity_VMInterReservationMigration covers all realistic phases of a VM +// migrating from res-a (on hv-a) to res-b (on hv-b). +// +// Six binary state variables per phase: +// - VM in hv-a allocation (affects hv-a free capacity directly) +// - VM in hv-b allocation (affects hv-b free capacity directly) +// - VM in res-a Spec.Allocations +// - VM in res-a Status.Allocations +// - VM in res-b Spec.Allocations +// - VM in res-b Status.Allocations +// +// Capacity accounting per host: +// +// free = HV.EffectiveCapacity - HV.Allocation +// block = max(slot - confirmed, specOnly) [clamped ≥ 0; else full slot when spec allocs empty] +// net = free - block +// +// All phases use: VM=4cpu/8Gi, slot=8cpu/16Gi, HV total=12cpu/24Gi, request=3cpu/6Gi (project-C). +func TestFilterHasEnoughCapacity_VMInterReservationMigration(t *testing.T) { + scheme := buildTestScheme(t) + + const ( + owner = "project-A" // project owning the reservations and the migrating VM + thirdParty = "project-C" // project making the placement request + flavorGroup = "gp-1" + slotCPU = "8" + slotMem = "16Gi" + hvCapCPU = "12" + hvCapMem = "24Gi" + vmCPU = "4" + vmMem = "8Gi" + ) + + tests := []struct { + name string + hvA *hv1.Hypervisor // allocation=vmCPU/vmMem when VM present, "0"/"0" when absent + hvB *hv1.Hypervisor + resA *v1alpha1.Reservation + resB *v1alpha1.Reservation + expectedHosts []string + filteredHosts []string + }{ + { + // VM fully on hv-a, confirmed in res-a. res-b exists but is empty. + // + // hv-a: free=12-4=8cpu. res-a confirmed → block=slot-confirmed=8-4=4 → net=4. Passes 3-cpu req. + // hv-b: free=12cpu. res-b no allocs → block=full slot=8 → net=4. Passes. + name: "Phase 1: VM on hv-a, confirmed in res-a, res-b empty", + hvA: newHypervisor("hv-a", hvCapCPU, vmCPU, hvCapMem, vmMem), + hvB: newHypervisor("hv-b", hvCapCPU, "0", hvCapMem, "0"), + resA: newCommittedReservation("res-a", "hv-a", owner, "m1.large", flavorGroup, slotCPU, slotMem, + crSpecAllocs(crVm("vm-1", vmCPU, vmMem)), + map[string]string{"vm-1": "hv-a"}, + ), + resB: newCommittedReservation("res-b", "hv-b", owner, "m1.large", flavorGroup, slotCPU, slotMem, nil, nil), + expectedHosts: []string{"hv-a", "hv-b"}, + filteredHosts: []string{}, + }, + { + // Placement pipeline wrote VM into res-b spec. VM is still running on hv-a (not yet migrated). + // + // hv-a: free=8, res-a confirmed → block=4 → net=4. Passes. + // hv-b: free=12, res-b spec-only(4) → remaining=8, specOnly=4, block=max(8,4)=8 → net=4. Passes. + // + // res-b blocks its full slot even though VM is only in spec: remaining(8) > specOnly(4). + name: "Phase 2: VM on hv-a, confirmed in res-a; added to res-b spec only (migration initiated)", + hvA: newHypervisor("hv-a", hvCapCPU, vmCPU, hvCapMem, vmMem), + hvB: newHypervisor("hv-b", hvCapCPU, "0", hvCapMem, "0"), + resA: newCommittedReservation("res-a", "hv-a", owner, "m1.large", flavorGroup, slotCPU, slotMem, + crSpecAllocs(crVm("vm-1", vmCPU, vmMem)), + map[string]string{"vm-1": "hv-a"}, + ), + resB: newCommittedReservation("res-b", "hv-b", owner, "m1.large", flavorGroup, slotCPU, slotMem, + crSpecAllocs(crVm("vm-1", vmCPU, vmMem)), + nil, + ), + expectedHosts: []string{"hv-a", "hv-b"}, + filteredHosts: []string{}, + }, + { + // VM appears in both HV allocations during live migration (transient double-presence). + // res-a: confirmed. res-b: spec-only (controller has not reconciled hv-b yet). + // + // hv-a: free=8, res-a confirmed → block=4 → net=4. Passes. + // hv-b: free=8, res-b spec-only → block=8 → net=0. FAILS — conservative until controller confirms. + name: "Phase 3: VM in both HV allocs (live migration in progress); res-a confirmed, res-b spec-only", + hvA: newHypervisor("hv-a", hvCapCPU, vmCPU, hvCapMem, vmMem), + hvB: newHypervisor("hv-b", hvCapCPU, vmCPU, hvCapMem, vmMem), + resA: newCommittedReservation("res-a", "hv-a", owner, "m1.large", flavorGroup, slotCPU, slotMem, + crSpecAllocs(crVm("vm-1", vmCPU, vmMem)), + map[string]string{"vm-1": "hv-a"}, + ), + resB: newCommittedReservation("res-b", "hv-b", owner, "m1.large", flavorGroup, slotCPU, slotMem, + crSpecAllocs(crVm("vm-1", vmCPU, vmMem)), + nil, + ), + expectedHosts: []string{"hv-a"}, + filteredHosts: []string{"hv-b"}, + }, + { + // VM has arrived on hv-b (in alloc) but left hv-a. Controller lag: res-a still confirmed, res-b spec-only. + // + // hv-a: free=12 (VM gone), res-a confirmed → block=4 → net=8. Passes. + // hv-b: free=8, res-b spec-only → block=8 → net=0. FAILS — res-b not confirmed yet. + name: "Phase 4: VM arrived on hv-b (in alloc), left hv-a; res-a still confirmed, res-b spec-only", + hvA: newHypervisor("hv-a", hvCapCPU, "0", hvCapMem, "0"), + hvB: newHypervisor("hv-b", hvCapCPU, vmCPU, hvCapMem, vmMem), + resA: newCommittedReservation("res-a", "hv-a", owner, "m1.large", flavorGroup, slotCPU, slotMem, + crSpecAllocs(crVm("vm-1", vmCPU, vmMem)), + map[string]string{"vm-1": "hv-a"}, + ), + resB: newCommittedReservation("res-b", "hv-b", owner, "m1.large", flavorGroup, slotCPU, slotMem, + crSpecAllocs(crVm("vm-1", vmCPU, vmMem)), + nil, + ), + expectedHosts: []string{"hv-a"}, + filteredHosts: []string{"hv-b"}, + }, + { + // Controller confirmed VM in res-b. res-a cleanup not done yet (stale confirmed entry). + // + // hv-a: free=12 (VM gone), res-a confirmed(stale) → block=8-4=4 → net=8. Passes. + // hv-b: free=8, res-b confirmed → block=8-4=4 → net=4. Passes. + // + // hv-a gets its remaining slot capacity back once res-a is cleaned up (phases 6→7). + name: "Phase 5: VM confirmed in res-b; res-a still has confirmed entry (stale, not yet cleaned)", + hvA: newHypervisor("hv-a", hvCapCPU, "0", hvCapMem, "0"), + hvB: newHypervisor("hv-b", hvCapCPU, vmCPU, hvCapMem, vmMem), + resA: newCommittedReservation("res-a", "hv-a", owner, "m1.large", flavorGroup, slotCPU, slotMem, + crSpecAllocs(crVm("vm-1", vmCPU, vmMem)), + map[string]string{"vm-1": "hv-a"}, + ), + resB: newCommittedReservation("res-b", "hv-b", owner, "m1.large", flavorGroup, slotCPU, slotMem, + crSpecAllocs(crVm("vm-1", vmCPU, vmMem)), + map[string]string{"vm-1": "hv-b"}, + ), + expectedHosts: []string{"hv-a", "hv-b"}, + filteredHosts: []string{}, + }, + { + // VM removed from res-a Spec.Allocations, but res-a Status.Allocations still stale (one controller cycle lag). + // spec allocs empty → else branch: res-a blocks its full slot regardless of status. + // Status allocations are not consulted when spec allocs are absent. + // + // hv-a: free=12, res-a spec-empty → block=full slot=8 → net=4. Passes. + // hv-b: free=8, res-b confirmed → block=4 → net=4. Passes. + name: "Phase 6: VM removed from res-a spec, res-a status stale; res-b confirmed", + hvA: newHypervisor("hv-a", hvCapCPU, "0", hvCapMem, "0"), + hvB: newHypervisor("hv-b", hvCapCPU, vmCPU, hvCapMem, vmMem), + resA: newCommittedReservation("res-a", "hv-a", owner, "m1.large", flavorGroup, slotCPU, slotMem, + nil, // spec allocs cleared + map[string]string{"vm-1": "hv-a"}, // status stale — not consulted when spec is empty + ), + resB: newCommittedReservation("res-b", "hv-b", owner, "m1.large", flavorGroup, slotCPU, slotMem, + crSpecAllocs(crVm("vm-1", vmCPU, vmMem)), + map[string]string{"vm-1": "hv-b"}, + ), + expectedHosts: []string{"hv-a", "hv-b"}, + filteredHosts: []string{}, + }, + { + // Migration fully complete: res-a is empty, VM confirmed in res-b. + // Identical blocking to phase 6 for hv-a (spec-empty → full slot block). + // + // hv-a: free=12, res-a empty → block=8 → net=4. Passes. + // hv-b: free=8, res-b confirmed → block=4 → net=4. Passes. + name: "Phase 7: Migration complete — res-a empty, VM confirmed in res-b", + hvA: newHypervisor("hv-a", hvCapCPU, "0", hvCapMem, "0"), + hvB: newHypervisor("hv-b", hvCapCPU, vmCPU, hvCapMem, vmMem), + resA: newCommittedReservation("res-a", "hv-a", owner, "m1.large", flavorGroup, slotCPU, slotMem, + nil, nil, + ), + resB: newCommittedReservation("res-b", "hv-b", owner, "m1.large", flavorGroup, slotCPU, slotMem, + crSpecAllocs(crVm("vm-1", vmCPU, vmMem)), + map[string]string{"vm-1": "hv-b"}, + ), + expectedHosts: []string{"hv-a", "hv-b"}, + filteredHosts: []string{}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + objects := []client.Object{tt.hvA, tt.hvB, tt.resA, tt.resB} - for _, host := range tt.filteredHosts { - if _, ok := result.Activations[host]; ok { - t.Errorf("expected host %s to be filtered out", host) - } + step := &FilterHasEnoughCapacity{} + step.Client = fake.NewClientBuilder().WithScheme(scheme).WithObjects(objects...).Build() + step.Options = FilterHasEnoughCapacityOpts{LockReserved: false} + + request := newNovaRequest("instance-new", thirdParty, "m1.small", flavorGroup, 3, "6Gi", false, []string{"hv-a", "hv-b"}) + result, err := step.Run(slog.Default(), request) + if err != nil { + t.Fatalf("unexpected error: %v", err) } + assertActivations(t, result.Activations, tt.expectedHosts, tt.filteredHosts) }) } } From 3749b2be28067965ad4c6d4e8e3735491bb55a50 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 4 May 2026 12:41:50 +0000 Subject: [PATCH 2/9] Bump cortex chart appVersions to sha-e6ae001f [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 463af7d34..f973a1d30 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.45 -appVersion: "sha-07d25e89" +appVersion: "sha-e6ae001f" icon: "https://example.com/icon.png" dependencies: [] From 1d4f049c12af7fc501ebf9a3a872843d00317f17 Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Mon, 4 May 2026 15:26:49 +0200 Subject: [PATCH 3/9] Expose parallel reconciles option in prometheus datasource ctrl The prometheus datasource controller can be too slow when a fresh database is initialized, such that the knowledges keep waiting too long and alerts are fired. It is reasonable to speed up the datasource sync process by exposing a parallel reconciles option like in the openstack datasource controller. --- cmd/manager/main.go | 1 - .../plugins/prometheus/controller.go | 28 ++++++++++++++++--- .../plugins/prometheus/controller_test.go | 6 ++-- 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/cmd/manager/main.go b/cmd/manager/main.go index e031366f8..4a09323a4 100644 --- a/cmd/manager/main.go +++ b/cmd/manager/main.go @@ -573,7 +573,6 @@ func main() { Client: multiclusterClient, Scheme: mgr.GetScheme(), Monitor: monitor, - Conf: conf.GetConfigOrDie[prometheus.PrometheusDatasourceReconcilerConfig](), }).SetupWithManager(mgr, multiclusterClient); err != nil { setupLog.Error(err, "unable to create controller", "controller", "PrometheusDatasourceReconciler") os.Exit(1) diff --git a/internal/knowledge/datasources/plugins/prometheus/controller.go b/internal/knowledge/datasources/plugins/prometheus/controller.go index 0dc0dd63b..26712b54a 100644 --- a/internal/knowledge/datasources/plugins/prometheus/controller.go +++ b/internal/knowledge/datasources/plugins/prometheus/controller.go @@ -12,6 +12,7 @@ import ( "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources" "github.com/cobaltcore-dev/cortex/internal/knowledge/db" + "github.com/cobaltcore-dev/cortex/pkg/conf" "github.com/cobaltcore-dev/cortex/pkg/multicluster" "github.com/cobaltcore-dev/cortex/pkg/sso" corev1 "k8s.io/api/core/v1" @@ -20,19 +21,24 @@ import ( "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/handler" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" ) -type PrometheusDatasourceReconcilerConfig struct { +type config struct { // The controller will only touch resources with this scheduling domain. SchedulingDomain v1alpha1.SchedulingDomain `json:"schedulingDomain"` // Secret ref to keystone credentials stored in a k8s secret. KeystoneSecretRef corev1.SecretReference `json:"keystoneSecretRef"` // Secret ref to SSO credentials stored in a k8s secret, if applicable. SSOSecretRef *corev1.SecretReference `json:"ssoSecretRef"` + // The number of parallel reconciles to allow for the controller. + // By default, this will be set to 1. + ParallelReconciles *int `json:"prometheusDatasourceControllerParallelReconciles,omitempty"` } type PrometheusDatasourceReconciler struct { @@ -41,7 +47,7 @@ type PrometheusDatasourceReconciler struct { // Kubernetes scheme to use for the deschedulings. Scheme *runtime.Scheme // Config for the reconciler. - Conf PrometheusDatasourceReconcilerConfig + conf config // Monitor for tracking the datasource syncs. Monitor datasources.Monitor } @@ -199,15 +205,20 @@ func (r *PrometheusDatasourceReconciler) Reconcile(ctx context.Context, req ctrl } func (r *PrometheusDatasourceReconciler) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error { + var err error + r.conf, err = conf.GetConfig[config]() + if err != nil { + return err + } bldr := multicluster.BuildController(mcl, mgr) // Watch datasource changes across all clusters. - bldr, err := bldr.WatchesMulticluster( + bldr, err = bldr.WatchesMulticluster( &v1alpha1.Datasource{}, &handler.EnqueueRequestForObject{}, predicate.NewPredicateFuncs(func(obj client.Object) bool { // Only react to datasources matching the operator. ds := obj.(*v1alpha1.Datasource) - if ds.Spec.SchedulingDomain != r.Conf.SchedulingDomain { + if ds.Spec.SchedulingDomain != r.conf.SchedulingDomain { return false } // Only react to prometheus datasources. @@ -218,5 +229,14 @@ func (r *PrometheusDatasourceReconciler) SetupWithManager(mgr manager.Manager, m return err } return bldr.Named("cortex-prometheus-datasource"). + WithOptions(controller.TypedOptions[reconcile.Request]{ + // Allow parallel reconciles if configured, otherwise default to 1. + MaxConcurrentReconciles: func() int { + if r.conf.ParallelReconciles != nil { + return *r.conf.ParallelReconciles + } + return 1 + }(), + }). Complete(r) } diff --git a/internal/knowledge/datasources/plugins/prometheus/controller_test.go b/internal/knowledge/datasources/plugins/prometheus/controller_test.go index af6dafa7b..9dfba9f3e 100644 --- a/internal/knowledge/datasources/plugins/prometheus/controller_test.go +++ b/internal/knowledge/datasources/plugins/prometheus/controller_test.go @@ -31,7 +31,7 @@ func TestPrometheusDatasourceReconciler_Creation(t *testing.T) { reconciler := &PrometheusDatasourceReconciler{ Client: client, Scheme: scheme, - Conf: PrometheusDatasourceReconcilerConfig{SchedulingDomain: "test-operator"}, + conf: config{SchedulingDomain: "test-operator"}, Monitor: datasources.Monitor{}, } @@ -43,8 +43,8 @@ func TestPrometheusDatasourceReconciler_Creation(t *testing.T) { t.Error("Scheme should not be nil") } - if reconciler.Conf.SchedulingDomain != "test-operator" { - t.Errorf("Expected scheduling domain 'test-operator', got %s", reconciler.Conf.SchedulingDomain) + if reconciler.conf.SchedulingDomain != "test-operator" { + t.Errorf("Expected scheduling domain 'test-operator', got %s", reconciler.conf.SchedulingDomain) } } From d24a4299638b43ed81209aba3da2efcc13f19bc5 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 4 May 2026 13:36:51 +0000 Subject: [PATCH 4/9] Bump cortex chart appVersions to sha-1d4f049c [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index f973a1d30..458645206 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.45 -appVersion: "sha-e6ae001f" +appVersion: "sha-1d4f049c" icon: "https://example.com/icon.png" dependencies: [] From 794bcc43666e6433c960217efa1005ea844c3bbe Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Mon, 4 May 2026 15:38:49 +0200 Subject: [PATCH 5/9] Remove all committed resource related alerts --- .../cortex-nova/alerts/nova.alerts.yaml | 252 ------------------ 1 file changed, 252 deletions(-) diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index f48f0cb28..b1a8570f4 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -287,258 +287,6 @@ groups: configuration. It is recommended to investigate the pipeline status and logs for more details. - # Committed Resource Info API Alerts - - alert: CortexNovaCommittedResourceInfoHttpRequest500sTooHigh - expr: rate(cortex_committed_resource_info_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource info API HTTP 500 errors too high" - description: > - The committed resource info API (Limes LIQUID integration) is responding - with HTTP 5xx errors. This indicates internal problems building service info, - such as invalid flavor group data. Limes will not be able to discover available - resources until the issue is resolved. - - # Committed Resource Change API Alerts - - alert: CortexNovaCommittedResourceHttpRequest400sTooHigh - expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource change API HTTP 400 errors too high" - description: > - The committed resource change API (Limes LIQUID integration) is responding - with HTTP 4xx errors. This may happen when Limes sends a request with - an outdated info version (409), the API is temporarily unavailable, - or the request format is invalid. Limes will typically retry these - requests, so no immediate action is needed unless the errors persist. - - - alert: CortexNovaCommittedResourceHttpRequest500sTooHigh - expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource change API HTTP 500 errors too high" - description: > - The committed resource change API (Limes LIQUID integration) is responding - with HTTP 5xx errors. This is not expected and indicates that Cortex - is having an internal problem processing commitment changes. Limes will - continue to retry, but new commitments may not be fulfilled until the - issue is resolved. - - - alert: CortexNovaCommittedResourceLatencyTooHigh - expr: | - histogram_quantile(0.95, sum(rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 30 - and on() rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics"}[5m]) > 0 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource change API latency too high" - description: > - The committed resource change API (Limes LIQUID integration) is experiencing - high latency (p95 > 30s). This may indicate that the scheduling pipeline - is under heavy load or that reservation scheduling is taking longer than - expected. Limes requests may time out, causing commitment changes to fail. - - - alert: CortexNovaCommittedResourceRejectionRateTooHigh - expr: | - ( - sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[5m])) - / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[5m])) - ) > 0.5 - and on() sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[5m])) > 0 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource rejection rate too high" - description: > - More than 50% of commitment change requests are being rejected. - This may indicate insufficient capacity in the datacenter to fulfill - new commitments, or issues with the commitment scheduling logic. - Rejected commitments are rolled back, so Limes will see them as failed - and may retry or report the failure to users. - - - alert: CortexNovaCommittedResourceTimeoutsTooHigh - expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics"}[5m]) > 0 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource change API timeout detected" - description: > - The committed resource change API (Limes LIQUID integration) timed out - while waiting for reservations to become ready. This indicates that the - scheduling pipeline is overloaded or reservations are taking too long - to be scheduled. Affected commitment changes are rolled back and Limes - will see them as failed. Consider investigating the scheduler performance - or increasing the timeout configuration. - - # Committed Resource Usage API Alerts - - alert: CortexNovaCommittedResourceUsageHttpRequest400sTooHigh - expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource usage API HTTP 400 errors too high" - description: > - The committed resource usage API (Limes LIQUID integration) is responding - with HTTP 4xx errors. This may indicate invalid project IDs or malformed - requests from Limes. Limes will typically retry these requests. - - - alert: CortexNovaCommittedResourceUsageHttpRequest500sTooHigh - expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource usage API HTTP 500 errors too high" - description: > - The committed resource usage API (Limes LIQUID integration) is responding - with HTTP 5xx errors. This indicates internal problems fetching reservations - or Nova server data. Limes may receive stale or incomplete usage data. - - - alert: CortexNovaCommittedResourceUsageLatencyTooHigh - expr: | - histogram_quantile(0.95, sum(rate(cortex_committed_resource_usage_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 10 - and on() rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics"}[5m]) > 0 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource usage API latency too high" - description: > - The committed resource usage API (Limes LIQUID integration) is experiencing - high latency (p95 > 10s). This may indicate slow Nova API responses or - database queries. Limes scrapes may time out, affecting quota reporting. - - # Committed Resource Capacity API Alerts - - alert: CortexNovaCommittedResourceCapacityHttpRequest400sTooHigh - expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource capacity API HTTP 400 errors too high" - description: > - The committed resource capacity API (Limes LIQUID integration) is responding - with HTTP 4xx errors. This may indicate malformed requests from Limes. - - - alert: CortexNovaCommittedResourceCapacityHttpRequest500sTooHigh - expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource capacity API HTTP 500 errors too high" - description: > - The committed resource capacity API (Limes LIQUID integration) is responding - with HTTP 5xx errors. This indicates internal problems calculating cluster - capacity. Limes may receive stale or incomplete capacity data. - - - alert: CortexNovaCommittedResourceCapacityLatencyTooHigh - expr: | - histogram_quantile(0.95, sum(rate(cortex_committed_resource_capacity_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 10 - and on() rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics"}[5m]) > 0 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource capacity API latency too high" - description: > - The committed resource capacity API (Limes LIQUID integration) is experiencing - high latency (p95 > 10s). This may indicate slow database queries or knowledge - CRD retrieval. Limes scrapes may time out, affecting capacity reporting. - - # Committed Resource Syncer Alerts - # These alerts only fire when the syncer is enabled (metrics are only registered when enabled). - # Absent metrics = syncer disabled = alerts inactive by design. - - alert: CortexNovaCommittedResourceSyncerNotRunning - expr: increase(cortex_committed_resource_syncer_duration_seconds_count{service="cortex-nova-metrics"}[3h]) < 1 - for: 15m - labels: - context: committed-resource-syncer - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource syncer has not run in 3 hours" - description: > - No commitment sync has completed in the last 3 hours. The syncer runs hourly, - so at least 2 runs should appear in this window. Check that the syncer task - is healthy and Limes is reachable. - - - alert: CortexNovaCommittedResourceSyncerErrors - expr: increase(cortex_committed_resource_syncer_errors_total{service="cortex-nova-metrics"}[1h]) > 3 - for: 5m - labels: - context: committed-resource-syncer - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource syncer is repeatedly failing" - description: > - The committed resource syncer has encountered more than 3 errors in the last - hour. Check syncer logs for details; common causes are connectivity issues - with Limes or failures writing CommittedResource CRDs. - - alert: CortexNovaDoesntFindValidKVMHosts expr: sum by (az, hvtype) (increase(cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*"}[5m])) > 0 for: 5m From ab6eb45dac84d3763aa9a2aac8f68b409dd2e88f Mon Sep 17 00:00:00 2001 From: mblos <156897072+mblos@users.noreply.github.com> Date: Mon, 4 May 2026 15:43:19 +0200 Subject: [PATCH 6/9] feat: adding operator-controlled per-resource-type config of committed resources (#792) Replaces hard-coded flavor group eligibility logic with operator-controlled per-resource-type config, and enables variable CPU:RAM ratio to accept committed resources --- helm/bundles/cortex-nova/values.yaml | 16 + .../commitments/api/change_commitments.go | 8 +- .../api/change_commitments_e2e_test.go | 3 + .../api/change_commitments_test.go | 14 +- .../reservations/commitments/api/info.go | 38 +- .../reservations/commitments/api/info_test.go | 67 +- .../committed_resource_controller.go | 6 +- .../committed_resource_controller_test.go | 39 -- .../committed_resource_integration_test.go | 633 ------------------ .../reservations/commitments/config.go | 30 + .../commitments/flavor_group_eligibility.go | 31 - .../commitments/integration_test.go | 539 ++++++++++++++- .../commitments/reservation_controller.go | 1 + .../commitments/reservation_manager.go | 6 +- .../commitments/reservation_manager_test.go | 83 +++ 15 files changed, 744 insertions(+), 770 deletions(-) delete mode 100644 internal/scheduling/reservations/commitments/committed_resource_integration_test.go delete mode 100644 internal/scheduling/reservations/commitments/flavor_group_eligibility.go diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml index f4e9d5725..a83a4f944 100644 --- a/helm/bundles/cortex-nova/values.yaml +++ b/helm/bundles/cortex-nova/values.yaml @@ -174,6 +174,22 @@ cortex-scheduling-controllers: enableReportUsage: true # When false, the endpoint returns HTTP 503. enableReportCapacity: true + # Maps flavor group IDs to resource flag configs; "*" acts as catch-all. + # Controls handlesCommitments, hasCapacity, hasQuota per resource type for each group. + flavorGroupResourceConfig: + "*": # catch-all (unknown groups) + ram: + handlesCommitments: false + hasCapacity: true + hasQuota: false + cores: + handlesCommitments: false + hasCapacity: true + hasQuota: false + instances: + handlesCommitments: false + hasCapacity: true + hasQuota: false # OvercommitMappings is a list of mappings that map hypervisor traits to # overcommit ratios. Note that this list is applied in order, so if there # are multiple mappings applying to the same hypervisors, the last mapping diff --git a/internal/scheduling/reservations/commitments/api/change_commitments.go b/internal/scheduling/reservations/commitments/api/change_commitments.go index 9849075b9..b7783b599 100644 --- a/internal/scheduling/reservations/commitments/api/change_commitments.go +++ b/internal/scheduling/reservations/commitments/api/change_commitments.go @@ -189,8 +189,8 @@ ProcessLoop: break ProcessLoop } - if !commitments.FlavorGroupAcceptsCommitments(&flavorGroup) { - failedReason = commitments.FlavorGroupCommitmentRejectionReason(&flavorGroup) + if !api.config.ResourceConfigForGroup(flavorGroupName).RAM.HandlesCommitments { + failedReason = fmt.Sprintf("flavor group %q is not configured to handle commitments", flavorGroupName) rollback = true break ProcessLoop } @@ -247,6 +247,10 @@ ProcessLoop: cr.Name = crName if _, err := controllerutil.CreateOrUpdate(ctx, api.client, cr, func() error { applyCRSpec(cr, stateDesired, allowRejection) + if cr.Annotations == nil { + cr.Annotations = make(map[string]string) + } + cr.Annotations[v1alpha1.AnnotationCreatorRequestID] = reservations.GlobalRequestIDFromContext(ctx) return nil }); err != nil { failedReason = fmt.Sprintf("commitment %s: failed to write CommittedResource CRD: %v", commitment.UUID, err) diff --git a/internal/scheduling/reservations/commitments/api/change_commitments_e2e_test.go b/internal/scheduling/reservations/commitments/api/change_commitments_e2e_test.go index ee546655b..3f19b4857 100644 --- a/internal/scheduling/reservations/commitments/api/change_commitments_e2e_test.go +++ b/internal/scheduling/reservations/commitments/api/change_commitments_e2e_test.go @@ -123,6 +123,9 @@ func newE2EEnv(t *testing.T, flavors []*TestFlavor, infoVersion int64, scheduler cfg := commitments.DefaultAPIConfig() cfg.WatchTimeout = metav1.Duration{Duration: 5 * time.Second} cfg.WatchPollInterval = metav1.Duration{Duration: 100 * time.Millisecond} + cfg.FlavorGroupResourceConfig = map[string]commitments.FlavorGroupResourcesConfig{ + "*": {RAM: commitments.ResourceTypeConfig{HandlesCommitments: true, HasCapacity: true}}, + } api := NewAPIWithConfig(k8sClient, cfg, nil) mux := http.NewServeMux() api.Init(mux, prometheus.NewRegistry(), log.Log) diff --git a/internal/scheduling/reservations/commitments/api/change_commitments_test.go b/internal/scheduling/reservations/commitments/api/change_commitments_test.go index 579173460..a98e840aa 100644 --- a/internal/scheduling/reservations/commitments/api/change_commitments_test.go +++ b/internal/scheduling/reservations/commitments/api/change_commitments_test.go @@ -144,6 +144,9 @@ func TestHandleChangeCommitments(t *testing.T) { cfg := commitments.DefaultAPIConfig() cfg.WatchTimeout = metav1.Duration{} cfg.WatchPollInterval = metav1.Duration{Duration: 100 * time.Millisecond} + cfg.FlavorGroupResourceConfig = map[string]commitments.FlavorGroupResourcesConfig{ + "*": {RAM: commitments.ResourceTypeConfig{HandlesCommitments: true, HasCapacity: true}}, + } return &cfg }(), ExpectedAPIResponse: newAPIResponse("timeout reached while processing commitment changes"), @@ -709,7 +712,16 @@ func newCRTestEnv(t *testing.T, tc CommitmentChangeTestCase) *CRTestEnv { if tc.CustomConfig != nil { api = NewAPIWithConfig(wrapped, *tc.CustomConfig, nil) } else { - api = NewAPI(wrapped) + // Default test config: all flavor groups accept RAM commitments via wildcard. + cfg := commitments.DefaultAPIConfig() + cfg.FlavorGroupResourceConfig = map[string]commitments.FlavorGroupResourcesConfig{ + "*": { + RAM: commitments.ResourceTypeConfig{HandlesCommitments: true, HasCapacity: true}, + Cores: commitments.ResourceTypeConfig{HasCapacity: true}, + Instances: commitments.ResourceTypeConfig{HasCapacity: true}, + }, + } + api = NewAPIWithConfig(wrapped, cfg, nil) } mux := http.NewServeMux() registry := prometheus.NewRegistry() diff --git a/internal/scheduling/reservations/commitments/api/info.go b/internal/scheduling/reservations/commitments/api/info.go index 2e8ddc8a8..cd8846101 100644 --- a/internal/scheduling/reservations/commitments/api/info.go +++ b/internal/scheduling/reservations/commitments/api/info.go @@ -113,11 +113,7 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, logger logr.Logger) (l // Build resources map resources := make(map[liquid.ResourceName]liquid.ResourceInfo) for groupName, groupData := range flavorGroups { - // Determine if this group accepts commitments (requires fixed RAM/core ratio) - handlesCommitments := commitments.FlavorGroupAcceptsCommitments(&groupData) - - // All flavor groups are registered for usage reporting. - // Only those with a fixed RAM/core ratio have HandlesCommitments=true. + resCfg := api.config.ResourceConfigForGroup(groupName) flavorNames := make([]string, 0, len(groupData.Flavors)) for _, flavor := range groupData.Flavors { @@ -157,12 +153,12 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, logger logr.Logger) (l groupData.SmallestFlavor.MemoryMB, flavorListStr, ), - Unit: ramUnit, // Non-standard unit: multiples of smallest flavor RAM + Unit: ramUnit, Topology: liquid.AZAwareTopology, NeedsResourceDemand: false, - HasCapacity: true, // We report capacity via /commitments/v1/report-capacity - HasQuota: false, - HandlesCommitments: handlesCommitments, // Only groups with fixed ratio accept commitments + HasCapacity: resCfg.RAM.HasCapacity, + HasQuota: resCfg.RAM.HasQuota, + HandlesCommitments: resCfg.RAM.HandlesCommitments, Attributes: attrsJSON, } @@ -173,13 +169,13 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, logger logr.Logger) (l "CPU cores (usable by: %s)", flavorListStr, ), - Unit: liquid.UnitNone, // Countable unit (omitted in JSON = "1") - Topology: liquid.AZAwareTopology, // Same topology as RAM + Unit: liquid.UnitNone, + Topology: liquid.AZAwareTopology, NeedsResourceDemand: false, - HasCapacity: true, // We report capacity (as 0 for now) - HasQuota: false, // No quota enforcement - HandlesCommitments: false, // Cores are derived from RAM commitments - Attributes: attrsJSON, // Same attributes (ratio info) + HasCapacity: resCfg.Cores.HasCapacity, + HasQuota: resCfg.Cores.HasQuota, + HandlesCommitments: resCfg.Cores.HandlesCommitments, + Attributes: attrsJSON, } // === 3. Instances Resource === @@ -189,13 +185,13 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, logger logr.Logger) (l "instances (usable by: %s)", flavorListStr, ), - Unit: liquid.UnitNone, // Countable unit (omitted in JSON = "1") - Topology: liquid.AZAwareTopology, // Same topology as RAM + Unit: liquid.UnitNone, + Topology: liquid.AZAwareTopology, NeedsResourceDemand: false, - HasCapacity: true, // We report capacity (as 0 for now) - HasQuota: false, // No quota enforcement - HandlesCommitments: false, // Instances are derived from RAM commitments - Attributes: attrsJSON, // Same attributes + HasCapacity: resCfg.Instances.HasCapacity, + HasQuota: resCfg.Instances.HasQuota, + HandlesCommitments: resCfg.Instances.HandlesCommitments, + Attributes: attrsJSON, } logger.V(1).Info("registered flavor group resources", diff --git a/internal/scheduling/reservations/commitments/api/info_test.go b/internal/scheduling/reservations/commitments/api/info_test.go index 48e12fd2c..60426a2aa 100644 --- a/internal/scheduling/reservations/commitments/api/info_test.go +++ b/internal/scheduling/reservations/commitments/api/info_test.go @@ -10,6 +10,7 @@ import ( "testing" "github.com/cobaltcore-dev/cortex/api/v1alpha1" + commitments "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/commitments" "github.com/sapcc/go-api-declarations/liquid" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -137,11 +138,9 @@ func TestHandleInfo_InvalidFlavorMemory(t *testing.T) { } } -func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { - // Test that ALL flavor groups get resources created: - // - Three resources are created per group: _ram, _cores, _instances - // - Only _ram of groups with FIXED ratio has HandlesCommitments=true - // - All resources have HasCapacity=true +func TestHandleInfo_ResourceFlagsFromConfig(t *testing.T) { + // Test that resource flags (HandlesCommitments, HasCapacity, HasQuota) are read from config, + // not derived from flavor group metadata. Both groups get resources regardless of ratio. scheme := runtime.NewScheme() if err := v1alpha1.AddToScheme(scheme); err != nil { t.Fatalf("failed to add scheme: %v", err) @@ -150,8 +149,6 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { // Create flavor groups knowledge with both fixed and variable ratio groups features := []map[string]interface{}{ { - // Group with fixed ratio - should accept commitments - // Creates 3 resources: _ram, _cores, _instances "name": "hana_fixed", "flavors": []map[string]interface{}{ {"name": "hana_c4_m16", "vcpus": 4, "memoryMB": 16384, "diskGB": 50}, @@ -159,20 +156,18 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { }, "largestFlavor": map[string]interface{}{"name": "hana_c8_m32", "vcpus": 8, "memoryMB": 32768, "diskGB": 100}, "smallestFlavor": map[string]interface{}{"name": "hana_c4_m16", "vcpus": 4, "memoryMB": 16384, "diskGB": 50}, - "ramCoreRatio": 4096, // Fixed: 4096 MiB per vCPU for all flavors + "ramCoreRatio": 4096, }, { - // Group with variable ratio - should NOT accept commitments - // Will be SKIPPED entirely (no resources created) "name": "v2_variable", "flavors": []map[string]interface{}{ - {"name": "v2_c4_m8", "vcpus": 4, "memoryMB": 8192, "diskGB": 50}, // 2048 MiB/vCPU - {"name": "v2_c4_m64", "vcpus": 4, "memoryMB": 65536, "diskGB": 100}, // 16384 MiB/vCPU + {"name": "v2_c4_m8", "vcpus": 4, "memoryMB": 8192, "diskGB": 50}, + {"name": "v2_c4_m64", "vcpus": 4, "memoryMB": 65536, "diskGB": 100}, }, "largestFlavor": map[string]interface{}{"name": "v2_c4_m64", "vcpus": 4, "memoryMB": 65536, "diskGB": 100}, "smallestFlavor": map[string]interface{}{"name": "v2_c4_m8", "vcpus": 4, "memoryMB": 8192, "diskGB": 50}, - "ramCoreRatioMin": 2048, // Variable: min ratio - "ramCoreRatioMax": 16384, // Variable: max ratio + "ramCoreRatioMin": 2048, + "ramCoreRatioMax": 16384, }, } @@ -199,7 +194,21 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { WithObjects(knowledge). Build() - api := NewAPI(k8sClient) + // hana_fixed: ram accepts commitments; v2_variable: nothing accepts commitments + cfg := commitments.DefaultAPIConfig() + cfg.FlavorGroupResourceConfig = map[string]commitments.FlavorGroupResourcesConfig{ + "hana_fixed": { + RAM: commitments.ResourceTypeConfig{HandlesCommitments: true, HasCapacity: true}, + Cores: commitments.ResourceTypeConfig{HasCapacity: true}, + Instances: commitments.ResourceTypeConfig{HasCapacity: true}, + }, + "*": { + RAM: commitments.ResourceTypeConfig{HasCapacity: true}, + Cores: commitments.ResourceTypeConfig{HasCapacity: true}, + Instances: commitments.ResourceTypeConfig{HasCapacity: true}, + }, + } + api := NewAPIWithConfig(k8sClient, cfg, nil) req := httptest.NewRequest(http.MethodGet, "/commitments/v1/info", http.NoBody) w := httptest.NewRecorder() @@ -217,14 +226,10 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { t.Fatalf("failed to decode response: %v", err) } - // Verify we have 6 resources (3 per flavor group, both groups included) - // hana_fixed generates: _ram, _cores, _instances - // v2_variable generates: _ram, _cores, _instances if len(serviceInfo.Resources) != 6 { t.Fatalf("expected 6 resources (3 per flavor group), got %d", len(serviceInfo.Resources)) } - // Test RAM resource: hw_version_hana_fixed_ram ramResource, ok := serviceInfo.Resources["hw_version_hana_fixed_ram"] if !ok { t.Fatal("expected hw_version_hana_fixed_ram resource to exist") @@ -233,10 +238,9 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { t.Error("hw_version_hana_fixed_ram: expected HasCapacity=true") } if !ramResource.HandlesCommitments { - t.Error("hw_version_hana_fixed_ram: expected HandlesCommitments=true (RAM is primary commitment resource)") + t.Error("hw_version_hana_fixed_ram: expected HandlesCommitments=true (set in config)") } - // Test Cores resource: hw_version_hana_fixed_cores coresResource, ok := serviceInfo.Resources["hw_version_hana_fixed_cores"] if !ok { t.Fatal("expected hw_version_hana_fixed_cores resource to exist") @@ -245,10 +249,9 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { t.Error("hw_version_hana_fixed_cores: expected HasCapacity=true") } if coresResource.HandlesCommitments { - t.Error("hw_version_hana_fixed_cores: expected HandlesCommitments=false (cores are derived)") + t.Error("hw_version_hana_fixed_cores: expected HandlesCommitments=false") } - // Test Instances resource: hw_version_hana_fixed_instances instancesResource, ok := serviceInfo.Resources["hw_version_hana_fixed_instances"] if !ok { t.Fatal("expected hw_version_hana_fixed_instances resource to exist") @@ -257,40 +260,34 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { t.Error("hw_version_hana_fixed_instances: expected HasCapacity=true") } if instancesResource.HandlesCommitments { - t.Error("hw_version_hana_fixed_instances: expected HandlesCommitments=false (instances are derived)") + t.Error("hw_version_hana_fixed_instances: expected HandlesCommitments=false") } - // Variable ratio group DOES have resources now, but HandlesCommitments=false for RAM + // v2_variable is covered by "*" wildcard: HasCapacity=true, HandlesCommitments=false v2RamResource, ok := serviceInfo.Resources["hw_version_v2_variable_ram"] if !ok { - t.Fatal("expected hw_version_v2_variable_ram resource to exist (all groups included)") + t.Fatal("expected hw_version_v2_variable_ram resource to exist") } if !v2RamResource.HasCapacity { t.Error("hw_version_v2_variable_ram: expected HasCapacity=true") } if v2RamResource.HandlesCommitments { - t.Error("hw_version_v2_variable_ram: expected HandlesCommitments=false (variable ratio)") + t.Error("hw_version_v2_variable_ram: expected HandlesCommitments=false (not in config)") } v2CoresResource, ok := serviceInfo.Resources["hw_version_v2_variable_cores"] if !ok { - t.Fatal("expected hw_version_v2_variable_cores resource to exist (all groups included)") + t.Fatal("expected hw_version_v2_variable_cores resource to exist") } if !v2CoresResource.HasCapacity { t.Error("hw_version_v2_variable_cores: expected HasCapacity=true") } - if v2CoresResource.HandlesCommitments { - t.Error("hw_version_v2_variable_cores: expected HandlesCommitments=false") - } v2InstancesResource, ok := serviceInfo.Resources["hw_version_v2_variable_instances"] if !ok { - t.Fatal("expected hw_version_v2_variable_instances resource to exist (all groups included)") + t.Fatal("expected hw_version_v2_variable_instances resource to exist") } if !v2InstancesResource.HasCapacity { t.Error("hw_version_v2_variable_instances: expected HasCapacity=true") } - if v2InstancesResource.HandlesCommitments { - t.Error("hw_version_v2_variable_instances: expected HandlesCommitments=false") - } } diff --git a/internal/scheduling/reservations/commitments/committed_resource_controller.go b/internal/scheduling/reservations/commitments/committed_resource_controller.go index 0481395fc..2389440e3 100644 --- a/internal/scheduling/reservations/commitments/committed_resource_controller.go +++ b/internal/scheduling/reservations/commitments/committed_resource_controller.go @@ -38,7 +38,11 @@ func (r *CommittedResourceController) Reconcile(ctx context.Context, req ctrl.Re return ctrl.Result{}, client.IgnoreNotFound(err) } - ctx = WithNewGlobalRequestID(ctx) + if creatorReq := cr.Annotations[v1alpha1.AnnotationCreatorRequestID]; creatorReq != "" { + ctx = WithGlobalRequestID(ctx, creatorReq) + } else { + ctx = WithNewGlobalRequestID(ctx) + } logger := LoggerFromContext(ctx).WithValues( "component", "committed-resource-controller", "committedResource", req.Name, diff --git a/internal/scheduling/reservations/commitments/committed_resource_controller_test.go b/internal/scheduling/reservations/commitments/committed_resource_controller_test.go index 471c013e3..1029ec997 100644 --- a/internal/scheduling/reservations/commitments/committed_resource_controller_test.go +++ b/internal/scheduling/reservations/commitments/committed_resource_controller_test.go @@ -360,13 +360,6 @@ func TestCommittedResourceController_PlacementFailure(t *testing.T) { expectedReason: "Reserving", expectRequeue: true, }, - { - name: "guaranteed AllowRejection=true: rejects on failure, no retry", - state: v1alpha1.CommitmentStatusGuaranteed, - allowRejection: true, - expectedReason: "Rejected", - expectRequeue: false, - }, { name: "confirmed AllowRejection=true: rejects on failure, no retry", state: v1alpha1.CommitmentStatusConfirmed, @@ -374,13 +367,6 @@ func TestCommittedResourceController_PlacementFailure(t *testing.T) { expectedReason: "Rejected", expectRequeue: false, }, - { - name: "guaranteed AllowRejection=false: retries on failure", - state: v1alpha1.CommitmentStatusGuaranteed, - allowRejection: false, - expectedReason: "Reserving", - expectRequeue: true, - }, { name: "confirmed AllowRejection=false: retries on failure", state: v1alpha1.CommitmentStatusConfirmed, @@ -503,31 +489,6 @@ func TestCommittedResourceController_BadSpec(t *testing.T) { } } -func TestCommittedResourceController_Idempotent(t *testing.T) { - scheme := newCRTestScheme(t) - cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusConfirmed) - k8sClient := newCRTestClient(scheme, cr, newTestFlavorKnowledge()) - controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: CommittedResourceControllerConfig{}} - - // Round 1: creates reservation, waits for placement. - if _, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)); err != nil { - t.Fatalf("reconcile 1: %v", err) - } - // Simulate reservation controller setting Ready=True. - setChildReservationsReady(t, k8sClient, cr.Spec.CommitmentUUID) - // Rounds 2 and 3: accepts, then stays accepted. - for i := 2; i <= 3; i++ { - if _, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)); err != nil { - t.Fatalf("reconcile %d: %v", i, err) - } - } - - if got := countChildReservations(t, k8sClient, cr.Spec.CommitmentUUID); got != 1 { - t.Errorf("expected 1 child reservation after 3 reconciles (idempotency), got %d", got) - } - assertCondition(t, k8sClient, cr.Name, metav1.ConditionTrue, "Accepted") -} - // ============================================================================ // Tests: checkChildReservationStatus generation guard // ============================================================================ diff --git a/internal/scheduling/reservations/commitments/committed_resource_integration_test.go b/internal/scheduling/reservations/commitments/committed_resource_integration_test.go deleted file mode 100644 index 0090e45f5..000000000 --- a/internal/scheduling/reservations/commitments/committed_resource_integration_test.go +++ /dev/null @@ -1,633 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package commitments - -// Integration tests for the CR lifecycle spanning CommittedResourceController and -// CommitmentReservationController. These tests drive both controllers against a shared -// fake client and verify the end-to-end state transitions without mocking internal logic. -// -// Scope: -// - State transition: planned → confirmed produces child Reservations -// - State transition: confirmed → expired cleans up child Reservations -// - Reservation controller places a child Reservation created by the CR controller -// - CR deletion removes all child Reservations - -import ( - "context" - "encoding/json" - "net/http" - "net/http/httptest" - "strings" - "testing" - "time" - - hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" - "k8s.io/apimachinery/pkg/api/meta" - "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/client/fake" - - schedulerdelegationapi "github.com/cobaltcore-dev/cortex/api/external/nova" - "github.com/cobaltcore-dev/cortex/api/v1alpha1" -) - -// crIntegrationEnv holds shared state for integration tests. -type crIntegrationEnv struct { - k8sClient client.Client - crController *CommittedResourceController - resController *CommitmentReservationController - schedulerServer *httptest.Server -} - -func newCRIntegrationEnv(t *testing.T) *crIntegrationEnv { - t.Helper() - scheme := newCRTestScheme(t) - - hypervisor := &hv1.Hypervisor{ObjectMeta: metav1.ObjectMeta{Name: "host-1"}} - k8sClient := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(newTestFlavorKnowledge(), hypervisor). - WithStatusSubresource( - &v1alpha1.CommittedResource{}, - &v1alpha1.Reservation{}, - &v1alpha1.Knowledge{}, - ). - WithIndex(&v1alpha1.Reservation{}, idxReservationByCommitmentUUID, func(obj client.Object) []string { - res, ok := obj.(*v1alpha1.Reservation) - if !ok || res.Spec.CommittedResourceReservation == nil || res.Spec.CommittedResourceReservation.CommitmentUUID == "" { - return nil - } - return []string{res.Spec.CommittedResourceReservation.CommitmentUUID} - }). - WithIndex(&v1alpha1.CommittedResource{}, idxCommittedResourceByUUID, func(obj client.Object) []string { - cr, ok := obj.(*v1alpha1.CommittedResource) - if !ok || cr.Spec.CommitmentUUID == "" { - return nil - } - return []string{cr.Spec.CommitmentUUID} - }). - Build() - - schedulerServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - resp := &schedulerdelegationapi.ExternalSchedulerResponse{Hosts: []string{"host-1"}} - if err := json.NewEncoder(w).Encode(resp); err != nil { - t.Errorf("scheduler encode: %v", err) - } - })) - - crCtrl := &CommittedResourceController{ - Client: k8sClient, - Scheme: scheme, - Conf: CommittedResourceControllerConfig{RequeueIntervalRetry: metav1.Duration{Duration: 5 * time.Minute}}, - } - - resCtrl := &CommitmentReservationController{ - Client: k8sClient, - Scheme: scheme, - Conf: ReservationControllerConfig{ - SchedulerURL: schedulerServer.URL, - AllocationGracePeriod: metav1.Duration{Duration: 15 * time.Minute}, - RequeueIntervalActive: metav1.Duration{Duration: 5 * time.Minute}, - }, - } - if err := resCtrl.Init(context.Background(), resCtrl.Conf); err != nil { - t.Fatalf("resCtrl.Init: %v", err) - } - - return &crIntegrationEnv{ - k8sClient: k8sClient, - crController: crCtrl, - resController: resCtrl, - schedulerServer: schedulerServer, - } -} - -func (e *crIntegrationEnv) close() { e.schedulerServer.Close() } - -func (e *crIntegrationEnv) reconcileCR(t *testing.T, crName string) { - t.Helper() - req := ctrl.Request{NamespacedName: types.NamespacedName{Name: crName}} - if _, err := e.crController.Reconcile(context.Background(), req); err != nil { - t.Fatalf("CR reconcile: %v", err) - } -} - -func (e *crIntegrationEnv) reconcileReservation(t *testing.T, resName string) { - t.Helper() - req := ctrl.Request{NamespacedName: types.NamespacedName{Name: resName}} - if _, err := e.resController.Reconcile(context.Background(), req); err != nil { - t.Fatalf("reservation reconcile %s: %v", resName, err) - } -} - -func (e *crIntegrationEnv) listChildReservations(t *testing.T, crName string) []v1alpha1.Reservation { - t.Helper() - var list v1alpha1.ReservationList - if err := e.k8sClient.List(context.Background(), &list, client.MatchingLabels{ - v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, - }); err != nil { - t.Fatalf("list reservations: %v", err) - } - prefix := crName + "-" - var children []v1alpha1.Reservation - for _, r := range list.Items { - if strings.HasPrefix(r.Name, prefix) { - children = append(children, r) - } - } - return children -} - -func (e *crIntegrationEnv) getCR(t *testing.T, name string) v1alpha1.CommittedResource { - t.Helper() - var cr v1alpha1.CommittedResource - if err := e.k8sClient.Get(context.Background(), types.NamespacedName{Name: name}, &cr); err != nil { - t.Fatalf("get CR %s: %v", name, err) - } - return cr -} - -// reconcileChildReservations runs the reservation controller twice on every child Reservation -// for crName (first reconcile sets TargetHost, second sets Ready=True), then re-reconciles -// the CR so it can observe the placement outcomes. -func (e *crIntegrationEnv) reconcileChildReservations(t *testing.T, crName string) { - t.Helper() - for _, res := range e.listChildReservations(t, crName) { - e.reconcileReservation(t, res.Name) // calls scheduler → sets TargetHost - e.reconcileReservation(t, res.Name) // syncs TargetHost to Status → Ready=True - } - e.reconcileCR(t, crName) -} - -// ============================================================================ -// Integration tests -// ============================================================================ - -// TestCRLifecycle covers the multi-step state transitions that require imperative -// mid-test patches and cannot be expressed as a purely declarative table. -func TestCRLifecycle(t *testing.T) { - t.Run("planned→confirmed: child Reservations created and placed", func(t *testing.T) { - env := newCRIntegrationEnv(t) - defer env.close() - - cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusPlanned) - if err := env.k8sClient.Create(context.Background(), cr); err != nil { - t.Fatalf("create CR: %v", err) - } - - // Reconcile as planned: finalizer added, no Reservations. - env.reconcileCR(t, cr.Name) - env.reconcileCR(t, cr.Name) - if got := env.listChildReservations(t, cr.Name); len(got) != 0 { - t.Fatalf("planned: expected 0 reservations, got %d", len(got)) - } - crState := env.getCR(t, cr.Name) - cond := meta.FindStatusCondition(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) - if cond == nil || cond.Reason != "Planned" { - t.Errorf("planned: expected Reason=Planned, got %v", cond) - } - - // Transition to confirmed. - patch := client.MergeFrom(crState.DeepCopy()) - crState.Spec.State = v1alpha1.CommitmentStatusConfirmed - if err := env.k8sClient.Patch(context.Background(), &crState, patch); err != nil { - t.Fatalf("patch state to confirmed: %v", err) - } - env.reconcileCR(t, cr.Name) - - children := env.listChildReservations(t, cr.Name) - if len(children) != 1 { - t.Fatalf("confirmed: expected 1 reservation, got %d", len(children)) - } - env.reconcileChildReservations(t, cr.Name) - - crState = env.getCR(t, cr.Name) - if !meta.IsStatusConditionTrue(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) { - t.Errorf("confirmed: expected Ready=True") - } - }) - - t.Run("confirmed→expired: child Reservations deleted, CR marked inactive", func(t *testing.T) { - env := newCRIntegrationEnv(t) - defer env.close() - - cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) - if err := env.k8sClient.Create(context.Background(), cr); err != nil { - t.Fatalf("create CR: %v", err) - } - - // Bring to confirmed+Ready=True. - env.reconcileCR(t, cr.Name) // adds finalizer - env.reconcileCR(t, cr.Name) // creates Reservations - env.reconcileChildReservations(t, cr.Name) // places slots → Ready=True - - if got := env.listChildReservations(t, cr.Name); len(got) != 1 { - t.Fatalf("pre-expire: expected 1 reservation, got %d", len(got)) - } - - // Transition to expired. - crState := env.getCR(t, cr.Name) - patch := client.MergeFrom(crState.DeepCopy()) - crState.Spec.State = v1alpha1.CommitmentStatusExpired - if err := env.k8sClient.Patch(context.Background(), &crState, patch); err != nil { - t.Fatalf("patch state to expired: %v", err) - } - env.reconcileCR(t, cr.Name) - - if got := env.listChildReservations(t, cr.Name); len(got) != 0 { - t.Errorf("expired: expected 0 reservations, got %d", len(got)) - } - crState = env.getCR(t, cr.Name) - cond := meta.FindStatusCondition(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) - if cond == nil || cond.Status != metav1.ConditionFalse { - t.Errorf("expired: expected Ready=False, got %v", cond) - } - if cond != nil && cond.Reason != string(v1alpha1.CommitmentStatusExpired) { - t.Errorf("expired: expected Reason=%s, got %s", v1alpha1.CommitmentStatusExpired, cond.Reason) - } - }) - - t.Run("reservation placement: two reconciles set TargetHost then Ready=True", func(t *testing.T) { - env := newCRIntegrationEnv(t) - defer env.close() - - cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) - if err := env.k8sClient.Create(context.Background(), cr); err != nil { - t.Fatalf("create CR: %v", err) - } - - env.reconcileCR(t, cr.Name) - env.reconcileCR(t, cr.Name) - - children := env.listChildReservations(t, cr.Name) - if len(children) != 1 { - t.Fatalf("expected 1 child reservation, got %d", len(children)) - } - child := children[0] - - // First reconcile: scheduler call → TargetHost written to Spec. - env.reconcileReservation(t, child.Name) - var afterFirst v1alpha1.Reservation - if err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: child.Name}, &afterFirst); err != nil { - t.Fatalf("get reservation after first reconcile: %v", err) - } - if afterFirst.Spec.TargetHost == "" { - t.Fatalf("expected TargetHost set after first reservation reconcile") - } - - // Second reconcile: TargetHost synced to Status, Ready=True. - env.reconcileReservation(t, child.Name) - var afterSecond v1alpha1.Reservation - if err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: child.Name}, &afterSecond); err != nil { - t.Fatalf("get reservation after second reconcile: %v", err) - } - if !meta.IsStatusConditionTrue(afterSecond.Status.Conditions, v1alpha1.ReservationConditionReady) { - t.Errorf("expected reservation Ready=True after placement, got %v", afterSecond.Status.Conditions) - } - if afterSecond.Status.Host != "host-1" { - t.Errorf("expected Status.Host=host-1, got %q", afterSecond.Status.Host) - } - }) - - t.Run("deletion: finalizer removed, child Reservations cleaned up", func(t *testing.T) { - env := newCRIntegrationEnv(t) - defer env.close() - - cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) - if err := env.k8sClient.Create(context.Background(), cr); err != nil { - t.Fatalf("create CR: %v", err) - } - - // Pre-create a child Reservation to verify it gets cleaned up on deletion. - // newTestCommittedResource pre-populates the finalizer, so Delete() immediately sets DeletionTimestamp. - child := &v1alpha1.Reservation{ - ObjectMeta: metav1.ObjectMeta{ - Name: "my-cr-0", - Labels: map[string]string{ - v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, - }, - }, - Spec: v1alpha1.ReservationSpec{ - Type: v1alpha1.ReservationTypeCommittedResource, - CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ - CommitmentUUID: "test-uuid-1234", - }, - }, - } - if err := env.k8sClient.Create(context.Background(), child); err != nil { - t.Fatalf("create child reservation: %v", err) - } - - crState := env.getCR(t, cr.Name) - if err := env.k8sClient.Delete(context.Background(), &crState); err != nil { - t.Fatalf("delete CR: %v", err) - } - env.reconcileCR(t, cr.Name) - - if got := env.listChildReservations(t, cr.Name); len(got) != 0 { - t.Errorf("post-deletion: expected 0 reservations, got %d", len(got)) - } - var final v1alpha1.CommittedResource - err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: cr.Name}, &final) - if client.IgnoreNotFound(err) != nil { - t.Fatalf("unexpected error after deletion: %v", err) - } - if err == nil { - for _, f := range final.Finalizers { - if f == crFinalizer { - t.Errorf("finalizer not removed after deletion reconcile") - } - } - } - }) - - t.Run("confirmed→superseded: child Reservations deleted, CR marked inactive", func(t *testing.T) { - env := newCRIntegrationEnv(t) - defer env.close() - - cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) - if err := env.k8sClient.Create(context.Background(), cr); err != nil { - t.Fatalf("create CR: %v", err) - } - - env.reconcileCR(t, cr.Name) - env.reconcileCR(t, cr.Name) - env.reconcileChildReservations(t, cr.Name) - - if got := env.listChildReservations(t, cr.Name); len(got) != 1 { - t.Fatalf("pre-supersede: expected 1 reservation, got %d", len(got)) - } - - crState := env.getCR(t, cr.Name) - patch := client.MergeFrom(crState.DeepCopy()) - crState.Spec.State = v1alpha1.CommitmentStatusSuperseded - if err := env.k8sClient.Patch(context.Background(), &crState, patch); err != nil { - t.Fatalf("patch state to superseded: %v", err) - } - env.reconcileCR(t, cr.Name) - - if got := env.listChildReservations(t, cr.Name); len(got) != 0 { - t.Errorf("superseded: expected 0 reservations, got %d", len(got)) - } - crState = env.getCR(t, cr.Name) - cond := meta.FindStatusCondition(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) - if cond == nil || cond.Status != metav1.ConditionFalse { - t.Errorf("superseded: expected Ready=False, got %v", cond) - } - if cond != nil && cond.Reason != string(v1alpha1.CommitmentStatusSuperseded) { - t.Errorf("superseded: expected Reason=%s, got %s", v1alpha1.CommitmentStatusSuperseded, cond.Reason) - } - }) - - t.Run("idempotency: extra reconciles after Accepted do not create extra slots", func(t *testing.T) { - env := newCRIntegrationEnv(t) - defer env.close() - - cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) - if err := env.k8sClient.Create(context.Background(), cr); err != nil { - t.Fatalf("create CR: %v", err) - } - - env.reconcileCR(t, cr.Name) - env.reconcileCR(t, cr.Name) - env.reconcileChildReservations(t, cr.Name) - - if got := env.listChildReservations(t, cr.Name); len(got) != 1 { - t.Fatalf("pre-idempotency check: expected 1 reservation, got %d", len(got)) - } - - env.reconcileCR(t, cr.Name) - env.reconcileCR(t, cr.Name) - - if got := env.listChildReservations(t, cr.Name); len(got) != 1 { - t.Errorf("idempotency: expected 1 reservation after extra reconciles, got %d", len(got)) - } - crState := env.getCR(t, cr.Name) - if !meta.IsStatusConditionTrue(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) { - t.Errorf("idempotency: expected CR to remain Ready=True after extra reconciles") - } - }) - - t.Run("AllowRejection=false: stays Reserving when scheduler rejects", func(t *testing.T) { - hypervisor := &hv1.Hypervisor{ObjectMeta: metav1.ObjectMeta{Name: "host-1"}} - env := newIntgEnv(t, []client.Object{newTestFlavorKnowledge(), hypervisor}, intgRejectScheduler) - defer env.close() - - cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) - // AllowRejection stays false (the default), so placement failure must requeue, not reject. - if err := env.k8sClient.Create(context.Background(), cr); err != nil { - t.Fatalf("create CR: %v", err) - } - - ctx := context.Background() - crReq := ctrl.Request{NamespacedName: types.NamespacedName{Name: cr.Name}} - for range 3 { - env.crController.Reconcile(ctx, crReq) //nolint:errcheck - var resList v1alpha1.ReservationList - env.k8sClient.List(ctx, &resList, client.MatchingLabels{ //nolint:errcheck - v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, - }) - for _, res := range resList.Items { - resReq := ctrl.Request{NamespacedName: types.NamespacedName{Name: res.Name}} - env.resController.Reconcile(ctx, resReq) //nolint:errcheck - env.resController.Reconcile(ctx, resReq) //nolint:errcheck - } - env.crController.Reconcile(ctx, crReq) //nolint:errcheck - } - - var final v1alpha1.CommittedResource - if err := env.k8sClient.Get(ctx, types.NamespacedName{Name: cr.Name}, &final); err != nil { - t.Fatalf("get CR: %v", err) - } - cond := meta.FindStatusCondition(final.Status.Conditions, v1alpha1.CommittedResourceConditionReady) - if cond == nil { - t.Fatalf("no Ready condition") - } - if cond.Reason == v1alpha1.CommittedResourceReasonRejected { - t.Errorf("AllowRejection=false: CR must not transition to Rejected, got Reason=%s", cond.Reason) - } - if cond.Reason != v1alpha1.CommittedResourceReasonReserving { - t.Errorf("AllowRejection=false: expected Reason=Reserving, got %s", cond.Reason) - } - }) - - t.Run("externally deleted child Reservation is recreated by CR controller", func(t *testing.T) { - env := newCRIntegrationEnv(t) - defer env.close() - - cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) - if err := env.k8sClient.Create(context.Background(), cr); err != nil { - t.Fatalf("create CR: %v", err) - } - - env.reconcileCR(t, cr.Name) - env.reconcileCR(t, cr.Name) - env.reconcileChildReservations(t, cr.Name) - - children := env.listChildReservations(t, cr.Name) - if len(children) != 1 { - t.Fatalf("expected 1 child reservation before deletion, got %d", len(children)) - } - - // Simulate out-of-band deletion of the slot. - child := children[0] - if err := env.k8sClient.Delete(context.Background(), &child); err != nil { - t.Fatalf("delete child reservation: %v", err) - } - - // CR controller detects the missing slot and recreates it. - env.reconcileCR(t, cr.Name) - // Place the new slot. - env.reconcileChildReservations(t, cr.Name) - // CR controller observes Ready=True on the recreated slot. - env.reconcileCR(t, cr.Name) - - if got := env.listChildReservations(t, cr.Name); len(got) != 1 { - t.Errorf("expected 1 reservation after recreation, got %d", len(got)) - } - crState := env.getCR(t, cr.Name) - if !meta.IsStatusConditionTrue(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) { - t.Errorf("expected CR to be Ready=True after slot recreation") - } - }) - - t.Run("AcceptedAt: set when CR accepted", func(t *testing.T) { - env := newCRIntegrationEnv(t) - defer env.close() - - cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) - if err := env.k8sClient.Create(context.Background(), cr); err != nil { - t.Fatalf("create CR: %v", err) - } - - env.reconcileCR(t, cr.Name) - env.reconcileCR(t, cr.Name) - env.reconcileChildReservations(t, cr.Name) - - crState := env.getCR(t, cr.Name) - if !meta.IsStatusConditionTrue(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) { - t.Fatalf("expected CR to be Ready=True") - } - if crState.Status.AcceptedAt == nil { - t.Errorf("expected AcceptedAt to be set on acceptance") - } - if crState.Status.AcceptedAmount == nil { - t.Errorf("expected AcceptedAmount to be set on acceptance") - } else if crState.Status.AcceptedAmount.Cmp(resource.MustParse("4Gi")) != 0 { - t.Errorf("AcceptedAmount: want 4Gi, got %s", crState.Status.AcceptedAmount.String()) - } - }) - - t.Run("resize failure: rolls back to AcceptedAmount, prior slot preserved", func(t *testing.T) { - // Scheduler: accepts the first placement call (initial 4 GiB slot), rejects all subsequent. - objects := []client.Object{newTestFlavorKnowledge(), intgHypervisor("host-1")} - env := newIntgEnv(t, objects, intgAcceptFirstScheduler(1)) - defer env.close() - - cr := intgCRAllowRejection("my-cr", "uuid-resize-0001", v1alpha1.CommitmentStatusConfirmed) - if err := env.k8sClient.Create(context.Background(), cr); err != nil { - t.Fatalf("create CR: %v", err) - } - - // Phase 1: accept at 4 GiB (1 slot). Uses 1 scheduler call. - intgDriveToTerminal(t, env, []string{cr.Name}) - var crState v1alpha1.CommittedResource - if err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: cr.Name}, &crState); err != nil { - t.Fatalf("get CR: %v", err) - } - if !meta.IsStatusConditionTrue(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) { - t.Fatalf("phase 1: expected CR to be Ready=True after initial placement") - } - if crState.Status.AcceptedAmount == nil || crState.Status.AcceptedAmount.Cmp(resource.MustParse("4Gi")) != 0 { - t.Fatalf("phase 1: AcceptedAmount must be 4Gi, got %v", crState.Status.AcceptedAmount) - } - - // Phase 2: resize to 8 GiB (needs 2 slots). Scheduler has no more accepts. - patch := client.MergeFrom(crState.DeepCopy()) - crState.Spec.Amount = resource.MustParse("8Gi") - if err := env.k8sClient.Patch(context.Background(), &crState, patch); err != nil { - t.Fatalf("patch CR to 8Gi: %v", err) - } - - ctx := context.Background() - crReq := ctrl.Request{NamespacedName: types.NamespacedName{Name: cr.Name}} - - // CR controller: applyReservationState bumps gen on existing slot, creates 2nd slot. - env.crController.Reconcile(ctx, crReq) //nolint:errcheck - // Reservation controller: existing slot echoes new ParentGeneration (no scheduler call); - // new slot calls scheduler → rejected. - var resList v1alpha1.ReservationList - env.k8sClient.List(ctx, &resList, client.MatchingLabels{ //nolint:errcheck - v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, - }) - for _, res := range resList.Items { - resReq := ctrl.Request{NamespacedName: types.NamespacedName{Name: res.Name}} - env.resController.Reconcile(ctx, resReq) //nolint:errcheck - env.resController.Reconcile(ctx, resReq) //nolint:errcheck - } - // CR controller: detects 2nd slot Ready=False → rollbackToAccepted (keeps 1 slot) → Rejected. - env.crController.Reconcile(ctx, crReq) //nolint:errcheck - - // Rollback must preserve 1 slot (matching AcceptedAmount=4Gi), not delete all. - var finalList v1alpha1.ReservationList - if err := env.k8sClient.List(ctx, &finalList, client.MatchingLabels{ - v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, - }); err != nil { - t.Fatalf("list reservations: %v", err) - } - if len(finalList.Items) != 1 { - t.Errorf("resize rollback: want 1 slot (AcceptedAmount), got %d", len(finalList.Items)) - } - intgAssertCRCondition(t, env.k8sClient, []string{cr.Name}, metav1.ConditionFalse, v1alpha1.CommittedResourceReasonRejected) - }) - - t.Run("AllowRejection=false: eventually accepted after scheduler starts accepting", func(t *testing.T) { - // Scheduler rejects the first 2 calls (one per reservation controller reconcile pair), - // then accepts all subsequent. AllowRejection=false means the CR controller retries rather - // than rejecting, so the CR must eventually reach Accepted once the scheduler cooperates. - objects := []client.Object{newTestFlavorKnowledge(), intgHypervisor("host-1")} - env := newIntgEnv(t, objects, intgRejectFirstScheduler(2)) - defer env.close() - - cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) - // AllowRejection stays false (default), so placement failure must requeue, not reject. - if err := env.k8sClient.Create(context.Background(), cr); err != nil { - t.Fatalf("create CR: %v", err) - } - - ctx := context.Background() - crReq := ctrl.Request{NamespacedName: types.NamespacedName{Name: cr.Name}} - for range 3 { - env.crController.Reconcile(ctx, crReq) //nolint:errcheck - var resList v1alpha1.ReservationList - env.k8sClient.List(ctx, &resList, client.MatchingLabels{ //nolint:errcheck - v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, - }) - for _, res := range resList.Items { - resReq := ctrl.Request{NamespacedName: types.NamespacedName{Name: res.Name}} - env.resController.Reconcile(ctx, resReq) //nolint:errcheck - env.resController.Reconcile(ctx, resReq) //nolint:errcheck - } - env.crController.Reconcile(ctx, crReq) //nolint:errcheck - } - - var final v1alpha1.CommittedResource - if err := env.k8sClient.Get(ctx, types.NamespacedName{Name: cr.Name}, &final); err != nil { - t.Fatalf("get CR: %v", err) - } - cond := meta.FindStatusCondition(final.Status.Conditions, v1alpha1.CommittedResourceConditionReady) - if cond == nil { - t.Fatalf("no Ready condition after retries") - } - if cond.Reason == v1alpha1.CommittedResourceReasonRejected { - t.Errorf("AllowRejection=false: CR must not be Rejected, got Reason=%s", cond.Reason) - } - if cond.Status != metav1.ConditionTrue || cond.Reason != v1alpha1.CommittedResourceReasonAccepted { - t.Errorf("AllowRejection=false: expected Ready=True/Accepted after retries, got Ready=%s/Reason=%s", cond.Status, cond.Reason) - } - }) -} diff --git a/internal/scheduling/reservations/commitments/config.go b/internal/scheduling/reservations/commitments/config.go index fe05fcc20..c30c87953 100644 --- a/internal/scheduling/reservations/commitments/config.go +++ b/internal/scheduling/reservations/commitments/config.go @@ -49,6 +49,20 @@ type CommittedResourceControllerConfig struct { RequeueIntervalRetry metav1.Duration `json:"requeueIntervalRetry"` } +// ResourceTypeConfig holds per-resource flags for a single resource type within a flavor group. +type ResourceTypeConfig struct { + HandlesCommitments bool `json:"handlesCommitments"` + HasCapacity bool `json:"hasCapacity"` + HasQuota bool `json:"hasQuota"` +} + +// FlavorGroupResourcesConfig groups resource type configs for the three resources of a flavor group. +type FlavorGroupResourcesConfig struct { + RAM ResourceTypeConfig `json:"ram"` + Cores ResourceTypeConfig `json:"cores"` + Instances ResourceTypeConfig `json:"instances"` +} + // APIConfig holds configuration for the LIQUID commitment HTTP endpoints. type APIConfig struct { // EnableChangeCommitments controls whether the change-commitments endpoint is active. @@ -64,6 +78,22 @@ type APIConfig struct { // WatchPollInterval is how frequently the change-commitments handler polls // CommittedResource CRD conditions while waiting for the controller outcome. WatchPollInterval metav1.Duration `json:"watchPollInterval"` + // FlavorGroupResourceConfig maps flavor group IDs to resource flag configs; "*" acts as catch-all. + FlavorGroupResourceConfig map[string]FlavorGroupResourcesConfig `json:"flavorGroupResourceConfig,omitempty"` +} + +// ResourceConfigForGroup returns the resource config for the given flavor group ID, +// falling back to the "*" catch-all if no exact match exists. +func (c APIConfig) ResourceConfigForGroup(groupID string) FlavorGroupResourcesConfig { + if c.FlavorGroupResourceConfig != nil { + if cfg, ok := c.FlavorGroupResourceConfig[groupID]; ok { + return cfg + } + if cfg, ok := c.FlavorGroupResourceConfig["*"]; ok { + return cfg + } + } + return FlavorGroupResourcesConfig{} } func DefaultAPIConfig() APIConfig { diff --git a/internal/scheduling/reservations/commitments/flavor_group_eligibility.go b/internal/scheduling/reservations/commitments/flavor_group_eligibility.go deleted file mode 100644 index 00218835f..000000000 --- a/internal/scheduling/reservations/commitments/flavor_group_eligibility.go +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package commitments - -import ( - "fmt" - - "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" -) - -// FlavorGroupAcceptsCommitments returns true if the given flavor group can accept committed resources. -// Currently, only groups with a fixed RAM/core ratio (all flavors have the same ratio) accept CRs. -// This is the single source of truth for CR eligibility and should be used across all CR APIs. -func FlavorGroupAcceptsCommitments(fg *compute.FlavorGroupFeature) bool { - return fg.HasFixedRamCoreRatio() -} - -// FlavorGroupCommitmentRejectionReason returns the reason why the given flavor group does not accept CRs. -// Returns empty string if the group accepts commitments. -func FlavorGroupCommitmentRejectionReason(fg *compute.FlavorGroupFeature) string { - if FlavorGroupAcceptsCommitments(fg) { - return "" - } - // Differentiate between missing ratio metadata and variable ratio - if fg.RamCoreRatioMin == nil && fg.RamCoreRatioMax == nil { - return fmt.Sprintf("flavor group %q has no computable RAM/core ratio metadata and does not accept commitments", fg.Name) - } - return fmt.Sprintf("flavor group %q has variable RAM/core ratio (min=%d, max=%d) and does not accept commitments", - fg.Name, *fg.RamCoreRatioMin, *fg.RamCoreRatioMax) -} diff --git a/internal/scheduling/reservations/commitments/integration_test.go b/internal/scheduling/reservations/commitments/integration_test.go index 138f3c74c..e89e2adb1 100644 --- a/internal/scheduling/reservations/commitments/integration_test.go +++ b/internal/scheduling/reservations/commitments/integration_test.go @@ -3,11 +3,12 @@ package commitments -// Table-driven integration tests for the committed-resource lifecycle. +// Integration tests for the committed-resource lifecycle. // -// Each test case wires CommittedResourceController and CommitmentReservationController -// against a shared fake k8s client and a mock Nova scheduler, then drives both -// controllers synchronously until every CR reaches a terminal condition. +// Both test suites wire CommittedResourceController and CommitmentReservationController +// against a shared fake k8s client and a mock Nova scheduler: +// - TestCRIntegration — table-driven declarative scenarios +// - TestCRLifecycle — imperative sub-tests for multi-step transitions // // Terminal conditions (no further reconcile expected without external input): // - Ready=True / Accepted @@ -21,6 +22,7 @@ import ( "encoding/json" "net/http" "net/http/httptest" + "strings" "sync/atomic" "testing" "time" @@ -383,6 +385,67 @@ func newIntgEnv(t *testing.T, initialObjects []client.Object, schedulerFn http.H func (e *intgEnv) close() { e.schedulerSrv.Close() } +func newDefaultIntgEnv(t *testing.T) *intgEnv { + t.Helper() + objects := []client.Object{newTestFlavorKnowledge(), intgHypervisor("host-1")} + return newIntgEnv(t, objects, intgAcceptScheduler) +} + +func (e *intgEnv) reconcileCR(t *testing.T, crName string) { + t.Helper() + req := ctrl.Request{NamespacedName: types.NamespacedName{Name: crName}} + if _, err := e.crController.Reconcile(context.Background(), req); err != nil { + t.Fatalf("CR reconcile: %v", err) + } +} + +func (e *intgEnv) reconcileReservation(t *testing.T, resName string) { + t.Helper() + req := ctrl.Request{NamespacedName: types.NamespacedName{Name: resName}} + if _, err := e.resController.Reconcile(context.Background(), req); err != nil { + t.Fatalf("reservation reconcile %s: %v", resName, err) + } +} + +func (e *intgEnv) listChildReservations(t *testing.T, crName string) []v1alpha1.Reservation { + t.Helper() + var list v1alpha1.ReservationList + if err := e.k8sClient.List(context.Background(), &list, client.MatchingLabels{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }); err != nil { + t.Fatalf("list reservations: %v", err) + } + prefix := crName + "-" + var children []v1alpha1.Reservation + for _, r := range list.Items { + if strings.HasPrefix(r.Name, prefix) { + children = append(children, r) + } + } + return children +} + +func (e *intgEnv) getCR(t *testing.T, name string) v1alpha1.CommittedResource { + t.Helper() + var cr v1alpha1.CommittedResource + if err := e.k8sClient.Get(context.Background(), types.NamespacedName{Name: name}, &cr); err != nil { + t.Fatalf("get CR %s: %v", name, err) + } + return cr +} + +// reconcileChildReservations runs the reservation controller twice on every child Reservation +// for crName (first reconcile sets TargetHost, second sets Ready=True), then re-reconciles +// the CR so it can observe the placement outcomes. +func (e *intgEnv) reconcileChildReservations(t *testing.T, crName string) { + t.Helper() + for _, res := range e.listChildReservations(t, crName) { + e.reconcileReservation(t, res.Name) // calls scheduler → sets TargetHost + e.reconcileReservation(t, res.Name) // syncs TargetHost to Status → Ready=True + } + e.reconcileCR(t, crName) +} + // ============================================================================ // Reconcile driver // ============================================================================ @@ -616,3 +679,471 @@ func intgExistingReservation(name, commitmentUUID string) *v1alpha1.Reservation }, } } + +// ============================================================================ +// Imperative lifecycle tests +// ============================================================================ + +// TestCRLifecycle covers multi-step state transitions that require imperative +// mid-test patches and cannot be expressed as a purely declarative table. +func TestCRLifecycle(t *testing.T) { + t.Run("planned→confirmed: child Reservations created and placed", func(t *testing.T) { + env := newDefaultIntgEnv(t) + defer env.close() + + cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusPlanned) + if err := env.k8sClient.Create(context.Background(), cr); err != nil { + t.Fatalf("create CR: %v", err) + } + + // Reconcile as planned: finalizer added, no Reservations. + env.reconcileCR(t, cr.Name) + env.reconcileCR(t, cr.Name) + if got := env.listChildReservations(t, cr.Name); len(got) != 0 { + t.Fatalf("planned: expected 0 reservations, got %d", len(got)) + } + crState := env.getCR(t, cr.Name) + cond := meta.FindStatusCondition(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) + if cond == nil || cond.Reason != "Planned" { + t.Errorf("planned: expected Reason=Planned, got %v", cond) + } + + // Transition to confirmed. + patch := client.MergeFrom(crState.DeepCopy()) + crState.Spec.State = v1alpha1.CommitmentStatusConfirmed + if err := env.k8sClient.Patch(context.Background(), &crState, patch); err != nil { + t.Fatalf("patch state to confirmed: %v", err) + } + env.reconcileCR(t, cr.Name) + + children := env.listChildReservations(t, cr.Name) + if len(children) != 1 { + t.Fatalf("confirmed: expected 1 reservation, got %d", len(children)) + } + env.reconcileChildReservations(t, cr.Name) + + crState = env.getCR(t, cr.Name) + if !meta.IsStatusConditionTrue(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) { + t.Errorf("confirmed: expected Ready=True") + } + }) + + t.Run("confirmed→expired: child Reservations deleted, CR marked inactive", func(t *testing.T) { + env := newDefaultIntgEnv(t) + defer env.close() + + cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) + if err := env.k8sClient.Create(context.Background(), cr); err != nil { + t.Fatalf("create CR: %v", err) + } + + // Bring to confirmed+Ready=True. + env.reconcileCR(t, cr.Name) // adds finalizer + env.reconcileCR(t, cr.Name) // creates Reservations + env.reconcileChildReservations(t, cr.Name) // places slots → Ready=True + + if got := env.listChildReservations(t, cr.Name); len(got) != 1 { + t.Fatalf("pre-expire: expected 1 reservation, got %d", len(got)) + } + + // Transition to expired. + crState := env.getCR(t, cr.Name) + patch := client.MergeFrom(crState.DeepCopy()) + crState.Spec.State = v1alpha1.CommitmentStatusExpired + if err := env.k8sClient.Patch(context.Background(), &crState, patch); err != nil { + t.Fatalf("patch state to expired: %v", err) + } + env.reconcileCR(t, cr.Name) + + if got := env.listChildReservations(t, cr.Name); len(got) != 0 { + t.Errorf("expired: expected 0 reservations, got %d", len(got)) + } + crState = env.getCR(t, cr.Name) + cond := meta.FindStatusCondition(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) + if cond == nil || cond.Status != metav1.ConditionFalse { + t.Errorf("expired: expected Ready=False, got %v", cond) + } + if cond != nil && cond.Reason != string(v1alpha1.CommitmentStatusExpired) { + t.Errorf("expired: expected Reason=%s, got %s", v1alpha1.CommitmentStatusExpired, cond.Reason) + } + }) + + t.Run("reservation placement: two reconciles set TargetHost then Ready=True", func(t *testing.T) { + env := newDefaultIntgEnv(t) + defer env.close() + + cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) + if err := env.k8sClient.Create(context.Background(), cr); err != nil { + t.Fatalf("create CR: %v", err) + } + + env.reconcileCR(t, cr.Name) + env.reconcileCR(t, cr.Name) + + children := env.listChildReservations(t, cr.Name) + if len(children) != 1 { + t.Fatalf("expected 1 child reservation, got %d", len(children)) + } + child := children[0] + + // First reconcile: scheduler call → TargetHost written to Spec. + env.reconcileReservation(t, child.Name) + var afterFirst v1alpha1.Reservation + if err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: child.Name}, &afterFirst); err != nil { + t.Fatalf("get reservation after first reconcile: %v", err) + } + if afterFirst.Spec.TargetHost == "" { + t.Fatalf("expected TargetHost set after first reservation reconcile") + } + + // Second reconcile: TargetHost synced to Status, Ready=True. + env.reconcileReservation(t, child.Name) + var afterSecond v1alpha1.Reservation + if err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: child.Name}, &afterSecond); err != nil { + t.Fatalf("get reservation after second reconcile: %v", err) + } + if !meta.IsStatusConditionTrue(afterSecond.Status.Conditions, v1alpha1.ReservationConditionReady) { + t.Errorf("expected reservation Ready=True after placement, got %v", afterSecond.Status.Conditions) + } + if afterSecond.Status.Host != "host-1" { + t.Errorf("expected Status.Host=host-1, got %q", afterSecond.Status.Host) + } + }) + + t.Run("deletion: finalizer removed, child Reservations cleaned up", func(t *testing.T) { + env := newDefaultIntgEnv(t) + defer env.close() + + cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) + if err := env.k8sClient.Create(context.Background(), cr); err != nil { + t.Fatalf("create CR: %v", err) + } + + // Pre-create a child Reservation to verify it gets cleaned up on deletion. + // newTestCommittedResource pre-populates the finalizer, so Delete() immediately sets DeletionTimestamp. + child := &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-cr-0", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + CommitmentUUID: "test-uuid-1234", + }, + }, + } + if err := env.k8sClient.Create(context.Background(), child); err != nil { + t.Fatalf("create child reservation: %v", err) + } + + crState := env.getCR(t, cr.Name) + if err := env.k8sClient.Delete(context.Background(), &crState); err != nil { + t.Fatalf("delete CR: %v", err) + } + env.reconcileCR(t, cr.Name) + + if got := env.listChildReservations(t, cr.Name); len(got) != 0 { + t.Errorf("post-deletion: expected 0 reservations, got %d", len(got)) + } + var final v1alpha1.CommittedResource + err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: cr.Name}, &final) + if client.IgnoreNotFound(err) != nil { + t.Fatalf("unexpected error after deletion: %v", err) + } + if err == nil { + for _, f := range final.Finalizers { + if f == crFinalizer { + t.Errorf("finalizer not removed after deletion reconcile") + } + } + } + }) + + t.Run("confirmed→superseded: child Reservations deleted, CR marked inactive", func(t *testing.T) { + env := newDefaultIntgEnv(t) + defer env.close() + + cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) + if err := env.k8sClient.Create(context.Background(), cr); err != nil { + t.Fatalf("create CR: %v", err) + } + + env.reconcileCR(t, cr.Name) + env.reconcileCR(t, cr.Name) + env.reconcileChildReservations(t, cr.Name) + + if got := env.listChildReservations(t, cr.Name); len(got) != 1 { + t.Fatalf("pre-supersede: expected 1 reservation, got %d", len(got)) + } + + crState := env.getCR(t, cr.Name) + patch := client.MergeFrom(crState.DeepCopy()) + crState.Spec.State = v1alpha1.CommitmentStatusSuperseded + if err := env.k8sClient.Patch(context.Background(), &crState, patch); err != nil { + t.Fatalf("patch state to superseded: %v", err) + } + env.reconcileCR(t, cr.Name) + + if got := env.listChildReservations(t, cr.Name); len(got) != 0 { + t.Errorf("superseded: expected 0 reservations, got %d", len(got)) + } + crState = env.getCR(t, cr.Name) + cond := meta.FindStatusCondition(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) + if cond == nil || cond.Status != metav1.ConditionFalse { + t.Errorf("superseded: expected Ready=False, got %v", cond) + } + if cond != nil && cond.Reason != string(v1alpha1.CommitmentStatusSuperseded) { + t.Errorf("superseded: expected Reason=%s, got %s", v1alpha1.CommitmentStatusSuperseded, cond.Reason) + } + }) + + t.Run("idempotency: extra reconciles after Accepted do not create extra slots", func(t *testing.T) { + env := newDefaultIntgEnv(t) + defer env.close() + + cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) + if err := env.k8sClient.Create(context.Background(), cr); err != nil { + t.Fatalf("create CR: %v", err) + } + + env.reconcileCR(t, cr.Name) + env.reconcileCR(t, cr.Name) + env.reconcileChildReservations(t, cr.Name) + + if got := env.listChildReservations(t, cr.Name); len(got) != 1 { + t.Fatalf("pre-idempotency check: expected 1 reservation, got %d", len(got)) + } + + env.reconcileCR(t, cr.Name) + env.reconcileCR(t, cr.Name) + + if got := env.listChildReservations(t, cr.Name); len(got) != 1 { + t.Errorf("idempotency: expected 1 reservation after extra reconciles, got %d", len(got)) + } + crState := env.getCR(t, cr.Name) + if !meta.IsStatusConditionTrue(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) { + t.Errorf("idempotency: expected CR to remain Ready=True after extra reconciles") + } + }) + + t.Run("AllowRejection=false: stays Reserving when scheduler rejects", func(t *testing.T) { + env := newIntgEnv(t, []client.Object{newTestFlavorKnowledge(), intgHypervisor("host-1")}, intgRejectScheduler) + defer env.close() + + cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) + // AllowRejection stays false (the default), so placement failure must requeue, not reject. + if err := env.k8sClient.Create(context.Background(), cr); err != nil { + t.Fatalf("create CR: %v", err) + } + + ctx := context.Background() + crReq := ctrl.Request{NamespacedName: types.NamespacedName{Name: cr.Name}} + for range 3 { + env.crController.Reconcile(ctx, crReq) //nolint:errcheck + var resList v1alpha1.ReservationList + env.k8sClient.List(ctx, &resList, client.MatchingLabels{ //nolint:errcheck + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }) + for _, res := range resList.Items { + resReq := ctrl.Request{NamespacedName: types.NamespacedName{Name: res.Name}} + env.resController.Reconcile(ctx, resReq) //nolint:errcheck + env.resController.Reconcile(ctx, resReq) //nolint:errcheck + } + env.crController.Reconcile(ctx, crReq) //nolint:errcheck + } + + var final v1alpha1.CommittedResource + if err := env.k8sClient.Get(ctx, types.NamespacedName{Name: cr.Name}, &final); err != nil { + t.Fatalf("get CR: %v", err) + } + cond := meta.FindStatusCondition(final.Status.Conditions, v1alpha1.CommittedResourceConditionReady) + if cond == nil { + t.Fatalf("no Ready condition") + } + if cond.Reason == v1alpha1.CommittedResourceReasonRejected { + t.Errorf("AllowRejection=false: CR must not transition to Rejected, got Reason=%s", cond.Reason) + } + if cond.Reason != v1alpha1.CommittedResourceReasonReserving { + t.Errorf("AllowRejection=false: expected Reason=Reserving, got %s", cond.Reason) + } + }) + + t.Run("externally deleted child Reservation is recreated by CR controller", func(t *testing.T) { + env := newDefaultIntgEnv(t) + defer env.close() + + cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) + if err := env.k8sClient.Create(context.Background(), cr); err != nil { + t.Fatalf("create CR: %v", err) + } + + env.reconcileCR(t, cr.Name) + env.reconcileCR(t, cr.Name) + env.reconcileChildReservations(t, cr.Name) + + children := env.listChildReservations(t, cr.Name) + if len(children) != 1 { + t.Fatalf("expected 1 child reservation before deletion, got %d", len(children)) + } + + // Simulate out-of-band deletion of the slot. + child := children[0] + if err := env.k8sClient.Delete(context.Background(), &child); err != nil { + t.Fatalf("delete child reservation: %v", err) + } + + // CR controller detects the missing slot and recreates it. + env.reconcileCR(t, cr.Name) + // Place the new slot. + env.reconcileChildReservations(t, cr.Name) + // CR controller observes Ready=True on the recreated slot. + env.reconcileCR(t, cr.Name) + + if got := env.listChildReservations(t, cr.Name); len(got) != 1 { + t.Errorf("expected 1 reservation after recreation, got %d", len(got)) + } + crState := env.getCR(t, cr.Name) + if !meta.IsStatusConditionTrue(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) { + t.Errorf("expected CR to be Ready=True after slot recreation") + } + }) + + t.Run("AcceptedAt: set when CR accepted", func(t *testing.T) { + env := newDefaultIntgEnv(t) + defer env.close() + + cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) + if err := env.k8sClient.Create(context.Background(), cr); err != nil { + t.Fatalf("create CR: %v", err) + } + + env.reconcileCR(t, cr.Name) + env.reconcileCR(t, cr.Name) + env.reconcileChildReservations(t, cr.Name) + + crState := env.getCR(t, cr.Name) + if !meta.IsStatusConditionTrue(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) { + t.Fatalf("expected CR to be Ready=True") + } + if crState.Status.AcceptedAt == nil { + t.Errorf("expected AcceptedAt to be set on acceptance") + } + if crState.Status.AcceptedAmount == nil { + t.Errorf("expected AcceptedAmount to be set on acceptance") + } else if crState.Status.AcceptedAmount.Cmp(resource.MustParse("4Gi")) != 0 { + t.Errorf("AcceptedAmount: want 4Gi, got %s", crState.Status.AcceptedAmount.String()) + } + }) + + t.Run("resize failure: rolls back to AcceptedAmount, prior slot preserved", func(t *testing.T) { + // Scheduler: accepts the first placement call (initial 4 GiB slot), rejects all subsequent. + objects := []client.Object{newTestFlavorKnowledge(), intgHypervisor("host-1")} + env := newIntgEnv(t, objects, intgAcceptFirstScheduler(1)) + defer env.close() + + cr := intgCRAllowRejection("my-cr", "uuid-resize-0001", v1alpha1.CommitmentStatusConfirmed) + if err := env.k8sClient.Create(context.Background(), cr); err != nil { + t.Fatalf("create CR: %v", err) + } + + // Phase 1: accept at 4 GiB (1 slot). Uses 1 scheduler call. + intgDriveToTerminal(t, env, []string{cr.Name}) + var crState v1alpha1.CommittedResource + if err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: cr.Name}, &crState); err != nil { + t.Fatalf("get CR: %v", err) + } + if !meta.IsStatusConditionTrue(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) { + t.Fatalf("phase 1: expected CR to be Ready=True after initial placement") + } + if crState.Status.AcceptedAmount == nil || crState.Status.AcceptedAmount.Cmp(resource.MustParse("4Gi")) != 0 { + t.Fatalf("phase 1: AcceptedAmount must be 4Gi, got %v", crState.Status.AcceptedAmount) + } + + // Phase 2: resize to 8 GiB (needs 2 slots). Scheduler has no more accepts. + patch := client.MergeFrom(crState.DeepCopy()) + crState.Spec.Amount = resource.MustParse("8Gi") + if err := env.k8sClient.Patch(context.Background(), &crState, patch); err != nil { + t.Fatalf("patch CR to 8Gi: %v", err) + } + + ctx := context.Background() + crReq := ctrl.Request{NamespacedName: types.NamespacedName{Name: cr.Name}} + + // CR controller: applyReservationState bumps gen on existing slot, creates 2nd slot. + env.crController.Reconcile(ctx, crReq) //nolint:errcheck + // Reservation controller: existing slot echoes new ParentGeneration (no scheduler call); + // new slot calls scheduler → rejected. + var resList v1alpha1.ReservationList + env.k8sClient.List(ctx, &resList, client.MatchingLabels{ //nolint:errcheck + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }) + for _, res := range resList.Items { + resReq := ctrl.Request{NamespacedName: types.NamespacedName{Name: res.Name}} + env.resController.Reconcile(ctx, resReq) //nolint:errcheck + env.resController.Reconcile(ctx, resReq) //nolint:errcheck + } + // CR controller: detects 2nd slot Ready=False → rollbackToAccepted (keeps 1 slot) → Rejected. + env.crController.Reconcile(ctx, crReq) //nolint:errcheck + + // Rollback must preserve 1 slot (matching AcceptedAmount=4Gi), not delete all. + var finalList v1alpha1.ReservationList + if err := env.k8sClient.List(ctx, &finalList, client.MatchingLabels{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }); err != nil { + t.Fatalf("list reservations: %v", err) + } + if len(finalList.Items) != 1 { + t.Errorf("resize rollback: want 1 slot (AcceptedAmount), got %d", len(finalList.Items)) + } + intgAssertCRCondition(t, env.k8sClient, []string{cr.Name}, metav1.ConditionFalse, v1alpha1.CommittedResourceReasonRejected) + }) + + t.Run("AllowRejection=false: eventually accepted after scheduler starts accepting", func(t *testing.T) { + // Scheduler rejects the first 2 calls (one per reservation controller reconcile pair), + // then accepts all subsequent. AllowRejection=false means the CR controller retries rather + // than rejecting, so the CR must eventually reach Accepted once the scheduler cooperates. + objects := []client.Object{newTestFlavorKnowledge(), intgHypervisor("host-1")} + env := newIntgEnv(t, objects, intgRejectFirstScheduler(2)) + defer env.close() + + cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) + // AllowRejection stays false (default), so placement failure must requeue, not reject. + if err := env.k8sClient.Create(context.Background(), cr); err != nil { + t.Fatalf("create CR: %v", err) + } + + ctx := context.Background() + crReq := ctrl.Request{NamespacedName: types.NamespacedName{Name: cr.Name}} + for range 3 { + env.crController.Reconcile(ctx, crReq) //nolint:errcheck + var resList v1alpha1.ReservationList + env.k8sClient.List(ctx, &resList, client.MatchingLabels{ //nolint:errcheck + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }) + for _, res := range resList.Items { + resReq := ctrl.Request{NamespacedName: types.NamespacedName{Name: res.Name}} + env.resController.Reconcile(ctx, resReq) //nolint:errcheck + env.resController.Reconcile(ctx, resReq) //nolint:errcheck + } + env.crController.Reconcile(ctx, crReq) //nolint:errcheck + } + + var final v1alpha1.CommittedResource + if err := env.k8sClient.Get(ctx, types.NamespacedName{Name: cr.Name}, &final); err != nil { + t.Fatalf("get CR: %v", err) + } + cond := meta.FindStatusCondition(final.Status.Conditions, v1alpha1.CommittedResourceConditionReady) + if cond == nil { + t.Fatalf("no Ready condition after retries") + } + if cond.Reason == v1alpha1.CommittedResourceReasonRejected { + t.Errorf("AllowRejection=false: CR must not be Rejected, got Reason=%s", cond.Reason) + } + if cond.Status != metav1.ConditionTrue || cond.Reason != v1alpha1.CommittedResourceReasonAccepted { + t.Errorf("AllowRejection=false: expected Ready=True/Accepted after retries, got Ready=%s/Reason=%s", cond.Status, cond.Reason) + } + }) +} diff --git a/internal/scheduling/reservations/commitments/reservation_controller.go b/internal/scheduling/reservations/commitments/reservation_controller.go index 96d86aeb2..b65842c60 100644 --- a/internal/scheduling/reservations/commitments/reservation_controller.go +++ b/internal/scheduling/reservations/commitments/reservation_controller.go @@ -73,6 +73,7 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr } else { ctx = WithNewGlobalRequestID(ctx) } + ctx = reservations.WithRequestID(ctx, req.Name) logger := LoggerFromContext(ctx).WithValues("component", "controller", "reservation", req.Name) // filter for CR reservations diff --git a/internal/scheduling/reservations/commitments/reservation_manager.go b/internal/scheduling/reservations/commitments/reservation_manager.go index d1fa28fda..6d70bcd20 100644 --- a/internal/scheduling/reservations/commitments/reservation_manager.go +++ b/internal/scheduling/reservations/commitments/reservation_manager.go @@ -172,8 +172,7 @@ func (m *ReservationManager) ApplyCommitmentState( // Phase 5 (CREATE): Create new reservations (capacity increased) for deltaMemoryBytes > 0 { - // Need to create new reservation slots, always prefer largest flavor within the group - // TODO more sophisticated flavor selection, especially with flavors of different cpu/memory ratio + // Select the largest flavor that fits the remaining delta (flavors sorted descending by memory). reservation := m.newReservation(desiredState, nextSlotIndex, deltaMemoryBytes, flavorGroup, creator) result.TouchedReservations = append(result.TouchedReservations, *reservation) memValue := reservation.Spec.Resources[hv1.ResourceMemory] @@ -283,7 +282,8 @@ func (m *ReservationManager) newReservation( } name := fmt.Sprintf("%s%d", namePrefix, slotIndex) - // Select first flavor that fits remaining memory (flavors sorted descending by size) + // Select largest flavor that fits remaining memory (flavors sorted descending by memory then vCPUs). + // This works for both fixed and varying CPU:RAM ratio groups. flavorInGroup := flavorGroup.Flavors[len(flavorGroup.Flavors)-1] // default to smallest memoryBytes := deltaMemoryBytes cpus := int64(flavorInGroup.VCPUs) //nolint:gosec // VCPUs from flavor specs, realistically bounded diff --git a/internal/scheduling/reservations/commitments/reservation_manager_test.go b/internal/scheduling/reservations/commitments/reservation_manager_test.go index b512fc9b5..bb2fdaf52 100644 --- a/internal/scheduling/reservations/commitments/reservation_manager_test.go +++ b/internal/scheduling/reservations/commitments/reservation_manager_test.go @@ -342,3 +342,86 @@ func TestNewReservation_SelectsAppropriateFlavor(t *testing.T) { }) } } + +// variableRatioFlavorGroup returns a flavor group with varying CPU:RAM ratios (GP-style). +// Flavors are sorted descending by memory then vCPUs, matching the knowledge extractor order. +func variableRatioFlavorGroup() compute.FlavorGroupFeature { + minRatio := uint64(2048) // MiB/vCPU + maxRatio := uint64(8192) // MiB/vCPU + return compute.FlavorGroupFeature{ + Name: "gp-group", + Flavors: []compute.FlavorInGroup{ + {Name: "c4_m32", VCPUs: 4, MemoryMB: 32768, DiskGB: 100}, // 8 GiB/vCPU + {Name: "c8_m16", VCPUs: 8, MemoryMB: 16384, DiskGB: 50}, // 2 GiB/vCPU + {Name: "c4_m8", VCPUs: 4, MemoryMB: 8192, DiskGB: 25}, // 2 GiB/vCPU + }, + SmallestFlavor: compute.FlavorInGroup{Name: "c4_m8", VCPUs: 4, MemoryMB: 8192, DiskGB: 25}, + LargestFlavor: compute.FlavorInGroup{Name: "c4_m32", VCPUs: 4, MemoryMB: 32768, DiskGB: 100}, + RamCoreRatioMin: &minRatio, + RamCoreRatioMax: &maxRatio, + } +} + +func TestNewReservation_VariableRatioGroup_SelectsLargestByMemory(t *testing.T) { + // For GP (variable CPU:RAM ratio) groups, flavor selection is driven by memory + // descending, not by ratio. The largest flavor fitting the delta is always chosen. + manager := &ReservationManager{} + fg := variableRatioFlavorGroup() + + tests := []struct { + name string + deltaMemoryMB int64 + wantFlavor string + wantCores int64 + }{ + { + name: "delta fits c4_m32: picks largest by memory", + deltaMemoryMB: 32768, + wantFlavor: "c4_m32", + wantCores: 4, + }, + { + name: "delta larger than all: picks largest (c4_m32)", + deltaMemoryMB: 65536, + wantFlavor: "c4_m32", + wantCores: 4, + }, + { + name: "delta between c4_m32 and c8_m16: picks c8_m16", + deltaMemoryMB: 24576, // 24 GiB — c8_m16 (16 GiB) fits, c4_m32 (32 GiB) doesn't + wantFlavor: "c8_m16", + wantCores: 8, + }, + { + name: "delta equals c8_m16: picks c8_m16 (more vCPUs than c4_m8 at same memory)", + deltaMemoryMB: 16384, + wantFlavor: "c8_m16", + wantCores: 8, + }, + { + name: "delta fits only c4_m8: picks smallest", + deltaMemoryMB: 8192, + wantFlavor: "c4_m8", + wantCores: 4, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + deltaBytes := tt.deltaMemoryMB * 1024 * 1024 + state := &CommitmentState{ + CommitmentUUID: "test-uuid", + ProjectID: "project-1", + FlavorGroupName: "gp-group", + } + res := manager.newReservation(state, 0, deltaBytes, fg, "test") + if got := res.Spec.CommittedResourceReservation.ResourceName; got != tt.wantFlavor { + t.Errorf("flavor: want %s, got %s", tt.wantFlavor, got) + } + cpuQty := res.Spec.Resources[hv1.ResourceCPU] + if got := cpuQty.Value(); got != tt.wantCores { + t.Errorf("cores: want %d, got %d", tt.wantCores, got) + } + }) + } +} From e46387c4e8faf9994c3d9bc1261852a99cc11c31 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 4 May 2026 13:51:35 +0000 Subject: [PATCH 7/9] Bump cortex chart appVersions to sha-ab6eb45d [skip ci] --- helm/library/cortex/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 458645206..4eebd56c4 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -3,6 +3,6 @@ name: cortex description: A Helm chart to distribute cortex. type: application version: 0.0.45 -appVersion: "sha-1d4f049c" +appVersion: "sha-ab6eb45d" icon: "https://example.com/icon.png" dependencies: [] From bf8cc0d1e2461840c2d18769f659f837fabb73fe Mon Sep 17 00:00:00 2001 From: "cortex-ai-agents[bot]" <279748396+cortex-ai-agents[bot]@users.noreply.github.com> Date: Mon, 4 May 2026 16:08:36 +0200 Subject: [PATCH 8/9] Bump cortex chart to v0.0.46 (sha-ab6eb45d) and bundles to v0.0.59 (#795) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Bumps the cortex library chart appVersion to `sha-ab6eb45d` to track the latest commit on main (feat: adding operator-controlled per-resource-type config of committed resources #792) This ensures the release PR #793 includes the correct appVersion for all changes. 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: github-actions[bot] Co-authored-by: Claude Opus 4.7 Co-authored-by: cortex-ai-agents[bot] Co-authored-by: Philipp Matthes <27271818+PhilippMatthes@users.noreply.github.com> --- helm/bundles/cortex-cinder/Chart.yaml | 6 +++--- helm/bundles/cortex-crds/Chart.yaml | 4 ++-- helm/bundles/cortex-ironcore/Chart.yaml | 4 ++-- helm/bundles/cortex-manila/Chart.yaml | 6 +++--- helm/bundles/cortex-nova/Chart.yaml | 6 +++--- helm/bundles/cortex-pods/Chart.yaml | 4 ++-- helm/library/cortex/Chart.yaml | 2 +- 7 files changed, 16 insertions(+), 16 deletions(-) diff --git a/helm/bundles/cortex-cinder/Chart.yaml b/helm/bundles/cortex-cinder/Chart.yaml index c1a93f75e..9118e6f60 100644 --- a/helm/bundles/cortex-cinder/Chart.yaml +++ b/helm/bundles/cortex-cinder/Chart.yaml @@ -5,7 +5,7 @@ apiVersion: v2 name: cortex-cinder description: A Helm chart deploying Cortex for Cinder. type: application -version: 0.0.58 +version: 0.0.59 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres @@ -16,12 +16,12 @@ dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.45 + version: 0.0.46 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.45 + version: 0.0.46 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-crds/Chart.yaml b/helm/bundles/cortex-crds/Chart.yaml index 4972527e3..0fe152845 100644 --- a/helm/bundles/cortex-crds/Chart.yaml +++ b/helm/bundles/cortex-crds/Chart.yaml @@ -5,13 +5,13 @@ apiVersion: v2 name: cortex-crds description: A Helm chart deploying Cortex CRDs. type: application -version: 0.0.58 +version: 0.0.59 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.45 + version: 0.0.46 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/bundles/cortex-ironcore/Chart.yaml b/helm/bundles/cortex-ironcore/Chart.yaml index 2f97392d9..079aed03e 100644 --- a/helm/bundles/cortex-ironcore/Chart.yaml +++ b/helm/bundles/cortex-ironcore/Chart.yaml @@ -5,13 +5,13 @@ apiVersion: v2 name: cortex-ironcore description: A Helm chart deploying Cortex for IronCore. type: application -version: 0.0.58 +version: 0.0.59 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.45 + version: 0.0.46 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/bundles/cortex-manila/Chart.yaml b/helm/bundles/cortex-manila/Chart.yaml index 484789b26..25e81f47e 100644 --- a/helm/bundles/cortex-manila/Chart.yaml +++ b/helm/bundles/cortex-manila/Chart.yaml @@ -5,7 +5,7 @@ apiVersion: v2 name: cortex-manila description: A Helm chart deploying Cortex for Manila. type: application -version: 0.0.58 +version: 0.0.59 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres @@ -16,12 +16,12 @@ dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.45 + version: 0.0.46 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.45 + version: 0.0.46 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-nova/Chart.yaml b/helm/bundles/cortex-nova/Chart.yaml index e0b941ee1..1d838afa0 100644 --- a/helm/bundles/cortex-nova/Chart.yaml +++ b/helm/bundles/cortex-nova/Chart.yaml @@ -5,7 +5,7 @@ apiVersion: v2 name: cortex-nova description: A Helm chart deploying Cortex for Nova. type: application -version: 0.0.58 +version: 0.0.59 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres @@ -16,12 +16,12 @@ dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.45 + version: 0.0.46 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.45 + version: 0.0.46 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-pods/Chart.yaml b/helm/bundles/cortex-pods/Chart.yaml index e5f17d322..a64fe009c 100644 --- a/helm/bundles/cortex-pods/Chart.yaml +++ b/helm/bundles/cortex-pods/Chart.yaml @@ -5,13 +5,13 @@ apiVersion: v2 name: cortex-pods description: A Helm chart deploying Cortex for Pods. type: application -version: 0.0.58 +version: 0.0.59 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.45 + version: 0.0.46 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 4eebd56c4..d4519077d 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: cortex description: A Helm chart to distribute cortex. type: application -version: 0.0.45 +version: 0.0.46 appVersion: "sha-ab6eb45d" icon: "https://example.com/icon.png" dependencies: [] From 2c7ae89758898f046b1d40607e9f851832247ba8 Mon Sep 17 00:00:00 2001 From: "cortex-ai-agents[bot]" <279748396+cortex-ai-agents[bot]@users.noreply.github.com> Date: Mon, 4 May 2026 16:09:03 +0200 Subject: [PATCH 9/9] Update changelog for release PR #793 (#794) Adds CHANGELOG.md with release notes for PR #793. This PR should be merged **after** the release PR #793. --------- Co-authored-by: Claude Co-authored-by: github-actions[bot] Co-authored-by: cortex-ai-agents[bot] --- CHANGELOG.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e9386d04a..a7ff9901a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,24 @@ # Changelog +## 2026-05-04 — [#793](https://github.com/cobaltcore-dev/cortex/pull/793) + +### cortex v0.0.46 (sha-ab6eb45d) + +Non-breaking changes: +- Fix capacity filter to correctly account for multi-VM CommittedResource reservation slots — confirmed VMs are now summed (not just the last one), blocks are clamped to zero when confirmed exceeds slot size, and spec-only VMs larger than remaining slot are fully covered +- Expose `prometheusDatasourceControllerParallelReconciles` config option to allow parallel reconciles in the Prometheus datasource controller, reducing initial sync latency +- Remove `Conf` field from PrometheusDatasourceReconciler — config is now loaded internally via `conf.GetConfig` during `SetupWithManager` +- Add operator-controlled per-resource-type config (`flavorGroupResourceConfig`) for committed resources, replacing runtime derivation from flavor group metadata; supports wildcard (`*`) catch-all for unknown groups +- Propagate `AnnotationCreatorRequestID` from the change-commitments API to the CommittedResource CRD and through the reservation controller for end-to-end request tracing + +### cortex-nova v0.0.59 (sha-ab6eb45d) + +Includes updated chart cortex v0.0.46. + +Non-breaking changes: +- Remove all committed resource related Prometheus alerts (info API, change API, usage API, capacity API, and syncer alerts) +- Add `flavorGroupResourceConfig` to cortex-nova values.yaml with a wildcard default that sets `hasCapacity: true` for ram, cores, and instances + ## 2026-05-04 — [#779](https://github.com/cobaltcore-dev/cortex/pull/779) ### cortex v0.0.45 (sha-1fb35660)