Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 21 additions & 17 deletions api/v1alpha1/committed_resource_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,38 +106,42 @@ type CommittedResourceSpec struct {

// CommittedResourceStatus defines the observed state of CommittedResource.
type CommittedResourceStatus struct {
// AcceptedAmount is the quantity the controller last successfully provisioned as Reservation slots.
// Nil if the spec has never been successfully reconciled.
// AcceptedSpec is a snapshot of Spec from the last successful reconcile.
// Used by rollbackToAccepted to restore the exact previously-accepted placement (AZ, amount,
// project, domain, flavor group) even when the current spec has already been mutated to a new value.
// +kubebuilder:validation:Optional
AcceptedAmount *resource.Quantity `json:"acceptedAmount,omitempty"`
AcceptedSpec *CommittedResourceSpec `json:"acceptedSpec,omitempty"`

// AcceptedAt is when the controller last successfully reconciled the spec into Reservation slots.
// +kubebuilder:validation:Optional
AcceptedAt *metav1.Time `json:"acceptedAt,omitempty"`

// LastChanged is when the spec was last written by the syncer.
// When AcceptedAt is older than LastChanged, the controller has pending work.
// +kubebuilder:validation:Optional
LastChanged *metav1.Time `json:"lastChanged,omitempty"`

// LastReconcileAt is when the controller last ran its reconcile loop for this resource.
// +kubebuilder:validation:Optional
LastReconcileAt *metav1.Time `json:"lastReconcileAt,omitempty"`

// AssignedVMs holds the UUIDs of VMs deterministically assigned to this committed resource.
// Populated by the usage reconciler; used to compute UsedAmount and drive the quota controller.
// AssignedInstances holds the UUIDs of VM instances deterministically assigned to this committed resource.
// Populated by the usage reconciler; used to compute UsedResources and drive the quota controller.
// +kubebuilder:validation:Optional
AssignedVMs []string `json:"assignedVMs,omitempty"`
AssignedInstances []string `json:"assignedInstances,omitempty"`

// UsedAmount is the sum of assigned VM resources expressed in the same units as Spec.Amount.
// Populated by the usage reconciler.
// UsedResources is the total resource consumption of assigned VM instances, keyed by resource type
// (e.g. "memory" in MiB binary SI, "cpu" as core count). Populated by the usage reconciler.
// +kubebuilder:validation:Optional
UsedAmount *resource.Quantity `json:"usedAmount,omitempty"`
UsedResources map[string]resource.Quantity `json:"usedResources,omitempty"`

// LastUsageReconcileAt is when the usage reconciler last updated AssignedVMs and UsedAmount.
// LastUsageReconcileAt is when the usage reconciler last updated AssignedInstances and UsedResources.
// +kubebuilder:validation:Optional
LastUsageReconcileAt *metav1.Time `json:"lastUsageReconcileAt,omitempty"`

// UsageObservedGeneration is the CR generation that the usage reconciler last processed.
// Follows the Kubernetes observedGeneration pattern: when this differs from
// metadata.generation the cooldown is bypassed so spec changes (e.g. shrink) are reflected
// immediately rather than waiting for the next cooldown interval.
// +kubebuilder:validation:Optional
// +kubebuilder:validation:Minimum=0
UsageObservedGeneration *int64 `json:"usageObservedGeneration,omitempty"`

// Conditions holds the current status conditions.
// +kubebuilder:validation:Optional
Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"`
Expand All @@ -163,8 +167,8 @@ const (
// +kubebuilder:printcolumn:name="ResourceType",type="string",JSONPath=".spec.resourceType"
// +kubebuilder:printcolumn:name="AZ",type="string",JSONPath=".spec.availabilityZone"
// +kubebuilder:printcolumn:name="Amount",type="string",JSONPath=".spec.amount"
// +kubebuilder:printcolumn:name="AcceptedAmount",type="string",JSONPath=".status.acceptedAmount"
// +kubebuilder:printcolumn:name="UsedAmount",type="string",JSONPath=".status.usedAmount"
// +kubebuilder:printcolumn:name="UsedMemory",type="string",JSONPath=".status.usedResources.memory",priority=1
// +kubebuilder:printcolumn:name="UsedCPU",type="string",JSONPath=".status.usedResources.cpu",priority=1
// +kubebuilder:printcolumn:name="State",type="string",JSONPath=".spec.state"
// +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status"
// +kubebuilder:printcolumn:name="StartTime",type="date",JSONPath=".spec.startTime",priority=1
Expand Down
1 change: 1 addition & 0 deletions api/v1alpha1/datasource_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ const (
NovaDatasourceTypeFlavors NovaDatasourceType = "flavors"
NovaDatasourceTypeMigrations NovaDatasourceType = "migrations"
NovaDatasourceTypeAggregates NovaDatasourceType = "aggregates"
NovaDatasourceTypeImages NovaDatasourceType = "images"
)

type NovaDatasource struct {
Expand Down
31 changes: 17 additions & 14 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

44 changes: 41 additions & 3 deletions cmd/manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package main
import (
"context"
"crypto/tls"
"encoding/json"
"flag"
"fmt"
"log/slog"
Expand Down Expand Up @@ -102,7 +103,10 @@ func main() {
restConfig := ctrl.GetConfigOrDie()

// Custom entrypoint for scheduler e2e tests.
if len(os.Args) == 2 {
// Usage: /main <subcommand> [json-override]
// The optional json-override is merged on top of the ConfigMap config, e.g.:
// /main e2e-commitments '{"noCleanup":true,"azs":["qa-de-1a"]}'
if len(os.Args) >= 2 {
copts := client.Options{Scheme: scheme}
client := must.Return(client.New(restConfig, copts))
switch os.Args[1] {
Expand All @@ -119,7 +123,21 @@ func main() {
return
case "e2e-commitments":
commitmentsChecksConfig := conf.GetConfigOrDie[commitments.E2EChecksConfig]()
commitments.RunCommitmentsE2EChecks(ctx, commitmentsChecksConfig)
if len(os.Args) >= 3 {
if err := json.Unmarshal([]byte(os.Args[2]), &commitmentsChecksConfig); err != nil {
slog.Error("invalid json override for e2e-commitments", "err", err)
os.Exit(1)
}
}
func() {
defer func() {
if r := recover(); r != nil {
slog.Error("e2e check failed", "reason", r)
os.Exit(1)
}
}()
commitments.RunCommitmentsE2EChecks(ctx, commitmentsChecksConfig)
}()
return
}
}
Expand Down Expand Up @@ -549,14 +567,34 @@ func main() {
os.Exit(1)
}

crControllerConf := commitmentsConfig.CommittedResourceController
crControllerConf.ApplyDefaults()
if err := (&commitments.CommittedResourceController{
Client: multiclusterClient,
Scheme: mgr.GetScheme(),
Conf: commitmentsConfig.CommittedResourceController,
Conf: crControllerConf,
}).SetupWithManager(mgr, multiclusterClient); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "CommittedResource")
os.Exit(1)
}

usageReconcilerMonitor := commitments.NewUsageReconcilerMonitor()
metrics.Registry.MustRegister(&usageReconcilerMonitor)
if commitmentsUsageDB == nil {
setupLog.Error(nil, "UsageReconciler requires a datasource but commitments.datasourceName is not configured — skipping")
} else {
usageReconcilerConf := commitmentsConfig.UsageReconciler
usageReconcilerConf.ApplyDefaults()
if err := (&commitments.UsageReconciler{
Client: multiclusterClient,
Conf: usageReconcilerConf,
UsageDB: commitmentsUsageDB,
Monitor: usageReconcilerMonitor,
}).SetupWithManager(mgr, multiclusterClient); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "CommittedResourceUsage")
os.Exit(1)
}
}
}
if slices.Contains(mainConfig.EnabledControllers, "datasource-controllers") {
setupLog.Info("enabling controller", "controller", "datasource-controllers")
Expand Down
24 changes: 24 additions & 0 deletions helm/bundles/cortex-nova/templates/datasources.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,30 @@ spec:
---
apiVersion: cortex.cloud/v1alpha1
kind: Datasource
metadata:
name: nova-images
spec:
schedulingDomain: nova
databaseSecretRef:
name: cortex-nova-postgres
namespace: {{ .Release.Namespace }}
{{- if .Values.openstack.sso.enabled }}
ssoSecretRef:
name: cortex-nova-openstack-sso
namespace: {{ .Release.Namespace }}
{{- end }}
type: openstack
openstack:
syncInterval: 3600s
secretRef:
name: cortex-nova-openstack-keystone
namespace: {{ .Release.Namespace }}
type: nova
nova:
type: images
---
apiVersion: cortex.cloud/v1alpha1
kind: Datasource
metadata:
name: limes-project-commitments
spec:
Expand Down
9 changes: 8 additions & 1 deletion helm/bundles/cortex-nova/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,10 @@ cortex-scheduling-controllers:
# URL of the nova external scheduler API for placement decisions
schedulerURL: "http://localhost:8080/scheduler/nova/external"
committedResourceController:
# Back-off interval while CommittedResource placement is pending or failed
# Back-off interval while CommittedResource placement is pending or failed (base for exponential backoff)
requeueIntervalRetry: "1m"
# Maximum back-off interval cap for the exponential retry delay
maxRequeueInterval: "30m"
committedResourceAPI:
# Timeout for watching CommittedResource CRDs before rolling back
watchTimeout: "10s"
Expand Down Expand Up @@ -196,6 +198,11 @@ cortex-scheduling-controllers:
handlesCommitments: false
hasCapacity: true
hasQuota: false
committedResourceUsageReconciler:
# Minimum time between usage reconcile runs for the same CommittedResource.
# Also acts as the periodic fallback interval: a successful reconcile schedules
# the next run after this duration, so this is also the maximum status staleness.
cooldownInterval: "5m"
# OvercommitMappings is a list of mappings that map hypervisor traits to
# overcommit ratios. Note that this list is applied in order, so if there
# are multiple mappings applying to the same hypervisors, the last mapping
Expand Down
Loading