diff --git a/.claude/agents/release-bump-charts.md b/.claude/agents/release-bump-charts.md new file mode 100644 index 000000000..dccccaaa0 --- /dev/null +++ b/.claude/agents/release-bump-charts.md @@ -0,0 +1,76 @@ +--- +name: release-bump-charts +description: Takes a release digest with breaking change info, bumps helm chart versions accordingly, and opens or updates a bump PR for a given release PR number. +tools: Bash, Read, Write, Edit, Agent +model: inherit +--- + +# Release Bump Charts Agent + +You receive a release digest (with breaking change info) and the release PR number. Your job is to bump the helm chart versions and open/update a PR. + +--- + +## Input + +The caller provides: +1. The release PR number (e.g. `123`) +2. The release digest containing `### Changed Charts` and `### Breaking Changes` sections + +## Step 1: Parse the digest + +From the digest, identify: +- Which library charts changed (from `### Changed Charts`) +- Whether any breaking changes exist (from `### Breaking Changes`) + +## Step 2: Bump versions + +For each changed library chart listed in the digest, update `helm/library//Chart.yaml`: +- If there are breaking changes for that chart: **minor-bump** the `version` (e.g. `0.5.14` → `0.6.0`) +- If no breaking changes: **patch-bump** the `version` (e.g. `0.5.14` → `0.5.15`) + +Do NOT touch `appVersion`. + +Then update the matching `dependencies[].version` entry in every `helm/bundles/*/Chart.yaml` that references the bumped library chart. + +## Step 3: Check for existing bump PR + +``` +gh pr list --head release/bump-charts- --state open --json number,url +``` + +## Step 4a: If a PR already exists + +1. Check out the existing `release/bump-charts-` branch +2. Reset it to `main`: `git reset --hard origin/main` +3. Apply the version bumps on top +4. Force-push the branch +5. Update the existing PR title and body with `gh pr edit` + +## Step 4b: If no PR exists + +1. Create branch `release/bump-charts-` from `main` +2. Apply the version bumps +3. Commit changes with message: `Bump chart versions for release PR #` +4. Push the branch +5. Use the **pull-request-creator** agent to open a PR. Provide the motivation: + - Which charts were bumped and to which versions + - Note that this PR should be merged before the release PR # + +## Step 5: Report + +Return a structured report: + +``` +## Bump Charts Result + +### PR +- PR #XXX: (opened/updated) + +### Bumped Charts +- cortex: 0.0.47 → 0.0.48 +- cortex-postgres: 0.5.14 → 0.5.15 + +### Updated Bundles +- cortex-nova/Chart.yaml: cortex-postgres 0.5.14 → 0.5.15, cortex 0.0.47 → 0.0.48 +``` diff --git a/.claude/agents/release-changelog.md b/.claude/agents/release-changelog.md new file mode 100644 index 000000000..0839c22f6 --- /dev/null +++ b/.claude/agents/release-changelog.md @@ -0,0 +1,100 @@ +--- +name: release-changelog +description: Takes a release digest with bumped chart versions, generates a changelog entry, prepends it to CHANGELOG.md, and opens or updates a changelog PR. +tools: Bash, Read, Write, Edit, Agent +model: inherit +--- + +# Release Changelog Agent + +You receive the release PR number, a release digest (with commit details and breaking changes), and the bumped chart versions. Your job is to generate the changelog entry, prepend it to CHANGELOG.md, and open/update a PR. + +--- + +## Input + +The caller provides: +1. The release PR number (e.g. `123`) +2. The release digest (with commits by component, breaking changes, and changed charts) +3. The bumped chart versions (e.g. `cortex: 0.0.47 → 0.0.48, cortex-postgres: 0.5.14 → 0.6.0`) + +## Step 1: Generate the changelog entry + +Using the digest and bumped versions, generate a changelog following this template: + +```markdown +## YYYY-MM-DD — [#NNN](https://github.com/cobaltcore-dev/cortex/pull/NNN) + +### v () + +Breaking changes: +- + +Non-breaking changes: +- + +... repeat for each changed chart ... + +### General + +Breaking changes: +- + +Non-breaking changes: +- +``` + +Rules: +- Use the NEW bumped version numbers (provided in input), NOT the pre-bump versions. +- One `###` section per changed chart only. +- For bundle charts, list which library versions they include, then any bundle-specific changes. +- Omit `### General` if empty. +- No commit SHAs, one line per bullet. +- Omit `Breaking changes:` subsection if there are none for that chart. +- Omit `Non-breaking changes:` subsection if there are none for that chart. + +## Step 2: Update CHANGELOG.md + +1. If `CHANGELOG.md` does not exist, create it with a `# Changelog` header. +2. Read the existing `CHANGELOG.md`. +3. Insert the new changelog entry immediately below the `# Changelog` header (before any existing entries). + +## Step 3: Check for existing changelog PR + +``` +gh pr list --head release/changelog- --state open --json number,url +``` + +## Step 4a: If a PR already exists + +1. Check out the existing `release/changelog-` branch +2. Reset it to `main`: `git reset --hard origin/main` +3. Apply the changelog update on top +4. Force-push the branch +5. Update the existing PR title and body with `gh pr edit` + +## Step 4b: If no PR exists + +1. Create branch `release/changelog-` from `main` +2. Apply the changelog update +3. Commit with message: `Update changelog for release PR #` +4. Push the branch +5. Use the **pull-request-creator** agent to open a PR with: + - Title: `Update changelog for release PR #` + - Motivation: This PR adds the changelog entry for release PR #. It should be merged after the release PR. + +## Step 5: Report + +Return a structured report: + +``` +## Changelog PR Result + +### PR +- PR #YYY: (opened/updated) + +### Changelog Entry + +``` + +Important: Include the full changelog entry text in your report — the orchestrator needs it for the next step. diff --git a/.claude/agents/release-digest.md b/.claude/agents/release-digest.md new file mode 100644 index 000000000..c1fa4ce55 --- /dev/null +++ b/.claude/agents/release-digest.md @@ -0,0 +1,81 @@ +--- +name: release-digest +description: Fetches PR metadata, classifies commits by component, checks helm charts for updated appVersions, determines breaking changes, and produces a structured release digest. +tools: Bash, Read +model: inherit +--- + +# Release Digest Agent + +You produce a structured release digest for a given PR number. The caller passes you the PR number as context. + +--- + +## Step 1: Fetch PR metadata + +``` +gh pr view --json number,title,body,commits,files +``` + +## Step 2: Classify commits + +For each commit SHA in the PR, inspect the changed files: + +``` +git show --name-only --format="%H %s" +``` + +Classify each commit to a component: +- **Cortex shim**: code touching `internal/shim` or `cmd/shim` +- **Cortex postgres**: code touching the postgres docker image (`postgres/`), or its helm chart (`helm/library/cortex-postgres`) +- **Cortex core**: core code touching anything else — the manager or external scheduler logic of cortex +- **General**: CI, tooling, docs, or other non-code changes + +## Step 3: Check helm charts for updated appVersions + +Read through the cortex helm charts in the `helm/library/` folder. Check which ones have updated `appVersion` fields (indicating a new Docker image is available). Compare the appVersion in the current branch to what's on `main`: + +``` +git diff main...HEAD -- helm/library/*/Chart.yaml +``` + +## Step 4: Determine breaking changes + +Read the actual diff for each commit that touches code. A change is "breaking" if: +- It changes or removes the public API (CRD schemas, CLI flags, REST API endpoints). Additions are NOT breaking. +- It requires a config format change (renaming/removing a values.yaml key, changing expected format). + +## Step 5: Produce the release digest + +Output in this exact format: + +``` +## Release Digest — PR #NNN "{title}" + +### Changed Charts +- cortex v (sha-xxxxxxxx) +- cortex-postgres v (sha-xxxxxxxx) +- cortex-nova v — includes cortex v, cortex-postgres v + +### Commits by Component + +#### cortex core +- + +#### cortex postgres +- + +#### cortex shim +- + +#### General +- + +### Breaking Changes +- [component] +(or "None" if no breaking changes) +``` + +Note: The versions in `### Changed Charts` are the CURRENT versions from Chart.yaml (pre-bump). The bump agent will determine the new versions. Include only charts whose `appVersion` actually changed. + +Return ONLY the digest. Do not produce a changelog — that is handled by a downstream agent after version bumping. diff --git a/.claude/agents/release-update-description.md b/.claude/agents/release-update-description.md new file mode 100644 index 000000000..cc334c6bc --- /dev/null +++ b/.claude/agents/release-update-description.md @@ -0,0 +1,53 @@ +--- +name: release-update-description +description: Takes a changelog entry, bump PR reference, and changelog PR reference, and updates the release PR description using gh pr edit. +tools: Bash, Read +model: inherit +--- + +# Release Update Description Agent + +You receive the release PR number, the formatted changelog, the bump PR reference, and the changelog PR reference. Your job is to update the release PR description. + +--- + +## Input + +The caller provides: +1. The release PR number (e.g. `123`) +2. The formatted changelog entry text +3. The bump PR number and URL (e.g. `#456 https://github.com/...`) +4. The changelog PR number and URL (e.g. `#457 https://github.com/...`) + +## Step 1: Build the PR description body + +Construct the PR description using this structure: + +```markdown +## Changelog + + + +## Dependencies + +- Bump PR: # (must be merged before this PR) +- Changelog PR: # (merge after this PR) +``` + +## Step 2: Update the PR + +``` +gh pr edit --body "" +``` + +Use a heredoc or temp file to pass the body to avoid shell quoting issues. + +## Step 3: Report + +Return: + +``` +## PR Description Updated + +PR # description updated with changelog and references to bump PR # and changelog PR #. +``` diff --git a/.claude/commands/release.md b/.claude/commands/release.md index d770d7e24..588d40b0b 100644 --- a/.claude/commands/release.md +++ b/.claude/commands/release.md @@ -1,184 +1,113 @@ --- -allowed-tools: Read, Write, Edit, Bash(*), WebSearch, WebFetch, Agent +allowed-tools: Read, Bash(*), Agent description: Release orchestrator — builds a digest of what changed in a release PR, opens a changelog PR, and references the bump PR. Usage: /release PR_NUMBER --- # Release Orchestrator -Your job is to orchestrate the release process for a given PR. This involves analyzing the PR's commits and changed files to build a structured digest of what changed, determining if there are any breaking changes, preparing a changelog, opening a PR to bump chart versions if needed, and updating the original PR description with the changelog and references to the new PRs. +You orchestrate the release process for a given PR. You MUST complete all three deliverables in order: +1. A bump PR for helm chart versions +2. A changelog PR with the release notes (using the bumped versions) +3. The release PR description updated with the changelog and references to both PRs ---- - -## Phase 1: Collect — Build the release digest - -1. Fetch PR metadata: - ``` - gh pr view $ARGUMENTS --json number,title,body,commits,files - ``` +You achieve this by dispatching focused subagents **sequentially**. Each step depends on the output of the previous one. Do NOT try to do the detailed work yourself — you are a dispatcher. -2. For each commit SHA in the PR, inspect the changed files: - ``` - git show --name-only --format="%H %s" - ``` +--- -3. Classify each commit to a component: - - Cortex shim: code touching the shim layer (internal/shim and cmd/shim) - - Cortex postgres: code touching the postgres docker image, or its helm chart - - Cortex core: core code touching anything else: the manager or external scheduler logic of cortex - - General: CI, tooling, docs, or other non-code changes +## Phase 1: Collect the release digest -4. Finally, read through the cortex helm charts in the helm/ folder, and check which ones have updated appVersions, indicating a new Docker image is available and that the chart should be included in the release notes. +Dispatch the **release-digest** agent. -Produce a structured digest in this exact format — the subagents depend on it: +Prompt: `Produce a release digest for PR #.` -``` -## Release Digest — PR #NNN "{title}" +Wait for it to return. Save its full output as the **digest**. -### Changed Charts -- cortex v1.2.3 (sha-xxxxxxxx) -- cortex-postgres v1.2.3 (sha-xxxxxxxx) -- cortex-nova v1.2.3 — includes cortex v1.2.3, cortex-postgres v1.2.3 +--- -### Commits by Component +## Phase 2: Bump chart versions -#### cortex core -- +Dispatch the **release-bump-charts** agent. Pass it the PR number and the full digest. -#### cortex postgres -- +Prompt: +``` +Release PR number: -#### cortex shim -- + -#### General -- +Bump the helm chart versions and open/update a bump PR. ``` -**Important**: Do NOT skip or shallow this phase. Read actual file diffs. The subagents depend entirely on the quality of this digest. +Wait for it to return. From its report, extract: +- The bump PR number and URL +- The list of bumped chart versions (e.g. `cortex: 0.0.47 → 0.0.48`) --- -## Phase 2: Determine Breaking Changes and Prepare a Changelog - -Reason for each change by looking at the commit's diff, if it is a breaking change that requires special attention. - -**Important**: Do NOT skip or shallow this phase. Read actual file diffs. The PR reviewers depend entirely on the quality of this analysis to know what to focus on in their review. - -### When is a change "breaking"? - -A change should be classified as "breaking" if it meets any of the following criteria: - -- It changes or removes the public API of any component (e.g., CRD schemas, CLI flags, or REST API endpoints). Note: additions to the public API are not breaking. -- It requires a config format change (e.g., renaming or removing a values.yaml key, changing the expected format of a value, etc) - -Once the digest is complete, read each agent file, then dispatch all three **in parallel** using the Agent tool in a single message. Each subagent operates independently — do not wait for one before starting the others. - -### Prepare the changelog - -Generate a changelog following this template: - -```markdown -# Changelog - -## YYYY-MM-DD — [#NNN]() - -### v () - -Breaking changes: -- - -Non-breaking changes: -- - -... repeat for each changed chart ... - -### General +## Phase 3: Create the changelog PR -Breaking changes: -- +Dispatch the **release-changelog** agent. Pass it the PR number, the full digest, and the bumped versions from Phase 2. -Non-breaking changes: -- +Prompt: ``` +Release PR number: -One `###` section per changed chart only. For bundle sections, list which library versions they include, then any bundle-specific changes (values.yaml keys, template/CRD changes). Omit `### General` if empty. No commit SHAs, one line per bullet. +Bumped chart versions: + -Example: -```markdown -# Changelog +Release digest: + -## 2026-04-24 — [#123](https://github.com/cobaltcore-dev/cortex/pull/123) - -### cortex v0.0.43 (sha-xxxxxxxx) - -Breaking changes: -- Check hypervisor resources against reservations - -Non-breaking changes: -- Commitments usage API uses postgres database instead of calling nova - -### cortex-postgres v0.5.14 (sha-xxxxxxxx) - -Non-breaking changes: -- Add commitments table migration - -### cortex-nova v0.0.56 (sha-xxxxxxxx) - -Includes updated charts cortex v0.0.43 and cortex-postgres v0.5.14. - -Non-breaking changes: -- values.yaml: added `reservations.enabled` (default: false) - -### General - -Non-breaking changes: -- Update golangci-lint to v2.1.0 +Generate the changelog entry using the NEW bumped versions, prepend it to CHANGELOG.md, and open/update a changelog PR. ``` -## Phase 3: Bump Chart Versions - -Prepare chart version bumps so GitHub pushes bumped charts to the registry immediately after the release PR is merged. +Wait for it to return. From its report, extract: +- The changelog PR number and URL +- The full changelog entry text -For each changed library chart, patch-bump its `version` in `helm/library//Chart.yaml` (e.g. `0.0.43` → `0.1.0`), if there was no breaking change, otherwise minor-bump it. Do not touch `appVersion`. Then update the matching `dependencies[].version` entry in every `helm/bundles/*/Chart.yaml` that references it. +--- -### Check for existing bump PR +## Phase 4: Update the release PR description -Before creating a new PR, check if one already exists for this release: +Dispatch the **release-update-description** agent. Pass it the PR number, changelog entry, bump PR reference, and changelog PR reference. +Prompt: ``` -gh pr list --head release/bump-charts- --state open --json number,url -``` +Release PR number: -- **If a PR already exists**: check out the existing `release/bump-charts-` branch, reset it to `main` (`git reset --hard origin/main`), apply the version bumps on top, force-push the branch. Then update the existing PR title and body with `gh pr edit` to reflect the latest changes. -- **If no PR exists**: create branch `release/bump-charts-` from `main`, apply the bumps, and open a new PR noting in the body that it should be merged before the release PR. Use the pull-request-creator agent for this subtask, and include the chart changes in the motivation so they are included in the PR description. +Changelog entry: + -## Phase 4: Update the PR Description +Bump PR: # () +Changelog PR: # () -Use `gh pr edit` with `--body` to update the PR description with the changelog. It is fine for release pull request descriptions to utilize markdown formatting. Reference the opened bump PR in the description as well as a dependency. +Update the release PR description with the changelog and references to both PRs. +``` -## Phase 5: Create a Changelog PR +Wait for it to return. -If the CHANGELOG.md does not exist, create it with a `# Changelog` header. Then prepend the new changelog entry below the header. +--- -### Check for existing changelog PR +## Phase 5: Summarize -Before creating a new PR, check if one already exists for this release: +After all agents have completed, produce a short summary: ``` -gh pr list --head release/changelog- --state open --json number,url -``` +## Release #NNN Post-Open Summary -- **If a PR already exists**: check out the existing `release/changelog-` branch, reset it to `main` (`git reset --hard origin/main`), apply the changelog update on top, force-push the branch. Then update the existing PR title and body with `gh pr edit` to reflect the latest changes. -- **If no PR exists**: create branch `release/changelog-` from `main`, apply the changelog, and open a new PR to `main` with title `Update changelog for release PR #` and a body noting it should be merged after the release PR. Use the pull-request-creator agent for this subtask. +- Bump PR: #XXX opened/updated +- Changelog PR: #YYY opened/updated +- PR #NNN description: updated with changelog and PR references +``` -## Phase 6: Summarize — Report what happened +If any agent reports a failure, include that in the summary and suggest next steps. -After all subagents return, produce a short summary: +--- -``` -## Release #NNN Post-Open Summary +## Critical rules -- PR description updated with changelog and bump PR reference -- Bump PR #XXX opened/updated to update chart versions -- Changelog PR #YYY opened/updated to update CHANGELOG.md -``` +- Execute phases 1 → 2 → 3 → 4 **strictly in order**. Each depends on the previous. +- You MUST complete ALL FOUR phases. Never skip one. +- Do NOT read code yourself — the release-digest agent handles that. +- Do NOT generate changelog text yourself — the release-changelog agent handles that. +- Keep your own context minimal — you are a dispatcher, not an analyst. +- Pass data between phases by extracting the relevant pieces from each agent's report and including them verbatim in the next agent's prompt. diff --git a/.github/actions/start-litellm-proxy/action.yml b/.github/actions/start-litellm-proxy/action.yml index 5b1431b2c..e5ae105f1 100644 --- a/.github/actions/start-litellm-proxy/action.yml +++ b/.github/actions/start-litellm-proxy/action.yml @@ -5,7 +5,7 @@ inputs: litellm_model: description: "LiteLLM model identifier" required: false - default: "sap/anthropic--claude-4.6-sonnet" + default: "sap/anthropic--claude-4.6-opus" runs: using: composite diff --git a/.github/patches/claude-code-action/litellm-proxy.patch b/.github/patches/claude-code-action/litellm-proxy.patch index e05cc6283..e3e208c81 100644 --- a/.github/patches/claude-code-action/litellm-proxy.patch +++ b/.github/patches/claude-code-action/litellm-proxy.patch @@ -11,10 +11,10 @@ index b6d0f05..054e0b8 100644 + required: false + default: "false" + litellm_model: -+ description: "LiteLLM model identifier (e.g., 'sap/anthropic--claude-4.6-sonnet')" ++ description: "LiteLLM model identifier (e.g., 'sap/anthropic--claude-4.6-opus')" + required: false -+ default: "sap/anthropic--claude-4.6-sonnet" - ++ default: "sap/anthropic--claude-4.6-opus" + claude_args: description: "Additional arguments to pass directly to Claude CLI" @@ -173,10 +181,11 @@ runs: @@ -30,12 +30,12 @@ index b6d0f05..054e0b8 100644 + # Install Bun via official install script (avoids github.com API token issues on GHES) + curl -fsSL https://bun.sh/install | bash -s "bun-v1.3.13" + echo "$HOME/.bun/bin" >> "$GITHUB_PATH" - + - name: Setup Custom Bun Path if: inputs.path_to_bun_executable != '' @@ -292,13 +301,14 @@ runs: NODE_VERSION: ${{ env.NODE_VERSION }} - + # Provider configuration - ANTHROPIC_API_KEY: ${{ inputs.anthropic_api_key }} + ANTHROPIC_API_KEY: ${{ inputs.use_litellm == 'true' && env.LITELLM_PROXY_API_KEY || inputs.anthropic_api_key }} @@ -46,13 +46,13 @@ index b6d0f05..054e0b8 100644 CLAUDE_CODE_USE_VERTEX: ${{ inputs.use_vertex == 'true' && '1' || '' }} CLAUDE_CODE_USE_FOUNDRY: ${{ inputs.use_foundry == 'true' && '1' || '' }} + CLAUDE_CODE_USE_LITELLM: ${{ inputs.use_litellm == 'true' && '1' || '' }} - + # AWS configuration AWS_REGION: ${{ env.AWS_REGION }} @@ -335,6 +345,10 @@ runs: MCP_TOOL_TIMEOUT: ${{ env.MCP_TOOL_TIMEOUT }} MAX_MCP_OUTPUT_TOKENS: ${{ env.MAX_MCP_OUTPUT_TOKENS }} - + + # SAP compliance configuration + DISABLE_TELEMETRY: "1" + DISABLE_ERROR_REPORTING: "1" @@ -77,9 +77,9 @@ index 1f28da3..4eae3f2 100644 + const useLiteLLM = process.env.CLAUDE_CODE_USE_LITELLM === "1"; const anthropicApiKey = process.env.ANTHROPIC_API_KEY; const claudeCodeOAuthToken = process.env.CLAUDE_CODE_OAUTH_TOKEN; - + const errors: string[] = []; - + // Check for mutual exclusivity between providers - const activeProviders = [useBedrock, useVertex, useFoundry].filter(Boolean); + const activeProviders = [useBedrock, useVertex, useFoundry, useLiteLLM].filter(Boolean); @@ -89,7 +89,7 @@ index 1f28da3..4eae3f2 100644 + "Cannot use multiple providers simultaneously. Please set only one of: CLAUDE_CODE_USE_BEDROCK, CLAUDE_CODE_USE_VERTEX, CLAUDE_CODE_USE_FOUNDRY, or CLAUDE_CODE_USE_LITELLM.", ); } - + - if (!useBedrock && !useVertex && !useFoundry) { + if (!useBedrock && !useVertex && !useFoundry && !useLiteLLM) { if (!anthropicApiKey && !claudeCodeOAuthToken) { @@ -114,5 +114,5 @@ index 1f28da3..4eae3f2 100644 + ); + } } - + if (errors.length > 0) { diff --git a/.github/renovate.json b/.github/renovate.json index b864ff7c3..d991d871d 100644 --- a/.github/renovate.json +++ b/.github/renovate.json @@ -99,6 +99,15 @@ "config/manager/kustomization.yaml" ], "enabled": false + }, + { + "matchPackageNames": [ + "python" + ], + "matchFileNames": [ + ".github/actions/setup-claude-code-action/**" + ], + "enabled": false } ], "prHourlyLimit": 0, diff --git a/.github/scripts/litellm-proxy.py b/.github/scripts/litellm-proxy.py index 2e5ce47ce..71affc3db 100644 --- a/.github/scripts/litellm-proxy.py +++ b/.github/scripts/litellm-proxy.py @@ -27,8 +27,8 @@ app = FastAPI() -# The LiteLLM model to route requests to (e.g., "sap/anthropic--claude-4.6-sonnet") -LITELLM_MODEL = os.environ.get("LITELLM_MODEL", "sap/anthropic--claude-4.6-sonnet") +# The LiteLLM model to route requests to (e.g., "sap/anthropic--claude-4.6-opus") +LITELLM_MODEL = os.environ.get("LITELLM_MODEL", "sap/anthropic--claude-4.6-opus") @app.get("/health/readiness") diff --git a/.github/workflows/claude-assistant.yaml b/.github/workflows/claude-assistant.yaml index 1ce4b1234..20d939630 100644 --- a/.github/workflows/claude-assistant.yaml +++ b/.github/workflows/claude-assistant.yaml @@ -46,7 +46,7 @@ jobs: go-version-file: 'go.mod' - name: Generate GitHub App token id: app-token - uses: actions/create-github-app-token@v1 + uses: actions/create-github-app-token@v3 with: app-id: ${{ secrets.CORTEX_AI_AGENTS_APP_ID }} private-key: ${{ secrets.CORTEX_AI_AGENTS_CLIENT_PKEY }} @@ -63,13 +63,14 @@ jobs: claude_args: | --max-turns 1000 --permission-mode auto - --effort-level max --allowedTools "Read,Write,Edit,Bash(*),WebSearch,WebFetch" trigger_phrase: "@claude" include_comments_by_actor: "auhlig,umswmayj,juliusclausnitzer,mblos,PhilippMatthes,Varsius,henrichter,SoWieMarkus,*[bot]" use_litellm: "true" litellm_model: "sap/anthropic--claude-4.6-opus" github_token: ${{ steps.app-token.outputs.token }} + bot_id: "279748396" + bot_name: "cortex-ai-agents[bot]" show_full_output: "true" - uses: ./.github/actions/stop-litellm-proxy if: always() diff --git a/.github/workflows/claude-weekly.yaml b/.github/workflows/claude-weekly.yaml index de4e940df..2d81a6a6d 100644 --- a/.github/workflows/claude-weekly.yaml +++ b/.github/workflows/claude-weekly.yaml @@ -21,7 +21,7 @@ jobs: go-version-file: 'go.mod' - name: Generate GitHub App token id: app-token - uses: actions/create-github-app-token@v1 + uses: actions/create-github-app-token@v3 with: app-id: ${{ secrets.CORTEX_AI_AGENTS_APP_ID }} private-key: ${{ secrets.CORTEX_AI_AGENTS_CLIENT_PKEY }} @@ -39,11 +39,12 @@ jobs: claude_args: | --max-turns 1000 --permission-mode auto - --effort-level max --allowedTools "Read,Write,Edit,Bash(*),WebSearch,WebFetch" use_litellm: "true" litellm_model: "sap/anthropic--claude-4.6-opus" github_token: ${{ steps.app-token.outputs.token }} + bot_id: "279748396" + bot_name: "cortex-ai-agents[bot]" show_full_output: "true" - uses: ./.github/actions/stop-litellm-proxy if: always() diff --git a/.github/workflows/codeql.yaml b/.github/workflows/codeql.yaml index 612360fdb..3a9815541 100644 --- a/.github/workflows/codeql.yaml +++ b/.github/workflows/codeql.yaml @@ -24,7 +24,7 @@ permissions: jobs: analyze: name: CodeQL - runs-on: ubuntu-latest + runs-on: large_runner_16core_64gb steps: - name: Check out code uses: actions/checkout@v6 diff --git a/.golangci.yaml b/.golangci.yaml index c857e9c96..c2311c4aa 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -126,7 +126,7 @@ linters: require-specific: true staticcheck: dot-import-whitelist: - - github.com/majewsky/gg/option + - go.xyrillian.de/gg/option - github.com/onsi/ginkgo/v2 - github.com/onsi/gomega usestdlibvars: diff --git a/Makefile b/Makefile index be7bb84f1..e940d6453 100644 --- a/Makefile +++ b/Makefile @@ -58,8 +58,8 @@ CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen GOLANGCI_LINT = $(LOCALBIN)/golangci-lint GOTESTSUM = $(LOCALBIN)/gotestsum -CONTROLLER_TOOLS_VERSION ?= v0.20.1 -GOLANGCI_LINT_VERSION ?= v2.11.4 +CONTROLLER_TOOLS_VERSION ?= v0.21.0 +GOLANGCI_LINT_VERSION ?= v2.12.2 GOTESTSUM_VERSION ?= v1.13.0 .PHONY: controller-gen diff --git a/api/v1alpha1/flavor_group_capacity_types.go b/api/v1alpha1/flavor_group_capacity_types.go index a7339dce2..80596256e 100644 --- a/api/v1alpha1/flavor_group_capacity_types.go +++ b/api/v1alpha1/flavor_group_capacity_types.go @@ -4,6 +4,7 @@ package v1alpha1 import ( + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -56,6 +57,10 @@ type FlavorGroupCapacityStatus struct { // +kubebuilder:validation:Optional CommittedCapacity int64 `json:"committedCapacity,omitempty"` + // TotalCapacity is the total capacity of all eligible hosts in an empty-datacenter scenario. + // +kubebuilder:validation:Optional + TotalCapacity map[string]resource.Quantity `json:"totalCapacity,omitempty"` + // TotalInstances is the total number of VM instances running on hypervisors in this AZ, // derived from Hypervisor CRD Status.Instances (not filtered by flavor group). // +kubebuilder:validation:Optional diff --git a/api/v1alpha1/project_quota_types.go b/api/v1alpha1/project_quota_types.go index fecac57cc..092155d3e 100644 --- a/api/v1alpha1/project_quota_types.go +++ b/api/v1alpha1/project_quota_types.go @@ -7,34 +7,9 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -// ResourceQuota holds the quota for a single resource with per-AZ breakdown. -// Maps to liquid.ResourceQuotaRequest from the LIQUID API. -// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ResourceQuotaRequest -type ResourceQuota struct { - // Quota is the total quota across all AZs (for compatibility). - // Corresponds to liquid.ResourceQuotaRequest.Quota. - // +kubebuilder:validation:Required - Quota int64 `json:"quota"` - - // PerAZ holds the per-availability-zone quota breakdown. - // Key: availability zone name, Value: quota for that AZ. - // Only populated for AZSeparatedTopology resources. - // Corresponds to liquid.ResourceQuotaRequest.PerAZ[az].Quota. - // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#AZResourceQuotaRequest - // +kubebuilder:validation:Optional - PerAZ map[string]int64 `json:"perAZ,omitempty"` -} - -// ResourceQuotaUsage holds per-AZ PAYG usage for a single resource. -type ResourceQuotaUsage struct { - // PerAZ holds per-availability-zone PAYG usage values. - // Key: availability zone name, Value: PAYG usage in that AZ. - // +kubebuilder:validation:Optional - PerAZ map[string]int64 `json:"perAZ,omitempty"` -} - // ProjectQuotaSpec defines the desired state of ProjectQuota. // Populated from PUT /v1/projects/:uuid/quota payloads (liquid.ServiceQuotaRequest). +// Each ProjectQuota CRD represents quota for ONE project in ONE availability zone. // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ServiceQuotaRequest type ProjectQuotaSpec struct { // ProjectID of the OpenStack project this quota belongs to. @@ -57,12 +32,18 @@ type ProjectQuotaSpec struct { // +kubebuilder:validation:Optional DomainName string `json:"domainName,omitempty"` - // Quota maps LIQUID resource names to their per-AZ quota. + // AvailabilityZone is the AZ this quota CRD covers (e.g. "qa-de-1a"). + // In a multi-cluster setup, this determines which cluster the CRD is routed to. + // +kubebuilder:validation:Required + // +kubebuilder:validation:MinLength=1 + AvailabilityZone string `json:"availabilityZone"` + + // Quota maps LIQUID resource names to their quota value for THIS availability zone. // Key: liquid.ResourceName (e.g. "hw_version_hana_v2_ram") - // Mirrors liquid.ServiceQuotaRequest.Resources with AZSeparatedTopology. - // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ServiceQuotaRequest + // Value: per-AZ quota from liquid.AZResourceQuotaRequest.Quota + // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#AZResourceQuotaRequest // +kubebuilder:validation:Optional - Quota map[string]ResourceQuota `json:"quota,omitempty"` + Quota map[string]int64 `json:"quota,omitempty"` } // ProjectQuotaStatus defines the observed state of ProjectQuota. @@ -75,17 +56,17 @@ type ProjectQuotaStatus struct { // +kubebuilder:validation:Optional ObservedGeneration int64 `json:"observedGeneration,omitempty"` - // TotalUsage tracks per-resource per-AZ total resource consumption (all VMs in this project). + // TotalUsage tracks per-resource total resource consumption in this AZ (all VMs in this project+AZ). // Persisted by the quota controller; updated by full reconcile and HV instance diffs. // Key: liquid.ResourceName // +kubebuilder:validation:Optional - TotalUsage map[string]ResourceQuotaUsage `json:"totalUsage,omitempty"` + TotalUsage map[string]int64 `json:"totalUsage,omitempty"` - // PaygUsage tracks per-resource per-AZ pay-as-you-go usage. + // PaygUsage tracks per-resource pay-as-you-go usage in this AZ. // Derived as TotalUsage - CRUsage (clamped >= 0). // Key: liquid.ResourceName // +kubebuilder:validation:Optional - PaygUsage map[string]ResourceQuotaUsage `json:"paygUsage,omitempty"` + PaygUsage map[string]int64 `json:"paygUsage,omitempty"` // LastReconcileAt is when the controller last reconciled this project's quota (any path). // +kubebuilder:validation:Optional @@ -106,6 +87,7 @@ type ProjectQuotaStatus struct { // +kubebuilder:subresource:status // +kubebuilder:resource:scope=Cluster // +kubebuilder:printcolumn:name="Project",type="string",JSONPath=".spec.projectID" +// +kubebuilder:printcolumn:name="AZ",type="string",JSONPath=".spec.availabilityZone" // +kubebuilder:printcolumn:name="Domain",type="string",JSONPath=".spec.domainID" // +kubebuilder:printcolumn:name="LastReconcile",type="date",JSONPath=".status.lastReconcileAt" // +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" @@ -113,6 +95,8 @@ type ProjectQuotaStatus struct { // ProjectQuota is the Schema for the projectquotas API. // It persists quota values pushed by Limes via the LIQUID quota endpoint // (PUT /v1/projects/:uuid/quota → liquid.ServiceQuotaRequest). +// Each CRD stores quota for one project in one availability zone. +// In a multi-cluster setup, it is routed to the cluster serving that AZ. // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ServiceQuotaRequest type ProjectQuota struct { metav1.TypeMeta `json:",inline"` diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 06f075e1e..a98f0bfff 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -849,6 +849,13 @@ func (in *FlavorGroupCapacityStatus) DeepCopyInto(out *FlavorGroupCapacityStatus *out = make([]FlavorCapacityStatus, len(*in)) copy(*out, *in) } + if in.TotalCapacity != nil { + in, out := &in.TotalCapacity, &out.TotalCapacity + *out = make(map[string]resource.Quantity, len(*in)) + for key, val := range *in { + (*out)[key] = val.DeepCopy() + } + } in.LastReconcileAt.DeepCopyInto(&out.LastReconcileAt) if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions @@ -1604,9 +1611,9 @@ func (in *ProjectQuotaSpec) DeepCopyInto(out *ProjectQuotaSpec) { *out = *in if in.Quota != nil { in, out := &in.Quota, &out.Quota - *out = make(map[string]ResourceQuota, len(*in)) + *out = make(map[string]int64, len(*in)) for key, val := range *in { - (*out)[key] = *val.DeepCopy() + (*out)[key] = val } } } @@ -1626,16 +1633,16 @@ func (in *ProjectQuotaStatus) DeepCopyInto(out *ProjectQuotaStatus) { *out = *in if in.TotalUsage != nil { in, out := &in.TotalUsage, &out.TotalUsage - *out = make(map[string]ResourceQuotaUsage, len(*in)) + *out = make(map[string]int64, len(*in)) for key, val := range *in { - (*out)[key] = *val.DeepCopy() + (*out)[key] = val } } if in.PaygUsage != nil { in, out := &in.PaygUsage, &out.PaygUsage - *out = make(map[string]ResourceQuotaUsage, len(*in)) + *out = make(map[string]int64, len(*in)) for key, val := range *in { - (*out)[key] = *val.DeepCopy() + (*out)[key] = val } } if in.LastReconcileAt != nil { @@ -1815,50 +1822,6 @@ func (in *ReservationStatus) DeepCopy() *ReservationStatus { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *ResourceQuota) DeepCopyInto(out *ResourceQuota) { - *out = *in - if in.PerAZ != nil { - in, out := &in.PerAZ, &out.PerAZ - *out = make(map[string]int64, len(*in)) - for key, val := range *in { - (*out)[key] = val - } - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourceQuota. -func (in *ResourceQuota) DeepCopy() *ResourceQuota { - if in == nil { - return nil - } - out := new(ResourceQuota) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *ResourceQuotaUsage) DeepCopyInto(out *ResourceQuotaUsage) { - *out = *in - if in.PerAZ != nil { - in, out := &in.PerAZ, &out.PerAZ - *out = make(map[string]int64, len(*in)) - for key, val := range *in { - (*out)[key] = val - } - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourceQuotaUsage. -func (in *ResourceQuotaUsage) DeepCopy() *ResourceQuotaUsage { - if in == nil { - return nil - } - out := new(ResourceQuotaUsage) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SchedulingHistoryEntry) DeepCopyInto(out *SchedulingHistoryEntry) { *out = *in diff --git a/cmd/manager/main.go b/cmd/manager/main.go index 6c1096512..f49fb8a9f 100644 --- a/cmd/manager/main.go +++ b/cmd/manager/main.go @@ -343,15 +343,19 @@ func main() { reservationGVK := schema.GroupVersionKind{Group: "cortex.cloud", Version: "v1alpha1", Kind: "Reservation"} historyGVK := schema.GroupVersionKind{Group: "cortex.cloud", Version: "v1alpha1", Kind: "History"} committedResourceGVK := schema.GroupVersionKind{Group: "cortex.cloud", Version: "v1alpha1", Kind: "CommittedResource"} + flavorGroupCapacityGVK := schema.GroupVersionKind{Group: "cortex.cloud", Version: "v1alpha1", Kind: "FlavorGroupCapacity"} + projectQuotaGVK := schema.GroupVersionKind{Group: "cortex.cloud", Version: "v1alpha1", Kind: "ProjectQuota"} multiclusterClient := &multicluster.Client{ HomeCluster: homeCluster, HomeRestConfig: restConfig, HomeScheme: scheme, ResourceRouters: map[schema.GroupVersionKind]multicluster.ResourceRouter{ - hvGVK: multicluster.HypervisorResourceRouter{}, - reservationGVK: multicluster.ReservationsResourceRouter{}, - historyGVK: multicluster.HistoryResourceRouter{}, - committedResourceGVK: multicluster.CommittedResourceRouter{}, + hvGVK: multicluster.HypervisorResourceRouter{}, + reservationGVK: multicluster.ReservationsResourceRouter{}, + historyGVK: multicluster.HistoryResourceRouter{}, + committedResourceGVK: multicluster.CommittedResourceRouter{}, + flavorGroupCapacityGVK: multicluster.FlavorGroupCapacityResourceRouter{}, + projectQuotaGVK: multicluster.ProjectQuotaResourceRouter{}, }, } multiclusterClientConfig := conf.GetConfigOrDie[multicluster.ClientConfig]() diff --git a/docs/apis.md b/docs/apis.md index d3a2d9416..9160f8816 100644 --- a/docs/apis.md +++ b/docs/apis.md @@ -20,10 +20,14 @@ graph LR; decision(Decision CRD) reservation(Reservation CRD) committedresource(CommittedResource CRD) + projectquota(ProjectQuota CRD) + flavorgroupcapacity(FlavorGroupCapacity CRD) pipeline --> descheduling pipeline --> decision pipeline --> reservation committedresource --> reservation + committedresource --> projectquota + flavorgroupcapacity --> committedresource end prometheus(Prometheus) @@ -114,6 +118,22 @@ The status tracks the accepted amount, usage information (assigned VMs and used For more details on how committed resources interact with reservations, see [committed resource reservations](reservations/committed-resource-reservations.md). +### FlavorGroupCapacity + +```bash +kubectl get flavorgroupcapacities +``` + +FlavorGroupCapacity caches pre-computed capacity data for one flavor group in one availability zone. One CRD exists per (flavor group × AZ) pair, maintained by the capacity controller on a fixed interval. The spec identifies the flavor group and AZ; the status holds per-flavor slot counts (`PlaceableVMs`, `PlaceableHosts`, `TotalCapacityVMSlots`, `TotalCapacityHosts`), aggregate fields (`CommittedCapacity`, `TotalCapacity`, `TotalInstances`), and a `LastReconcileAt` timestamp. The capacity API reads these CRDs instead of probing the scheduler on each request. + +### ProjectQuota + +```bash +kubectl get projectquotas +``` + +ProjectQuota stores quota allocations and computed usage for one project in one availability zone. One CRD exists per (project × AZ) pair, named `quota-{projectID}-{az}`. The spec holds the project identity, availability zone, and a flat `Quota map[string]int64` mapping LIQUID resource names to their per-AZ quota value. The status holds `TotalUsage` and `PaygUsage` as flat `map[string]int64` fields tracking per-resource consumption in that AZ, maintained by the quota controller through periodic full reconciles, incremental Hypervisor diffs, and PaygUsage-only recomputes triggered by CommittedResource status changes. + ### Deschedulings ```bash diff --git a/docs/reservations/committed-resource-reservations.md b/docs/reservations/committed-resource-reservations.md index cccd55cbf..4d96d43a6 100644 --- a/docs/reservations/committed-resource-reservations.md +++ b/docs/reservations/committed-resource-reservations.md @@ -2,27 +2,30 @@ Cortex reserves hypervisor capacity for customers who pre-commit resources (committed resources, CRs), and exposes usage and capacity data via APIs. - - [Committed Resource Reservation System](#committed-resource-reservation-system) - [Configuration and Observability](#configuration-and-observability) - [Lifecycle Management](#lifecycle-management) - [State (CRDs)](#state-crds) - [CR Commitment Lifecycle](#cr-commitment-lifecycle) + - [Resource types](#resource-types) - [CommittedResource Controller](#committedresource-controller) - [Reservation Lifecycle](#reservation-lifecycle) - [VM Lifecycle](#vm-lifecycle) - [Capacity Blocking](#capacity-blocking) - [Reservation Controller](#reservation-controller) + - [Info API](#info-api) - [Change-Commitments API](#change-commitments-api) + - [Quota API](#quota-api) + - [Report-Usage API](#report-usage-api) + - [Report-Capacity API](#report-capacity-api) - [Syncer Task](#syncer-task) - - [Usage API](#usage-api) The CR reservation implementation is located in `internal/scheduling/reservations/commitments/`. Key components include: -- `CommittedResource` controller (`committed_resource_controller.go`) — acceptance, rejection, child Reservation CRUD -- `Reservation` controller (`reservation_controller.go`) — placement, VM allocation verification +- `CommittedResource` controller — acceptance, rejection, child Reservation CRUD (memory) or arithmetic headroom check (cores) +- `Reservation` controller — placement, VM allocation verification - API endpoints (`api/`) -- Capacity and usage calculation logic (`capacity.go`, `usage.go`) -- Syncer for periodic state sync (`syncer.go`) +- Capacity and usage calculation logic +- Syncer for periodic state sync ## Configuration and Observability @@ -30,6 +33,7 @@ The CR reservation implementation is located in `internal/scheduling/reservation - API endpoint toggles (change-commitments, report-usage, report-capacity) — each endpoint can be disabled independently - Reconciliation intervals (grace period, active monitoring) - Scheduling pipeline selection per flavor group +- Per-flavor-group resource flags (`handlesCommitments`, `hasCapacity`, `hasQuota`) controlling which resource types are active for each group **Metrics and Alerts**: Defined in `helm/bundles/cortex-nova/alerts/nova.alerts.yaml` with prefixes: - `cortex_committed_resource_change_api_*` @@ -45,22 +49,26 @@ flowchart LR subgraph State CR[(CommittedResource CRDs)] Res[(Reservation CRDs)] + PQ[(ProjectQuota CRDs)] + FGCap[(FlavorGroupCapacity CRDs)] end Syncer[Syncer Task] ChangeAPI[Change API] + QuotaAPI[Quota API] CapacityAPI[Capacity API] CRCtrl[CommittedResource Controller] ResCtrl[Reservation Controller] - UsageAPI[Usage API] + UsageAPI[Report-Usage API] Scheduler[Scheduler API] ChangeAPI -->|upsert + poll status| CR + QuotaAPI -->|write| PQ Syncer -->|upsert| CR UsageAPI -->|read| CR UsageAPI -->|read| Res - CapacityAPI -->|read| Res - CapacityAPI -->|capacity request| Scheduler + UsageAPI -->|read| PQ + CapacityAPI -->|read| FGCap CR -->|watch| CRCtrl CRCtrl -->|CRUD child Reservation slots| Res CRCtrl -->|update status| CR @@ -72,9 +80,13 @@ flowchart LR ### State (CRDs) -**`CommittedResource` CRD** (`committed_resource_types.go`) — primary source of truth for a commitment accepted by Cortex. One CRD per commitment UUID. Spec holds the commitment identity (project, flavor group, ...). Status holds the acceptance outcome (`Ready` condition with reason `Planned`/`Reserving`/`Rejected`) and the accepted amount. +**`CommittedResource` CRD** — primary source of truth for a commitment accepted by Cortex. One CRD per commitment UUID. Spec holds the commitment identity (project, flavor group, resource type, amount, ...). Status holds the acceptance outcome (`Ready` condition with reason `Planned`/`Reserving`/`Rejected`/`Accepted`), the accepted amount, and usage fields populated by the usage reconciler: `AssignedInstances` (VM UUIDs deterministically assigned to this CR), `UsedResources` (total resource consumption of assigned VMs), `LastUsageReconcileAt`, and `UsageObservedGeneration`. + +**`Reservation` CRD** — a single reservation slot on a hypervisor, owned by a `CommittedResource`. One `CommittedResource` may drive multiple `Reservation` CRDs (one per flavor-sized slot). Only memory commitments create Reservation CRDs; cores commitments do not. See [./failover-reservations.md](./failover-reservations.md) for the failover reservation type. -**`Reservation` CRD** (`reservation_types.go`) — a single reservation slot on a hypervisor, owned by a `CommittedResource`. One `CommittedResource` typically drives multiple `Reservation` CRDs (one per flavor-sized slot). See [./failover-reservations.md](./failover-reservations.md) for the failover reservation type. +**`ProjectQuota` CRD** — per-project, per-AZ quota store. One CRD exists per (project × availability zone) pair, named `quota-{projectID}-{az}`. Written by the Quota API when Limes pushes quota (one CRD is created for each AZ in the request). The quota controller reconciles usage into the status: `TotalUsage` and `PaygUsage` are flat `map[string]int64` fields tracking per-resource consumption in that AZ. The controller watches CommittedResource and Hypervisor CRDs to maintain these values via periodic full reconciles, incremental HV diffs, and PaygUsage-only recomputes triggered by CommittedResource status changes. + +**`FlavorGroupCapacity` CRD** — per-flavor-group, per-AZ capacity snapshot maintained by the capacity controller (outside this subsystem). The Report-Capacity endpoint reads these to compute available capacity. ### CR Commitment Lifecycle @@ -84,44 +96,30 @@ The CR commitment lifecycle covers everything from a commitment being accepted b | Limes State | Meaning | Cortex action | |---|---|---| -| `planned` | Future start, no guarantee yet | No Reservations — capacity not blocked | -| `pending` | Limes asking for a yes/no decision now | One-shot attempt — accept or reject; no retry | -| `guaranteed` / `confirmed` | Capacity must be honoured | Place Reservations and keep them in sync; see failure handling below | -| `superseded` / `expired` | Commitment no longer active | Remove all child Reservations | +| `planned` | Future start, no guarantee yet | No capacity reserved | +| `pending` | Limes asking for a yes/no decision now | One-shot acceptance attempt — accept or reject; no retry | +| `guaranteed` / `confirmed` | Capacity must be honoured | Accept and keep in sync; see failure handling below | +| `superseded` / `expired` | Commitment no longer active | Release all held capacity | -**CommittedResource status conditions (Cortex-side):** +#### Resource types -```mermaid -stateDiagram-v2 - direction LR - state "Planned (Ready=False)" as Planned - state "Reserving (Ready=False)" as Reserving - state "Active (Ready=True)" as Active - state "Rejected (Ready=False)" as Rejected +Cortex handles two resource types for committed resources, with different acceptance mechanisms: - [*] --> Planned : state=planned - [*] --> Reserving : state=pending / guaranteed / confirmed - Planned --> Reserving : state changes to pending/guaranteed/confirmed - Reserving --> Active : placement succeeded - Reserving --> Rejected : placement failed — pending, or AllowRejection=true - Reserving --> Reserving : placement failed — retrying (AllowRejection=false) - Active --> Reserving : spec changed (e.g. resize) - Active --> [*] : state=superseded / expired - Rejected --> [*] : deleted - Planned --> [*] : deleted -``` +**Memory (`_ram`)** — Cortex creates and manages `Reservation` CRDs on specific hypervisors. Acceptance means Cortex can place the required number of reservation slots via the scheduling pipeline. If placement is impossible (no hosts with enough free memory), the commitment is rejected or retried depending on the commitment state and `AllowRejection` flag. -#### CommittedResource Controller +**CPU cores (`_cores`)** — No `Reservation` CRDs are created. Cortex checks whether sufficient CPU headroom exists by comparing the requested cores against the total CPU capacity for the flavor group and AZ (as reported by the `FlavorGroupCapacity` CRD) minus cores already committed by other active CRs. This is a lightweight arithmetic check that does not interact with the scheduling pipeline. + +The two types share the same lifecycle states and the same acceptance/rejection semantics — they differ only in how capacity is verified and held. -The controller's job is to keep child `Reservation` CRDs in sync with the desired state expressed in `Spec.Amount`. The key rules: +#### CommittedResource Controller -- **`pending`**: Cortex is being asked for a yes/no decision. If placement fails for any reason, child Reservations are removed and the CR is marked Rejected. The caller (e.g. the change-commitments API) reads the outcome and reports back to Limes. No retry. +The controller accepts or rejects commitments and keeps the allocated capacity in sync with what Limes expects. -- **`guaranteed` / `confirmed`**: Cortex is expected to honour the commitment. The default is to keep retrying until placement succeeds (`Ready=False, Reason=Reserving`). Callers that can accept "no" as an answer set `Spec.AllowRejection=true` (the change-commitments API sets this for confirming requests — new commitments, resizes); the controller then rejects on failure instead of retrying. +**`pending`** — Cortex is being asked for a yes/no answer. A single acceptance attempt is made. On failure, the commitment is rejected and all held capacity is released. No retry. -- **On rejection**: rolls back child Reservations to the last successfully placed quantity (`Status.AcceptedAmount`). For a CR that was never accepted, this means removing all child Reservations. +**`guaranteed` / `confirmed`** — Cortex is expected to honour the commitment indefinitely. The default is to keep retrying on failure (`Ready=False, Reason=Reserving`). Callers that can tolerate rejection set `AllowRejection=true`; the controller then rejects on failure rather than retrying. -The controller communicates with the Reservation controller only through CRDs — no direct calls. +**On rejection** — any capacity held for this CR is rolled back to the last successfully accepted amount (or fully released if never accepted). **Reconcile trigger flow:** @@ -135,15 +133,45 @@ sequenceDiagram API->>CRCRD: write (create/update) CRCRD-->>CRCtrl: watch fires - CRCtrl->>ResCRD: create/update child slots + CRCtrl->>ResCRD: create/update child slots (memory only) ResCRD-->>ResCtrl: watch fires ResCtrl->>ResCRD: update (ObservedParentGeneration, Ready=True/False) ResCRD-->>CRCtrl: watch fires (Reservation→parent CR lookup) CRCtrl->>CRCRD: update status (Accepted / Reserving / Rejected) ``` +For cores commitments the middle steps (Reservation CRUD, Reservation controller) are skipped — the CR controller updates the `CommittedResource` status directly after the arithmetic check. + +**CommittedResource status states:** + +```mermaid +stateDiagram-v2 + direction LR + state "Planned (Ready=False)" as Planned + state "Reserving (Ready=False)" as Reserving + state "Active (Ready=True)" as Active + state "Rejected (Ready=False)" as Rejected + + [*] --> Planned : state=planned + [*] --> Reserving : pending/guaranteed/confirmed (memory) + [*] --> Active : pending/guaranteed/confirmed (cores, ok) + [*] --> Rejected : pending/guaranteed/confirmed (cores, fail) + Planned --> Reserving : state activates (memory) + Planned --> Active : state activates (cores, ok) + Reserving --> Active : placement succeeded + Reserving --> Rejected : failed - pending or AllowRejection=true + Reserving --> Reserving : failed - retrying (AllowRejection=false) + Active --> Reserving : spec changed (resize) - memory + Active --> Active : spec changed (resize) - cores + Active --> [*] : state=superseded / expired + Rejected --> [*] : deleted + Planned --> [*] : deleted +``` + ### Reservation Lifecycle +*Applies to memory commitments only. Cores commitments do not create Reservations.* + | Component | Event | Timing | Action | |-----------|-------|--------|--------| | **Reservation Controller** | `Reservation` created | Immediate (watch) | Find host via scheduler API, set `TargetHost` | @@ -178,7 +206,6 @@ flowchart LR **VM allocation state diagram**: The controller uses the **Hypervisor CRD** as the sole source of truth for VM allocation verification: -- **Hypervisor CRD** — used for all allocation checks; reflects the set of instances the hypervisor operator observes on the host ```mermaid stateDiagram-v2 @@ -234,7 +261,7 @@ When a reservation is being migrated to a new host, block the full `max(Spec.Res #### Reservation Controller -The `Reservation` controller (`CommitmentReservationController`) watches `Reservation` CRDs and `Hypervisor` CRDs. `MaxConcurrentReconciles=1` prevents overbooking during concurrent placements. +The `Reservation` controller watches `Reservation` CRDs and `Hypervisor` CRDs. `MaxConcurrentReconciles=1` prevents overbooking during concurrent placements. **Placement** — finds hosts for new reservations (calls scheduler API) @@ -245,6 +272,15 @@ The `Reservation` controller (`CommitmentReservationController`) watches `Reserv **Reservation migration is not supported yet.** +### Info API + +`GET /commitments/v1/info` — describes the full service to Limes: which flavor groups are active, what resource types each group exposes (ram, cores, instances), their units, LIQUID topologies, and whether each accepts commitments. + +- RAM resources with `HandlesCommitments=true` use `AZSeparatedTopology` — Limes treats quota as AZ-specific and sends per-AZ breakdowns in quota requests. +- All other resources (cores, instances, and RAM without commitments) use `AZAwareTopology` — no per-AZ quota. + +Limes calls this endpoint once on startup and whenever the service description changes. + ### Change-Commitments API The change-commitments API receives batched commitment changes from Limes and applies them using a **write-intent, watch-for-outcome** pattern: the handler creates or updates `CommittedResource` CRDs and polls their `Status.Conditions` until each reaches a terminal state — it does not interact with `Reservation` CRDs directly. @@ -256,17 +292,31 @@ The change-commitments API receives batched commitment changes from Limes and ap 2. Poll `CommittedResource.Status.Conditions[Ready]` until each reaches a terminal state: `Reason=Accepted` (success), `Reason=Planned` (deferred; accepted), or `Reason=Rejected` (failure) — only for confirming changes; non-confirming changes return immediately without polling 3. On any failure or timeout, restore all modified `CommittedResource` CRDs to their pre-request specs (or delete newly-created ones) -The `CommittedResource` controller handles all downstream `Reservation` CRUD. `AllowRejection=true` tells it to reject and roll back child Reservations on placement failure rather than retrying indefinitely. +The `CommittedResource` controller handles all downstream work. `AllowRejection=true` tells it to reject and roll back on failure rather than retrying indefinitely. -### Syncer Task +### Quota API + +`PUT /commitments/v1/projects/:project_id/quota` — receives the project's quota allocation from Limes and persists it as `ProjectQuota` CRDs, one per (project × availability zone) combination, named `quota-{projectID}-{az}`. For flavor groups with `HandlesCommitments=true`, Limes sends per-AZ quota breakdowns; each AZ gets its own CRD with a flat `Quota map[string]int64` holding per-resource quota values for that zone. The quota controller then reconciles usage into each CRD's status (`TotalUsage`, `PaygUsage`). Writes are idempotent; concurrent writes are resolved with retry-on-conflict. -The syncer task runs periodically and syncs local `CommittedResource` CRD state to match Limes' view of commitments, correcting drift from missed API calls or restarts. It writes `CommittedResource` CRDs only — Reservation CRUD is the controller's responsibility. +### Report-Usage API -### Usage API +`POST /commitments/v1/projects/:project_id/report-usage` — reports current resource usage for a project. For each flavor group `X` that accepts commitments, Cortex exposes three resource types: - `hw_version_X_ram` — RAM in units of the smallest flavor in the group (`HandlesCommitments=true`) -- `hw_version_X_cores` — CPU cores derived from RAM via fixed ratio (`HandlesCommitments=false`) +- `hw_version_X_cores` — CPU cores (`HandlesCommitments=false`; derived from RAM via fixed ratio where applicable) - `hw_version_X_instances` — instance count (`HandlesCommitments=false`) +For flavor groups with `HandlesCommitments=true`, the response includes per-AZ quota from the `ProjectQuota` CRDs (written by the Quota API). + +VM-to-commitment assignment is read from pre-computed `CommittedResource.Status` fields rather than being calculated inline at request time. A dedicated **usage reconciler** (in `internal/scheduling/reservations/commitments/usage_reconciler.go`) watches `CommittedResource` and `Hypervisor` CRDs and periodically runs the deterministic assignment algorithm, writing `AssignedInstances`, `UsedResources`, `LastUsageReconcileAt`, and `UsageObservedGeneration` into each CommittedResource's status. The Report-Usage endpoint reads these status fields to determine which VMs belong to which commitment. If a CR has not yet been reconciled, its VMs appear as PAYG until the first usage reconcile completes. + For each VM, the API reports whether it accounts to a specific commitment or PAYG. This assignment is deterministic and may differ from the actual Cortex internal assignment used for scheduling. + +### Report-Capacity API + +`POST /commitments/v1/report-capacity` — reports available hypervisor capacity per flavor group and AZ. Capacity data is pre-computed by the capacity controller and stored in `FlavorGroupCapacity` CRDs; the endpoint aggregates these per-AZ values into the response. If a `FlavorGroupCapacity` CRD is stale (controller behind), the endpoint reports total capacity without subtracting usage to avoid underreporting. + +### Syncer Task + +The syncer task runs periodically and syncs local `CommittedResource` CRD state to match Limes' view of commitments, correcting drift from missed API calls or restarts. It writes `CommittedResource` CRDs only — capacity management is the controller's responsibility. diff --git a/go.mod b/go.mod index 52da7fef1..be314d40e 100644 --- a/go.mod +++ b/go.mod @@ -3,14 +3,14 @@ module github.com/cobaltcore-dev/cortex go 1.26.0 require ( - github.com/cobaltcore-dev/openstack-hypervisor-operator v1.2.0 + github.com/cobaltcore-dev/openstack-hypervisor-operator v1.2.2 github.com/go-gorp/gorp v2.2.0+incompatible github.com/gophercloud/gophercloud/v2 v2.12.0 github.com/ironcore-dev/ironcore v0.3.0 - github.com/majewsky/gg v1.6.0 github.com/prometheus/client_golang v1.23.2 github.com/prometheus/client_model v0.6.2 - github.com/sapcc/go-bits v0.0.0-20260423021225-fb5e4523b6c5 + github.com/sapcc/go-bits v0.0.0-20260507090738-58bd3afe1717 + go.xyrillian.de/gg v1.7.0 k8s.io/api v0.36.0 k8s.io/apimachinery v0.36.0 k8s.io/client-go v0.36.0 @@ -87,7 +87,7 @@ require ( github.com/poy/onpar v0.3.5 // indirect github.com/prometheus/common v0.67.5 // indirect github.com/prometheus/procfs v0.19.2 // indirect - github.com/sapcc/go-api-declarations v1.21.0 + github.com/sapcc/go-api-declarations v1.22.0 github.com/sirupsen/logrus v1.9.3 // indirect github.com/spf13/cobra v1.10.2 // indirect github.com/spf13/pflag v1.0.10 // indirect diff --git a/go.sum b/go.sum index 638b047bd..0003315aa 100644 --- a/go.sum +++ b/go.sum @@ -20,8 +20,8 @@ github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1x github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cobaltcore-dev/openstack-hypervisor-operator v1.2.0 h1:XYVIKTC19dj4jck2uinYzTNXcoED5HNTvv+BJ75M2E0= -github.com/cobaltcore-dev/openstack-hypervisor-operator v1.2.0/go.mod h1:iuhqhW6ozxfYWbGlEeh9rW9xyTb/EgelkDJqzJXBclk= +github.com/cobaltcore-dev/openstack-hypervisor-operator v1.2.2 h1:qROHDCT/5iwbeUHoFSUeiHwEGaeNnOLYj0OGIlFBu5o= +github.com/cobaltcore-dev/openstack-hypervisor-operator v1.2.2/go.mod h1:vEKwzkDzZwnSd0VRnG+Q1bEzLKe0SWW1ugBAUVqrkY8= github.com/containerd/continuity v0.4.5 h1:ZRoN1sXq9u7V6QoHMcVWGhOwDFqZ4B9i5H6un1Wh0x4= github.com/containerd/continuity v0.4.5/go.mod h1:/lNJvtJKUQStBzpVQ1+rasXO1LAWtUQssk28EZvJ3nE= github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI= @@ -119,8 +119,8 @@ github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20260115054156-294ebfa9ad83 h1:z2ogiKUYzX5Is6zr/vP9vJGqPwcdqsWjOt+V8J7+bTc= -github.com/google/pprof v0.0.0-20260115054156-294ebfa9ad83/go.mod h1:MxpfABSjhmINe3F1It9d+8exIHFvUqtLIRCdOGNXqiI= +github.com/google/pprof v0.0.0-20260402051712-545e8a4df936 h1:EwtI+Al+DeppwYX2oXJCETMO23COyaKGP6fHVpkpWpg= +github.com/google/pprof v0.0.0-20260402051712-545e8a4df936/go.mod h1:MxpfABSjhmINe3F1It9d+8exIHFvUqtLIRCdOGNXqiI= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gophercloud/gophercloud/v2 v2.12.0 h1:Gxmc/Bog1UDKkxTcQW7MSPTDviJXpLeEgVeN5KrxoCo= @@ -153,8 +153,6 @@ github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0 github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/lib/pq v1.12.3 h1:tTWxr2YLKwIvK90ZXEw8GP7UFHtcbTtty8zsI+YjrfQ= github.com/lib/pq v1.12.3/go.mod h1:/p+8NSbOcwzAEI7wiMXFlgydTwcgTr3OSKMsD2BitpA= -github.com/majewsky/gg v1.6.0 h1:QyUP+a1YHlCRmcvAlyVhOnqdpeDQogmAygQaeGU0VPc= -github.com/majewsky/gg v1.6.0/go.mod h1:KC7qUlln1VBY90OE0jXMNjXW2b9B4jJ1heYQ08OzeAg= github.com/mattn/go-sqlite3 v1.14.44 h1:3VSe+xafpbzsLbdr2AWlAZk9yRHiBhTBakioXaCKTF8= github.com/mattn/go-sqlite3 v1.14.44/go.mod h1:pjEuOr8IwzLJP2MfGeTb0A35jauH+C2kbHKBr7yXKVQ= github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0= @@ -174,10 +172,10 @@ github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7P github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/onsi/ginkgo v1.16.4 h1:29JGrr5oVBm5ulCWet69zQkzWipVXIol6ygQUe/EzNc= -github.com/onsi/ginkgo/v2 v2.28.2 h1:DTrMfpqxiNUyQ3Y0zhn1n3cOO2euFgQPYIpkWwxVFps= -github.com/onsi/ginkgo/v2 v2.28.2/go.mod h1:CLtbVInNckU3/+gC8LzkGUb9oF+e8W8TdUsxPwvdOgE= -github.com/onsi/gomega v1.39.1 h1:1IJLAad4zjPn2PsnhH70V4DKRFlrCzGBNrNaru+Vf28= -github.com/onsi/gomega v1.39.1/go.mod h1:hL6yVALoTOxeWudERyfppUcZXjMwIMLnuSfruD2lcfg= +github.com/onsi/ginkgo/v2 v2.28.3 h1:4JvMdwtFU0imd8fHx25OJXoDMRexnf8v5NHKYSTTji4= +github.com/onsi/ginkgo/v2 v2.28.3/go.mod h1:+aXOY+vzZ5mu2iI2HpTZUPmM//oQfsNFX6gU9kNcA44= +github.com/onsi/gomega v1.40.0 h1:Vtol0e1MghCD2ZVIilPDIg44XSL9l2QAn8ZNaljWcJc= +github.com/onsi/gomega v1.40.0/go.mod h1:M/Uqpu/8qTjtzCLUA2zJHX9Iilrau25x1PdoSRbWh5A= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040= @@ -204,10 +202,10 @@ github.com/prometheus/procfs v0.19.2/go.mod h1:M0aotyiemPhBCM0z5w87kL22CxfcH05Zp github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= -github.com/sapcc/go-api-declarations v1.21.0 h1:Ag6GXgJLTFdBDKmrJU4QFllQbgGSenSGeHpLuvuxeDk= -github.com/sapcc/go-api-declarations v1.21.0/go.mod h1:eiRrXXUeQS5C/1kKn8/KMjk0Y0goUzgDQswj30rH0Zc= -github.com/sapcc/go-bits v0.0.0-20260423021225-fb5e4523b6c5 h1:Ynw89El0yMVQnKOaBVwgSoB7FZVNbwxmuH3DQqofNWI= -github.com/sapcc/go-bits v0.0.0-20260423021225-fb5e4523b6c5/go.mod h1:ISqYagZwY8jfyy5gwEk3CjAUi9J+KOsAdRiQxu3eglU= +github.com/sapcc/go-api-declarations v1.22.0 h1:nU/eJ6OO54Z9YSo1gWinD0A2etrfZObCwYdB9xA0VWE= +github.com/sapcc/go-api-declarations v1.22.0/go.mod h1:x3V8bzg7Y4kmbA+DeWWwKteFEdCCSiVQdwRXj4fGAYY= +github.com/sapcc/go-bits v0.0.0-20260507090738-58bd3afe1717 h1:6H8zcKP7+1nc9cpD/SY2pH3izRPojIsNza+Pp+d4FFk= +github.com/sapcc/go-bits v0.0.0-20260507090738-58bd3afe1717/go.mod h1:HlbUxBX5jObpd6owpG11w/RcnffZF0m0zLtsipqev48= github.com/sergi/go-diff v1.4.0 h1:n/SP9D5ad1fORl+llWyN+D6qoUETXNZARKjyY2/KVCw= github.com/sergi/go-diff v1.4.0/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= @@ -261,6 +259,8 @@ go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= go.uber.org/zap v1.28.0 h1:IZzaP1Fv73/T/pBMLk4VutPl36uNC+OSUh3JLG3FIjo= go.uber.org/zap v1.28.0/go.mod h1:rDLpOi171uODNm/mxFcuYWxDsqWSAVkFdX4XojSKg/Q= +go.xyrillian.de/gg v1.7.0 h1:IA0BJaX9TtBD7crH+CSoK4lYmBk5zi7nUQd0YRzPNf0= +go.xyrillian.de/gg v1.7.0/go.mod h1:dj+ZhCwC6JKWyFvImhVNXQAErrRcYMUkXu6vwWYNrzQ= go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= @@ -269,8 +269,8 @@ go4.org/netipx v0.0.0-20231129151722-fdeea329fbba h1:0b9z3AuHCjxk0x/opv64kcgZLBs go4.org/netipx v0.0.0-20231129151722-fdeea329fbba/go.mod h1:PLyyIXexvUFg3Owu6p/WfdlivPbZJsZdgWZlrGope/Y= golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93 h1:fQsdNF2N+/YewlRZiricy4P1iimyPKZ/xwniHj8Q2a0= golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93/go.mod h1:EPRbTFwzwjXj9NpYyyrvenVh9Y+GFeEvMNh7Xuz7xgU= -golang.org/x/mod v0.34.0 h1:xIHgNUUnW6sYkcM5Jleh05DvLOtwc6RitGHbDk4akRI= -golang.org/x/mod v0.34.0/go.mod h1:ykgH52iCZe79kzLLMhyCUzhMci+nQj+0XkbXpNYtVjY= +golang.org/x/mod v0.35.0 h1:Ww1D637e6Pg+Zb2KrWfHQUnH2dQRLBQyAtpr/haaJeM= +golang.org/x/mod v0.35.0/go.mod h1:+GwiRhIInF8wPm+4AoT6L0FA1QWAad3OMdTRx4tFYlU= golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA= golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs= golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ= @@ -287,8 +287,8 @@ golang.org/x/text v0.36.0 h1:JfKh3XmcRPqZPKevfXVpI1wXPTqbkE5f7JA92a55Yxg= golang.org/x/text v0.36.0/go.mod h1:NIdBknypM8iqVmPiuco0Dh6P5Jcdk8lJL0CUebqK164= golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U= golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno= -golang.org/x/tools v0.43.0 h1:12BdW9CeB3Z+J/I/wj34VMl8X+fEXBxVR90JeMX5E7s= -golang.org/x/tools v0.43.0/go.mod h1:uHkMso649BX2cZK6+RpuIPXS3ho2hZo4FVwfoy1vIk0= +golang.org/x/tools v0.44.0 h1:UP4ajHPIcuMjT1GqzDWRlalUEoY+uzoZKnhOjbIPD2c= +golang.org/x/tools v0.44.0/go.mod h1:KA0AfVErSdxRZIsOVipbv3rQhVXTnlU6UhKxHd1seDI= gomodules.xyz/jsonpatch/v2 v2.5.0 h1:JELs8RLM12qJGXU4u/TO3V25KW8GreMKl9pdkk14RM0= gomodules.xyz/jsonpatch/v2 v2.5.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= diff --git a/helm/bundles/cortex-cinder/Chart.yaml b/helm/bundles/cortex-cinder/Chart.yaml index 0ae840f15..c859282c8 100644 --- a/helm/bundles/cortex-cinder/Chart.yaml +++ b/helm/bundles/cortex-cinder/Chart.yaml @@ -5,23 +5,23 @@ apiVersion: v2 name: cortex-cinder description: A Helm chart deploying Cortex for Cinder. type: application -version: 0.0.60 +version: 0.0.61 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres - name: cortex-postgres repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.6.0 + version: 0.6.1 # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.47 + version: 0.0.48 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.47 + version: 0.0.48 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-crds/Chart.yaml b/helm/bundles/cortex-crds/Chart.yaml index 633fe00e7..7a4672b1f 100644 --- a/helm/bundles/cortex-crds/Chart.yaml +++ b/helm/bundles/cortex-crds/Chart.yaml @@ -5,13 +5,13 @@ apiVersion: v2 name: cortex-crds description: A Helm chart deploying Cortex CRDs. type: application -version: 0.0.60 +version: 0.0.61 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.47 + version: 0.0.48 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/bundles/cortex-ironcore/Chart.yaml b/helm/bundles/cortex-ironcore/Chart.yaml index 571e1a243..cbb2574ea 100644 --- a/helm/bundles/cortex-ironcore/Chart.yaml +++ b/helm/bundles/cortex-ironcore/Chart.yaml @@ -5,13 +5,13 @@ apiVersion: v2 name: cortex-ironcore description: A Helm chart deploying Cortex for IronCore. type: application -version: 0.0.60 +version: 0.0.61 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.47 + version: 0.0.48 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/bundles/cortex-manila/Chart.yaml b/helm/bundles/cortex-manila/Chart.yaml index 93ab402d2..849523e20 100644 --- a/helm/bundles/cortex-manila/Chart.yaml +++ b/helm/bundles/cortex-manila/Chart.yaml @@ -5,23 +5,23 @@ apiVersion: v2 name: cortex-manila description: A Helm chart deploying Cortex for Manila. type: application -version: 0.0.60 +version: 0.0.61 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres - name: cortex-postgres repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.6.0 + version: 0.6.1 # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.47 + version: 0.0.48 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.47 + version: 0.0.48 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-nova/Chart.yaml b/helm/bundles/cortex-nova/Chart.yaml index feff1ae83..ebf4fdba9 100644 --- a/helm/bundles/cortex-nova/Chart.yaml +++ b/helm/bundles/cortex-nova/Chart.yaml @@ -5,23 +5,23 @@ apiVersion: v2 name: cortex-nova description: A Helm chart deploying Cortex for Nova. type: application -version: 0.0.60 +version: 0.0.61 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres - name: cortex-postgres repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.6.0 + version: 0.6.1 # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.47 + version: 0.0.48 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.47 + version: 0.0.48 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-nova/templates/pipelines.yaml b/helm/bundles/cortex-nova/templates/pipelines.yaml index e1abb1969..0c2dcd274 100644 --- a/helm/bundles/cortex-nova/templates/pipelines.yaml +++ b/helm/bundles/cortex-nova/templates/pipelines.yaml @@ -75,42 +75,4 @@ spec: type: filter-weigher createHistory: false filters: [] - weighers: - - name: vmware_binpack - params: - - {key: resourceWeights, floatMapValue: {"memory": 1.0}} - description: | - This step implements a binpacking weigher for workloads on vmware hypervisors. - It pulls the requested vm into the smallest gaps possible, to ensure - other hosts with less allocation stay free for bigger vms. - In this pipeline, the binpacking will focus on hana virtual machines. - - name: vmware_avoid_long_term_contended_hosts - description: | - This step avoids placing vms on vmware hosts with a high CPU contention over - a longer period of time, based on vrops contention metrics. In particular, - this step looks at a longer time window of 4 weeks to identify hosts that - are consistently contended. - params: - - {key: avgCPUContentionLowerBound, floatValue: 0} # pct - - {key: avgCPUContentionUpperBound, floatValue: 10} # pct - - {key: avgCPUContentionActivationLowerBound, floatValue: 0.0} - - {key: avgCPUContentionActivationUpperBound, floatValue: -0.75} - - {key: maxCPUContentionLowerBound, floatValue: 0} # pct - - {key: maxCPUContentionUpperBound, floatValue: 10} # pct - - {key: maxCPUContentionActivationLowerBound, floatValue: 0.0} - - {key: maxCPUContentionActivationUpperBound, floatValue: -0.25} - - name: vmware_avoid_short_term_contended_hosts - description: | - This step avoids placing vms on vmware hosts with a high CPU contention over - a shorter period of time, based on vrops contention metrics. In particular, - this step looks at a shorter time window of 20 minutes to identify hosts that - are currently contended. - params: - - {key: avgCPUContentionLowerBound, floatValue: 0} # pct - - {key: avgCPUContentionUpperBound, floatValue: 10} # pct - - {key: avgCPUContentionActivationLowerBound, floatValue: 0.0} - - {key: avgCPUContentionActivationUpperBound, floatValue: -0.75} - - {key: maxCPUContentionLowerBound, floatValue: 0} # pct - - {key: maxCPUContentionUpperBound, floatValue: 10} # pct - - {key: maxCPUContentionActivationLowerBound, floatValue: 0.0} - - {key: maxCPUContentionActivationUpperBound, floatValue: -0.25} \ No newline at end of file + weighers: [] diff --git a/helm/bundles/cortex-pods/Chart.yaml b/helm/bundles/cortex-pods/Chart.yaml index b952d4322..5c998ec5b 100644 --- a/helm/bundles/cortex-pods/Chart.yaml +++ b/helm/bundles/cortex-pods/Chart.yaml @@ -5,13 +5,13 @@ apiVersion: v2 name: cortex-pods description: A Helm chart deploying Cortex for Pods. type: application -version: 0.0.60 +version: 0.0.61 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.47 + version: 0.0.48 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/dev/cortex-prometheus-operator/Chart.yaml b/helm/dev/cortex-prometheus-operator/Chart.yaml index fb2ea1e8d..05ec29eeb 100644 --- a/helm/dev/cortex-prometheus-operator/Chart.yaml +++ b/helm/dev/cortex-prometheus-operator/Chart.yaml @@ -10,4 +10,4 @@ dependencies: # CRDs of the prometheus operator, such as PrometheusRule, ServiceMonitor, etc. - name: kube-prometheus-stack repository: oci://ghcr.io/prometheus-community/charts - version: 84.4.0 + version: 84.5.0 diff --git a/helm/library/cortex-postgres/Chart.yaml b/helm/library/cortex-postgres/Chart.yaml index 2e0ab6abb..c9984af54 100644 --- a/helm/library/cortex-postgres/Chart.yaml +++ b/helm/library/cortex-postgres/Chart.yaml @@ -5,5 +5,5 @@ apiVersion: v2 name: cortex-postgres description: Postgres setup for Cortex. type: application -version: 0.6.0 -appVersion: "sha-88f03a41" +version: 0.6.1 +appVersion: "sha-24773a48" diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 1e1e9a4fa..f82fe0576 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: cortex description: A Helm chart to distribute cortex. type: application -version: 0.0.47 -appVersion: "sha-f9c27d07" +version: 0.0.48 +appVersion: "sha-f3c2ce54" icon: "https://example.com/icon.png" dependencies: [] diff --git a/helm/library/cortex/files/crds/cortex.cloud_committedresources.yaml b/helm/library/cortex/files/crds/cortex.cloud_committedresources.yaml index bf63aea12..d431d9dea 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_committedresources.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_committedresources.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.20.1 + controller-gen.kubebuilder.io/version: v0.21.0 name: committedresources.cortex.cloud spec: group: cortex.cloud diff --git a/helm/library/cortex/files/crds/cortex.cloud_datasources.yaml b/helm/library/cortex/files/crds/cortex.cloud_datasources.yaml index 1b0815fef..9a2d32bbc 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_datasources.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_datasources.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.20.1 + controller-gen.kubebuilder.io/version: v0.21.0 name: datasources.cortex.cloud spec: group: cortex.cloud diff --git a/helm/library/cortex/files/crds/cortex.cloud_decisions.yaml b/helm/library/cortex/files/crds/cortex.cloud_decisions.yaml index 7bb677ba4..1d7c38ea1 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_decisions.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_decisions.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.20.1 + controller-gen.kubebuilder.io/version: v0.21.0 name: decisions.cortex.cloud spec: group: cortex.cloud diff --git a/helm/library/cortex/files/crds/cortex.cloud_deschedulings.yaml b/helm/library/cortex/files/crds/cortex.cloud_deschedulings.yaml index 8fab9aa72..b01e8d2f4 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_deschedulings.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_deschedulings.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.20.1 + controller-gen.kubebuilder.io/version: v0.21.0 name: deschedulings.cortex.cloud spec: group: cortex.cloud diff --git a/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml b/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml index 5f475689e..73a009ba4 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.20.1 + controller-gen.kubebuilder.io/version: v0.21.0 name: flavorgroupcapacities.cortex.cloud spec: group: cortex.cloud @@ -174,6 +174,16 @@ spec: reconcile. format: date-time type: string + totalCapacity: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: TotalCapacity is the total capacity of all eligible hosts + in an empty-datacenter scenario. + type: object totalInstances: description: |- TotalInstances is the total number of VM instances running on hypervisors in this AZ, diff --git a/helm/library/cortex/files/crds/cortex.cloud_histories.yaml b/helm/library/cortex/files/crds/cortex.cloud_histories.yaml index 2b6049797..693f776e5 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_histories.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_histories.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.20.1 + controller-gen.kubebuilder.io/version: v0.21.0 name: histories.cortex.cloud spec: group: cortex.cloud diff --git a/helm/library/cortex/files/crds/cortex.cloud_knowledges.yaml b/helm/library/cortex/files/crds/cortex.cloud_knowledges.yaml index 83866b0a9..5a4cba037 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_knowledges.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_knowledges.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.20.1 + controller-gen.kubebuilder.io/version: v0.21.0 name: knowledges.cortex.cloud spec: group: cortex.cloud diff --git a/helm/library/cortex/files/crds/cortex.cloud_kpis.yaml b/helm/library/cortex/files/crds/cortex.cloud_kpis.yaml index 5789ccf17..b7637fa24 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_kpis.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_kpis.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.20.1 + controller-gen.kubebuilder.io/version: v0.21.0 name: kpis.cortex.cloud spec: group: cortex.cloud diff --git a/helm/library/cortex/files/crds/cortex.cloud_pipelines.yaml b/helm/library/cortex/files/crds/cortex.cloud_pipelines.yaml index a79532d4b..80c5497aa 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_pipelines.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_pipelines.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.20.1 + controller-gen.kubebuilder.io/version: v0.21.0 name: pipelines.cortex.cloud spec: group: cortex.cloud diff --git a/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml b/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml index c9183638b..57354cb44 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.20.1 + controller-gen.kubebuilder.io/version: v0.21.0 name: projectquotas.cortex.cloud spec: group: cortex.cloud @@ -18,6 +18,9 @@ spec: - jsonPath: .spec.projectID name: Project type: string + - jsonPath: .spec.availabilityZone + name: AZ + type: string - jsonPath: .spec.domainID name: Domain type: string @@ -34,6 +37,8 @@ spec: ProjectQuota is the Schema for the projectquotas API. It persists quota values pushed by Limes via the LIQUID quota endpoint (PUT /v1/projects/:uuid/quota → liquid.ServiceQuotaRequest). + Each CRD stores quota for one project in one availability zone. + In a multi-cluster setup, it is routed to the cluster serving that AZ. See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ServiceQuotaRequest properties: apiVersion: @@ -57,8 +62,15 @@ spec: description: |- ProjectQuotaSpec defines the desired state of ProjectQuota. Populated from PUT /v1/projects/:uuid/quota payloads (liquid.ServiceQuotaRequest). + Each ProjectQuota CRD represents quota for ONE project in ONE availability zone. See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ServiceQuotaRequest properties: + availabilityZone: + description: |- + AvailabilityZone is the AZ this quota CRD covers (e.g. "qa-de-1a"). + In a multi-cluster setup, this determines which cluster the CRD is routed to. + minLength: 1 + type: string domainID: description: |- DomainID of the OpenStack domain this project belongs to. @@ -81,38 +93,16 @@ spec: type: string quota: additionalProperties: - description: |- - ResourceQuota holds the quota for a single resource with per-AZ breakdown. - Maps to liquid.ResourceQuotaRequest from the LIQUID API. - See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ResourceQuotaRequest - properties: - perAZ: - additionalProperties: - format: int64 - type: integer - description: |- - PerAZ holds the per-availability-zone quota breakdown. - Key: availability zone name, Value: quota for that AZ. - Only populated for AZSeparatedTopology resources. - Corresponds to liquid.ResourceQuotaRequest.PerAZ[az].Quota. - See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#AZResourceQuotaRequest - type: object - quota: - description: |- - Quota is the total quota across all AZs (for compatibility). - Corresponds to liquid.ResourceQuotaRequest.Quota. - format: int64 - type: integer - required: - - quota - type: object + format: int64 + type: integer description: |- - Quota maps LIQUID resource names to their per-AZ quota. + Quota maps LIQUID resource names to their quota value for THIS availability zone. Key: liquid.ResourceName (e.g. "hw_version_hana_v2_ram") - Mirrors liquid.ServiceQuotaRequest.Resources with AZSeparatedTopology. - See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ServiceQuotaRequest + Value: per-AZ quota from liquid.AZResourceQuotaRequest.Quota + See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#AZResourceQuotaRequest type: object required: + - availabilityZone - domainID - projectID type: object @@ -200,39 +190,19 @@ spec: type: integer paygUsage: additionalProperties: - description: ResourceQuotaUsage holds per-AZ PAYG usage for a single - resource. - properties: - perAZ: - additionalProperties: - format: int64 - type: integer - description: |- - PerAZ holds per-availability-zone PAYG usage values. - Key: availability zone name, Value: PAYG usage in that AZ. - type: object - type: object + format: int64 + type: integer description: |- - PaygUsage tracks per-resource per-AZ pay-as-you-go usage. + PaygUsage tracks per-resource pay-as-you-go usage in this AZ. Derived as TotalUsage - CRUsage (clamped >= 0). Key: liquid.ResourceName type: object totalUsage: additionalProperties: - description: ResourceQuotaUsage holds per-AZ PAYG usage for a single - resource. - properties: - perAZ: - additionalProperties: - format: int64 - type: integer - description: |- - PerAZ holds per-availability-zone PAYG usage values. - Key: availability zone name, Value: PAYG usage in that AZ. - type: object - type: object + format: int64 + type: integer description: |- - TotalUsage tracks per-resource per-AZ total resource consumption (all VMs in this project). + TotalUsage tracks per-resource total resource consumption in this AZ (all VMs in this project+AZ). Persisted by the quota controller; updated by full reconcile and HV instance diffs. Key: liquid.ResourceName type: object diff --git a/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml b/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml index a30b7d221..b4c5bfa6f 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.20.1 + controller-gen.kubebuilder.io/version: v0.21.0 name: reservations.cortex.cloud spec: group: cortex.cloud diff --git a/internal/knowledge/extractor/plugins/compute/flavor_groups.go b/internal/knowledge/extractor/plugins/compute/flavor_groups.go index 1dacb9d38..fd5d31b48 100644 --- a/internal/knowledge/extractor/plugins/compute/flavor_groups.go +++ b/internal/knowledge/extractor/plugins/compute/flavor_groups.go @@ -35,7 +35,7 @@ type FlavorGroupFeature struct { // The largest flavor in the group (used for reservation slot sizing) LargestFlavor FlavorInGroup `json:"largestFlavor"` - // The smallest flavor in the group (used for CR size quantification) + // The smallest flavor in the group SmallestFlavor FlavorInGroup `json:"smallestFlavor"` // RAM-to-core ratio in MiB per vCPU (MemoryMB / VCPUs). diff --git a/internal/knowledge/kpis/plugins/infrastructure/kvm_project_utilization.go b/internal/knowledge/kpis/plugins/infrastructure/kvm_project_utilization.go index 20f0ca32e..f80b9bd22 100644 --- a/internal/knowledge/kpis/plugins/infrastructure/kvm_project_utilization.go +++ b/internal/knowledge/kpis/plugins/infrastructure/kvm_project_utilization.go @@ -117,8 +117,8 @@ func (k *KVMProjectUtilizationKPI) Collect(ch chan<- prometheus.Metric) { hostLabels := host.getHostLabels() hostLabels = append(hostLabels, projectCapacityUsage.ProjectID, projectCapacityUsage.ProjectName, projectCapacityUsage.DomainID, projectCapacityUsage.DomainName) - ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, projectCapacityUsage.TotalVCPUs, append(hostLabels, "vcpu")...) - ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, projectCapacityUsage.TotalRAMMB*1024*1024, append(hostLabels, "memory")...) + ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, projectCapacityUsage.TotalVCPUs, append(hostLabels, "cpu")...) + ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, projectCapacityUsage.TotalRAMMB*1024*1024, append(hostLabels, "ram")...) ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, projectCapacityUsage.TotalDiskGB*1024*1024*1024, append(hostLabels, "disk")...) } } diff --git a/internal/knowledge/kpis/plugins/infrastructure/kvm_project_utilization_test.go b/internal/knowledge/kpis/plugins/infrastructure/kvm_project_utilization_test.go index 9b919c676..5ba039095 100644 --- a/internal/knowledge/kpis/plugins/infrastructure/kvm_project_utilization_test.go +++ b/internal/knowledge/kpis/plugins/infrastructure/kvm_project_utilization_test.go @@ -489,8 +489,8 @@ func TestKVMProjectUtilizationKPI_Collect(t *testing.T) { }, expectedMetrics: []collectedKVMMetric{ kvmInstanceMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-1", 1), - kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 2), - kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "memory", 4096*1024*1024), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "cpu", 2), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "ram", 4096*1024*1024), kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "disk", 1*1024*1024*1024), }, }, @@ -519,12 +519,12 @@ func TestKVMProjectUtilizationKPI_Collect(t *testing.T) { kvmInstanceMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-2", 1), kvmInstanceMetric("node002-bb02", "az2", "project-2", "Project Two", "domain-1", "Domain One", "flavor-1", 1), // node001-bb01/project-1: 1*flavor-1 + 1*flavor-2 - kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 6), - kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "memory", 12288*1024*1024), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "cpu", 6), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "ram", 12288*1024*1024), kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "disk", 3*1024*1024*1024), // node002-bb02/project-2: 1*flavor-1 - kvmCapacityMetric("node002-bb02", "az2", "project-2", "Project Two", "domain-1", "Domain One", "vcpu", 2), - kvmCapacityMetric("node002-bb02", "az2", "project-2", "Project Two", "domain-1", "Domain One", "memory", 4096*1024*1024), + kvmCapacityMetric("node002-bb02", "az2", "project-2", "Project Two", "domain-1", "Domain One", "cpu", 2), + kvmCapacityMetric("node002-bb02", "az2", "project-2", "Project Two", "domain-1", "Domain One", "ram", 4096*1024*1024), kvmCapacityMetric("node002-bb02", "az2", "project-2", "Project Two", "domain-1", "Domain One", "disk", 1*1024*1024*1024), }, }, @@ -542,8 +542,8 @@ func TestKVMProjectUtilizationKPI_Collect(t *testing.T) { }, expectedMetrics: []collectedKVMMetric{ kvmInstanceMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-1", 1), - kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 2), - kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "memory", 4096*1024*1024), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "cpu", 2), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "ram", 4096*1024*1024), kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "disk", 1*1024*1024*1024), }, }, @@ -566,8 +566,8 @@ func TestKVMProjectUtilizationKPI_Collect(t *testing.T) { }, expectedMetrics: []collectedKVMMetric{ kvmInstanceMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-3", 1), - kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 8), - kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "memory", 16384*1024*1024), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "cpu", 8), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "ram", 16384*1024*1024), kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "disk", 4*1024*1024*1024), }, }, @@ -589,11 +589,11 @@ func TestKVMProjectUtilizationKPI_Collect(t *testing.T) { expectedMetrics: []collectedKVMMetric{ kvmInstanceMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-1", 2), kvmInstanceMetric("node002-bb02", "az2", "project-1", "Project One", "domain-1", "Domain One", "flavor-1", 2), - kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 4), - kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "memory", 2*4096*1024*1024), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "cpu", 4), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "ram", 2*4096*1024*1024), kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "disk", 2*1024*1024*1024), - kvmCapacityMetric("node002-bb02", "az2", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 4), - kvmCapacityMetric("node002-bb02", "az2", "project-1", "Project One", "domain-1", "Domain One", "memory", 2*4096*1024*1024), + kvmCapacityMetric("node002-bb02", "az2", "project-1", "Project One", "domain-1", "Domain One", "cpu", 4), + kvmCapacityMetric("node002-bb02", "az2", "project-1", "Project One", "domain-1", "Domain One", "ram", 2*4096*1024*1024), kvmCapacityMetric("node002-bb02", "az2", "project-1", "Project One", "domain-1", "Domain One", "disk", 2*1024*1024*1024), }, }, @@ -622,8 +622,8 @@ func TestKVMProjectUtilizationKPI_Collect(t *testing.T) { expectedMetrics: []collectedKVMMetric{ // The domain_id is extracted from the project record, so it should be "domain-unknown" even though there is no matching domain entry kvmInstanceMetric("node001-bb01", "az1", "project-1", "Project One", "domain-unknown", "", "flavor-1", 1), - kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-unknown", "", "vcpu", 2), - kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-unknown", "", "memory", 4096*1024*1024), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-unknown", "", "cpu", 2), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-unknown", "", "ram", 4096*1024*1024), kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-unknown", "", "disk", 1*1024*1024*1024), }, }, @@ -640,8 +640,8 @@ func TestKVMProjectUtilizationKPI_Collect(t *testing.T) { }, expectedMetrics: []collectedKVMMetric{ kvmInstanceMetric("node001-bb01", "az1", "project-1", "", "", "", "flavor-1", 1), - kvmCapacityMetric("node001-bb01", "az1", "project-1", "", "", "", "vcpu", 2), - kvmCapacityMetric("node001-bb01", "az1", "project-1", "", "", "", "memory", 4096*1024*1024), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "", "", "", "cpu", 2), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "", "", "", "ram", 4096*1024*1024), kvmCapacityMetric("node001-bb01", "az1", "project-1", "", "", "", "disk", 1*1024*1024*1024), }, }, @@ -658,8 +658,8 @@ func TestKVMProjectUtilizationKPI_Collect(t *testing.T) { }, expectedMetrics: []collectedKVMMetric{ kvmInstanceMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-missing", 1), - kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 0), - kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "memory", 0), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "cpu", 0), + kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "ram", 0), kvmCapacityMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", "disk", 0), }, }, diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_project_commitments.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_commitments.go index 14744b205..95f5765b6 100644 --- a/internal/knowledge/kpis/plugins/infrastructure/vmware_project_commitments.go +++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_commitments.go @@ -43,12 +43,12 @@ func (k *VMwareProjectCommitmentsKPI) Init(dbConn *db.DB, c client.Client, opts } k.unusedGeneralPurposeCommitmentsPerProject = prometheus.NewDesc( - "cortex_vmware_commitments_general_purpose", + "cortex_vmware_unused_commitments_general_purpose", "Committed general purpose resources that are currently unused. CPU (resource=cpu) in vCPUs, memory (resource=ram) in bytes.", []string{"availability_zone", "resource", "project_id", "project_name", "domain_id", "domain_name"}, nil, ) k.unusedHanaCommittedResourcesPerProject = prometheus.NewDesc( - "cortex_vmware_commitments_hana_resources", + "cortex_vmware_unused_commitments_hana_resources", "Total committed HANA instances capacity that is currently unused, translated to resources. CPU in vCPUs, memory and disk in bytes.", []string{"availability_zone", "cpu_architecture", "resource", "project_id", "project_name", "domain_id", "domain_name"}, nil, ) diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_project_commitments_test.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_commitments_test.go index 8978f76a9..b33cdbca7 100644 --- a/internal/knowledge/kpis/plugins/infrastructure/vmware_project_commitments_test.go +++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_commitments_test.go @@ -65,12 +65,12 @@ func collectProjectCommitmentsMetrics(t *testing.T, testDB *db.DB) map[string]fl // gpKey builds the expected map key for a general-purpose metric. // cpu_architecture is always empty because the GP metric descriptor omits that label. func gpKey(az, resource string, p projectWithDomain) string { - return "cortex_vmware_commitments_general_purpose|" + az + "||" + resource + "|" + p.ProjectID + "|" + p.ProjectName + "|" + p.DomainID + "|" + p.DomainName + return "cortex_vmware_unused_commitments_general_purpose|" + az + "||" + resource + "|" + p.ProjectID + "|" + p.ProjectName + "|" + p.DomainID + "|" + p.DomainName } // hKey builds the expected map key for a HANA metric. func hKey(az, cpuArch, resource string, p projectWithDomain) string { - return "cortex_vmware_commitments_hana_resources|" + az + "|" + cpuArch + "|" + resource + "|" + p.ProjectID + "|" + p.ProjectName + "|" + p.DomainID + "|" + p.DomainName + return "cortex_vmware_unused_commitments_hana_resources|" + az + "|" + cpuArch + "|" + resource + "|" + p.ProjectID + "|" + p.ProjectName + "|" + p.DomainID + "|" + p.DomainName } func TestVMwareProjectCommitmentsKPI_Init(t *testing.T) { diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization.go index 368ebd194..6a85756ff 100644 --- a/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization.go +++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization.go @@ -119,8 +119,8 @@ func (k *VMwareProjectUtilizationKPI) Collect(ch chan<- prometheus.Metric) { hostLabels := host.getHostLabels() hostLabels = append(hostLabels, projectCapacityUsage.ProjectID, projectCapacityUsage.ProjectName, projectCapacityUsage.DomainID, projectCapacityUsage.DomainName) - ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, projectCapacityUsage.TotalVCPUs, append(hostLabels, "vcpu")...) - ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, projectCapacityUsage.TotalRAMMB*1024*1024, append(hostLabels, "memory")...) + ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, projectCapacityUsage.TotalVCPUs, append(hostLabels, "cpu")...) + ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, projectCapacityUsage.TotalRAMMB*1024*1024, append(hostLabels, "ram")...) ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, projectCapacityUsage.TotalDiskGB*1024*1024*1024, append(hostLabels, "disk")...) } } diff --git a/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization_test.go b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization_test.go index 853b88d2c..15f18b894 100644 --- a/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization_test.go +++ b/internal/knowledge/kpis/plugins/infrastructure/vmware_project_utilization_test.go @@ -519,8 +519,8 @@ func TestVMwareProjectUtilizationKPI_Collect(t *testing.T) { }, expectedMetrics: []collectedVMwareMetric{ instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-1", 1), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 2), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "memory", 4096*1024*1024), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "cpu", 2), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "ram", 4096*1024*1024), capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "disk", 1*1024*1024*1024), }, }, @@ -549,12 +549,12 @@ func TestVMwareProjectUtilizationKPI_Collect(t *testing.T) { instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-2", 1), instanceMetric("nova-compute-2", "az2", "project-2", "Project Two", "domain-1", "Domain One", "flavor-1", 1), // nova-compute-1/project-1: 1*flavor-1 + 1*flavor-2 - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 6), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "memory", 12288*1024*1024), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "cpu", 6), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "ram", 12288*1024*1024), capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "disk", 3*1024*1024*1024), // nova-compute-2/project-2: 1*flavor-1 - capacityMetric("nova-compute-2", "az2", "project-2", "Project Two", "domain-1", "Domain One", "vcpu", 2), - capacityMetric("nova-compute-2", "az2", "project-2", "Project Two", "domain-1", "Domain One", "memory", 4096*1024*1024), + capacityMetric("nova-compute-2", "az2", "project-2", "Project Two", "domain-1", "Domain One", "cpu", 2), + capacityMetric("nova-compute-2", "az2", "project-2", "Project Two", "domain-1", "Domain One", "ram", 4096*1024*1024), capacityMetric("nova-compute-2", "az2", "project-2", "Project Two", "domain-1", "Domain One", "disk", 1*1024*1024*1024), }, }, @@ -573,8 +573,8 @@ func TestVMwareProjectUtilizationKPI_Collect(t *testing.T) { }, expectedMetrics: []collectedVMwareMetric{ instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-1", 1), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 2), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "memory", 4096*1024*1024), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "cpu", 2), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "ram", 4096*1024*1024), capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "disk", 1*1024*1024*1024), }, }, @@ -597,8 +597,8 @@ func TestVMwareProjectUtilizationKPI_Collect(t *testing.T) { }, expectedMetrics: []collectedVMwareMetric{ instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-3", 1), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 8), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "memory", 16384*1024*1024), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "cpu", 8), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "ram", 16384*1024*1024), capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "disk", 4*1024*1024*1024), }, }, @@ -620,11 +620,11 @@ func TestVMwareProjectUtilizationKPI_Collect(t *testing.T) { expectedMetrics: []collectedVMwareMetric{ instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-1", 2), instanceMetric("nova-compute-2", "az2", "project-1", "Project One", "domain-1", "Domain One", "flavor-1", 2), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 4), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "memory", 2*4096*1024*1024), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "cpu", 4), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "ram", 2*4096*1024*1024), capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "disk", 2*1024*1024*1024), - capacityMetric("nova-compute-2", "az2", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 4), - capacityMetric("nova-compute-2", "az2", "project-1", "Project One", "domain-1", "Domain One", "memory", 2*4096*1024*1024), + capacityMetric("nova-compute-2", "az2", "project-1", "Project One", "domain-1", "Domain One", "cpu", 4), + capacityMetric("nova-compute-2", "az2", "project-1", "Project One", "domain-1", "Domain One", "ram", 2*4096*1024*1024), capacityMetric("nova-compute-2", "az2", "project-1", "Project One", "domain-1", "Domain One", "disk", 2*1024*1024*1024), }, }, @@ -642,8 +642,8 @@ func TestVMwareProjectUtilizationKPI_Collect(t *testing.T) { expectedMetrics: []collectedVMwareMetric{ // The domain_id is extracted from the project record, so it should be "domain-unknown" even though there is no matching domain entry instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-unknown", "", "flavor-1", 1), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-unknown", "", "vcpu", 2), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-unknown", "", "memory", 4096*1024*1024), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-unknown", "", "cpu", 2), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-unknown", "", "ram", 4096*1024*1024), capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-unknown", "", "disk", 1*1024*1024*1024), }, }, @@ -660,8 +660,8 @@ func TestVMwareProjectUtilizationKPI_Collect(t *testing.T) { }, expectedMetrics: []collectedVMwareMetric{ instanceMetric("nova-compute-1", "az1", "project-1", "", "", "", "flavor-1", 1), - capacityMetric("nova-compute-1", "az1", "project-1", "", "", "", "vcpu", 2), - capacityMetric("nova-compute-1", "az1", "project-1", "", "", "", "memory", 4096*1024*1024), + capacityMetric("nova-compute-1", "az1", "project-1", "", "", "", "cpu", 2), + capacityMetric("nova-compute-1", "az1", "project-1", "", "", "", "ram", 4096*1024*1024), capacityMetric("nova-compute-1", "az1", "project-1", "", "", "", "disk", 1*1024*1024*1024), }, }, @@ -678,8 +678,8 @@ func TestVMwareProjectUtilizationKPI_Collect(t *testing.T) { }, expectedMetrics: []collectedVMwareMetric{ instanceMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "flavor-missing", 1), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "vcpu", 0), - capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "memory", 0), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "cpu", 0), + capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "ram", 0), capacityMetric("nova-compute-1", "az1", "project-1", "Project One", "domain-1", "Domain One", "disk", 0), }, }, diff --git a/internal/scheduling/nova/candidate_gatherer_test.go b/internal/scheduling/nova/candidate_gatherer_test.go index 551e8942b..9f2b9a3b4 100644 --- a/internal/scheduling/nova/candidate_gatherer_test.go +++ b/internal/scheduling/nova/candidate_gatherer_test.go @@ -11,12 +11,14 @@ import ( api "github.com/cobaltcore-dev/cortex/api/external/nova" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) func TestCandidateGatherer_MutateWithAllCandidates(t *testing.T) { - scheme, err := hv1.SchemeBuilder.Build() + scheme := runtime.NewScheme() + err := hv1.AddToScheme(scheme) if err != nil { t.Fatalf("failed to build scheme: %v", err) } diff --git a/internal/scheduling/nova/plugins/filters/filter_allowed_projects_test.go b/internal/scheduling/nova/plugins/filters/filter_allowed_projects_test.go index 070160e2e..53a4ac958 100644 --- a/internal/scheduling/nova/plugins/filters/filter_allowed_projects_test.go +++ b/internal/scheduling/nova/plugins/filters/filter_allowed_projects_test.go @@ -10,12 +10,14 @@ import ( api "github.com/cobaltcore-dev/cortex/api/external/nova" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) func TestFilterAllowedProjectsStep_Run(t *testing.T) { - scheme, err := hv1.SchemeBuilder.Build() + scheme := runtime.NewScheme() + err := hv1.AddToScheme(scheme) if err != nil { t.Fatalf("expected no error, got %v", err) } diff --git a/internal/scheduling/nova/plugins/filters/filter_capabilities_test.go b/internal/scheduling/nova/plugins/filters/filter_capabilities_test.go index 9b5f111dc..76c2131f0 100644 --- a/internal/scheduling/nova/plugins/filters/filter_capabilities_test.go +++ b/internal/scheduling/nova/plugins/filters/filter_capabilities_test.go @@ -10,6 +10,7 @@ import ( api "github.com/cobaltcore-dev/cortex/api/external/nova" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) @@ -134,7 +135,8 @@ func TestHvToNovaCapabilities(t *testing.T) { } func TestFilterCapabilitiesStep_Run(t *testing.T) { - scheme, err := hv1.SchemeBuilder.Build() + scheme := runtime.NewScheme() + err := hv1.AddToScheme(scheme) if err != nil { t.Fatalf("expected no error, got %v", err) } @@ -585,7 +587,8 @@ func TestFilterCapabilitiesStep_Run(t *testing.T) { // because subsequent filters (like filter_has_requested_traits) also need to access // the full ExtraSpecs including non-capability keys like trait:*. func TestFilterCapabilitiesStep_DoesNotMutateExtraSpecs(t *testing.T) { - scheme, err := hv1.SchemeBuilder.Build() + scheme := runtime.NewScheme() + err := hv1.AddToScheme(scheme) if err != nil { t.Fatalf("expected no error, got %v", err) } diff --git a/internal/scheduling/nova/plugins/filters/filter_correct_az_test.go b/internal/scheduling/nova/plugins/filters/filter_correct_az_test.go index d8389de9e..9b4df454e 100644 --- a/internal/scheduling/nova/plugins/filters/filter_correct_az_test.go +++ b/internal/scheduling/nova/plugins/filters/filter_correct_az_test.go @@ -10,12 +10,14 @@ import ( api "github.com/cobaltcore-dev/cortex/api/external/nova" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) func TestFilterCorrectAZStep_Run(t *testing.T) { - scheme, err := hv1.SchemeBuilder.Build() + scheme := runtime.NewScheme() + err := hv1.AddToScheme(scheme) if err != nil { t.Fatalf("expected no error, got %v", err) } diff --git a/internal/scheduling/nova/plugins/filters/filter_external_customer_test.go b/internal/scheduling/nova/plugins/filters/filter_external_customer_test.go index 05bdbc6f6..87b2f571c 100644 --- a/internal/scheduling/nova/plugins/filters/filter_external_customer_test.go +++ b/internal/scheduling/nova/plugins/filters/filter_external_customer_test.go @@ -10,12 +10,14 @@ import ( api "github.com/cobaltcore-dev/cortex/api/external/nova" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) func TestFilterExternalCustomerStep_Run(t *testing.T) { - scheme, err := hv1.SchemeBuilder.Build() + scheme := runtime.NewScheme() + err := hv1.AddToScheme(scheme) if err != nil { t.Fatalf("expected no error, got %v", err) } diff --git a/internal/scheduling/nova/plugins/filters/filter_has_accelerators_test.go b/internal/scheduling/nova/plugins/filters/filter_has_accelerators_test.go index 1d1a06764..8610e3132 100644 --- a/internal/scheduling/nova/plugins/filters/filter_has_accelerators_test.go +++ b/internal/scheduling/nova/plugins/filters/filter_has_accelerators_test.go @@ -10,12 +10,14 @@ import ( api "github.com/cobaltcore-dev/cortex/api/external/nova" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) func TestFilterHasAcceleratorsStep_Run(t *testing.T) { - scheme, err := hv1.SchemeBuilder.Build() + scheme := runtime.NewScheme() + err := hv1.AddToScheme(scheme) if err != nil { t.Fatalf("expected no error, got %v", err) } diff --git a/internal/scheduling/nova/plugins/filters/filter_has_requested_traits_test.go b/internal/scheduling/nova/plugins/filters/filter_has_requested_traits_test.go index 10a7c94aa..4b57737da 100644 --- a/internal/scheduling/nova/plugins/filters/filter_has_requested_traits_test.go +++ b/internal/scheduling/nova/plugins/filters/filter_has_requested_traits_test.go @@ -10,12 +10,14 @@ import ( api "github.com/cobaltcore-dev/cortex/api/external/nova" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) func TestFilterHasRequestedTraits_Run(t *testing.T) { - scheme, err := hv1.SchemeBuilder.Build() + scheme := runtime.NewScheme() + err := hv1.AddToScheme(scheme) if err != nil { t.Fatalf("expected no error, got %v", err) } diff --git a/internal/scheduling/nova/plugins/filters/filter_instance_group_anti_affinity_test.go b/internal/scheduling/nova/plugins/filters/filter_instance_group_anti_affinity_test.go index 6eea6bc7f..931265b9b 100644 --- a/internal/scheduling/nova/plugins/filters/filter_instance_group_anti_affinity_test.go +++ b/internal/scheduling/nova/plugins/filters/filter_instance_group_anti_affinity_test.go @@ -10,12 +10,14 @@ import ( api "github.com/cobaltcore-dev/cortex/api/external/nova" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) func TestFilterInstanceGroupAntiAffinityStep_Run(t *testing.T) { - scheme, err := hv1.SchemeBuilder.Build() + scheme := runtime.NewScheme() + err := hv1.AddToScheme(scheme) if err != nil { t.Fatalf("expected no error, got %v", err) } diff --git a/internal/scheduling/nova/plugins/filters/filter_status_conditions_test.go b/internal/scheduling/nova/plugins/filters/filter_status_conditions_test.go index adbfc8c65..7b35de033 100644 --- a/internal/scheduling/nova/plugins/filters/filter_status_conditions_test.go +++ b/internal/scheduling/nova/plugins/filters/filter_status_conditions_test.go @@ -10,12 +10,14 @@ import ( api "github.com/cobaltcore-dev/cortex/api/external/nova" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) func TestFilterStatusConditionsStep_Run(t *testing.T) { - scheme, err := hv1.SchemeBuilder.Build() + scheme := runtime.NewScheme() + err := hv1.AddToScheme(scheme) if err != nil { t.Fatalf("expected no error, got %v", err) } diff --git a/internal/scheduling/reservations/capacity/controller.go b/internal/scheduling/reservations/capacity/controller.go index eba8a9fec..76a8aaf9a 100644 --- a/internal/scheduling/reservations/capacity/controller.go +++ b/internal/scheduling/reservations/capacity/controller.go @@ -15,6 +15,7 @@ import ( "github.com/google/uuid" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" @@ -172,10 +173,49 @@ func (c *Controller) reconcileOne( committedCapacity = 0 } + // Compute TotalCapacity: for each flavor multiply slot count by its RAM/CPU, + // then take the max across all flavors independently for each resource. + // This reveals the most capacity because the flavor best matching the host's + // resource ratio saturates more resources and produces a higher product. + flavorSpecByName := make(map[string]compute.FlavorInGroup, len(groupData.Flavors)) + for _, f := range groupData.Flavors { + flavorSpecByName[f.Name] = f + } + var maxMemBytes, maxCPUCores int64 + for _, f := range newFlavors { + spec, ok := flavorSpecByName[f.FlavorName] + if !ok || f.TotalCapacityVMSlots <= 0 { + continue + } + memBytes := f.TotalCapacityVMSlots * int64(spec.MemoryMB) * 1024 * 1024 //nolint:gosec + cpuCores := f.TotalCapacityVMSlots * int64(spec.VCPUs) //nolint:gosec + if memBytes > maxMemBytes { + maxMemBytes = memBytes + } + if cpuCores > maxCPUCores { + maxCPUCores = cpuCores + } + } + + // Only update TotalCapacity when all probes succeeded (allFresh=true). + // This preserves stale values across transient probe failures and ensures + // the CR controller can distinguish "not yet probed" (key absent) from + // "probed but zero capacity" (key present, value=0). + var totalCapacity map[string]resource.Quantity + if allFresh { + totalCapacity = map[string]resource.Quantity{ + string(v1alpha1.CommittedResourceTypeMemory): *resource.NewQuantity(maxMemBytes, resource.BinarySI), + string(v1alpha1.CommittedResourceTypeCores): *resource.NewQuantity(maxCPUCores, resource.DecimalSI), + } + } else { + totalCapacity = existing.Status.TotalCapacity + } + patch := client.MergeFrom(existing.DeepCopy()) existing.Status.Flavors = newFlavors existing.Status.TotalInstances = totalInstances existing.Status.CommittedCapacity = committedCapacity + existing.Status.TotalCapacity = totalCapacity existing.Status.LastReconcileAt = metav1.Now() freshCondition := metav1.Condition{ diff --git a/internal/scheduling/reservations/commitments/api/change_commitments.go b/internal/scheduling/reservations/commitments/api/change_commitments.go index fd821799b..25ccbff64 100644 --- a/internal/scheduling/reservations/commitments/api/change_commitments.go +++ b/internal/scheduling/reservations/commitments/api/change_commitments.go @@ -118,6 +118,13 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque return } + if req.AZ == "" { + statusCode = http.StatusBadRequest + http.Error(w, "availability zone is required", statusCode) + api.recordMetrics(req, resp, statusCode, startTime) + return + } + if err := api.processCommitmentChanges(ctx, w, logger, req, &resp); err != nil { if strings.Contains(err.Error(), "version mismatch") { statusCode = http.StatusConflict @@ -183,22 +190,29 @@ ProcessLoop: for _, resourceName := range sortedKeys(projectChanges.ByResource) { resourceChanges := projectChanges.ByResource[resourceName] - flavorGroupName, err := commitments.GetFlavorGroupNameFromResource(string(resourceName)) + flavorGroupName, resourceType, err := commitments.GetFlavorGroupAndTypeFromResource(string(resourceName)) if err != nil { failedReason = fmt.Sprintf("project with unknown resource name %s: %v", projectID, err) rollback = true break ProcessLoop } - flavorGroup, ok := flavorGroups[flavorGroupName] - if !ok { + if _, ok := flavorGroups[flavorGroupName]; !ok { failedReason = "flavor group not found: " + flavorGroupName rollback = true break ProcessLoop } - if !api.config.ResourceConfigForGroup(flavorGroupName).RAM.HandlesCommitments { - failedReason = fmt.Sprintf("flavor group %q is not configured to handle commitments", flavorGroupName) + groupResourceConf := api.config.ResourceConfigForGroup(flavorGroupName) + var handlesCommitments bool + switch resourceType { + case v1alpha1.CommittedResourceTypeCores: + handlesCommitments = groupResourceConf.Cores.HandlesCommitments + default: + handlesCommitments = groupResourceConf.RAM.HandlesCommitments + } + if !handlesCommitments { + failedReason = fmt.Sprintf("flavor group %q is not configured to handle %s commitments", flavorGroupName, resourceType) rollback = true break ProcessLoop } @@ -249,25 +263,30 @@ ProcessLoop: } stateDesired, err := commitments.FromChangeCommitmentTargetState( - commitment, string(projectID), domainID, flavorGroupName, flavorGroup, string(req.AZ)) + commitment, string(projectID), domainID, flavorGroupName, resourceType, string(req.AZ)) if err != nil { failedReason = fmt.Sprintf("commitment %s: %s", commitment.UUID, err) rollback = true break ProcessLoop } - cr := &v1alpha1.CommittedResource{} - cr.Name = crName - if _, err := controllerutil.CreateOrUpdate(ctx, api.client, cr, func() error { - if cr.Spec.AvailabilityZone != "" && cr.Spec.AvailabilityZone != stateDesired.AvailabilityZone { - return fmt.Errorf("cannot change availability zone of commitment %s: current=%q requested=%q", - commitment.UUID, cr.Spec.AvailabilityZone, stateDesired.AvailabilityZone) - } - applyCRSpec(cr, stateDesired, allowRejection) - if cr.Annotations == nil { - cr.Annotations = make(map[string]string) + // RetryOnConflict handles the race where the CommittedResource controller reconciles + // the CRD (bumping resourceVersion) between the Get and Update inside CreateOrUpdate. + var crGeneration int64 + if err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + cr := &v1alpha1.CommittedResource{} + cr.Name = crName + if _, err := controllerutil.CreateOrUpdate(ctx, api.client, cr, func() error { + applyCRSpec(cr, stateDesired, allowRejection) + if cr.Annotations == nil { + cr.Annotations = make(map[string]string) + } + cr.Annotations[v1alpha1.AnnotationCreatorRequestID] = reservations.GlobalRequestIDFromContext(ctx) + return nil + }); err != nil { + return err } - cr.Annotations[v1alpha1.AnnotationCreatorRequestID] = reservations.GlobalRequestIDFromContext(ctx) + crGeneration = cr.Generation return nil }); err != nil { failedReason = fmt.Sprintf("commitment %s: failed to write CommittedResource CRD: %v", commitment.UUID, err) @@ -275,7 +294,7 @@ ProcessLoop: break ProcessLoop } - toWatch = append(toWatch, crWatch{name: crName, generation: cr.Generation}) + toWatch = append(toWatch, crWatch{name: crName, generation: crGeneration}) snapshots = append(snapshots, snap) logger.V(1).Info("upserted CommittedResource CRD", "name", crName) } @@ -491,8 +510,13 @@ func applyCRSpec(cr *v1alpha1.CommittedResource, state *commitments.CommitmentSt cr.Spec.CommitmentUUID = state.CommitmentUUID cr.Spec.SchedulingDomain = v1alpha1.SchedulingDomainNova cr.Spec.FlavorGroupName = state.FlavorGroupName - cr.Spec.ResourceType = v1alpha1.CommittedResourceTypeMemory - cr.Spec.Amount = *resource.NewQuantity(state.TotalMemoryBytes, resource.BinarySI) + cr.Spec.ResourceType = state.ResourceType + switch state.ResourceType { + case v1alpha1.CommittedResourceTypeCores: + cr.Spec.Amount = *resource.NewQuantity(state.TotalCores, resource.DecimalSI) + default: + cr.Spec.Amount = *resource.NewQuantity(state.TotalMemoryBytes, resource.BinarySI) + } cr.Spec.AvailabilityZone = state.AvailabilityZone cr.Spec.ProjectID = state.ProjectID cr.Spec.DomainID = state.DomainID diff --git a/internal/scheduling/reservations/commitments/api/change_commitments_test.go b/internal/scheduling/reservations/commitments/api/change_commitments_test.go index 999ddd240..c843751ec 100644 --- a/internal/scheduling/reservations/commitments/api/change_commitments_test.go +++ b/internal/scheduling/reservations/commitments/api/change_commitments_test.go @@ -22,9 +22,9 @@ import ( "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" commitments "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/commitments" - . "github.com/majewsky/gg/option" "github.com/prometheus/client_golang/prometheus" "github.com/sapcc/go-api-declarations/liquid" + . "go.xyrillian.de/gg/option" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/api/resource" @@ -130,22 +130,13 @@ func TestHandleChangeCommitments(t *testing.T) { ExpectedAPIResponse: newAPIResponse("uuid-b: not sufficient capacity"), ExpectedDeletedCRs: []string{"commitment-uuid-a", "commitment-uuid-b"}, }, - // --- AZ immutability --- + // --- AZ validation --- { - // AZ is immutable once set on a CommittedResource. Attempting to change it via - // change-commitments must be rejected immediately, before any polling or controller - // interaction, and the CR must remain at its original spec. - Name: "AZ change on existing CR: must be rejected", + Name: "Empty AZ: rejected with 400", Flavors: []*TestFlavor{m1Small}, - ExistingCRs: []*TestCR{ - {CommitmentUUID: "uuid-az-stale", State: v1alpha1.CommitmentStatusConfirmed, - AmountMiB: 1024, ProjectID: "project-A", AZ: "az-old", ReadyCondition: true}, - }, - CommitmentRequest: newCommitmentRequest("az-new", false, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", "uuid-az-stale", "confirmed", 2)), - ExpectedAPIResponse: newAPIResponse("cannot change availability zone"), - // CR spec must not have changed. - ExpectedCRSpecs: map[string]int64{"commitment-uuid-az-stale": 1024 * 1024 * 1024}, + CommitmentRequest: newCommitmentRequest("", false, 1234, + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-noaz", "confirmed", 2)), + ExpectedAPIResponse: APIResponseExpectation{StatusCode: http.StatusBadRequest}, }, // --- Timeout --- { diff --git a/internal/scheduling/reservations/commitments/api/info.go b/internal/scheduling/reservations/commitments/api/info.go index c9576008f..d8459c50b 100644 --- a/internal/scheduling/reservations/commitments/api/info.go +++ b/internal/scheduling/reservations/commitments/api/info.go @@ -6,7 +6,6 @@ package api import ( "context" "encoding/json" - "errors" "fmt" "net/http" "strconv" @@ -20,9 +19,6 @@ import ( liquid "github.com/sapcc/go-api-declarations/liquid" ) -// errInternalServiceInfo indicates an internal error while building service info (e.g., invalid unit configuration) -var errInternalServiceInfo = errors.New("internal error building service info") - // handles GET /commitments/v1/info requests from Limes: // See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid @@ -54,16 +50,9 @@ func (api *HTTPAPI) HandleInfo(w http.ResponseWriter, r *http.Request) { // Build info response info, err := api.buildServiceInfo(ctx, logger) if err != nil { - if errors.Is(err, errInternalServiceInfo) { - logger.Error(err, "internal error building service info") - statusCode = http.StatusInternalServerError - http.Error(w, "Internal server error: "+err.Error(), statusCode) - } else { - // Use Info level for expected conditions like knowledge not being ready yet - logger.Info("service info not available yet", "error", err.Error()) - statusCode = http.StatusServiceUnavailable - http.Error(w, "Service temporarily unavailable: "+err.Error(), statusCode) - } + logger.Info("service info not available yet", "error", err.Error()) + statusCode = http.StatusServiceUnavailable + http.Error(w, "Service temporarily unavailable: "+err.Error(), statusCode) api.recordInfoMetrics(statusCode, startTime) return } @@ -133,20 +122,8 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, logger logr.Logger) (l attrsJSON = nil } - // Validate memory is positive to avoid panic in MultiplyBy (which panics on factor=0) - if groupData.SmallestFlavor.MemoryMB == 0 { - return liquid.ServiceInfo{}, fmt.Errorf("%w: flavor group %q has invalid smallest flavor with memoryMB=0", - errInternalServiceInfo, groupName) - } - // === 1. RAM Resource === ramResourceName := liquid.ResourceName(commitments.ResourceNameRAM(groupName)) - ramUnit, err := liquid.UnitMebibytes.MultiplyBy(groupData.SmallestFlavor.MemoryMB) - if err != nil { - // Note: This error only occurs on uint64 overflow, which is unrealistic for memory values - return liquid.ServiceInfo{}, fmt.Errorf("%w: failed to create unit for flavor group %q: %w", - errInternalServiceInfo, groupName, err) - } // Determine topology: AZSeparatedTopology only for groups that accept commitments // (AZSeparatedTopology means quota is also AZ-aware, required when HasQuota=true) ramTopology := liquid.AZAwareTopology @@ -155,11 +132,10 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, logger logr.Logger) (l } resources[ramResourceName] = liquid.ResourceInfo{ DisplayName: fmt.Sprintf( - "multiples of %d MiB (usable by: %s)", - groupData.SmallestFlavor.MemoryMB, + "GiB of RAM (usable by: %s)", flavorListStr, ), - Unit: ramUnit, + Unit: liquid.UnitGibibytes, Topology: ramTopology, NeedsResourceDemand: false, HasCapacity: resCfg.RAM.HasCapacity, @@ -205,8 +181,6 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, logger logr.Logger) (l "ramResource", ramResourceName, "coresResource", coresResourceName, "instancesResource", instancesResourceName, - "smallestFlavor", groupData.SmallestFlavor.Name, - "smallestRamMB", groupData.SmallestFlavor.MemoryMB, "ramCoreRatio", groupData.RamCoreRatio) } diff --git a/internal/scheduling/reservations/commitments/api/info_test.go b/internal/scheduling/reservations/commitments/api/info_test.go index 514ebc752..e74964925 100644 --- a/internal/scheduling/reservations/commitments/api/info_test.go +++ b/internal/scheduling/reservations/commitments/api/info_test.go @@ -78,16 +78,15 @@ func TestHandleInfo_MethodNotAllowed(t *testing.T) { } func TestHandleInfo_InvalidFlavorMemory(t *testing.T) { - // Test that a 500 Internal Server Error is returned when a flavor group has invalid data. - // - // A flavor with memoryMB=0 is invalid and should trigger an HTTP 500 error. - // Such data could occur from a bug in the flavor groups extractor. + // Test that the info endpoint succeeds even when a flavor group has memoryMB=0. + // With the fixed GiB unit, we no longer reject zero-memory flavors at the info level; + // they result in zero capacity at the capacity reporting level instead. scheme := runtime.NewScheme() if err := v1alpha1.AddToScheme(scheme); err != nil { t.Fatalf("failed to add scheme: %v", err) } - // Create flavor group with memoryMB=0 (invalid data that could come from a buggy extractor) + // Create flavor group with memoryMB=0 (edge case from a buggy extractor) features := []map[string]interface{}{ { "name": "invalid_group", @@ -132,9 +131,9 @@ func TestHandleInfo_InvalidFlavorMemory(t *testing.T) { resp := w.Result() defer resp.Body.Close() - // Should return 500 Internal Server Error when unit creation fails - if resp.StatusCode != http.StatusInternalServerError { - t.Errorf("expected status code %d (Internal Server Error), got %d", http.StatusInternalServerError, resp.StatusCode) + // Should return 200 OK — zero-memory flavor no longer causes an error + if resp.StatusCode != http.StatusOK { + t.Errorf("expected status code %d (OK), got %d", http.StatusOK, resp.StatusCode) } } diff --git a/internal/scheduling/reservations/commitments/api/quota.go b/internal/scheduling/reservations/commitments/api/quota.go index 9c34e879c..4d6109a7b 100644 --- a/internal/scheduling/reservations/commitments/api/quota.go +++ b/internal/scheduling/reservations/commitments/api/quota.go @@ -16,21 +16,26 @@ import ( "github.com/sapcc/go-api-declarations/liquid" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/client-go/util/retry" "sigs.k8s.io/controller-runtime/pkg/client" ) -// projectQuotaCRDName returns the CRD object name for a given project UUID. -// Convention: "quota-" -func projectQuotaCRDName(projectID string) string { - return "quota-" + projectID +// idxProjectQuotaByProjectID is the field index key used to look up ProjectQuota CRDs by project ID. +// Must match the index registered in field_index.go. +const idxProjectQuotaByProjectID = "spec.projectID" + +// projectQuotaCRDName returns the CRD object name for a given project UUID and AZ. +// Convention: "quota--" +func projectQuotaCRDName(projectID, az string) string { + return "quota-" + projectID + "-" + az } // HandleQuota implements PUT /commitments/v1/projects/:project_id/quota from Limes LIQUID API. // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid // // This endpoint receives quota requests from Limes and persists them as ProjectQuota CRDs. -// One CRD per project, named "quota-". +// One CRD per project per availability zone, named "quota--". func (api *HTTPAPI) HandleQuota(w http.ResponseWriter, r *http.Request) { startTime := time.Now() @@ -89,88 +94,108 @@ func (api *HTTPAPI) HandleQuota(w http.ResponseWriter, r *http.Request) { return } - // Build the spec quota map from the liquid request. + // Build per-AZ quota maps from the liquid request. // liquid API uses uint64; our CRD uses int64 (K8s convention). // Guard against overflow: uint64 values > MaxInt64 would wrap to negative. - specQuota := make(map[string]v1alpha1.ResourceQuota, len(req.Resources)) + // quotaByAZ[az][resourceName] = quota value for that AZ + quotaByAZ := make(map[string]map[string]int64) for resourceName, resQuota := range req.Resources { - if resQuota.Quota > math.MaxInt64 { - api.quotaError(w, http.StatusBadRequest, fmt.Sprintf("Quota value for resource %q exceeds int64 max", resourceName), startTime) - return - } - rq := v1alpha1.ResourceQuota{ - Quota: int64(resQuota.Quota), - } - if len(resQuota.PerAZ) > 0 { - rq.PerAZ = make(map[string]int64, len(resQuota.PerAZ)) - for az, azQuota := range resQuota.PerAZ { - if azQuota.Quota > math.MaxInt64 { - api.quotaError(w, http.StatusBadRequest, fmt.Sprintf("Quota value for resource %q in AZ %q exceeds int64 max", resourceName, az), startTime) - return - } - rq.PerAZ[string(az)] = int64(azQuota.Quota) + for az, azQuota := range resQuota.PerAZ { + if azQuota.Quota > math.MaxInt64 { + api.quotaError(w, http.StatusBadRequest, fmt.Sprintf("Quota value for resource %q in AZ %q exceeds int64 max", resourceName, az), startTime) + return + } + azStr := string(az) + if quotaByAZ[azStr] == nil { + quotaByAZ[azStr] = make(map[string]int64) } + quotaByAZ[azStr][string(resourceName)] = int64(azQuota.Quota) } - specQuota[string(resourceName)] = rq } - // Create or update ProjectQuota CRD with retry-on-conflict to handle - // concurrent status updates from the quota controller. - crdName := projectQuotaCRDName(projectID) ctx := r.Context() - err = retry.RetryOnConflict(retry.DefaultRetry, func() error { - var existing v1alpha1.ProjectQuota - getErr := api.client.Get(ctx, client.ObjectKey{Name: crdName}, &existing) - if getErr != nil { - if !apierrors.IsNotFound(getErr) { - return getErr - } - // Not found -- create new - pq := &v1alpha1.ProjectQuota{ - ObjectMeta: metav1.ObjectMeta{ - Name: crdName, - }, - Spec: v1alpha1.ProjectQuotaSpec{ - ProjectID: projectID, - ProjectName: projectName, - DomainID: domainID, - DomainName: domainName, - Quota: specQuota, - }, - } - if createErr := api.client.Create(ctx, pq); createErr != nil { - // If another request just created it, retry will fetch and update - if apierrors.IsAlreadyExists(createErr) { + // Create or update one ProjectQuota CRD per AZ with retry-on-conflict to handle + // concurrent status updates from the quota controller. + activeAZs := make(map[string]bool, len(quotaByAZ)) + for az, azQuota := range quotaByAZ { + activeAZs[az] = true + crdName := projectQuotaCRDName(projectID, az) + + err = retry.RetryOnConflict(retry.DefaultRetry, func() error { + var existing v1alpha1.ProjectQuota + getErr := api.client.Get(ctx, client.ObjectKey{Name: crdName}, &existing) + if getErr != nil { + if !apierrors.IsNotFound(getErr) { + return getErr + } + // Not found -- create new + pq := &v1alpha1.ProjectQuota{ + ObjectMeta: metav1.ObjectMeta{ + Name: crdName, + }, + Spec: v1alpha1.ProjectQuotaSpec{ + ProjectID: projectID, + ProjectName: projectName, + DomainID: domainID, + DomainName: domainName, + AvailabilityZone: az, + Quota: azQuota, + }, + } + if createErr := api.client.Create(ctx, pq); createErr != nil { + // If another request just created it, surface as a conflict so + // RetryOnConflict re-runs the closure and falls into the update branch. + if apierrors.IsAlreadyExists(createErr) { + return apierrors.NewConflict( + schema.GroupResource{Group: "cortex.cloud", Resource: "projectquotas"}, + crdName, createErr, + ) + } return createErr } - return createErr + log.V(1).Info("created ProjectQuota", "name", crdName, "projectID", projectID, "az", az, "resources", len(azQuota)) + return nil + } + + // Update existing (re-fetched on each retry to get fresh resourceVersion) + existing.Spec.Quota = azQuota + if projectName != "" { + existing.Spec.ProjectName = projectName + } + if domainID != "" { + existing.Spec.DomainID = domainID + } + if domainName != "" { + existing.Spec.DomainName = domainName + } + if updateErr := api.client.Update(ctx, &existing); updateErr != nil { + return updateErr } - log.V(1).Info("created ProjectQuota", "name", crdName, "projectID", projectID, "resources", len(specQuota)) + log.V(1).Info("updated ProjectQuota", "name", crdName, "projectID", projectID, "az", az, "resources", len(azQuota)) return nil + }) + if err != nil { + log.Error(err, "failed to create/update ProjectQuota", "name", crdName, "az", az) + api.quotaError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to persist quota for AZ %s: %v", az, err), startTime) + return } + } - // Update existing (re-fetched on each retry to get fresh resourceVersion) - existing.Spec.Quota = specQuota - if projectName != "" { - existing.Spec.ProjectName = projectName - } - if domainID != "" { - existing.Spec.DomainID = domainID - } - if domainName != "" { - existing.Spec.DomainName = domainName - } - if updateErr := api.client.Update(ctx, &existing); updateErr != nil { - return updateErr + // Delete orphan CRDs for AZs no longer present in the quota push. + var pqList v1alpha1.ProjectQuotaList + if err := api.client.List(ctx, &pqList, client.MatchingFields{idxProjectQuotaByProjectID: projectID}); err == nil { + for i := range pqList.Items { + pq := &pqList.Items[i] + if !activeAZs[pq.Spec.AvailabilityZone] { + if delErr := api.client.Delete(ctx, pq); delErr != nil && !apierrors.IsNotFound(delErr) { + log.Error(delErr, "failed to delete orphan ProjectQuota", "name", pq.Name, "az", pq.Spec.AvailabilityZone) + // Non-fatal: orphan will be cleaned up on next push + } else { + log.V(1).Info("deleted orphan ProjectQuota", "name", pq.Name, "az", pq.Spec.AvailabilityZone) + } + } } - log.V(1).Info("updated ProjectQuota", "name", crdName, "projectID", projectID, "resources", len(specQuota)) - return nil - }) - if err != nil { - log.Error(err, "failed to create/update ProjectQuota", "name", crdName) - api.quotaError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to persist quota: %v", err), startTime) - return } // Return 204 No Content as expected by the LIQUID API diff --git a/internal/scheduling/reservations/commitments/api/quota_test.go b/internal/scheduling/reservations/commitments/api/quota_test.go index 8632d2af4..11ec744a4 100644 --- a/internal/scheduling/reservations/commitments/api/quota_test.go +++ b/internal/scheduling/reservations/commitments/api/quota_test.go @@ -13,8 +13,9 @@ import ( "github.com/cobaltcore-dev/cortex/api/v1alpha1" commitments "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/commitments" - "github.com/majewsky/gg/option" "github.com/sapcc/go-api-declarations/liquid" + "go.xyrillian.de/gg/option" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -148,15 +149,14 @@ func TestHandleQuota_ErrorCases(t *testing.T) { func TestHandleQuota_CreateAndUpdate(t *testing.T) { tests := []struct { name string - // existing is a pre-existing CRD to seed (nil = create, non-nil = update) - existing *v1alpha1.ProjectQuota + // existing is a set of pre-existing per-AZ CRDs to seed (nil = create, non-nil = update) + existing []*v1alpha1.ProjectQuota projectID string resources map[liquid.ResourceName]liquid.ResourceQuotaRequest metadata *liquid.ProjectMetadata - expectQuota map[string]int64 // resource name → expected total quota - expectPerAZ map[string]map[string]int64 // resource name → az → expected quota + expectPerAZ map[string]map[string]int64 // az → resource name → expected quota expectName string - expectDomain string + expectDom string expectDomName string }{ { @@ -164,7 +164,6 @@ func TestHandleQuota_CreateAndUpdate(t *testing.T) { projectID: "project-abc-123", resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{ "hw_version_hana_1_ram": { - Quota: 100, PerAZ: map[liquid.AvailabilityZone]liquid.AZResourceQuotaRequest{ "az-a": {Quota: 60}, "az-b": {Quota: 40}, @@ -175,28 +174,21 @@ func TestHandleQuota_CreateAndUpdate(t *testing.T) { UUID: "project-abc-123", Domain: liquid.DomainMetadata{UUID: "domain-1"}, }, - expectQuota: map[string]int64{"hw_version_hana_1_ram": 100}, expectPerAZ: map[string]map[string]int64{ - "hw_version_hana_1_ram": {"az-a": 60, "az-b": 40}, + "az-a": {"hw_version_hana_1_ram": 60}, + "az-b": {"hw_version_hana_1_ram": 40}, }, - expectDomain: "domain-1", - }, - { - name: "Create_EmptyResources", - projectID: "project-empty", - resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{}, - metadata: &liquid.ProjectMetadata{ - UUID: "project-empty", - Domain: liquid.DomainMetadata{UUID: "domain-1"}, - }, - expectQuota: map[string]int64{}, - expectDomain: "domain-1", + expectDom: "domain-1", }, { name: "Create_WithMetadata", projectID: "project-meta-test", resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{ - "hw_version_hana_1_ram": {Quota: 50}, + "hw_version_hana_1_ram": { + PerAZ: map[liquid.AvailabilityZone]liquid.AZResourceQuotaRequest{ + "az-a": {Quota: 50}, + }, + }, }, metadata: &liquid.ProjectMetadata{ UUID: "project-meta-test", @@ -206,80 +198,123 @@ func TestHandleQuota_CreateAndUpdate(t *testing.T) { Name: "my-domain-name", }, }, - expectQuota: map[string]int64{"hw_version_hana_1_ram": 50}, + expectPerAZ: map[string]map[string]int64{ + "az-a": {"hw_version_hana_1_ram": 50}, + }, expectName: "my-project-name", - expectDomain: "domain-uuid-456", + expectDom: "domain-uuid-456", expectDomName: "my-domain-name", }, { - name: "Update_QuotaValues", - existing: &v1alpha1.ProjectQuota{ - Spec: v1alpha1.ProjectQuotaSpec{ - ProjectID: "project-xyz", - DomainID: "original-domain", - DomainName: "original-domain-name", - ProjectName: "original-project-name", - Quota: map[string]v1alpha1.ResourceQuota{ - "hw_version_hana_1_ram": {Quota: 50, PerAZ: map[string]int64{"az-a": 50}}, + name: "Create_EmptyResources", + projectID: "project-empty", + resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{}, + metadata: &liquid.ProjectMetadata{ + UUID: "project-empty", + Domain: liquid.DomainMetadata{UUID: "domain-1"}, + }, + // No AZs in request means no per-AZ CRDs are created. + // expectPerAZ is empty — we just verify no error and 204 response. + expectPerAZ: map[string]map[string]int64{}, + expectDom: "domain-1", + }, + { + name: "Update_WithNewMetadata", + existing: []*v1alpha1.ProjectQuota{ + { + ObjectMeta: metav1.ObjectMeta{Name: "quota-project-update-meta-az-a"}, + Spec: v1alpha1.ProjectQuotaSpec{ + ProjectID: "project-update-meta", + DomainID: "old-domain", + DomainName: "old-domain-name", + ProjectName: "old-project-name", + AvailabilityZone: "az-a", + Quota: map[string]int64{"hw_version_hana_1_ram": 10}, + }, + }, + }, + projectID: "project-update-meta", + resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{ + "hw_version_hana_1_ram": { + PerAZ: map[liquid.AvailabilityZone]liquid.AZResourceQuotaRequest{ + "az-a": {Quota: 99}, }, }, }, - projectID: "project-xyz", metadata: &liquid.ProjectMetadata{ - UUID: "project-xyz", - Name: "original-project-name", + UUID: "project-update-meta", + Name: "new-project-name", Domain: liquid.DomainMetadata{ - UUID: "original-domain", - Name: "original-domain-name", + UUID: "new-domain", + Name: "new-domain-name", }, }, + expectPerAZ: map[string]map[string]int64{ + "az-a": {"hw_version_hana_1_ram": 99}, + }, + expectName: "new-project-name", + expectDom: "new-domain", + expectDomName: "new-domain-name", + }, + { + name: "Create_PartialAZ_OnlyOneAZ", + projectID: "project-partial", resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{ "hw_version_hana_1_ram": { - Quota: 200, PerAZ: map[liquid.AvailabilityZone]liquid.AZResourceQuotaRequest{ - "az-a": {Quota: 120}, - "az-b": {Quota: 80}, + "az-a": {Quota: 100}, + // az-b intentionally missing }, }, }, - expectQuota: map[string]int64{"hw_version_hana_1_ram": 200}, + metadata: &liquid.ProjectMetadata{ + UUID: "project-partial", + Domain: liquid.DomainMetadata{UUID: "domain-1"}, + }, + // Only az-a should get a CRD expectPerAZ: map[string]map[string]int64{ - "hw_version_hana_1_ram": {"az-a": 120, "az-b": 80}, + "az-a": {"hw_version_hana_1_ram": 100}, }, - // Metadata should be preserved when not provided in update - expectDomain: "original-domain", - expectDomName: "original-domain-name", - expectName: "original-project-name", + expectDom: "domain-1", }, { - name: "Update_WithNewMetadata", - existing: &v1alpha1.ProjectQuota{ - Spec: v1alpha1.ProjectQuotaSpec{ - ProjectID: "project-update-meta", - DomainID: "old-domain", - DomainName: "old-domain-name", - ProjectName: "old-project-name", - Quota: map[string]v1alpha1.ResourceQuota{ - "hw_version_hana_1_ram": {Quota: 10}, + name: "Update_QuotaValues", + existing: []*v1alpha1.ProjectQuota{ + { + ObjectMeta: metav1.ObjectMeta{Name: "quota-project-xyz-az-a"}, + Spec: v1alpha1.ProjectQuotaSpec{ + ProjectID: "project-xyz", + DomainID: "original-domain", + DomainName: "original-domain-name", + ProjectName: "original-project-name", + AvailabilityZone: "az-a", + Quota: map[string]int64{"hw_version_hana_1_ram": 50}, }, }, }, - projectID: "project-update-meta", - resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{ - "hw_version_hana_1_ram": {Quota: 99}, - }, + projectID: "project-xyz", metadata: &liquid.ProjectMetadata{ - UUID: "project-update-meta", - Name: "new-project-name", + UUID: "project-xyz", + Name: "original-project-name", Domain: liquid.DomainMetadata{ - UUID: "new-domain", - Name: "new-domain-name", + UUID: "original-domain", + Name: "original-domain-name", }, }, - expectQuota: map[string]int64{"hw_version_hana_1_ram": 99}, - expectName: "new-project-name", - expectDomain: "new-domain", - expectDomName: "new-domain-name", + resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{ + "hw_version_hana_1_ram": { + PerAZ: map[liquid.AvailabilityZone]liquid.AZResourceQuotaRequest{ + "az-a": {Quota: 120}, + "az-b": {Quota: 80}, + }, + }, + }, + expectPerAZ: map[string]map[string]int64{ + "az-a": {"hw_version_hana_1_ram": 120}, + "az-b": {"hw_version_hana_1_ram": 80}, + }, + expectDom: "original-domain", + expectName: "original-project-name", }, } @@ -289,8 +324,11 @@ func TestHandleQuota_CreateAndUpdate(t *testing.T) { builder := fake.NewClientBuilder().WithScheme(scheme) if tc.existing != nil { - tc.existing.Name = projectQuotaCRDName(tc.projectID) - builder = builder.WithObjects(tc.existing) + objs := make([]client.Object, len(tc.existing)) + for i := range tc.existing { + objs[i] = tc.existing[i] + } + builder = builder.WithObjects(objs...) } k8sClient := builder.Build() httpAPI := NewAPI(k8sClient) @@ -316,52 +354,43 @@ func TestHandleQuota_CreateAndUpdate(t *testing.T) { t.Fatalf("expected status %d (No Content), got %d", http.StatusNoContent, resp.StatusCode) } - // Verify the ProjectQuota CRD - var pq v1alpha1.ProjectQuota - crdName := projectQuotaCRDName(tc.projectID) - if err := k8sClient.Get(context.Background(), client.ObjectKey{Name: crdName}, &pq); err != nil { - t.Fatalf("failed to get ProjectQuota CRD %q: %v", crdName, err) - } - - if pq.Spec.ProjectID != tc.projectID { - t.Errorf("expected ProjectID %q, got %q", tc.projectID, pq.Spec.ProjectID) - } + // Verify per-AZ ProjectQuota CRDs were created/updated + for az, expectedQuota := range tc.expectPerAZ { + crdName := projectQuotaCRDName(tc.projectID, az) + var pq v1alpha1.ProjectQuota + if err := k8sClient.Get(context.Background(), client.ObjectKey{Name: crdName}, &pq); err != nil { + t.Fatalf("failed to get ProjectQuota CRD %q: %v", crdName, err) + } - // Verify quota totals - for resName, expectedTotal := range tc.expectQuota { - actual, ok := pq.Spec.Quota[resName] - if !ok { - t.Errorf("expected resource %q in quota spec", resName) - continue + if pq.Spec.ProjectID != tc.projectID { + t.Errorf("CRD %q: expected ProjectID %q, got %q", crdName, tc.projectID, pq.Spec.ProjectID) } - if actual.Quota != expectedTotal { - t.Errorf("resource %q: expected quota %d, got %d", resName, expectedTotal, actual.Quota) + if pq.Spec.AvailabilityZone != az { + t.Errorf("CRD %q: expected AZ %q, got %q", crdName, az, pq.Spec.AvailabilityZone) } - } - // Verify per-AZ quotas - for resName, azMap := range tc.expectPerAZ { - actual, ok := pq.Spec.Quota[resName] - if !ok { - t.Errorf("expected resource %q in quota spec for per-AZ check", resName) - continue - } - for az, expectedAZ := range azMap { - if actual.PerAZ[az] != expectedAZ { - t.Errorf("resource %q AZ %q: expected %d, got %d", resName, az, expectedAZ, actual.PerAZ[az]) + // Verify quota values + for resName, expectedVal := range expectedQuota { + actual, ok := pq.Spec.Quota[resName] + if !ok { + t.Errorf("CRD %q: expected resource %q in quota spec", crdName, resName) + continue + } + if actual != expectedVal { + t.Errorf("CRD %q resource %q: expected %d, got %d", crdName, resName, expectedVal, actual) } } - } - // Verify metadata - if tc.expectName != "" && pq.Spec.ProjectName != tc.expectName { - t.Errorf("expected ProjectName %q, got %q", tc.expectName, pq.Spec.ProjectName) - } - if tc.expectDomain != "" && pq.Spec.DomainID != tc.expectDomain { - t.Errorf("expected DomainID %q, got %q", tc.expectDomain, pq.Spec.DomainID) - } - if tc.expectDomName != "" && pq.Spec.DomainName != tc.expectDomName { - t.Errorf("expected DomainName %q, got %q", tc.expectDomName, pq.Spec.DomainName) + // Verify metadata + if tc.expectName != "" && pq.Spec.ProjectName != tc.expectName { + t.Errorf("CRD %q: expected ProjectName %q, got %q", crdName, tc.expectName, pq.Spec.ProjectName) + } + if tc.expectDom != "" && pq.Spec.DomainID != tc.expectDom { + t.Errorf("CRD %q: expected DomainID %q, got %q", crdName, tc.expectDom, pq.Spec.DomainID) + } + if tc.expectDomName != "" && pq.Spec.DomainName != tc.expectDomName { + t.Errorf("CRD %q: expected DomainName %q, got %q", crdName, tc.expectDomName, pq.Spec.DomainName) + } } }) } diff --git a/internal/scheduling/reservations/commitments/api/report_capacity.go b/internal/scheduling/reservations/commitments/api/report_capacity.go index 9f0966cce..ec537607a 100644 --- a/internal/scheduling/reservations/commitments/api/report_capacity.go +++ b/internal/scheduling/reservations/commitments/api/report_capacity.go @@ -59,7 +59,7 @@ func (api *HTTPAPI) HandleReportCapacity(w http.ResponseWriter, r *http.Request) } // Calculate capacity - calculator := commitments.NewCapacityCalculator(api.client) + calculator := commitments.NewCapacityCalculator(api.client, api.config) report, err := calculator.CalculateCapacity(ctx, req) if err != nil { logger.Error(err, "failed to calculate capacity") diff --git a/internal/scheduling/reservations/commitments/api/report_capacity_test.go b/internal/scheduling/reservations/commitments/api/report_capacity_test.go index 9972c6681..638c060db 100644 --- a/internal/scheduling/reservations/commitments/api/report_capacity_test.go +++ b/internal/scheduling/reservations/commitments/api/report_capacity_test.go @@ -29,6 +29,18 @@ func TestHandleReportCapacity(t *testing.T) { t.Fatal(err) } + // testCapacityConfig enables capacity reporting for all groups via "*" catch-all. + testCapacityConfig := commitments.APIConfig{ + EnableReportCapacity: true, + FlavorGroupResourceConfig: map[string]commitments.FlavorGroupResourcesConfig{ + "*": { + RAM: commitments.ResourceTypeConfig{HasCapacity: true}, + Cores: commitments.ResourceTypeConfig{HasCapacity: true}, + Instances: commitments.ResourceTypeConfig{HasCapacity: true}, + }, + }, + } + // Create empty flavor groups knowledge so capacity calculation doesn't fail emptyKnowledge := createEmptyFlavorGroupKnowledge() @@ -37,7 +49,7 @@ func TestHandleReportCapacity(t *testing.T) { WithObjects(emptyKnowledge). Build() - api := NewAPI(fakeClient) + api := NewAPIWithConfig(fakeClient, testCapacityConfig, nil) tests := []struct { name string @@ -131,12 +143,22 @@ func TestCapacityCalculator(t *testing.T) { t.Fatal(err) } + testCapacityConfig := commitments.APIConfig{ + FlavorGroupResourceConfig: map[string]commitments.FlavorGroupResourcesConfig{ + "*": { + RAM: commitments.ResourceTypeConfig{HasCapacity: true}, + Cores: commitments.ResourceTypeConfig{HasCapacity: true}, + Instances: commitments.ResourceTypeConfig{HasCapacity: true}, + }, + }, + } + t.Run("CalculateCapacity returns error when no flavor groups knowledge exists", func(t *testing.T) { fakeClient := fake.NewClientBuilder(). WithScheme(scheme). Build() - calculator := commitments.NewCapacityCalculator(fakeClient) + calculator := commitments.NewCapacityCalculator(fakeClient, testCapacityConfig) req := liquid.ServiceCapacityRequest{ AllAZs: []liquid.AvailabilityZone{"az-one", "az-two"}, } @@ -158,7 +180,7 @@ func TestCapacityCalculator(t *testing.T) { WithObjects(emptyKnowledge). Build() - calculator := commitments.NewCapacityCalculator(fakeClient) + calculator := commitments.NewCapacityCalculator(fakeClient, testCapacityConfig) req := liquid.ServiceCapacityRequest{ AllAZs: []liquid.AvailabilityZone{"az-one", "az-two"}, } @@ -183,7 +205,7 @@ func TestCapacityCalculator(t *testing.T) { WithObjects(flavorGroupKnowledge). Build() - calculator := commitments.NewCapacityCalculator(fakeClient) + calculator := commitments.NewCapacityCalculator(fakeClient, testCapacityConfig) req := liquid.ServiceCapacityRequest{ AllAZs: []liquid.AvailabilityZone{"qa-de-1a", "qa-de-1b", "qa-de-1d"}, } @@ -209,7 +231,7 @@ func TestCapacityCalculator(t *testing.T) { WithObjects(flavorGroupKnowledge). Build() - calculator := commitments.NewCapacityCalculator(fakeClient) + calculator := commitments.NewCapacityCalculator(fakeClient, testCapacityConfig) req := liquid.ServiceCapacityRequest{AllAZs: []liquid.AvailabilityZone{}} report, err := calculator.CalculateCapacity(context.Background(), req) if err != nil { @@ -234,7 +256,7 @@ func TestCapacityCalculator(t *testing.T) { WithObjects(flavorGroupKnowledge). Build() - calculator := commitments.NewCapacityCalculator(fakeClient) + calculator := commitments.NewCapacityCalculator(fakeClient, testCapacityConfig) req1 := liquid.ServiceCapacityRequest{ AllAZs: []liquid.AvailabilityZone{"eu-de-1a", "eu-de-1b"}, @@ -270,7 +292,7 @@ func TestCapacityCalculator(t *testing.T) { WithStatusSubresource(crd). Build() - calculator := commitments.NewCapacityCalculator(fakeClient) + calculator := commitments.NewCapacityCalculator(fakeClient, testCapacityConfig) req := liquid.ServiceCapacityRequest{AllAZs: []liquid.AvailabilityZone{"az-one"}} report, err := calculator.CalculateCapacity(context.Background(), req) if err != nil { @@ -285,15 +307,15 @@ func TestCapacityCalculator(t *testing.T) { if azReport == nil { t.Fatal("expected az-one entry") } - if azReport.Capacity != 1000 { - t.Errorf("expected capacity=1000, got %d", azReport.Capacity) + if azReport.Capacity != 32000 { + t.Errorf("expected capacity=32000, got %d", azReport.Capacity) } if !azReport.Usage.IsSome() { t.Fatal("expected usage to be set for Ready CRD") } - // usage = capacity - placeable = 1000 - 800 = 200 - if usage := azReport.Usage.UnwrapOr(0); usage != 200 { - t.Errorf("expected usage=200 (1000-800), got %d", usage) + // usage = (capacity - placeable) * 32 GiB/slot = (1000 - 800) * 32 = 6400 + if usage := azReport.Usage.UnwrapOr(0); usage != 6400 { + t.Errorf("expected usage=6400 (200*32), got %d", usage) } }) @@ -307,7 +329,7 @@ func TestCapacityCalculator(t *testing.T) { WithStatusSubresource(crd). Build() - calculator := commitments.NewCapacityCalculator(fakeClient) + calculator := commitments.NewCapacityCalculator(fakeClient, testCapacityConfig) req := liquid.ServiceCapacityRequest{AllAZs: []liquid.AvailabilityZone{"az-one", "az-two"}} report, err := calculator.CalculateCapacity(context.Background(), req) if err != nil { @@ -336,7 +358,7 @@ func TestCapacityCalculator(t *testing.T) { WithStatusSubresource(crd). Build() - calculator := commitments.NewCapacityCalculator(fakeClient) + calculator := commitments.NewCapacityCalculator(fakeClient, testCapacityConfig) req := liquid.ServiceCapacityRequest{AllAZs: []liquid.AvailabilityZone{"az-one"}} report, err := calculator.CalculateCapacity(context.Background(), req) if err != nil { @@ -351,9 +373,9 @@ func TestCapacityCalculator(t *testing.T) { if azReport == nil { t.Fatal("expected az-one entry") } - // Stale CRD: last-known capacity is still reported - if azReport.Capacity != 1000 { - t.Errorf("expected last-known capacity=1000 for stale CRD, got %d", azReport.Capacity) + // Stale CRD: last-known capacity is still reported (1000 slots * 32 GiB/slot) + if azReport.Capacity != 32000 { + t.Errorf("expected last-known capacity=32000 for stale CRD, got %d", azReport.Capacity) } // Stale CRD: usage must be absent (None) if azReport.Usage.IsSome() { @@ -463,20 +485,20 @@ func createTestFlavorGroupKnowledge(t *testing.T) *v1alpha1.Knowledge { { "name": "test_c8_m32", "vcpus": 8, - "memoryMB": 32768, + "memoryMB": 32752, "diskGB": 50, }, }, "largestFlavor": map[string]interface{}{ "name": "test_c8_m32", "vcpus": 8, - "memoryMB": 32768, + "memoryMB": 32752, "diskGB": 50, }, "smallestFlavor": map[string]interface{}{ "name": "test_c8_m32", "vcpus": 8, - "memoryMB": 32768, + "memoryMB": 32752, "diskGB": 50, }, // Fixed RAM/core ratio (4096 MiB per vCPU) - required for group to accept commitments diff --git a/internal/scheduling/reservations/commitments/api/report_usage.go b/internal/scheduling/reservations/commitments/api/report_usage.go index bf48dfe00..7d5b35b38 100644 --- a/internal/scheduling/reservations/commitments/api/report_usage.go +++ b/internal/scheduling/reservations/commitments/api/report_usage.go @@ -73,7 +73,7 @@ func (api *HTTPAPI) HandleReportUsage(w http.ResponseWriter, r *http.Request) { } // Use UsageCalculator to build usage report - calculator := commitments.NewUsageCalculator(api.client, api.usageDB) + calculator := commitments.NewUsageCalculator(api.client, api.usageDB, api.config) report, err := calculator.CalculateUsage(r.Context(), log, projectID, req.AllAZs) if err != nil { log.Error(err, "failed to calculate usage report", "projectID", projectID) diff --git a/internal/scheduling/reservations/commitments/api/report_usage_test.go b/internal/scheduling/reservations/commitments/api/report_usage_test.go index a867122ff..eb21f7b61 100644 --- a/internal/scheduling/reservations/commitments/api/report_usage_test.go +++ b/internal/scheduling/reservations/commitments/api/report_usage_test.go @@ -44,9 +44,9 @@ func TestReportUsageIntegration(t *testing.T) { m1Large := &TestFlavor{Name: "m1.large", Group: "hana_1", MemoryMB: 4096, VCPUs: 16} // 4 units m1XL := &TestFlavor{Name: "m1.xl", Group: "hana_1", MemoryMB: 8192, VCPUs: 32} // 8 units - // gp_1 group: smallest = 512 MB, so 1 unit = 0.5 GB - gpSmall := &TestFlavor{Name: "gp.small", Group: "gp_1", MemoryMB: 512, VCPUs: 1} // 1 unit - gpMedium := &TestFlavor{Name: "gp.medium", Group: "gp_1", MemoryMB: 2048, VCPUs: 4} // 4 units + // gp_1 group: smallest = 1024 MB = 1 GiB, so 1 unit = 1 GiB + gpSmall := &TestFlavor{Name: "gp.small", Group: "gp_1", MemoryMB: 1024, VCPUs: 1} // 1 unit + gpMedium := &TestFlavor{Name: "gp.medium", Group: "gp_1", MemoryMB: 2048, VCPUs: 4} // 2 units baseTime := time.Date(2026, 1, 1, 12, 0, 0, 0, time.UTC) @@ -290,7 +290,7 @@ func TestReportUsageIntegration(t *testing.T) { "hw_version_gp_1_ram": { PerAZ: map[string]ExpectedAZUsage{ "az-a": { - Usage: 4, // 2048 MB / 512 MB = 4 units + Usage: 2, // 2048 MB / 1024 MB = 2 units VMs: []ExpectedVMUsage{ {UUID: "vm-gp", CommitmentID: "commit-gp", MemoryMB: 2048}, }, @@ -354,11 +354,11 @@ func TestReportUsageIntegration(t *testing.T) { ExpectedStatusCode: http.StatusMethodNotAllowed, }, { - Name: "VM with empty AZ - normalized to unknown", + Name: "VM with empty AZ - dropped from report", ProjectID: "project-empty-az", Flavors: []*TestFlavor{m1Small, m1Large}, VMs: []*TestVMUsage{ - // VM with empty AZ (e.g., ERROR or BUILDING state VM not yet scheduled) + // VM with empty AZ (e.g., ERROR or BUILDING state) — normalized to "unknown", excluded. newTestVMUsageWithEmptyAZ("vm-error", m1Large, "project-empty-az", "host-1", baseTime), // Normal VM with valid AZ newTestVMUsage("vm-ok", m1Large, "project-empty-az", "az-a", "host-2", baseTime.Add(1*time.Hour)), @@ -377,10 +377,106 @@ func TestReportUsageIntegration(t *testing.T) { {UUID: "vm-ok", CommitmentID: "commit-1", MemoryMB: 4096}, }, }, - "unknown": { - Usage: 4, // VM with empty AZ normalized to "unknown" + // "unknown" AZ is excluded — VMs without a valid AZ are dropped. + }, + }, + }, + }, + { + // hana_1 has a fixed RAM/core ratio (all flavors: 256 MiB/vCPU), so _ram + // is AZSeparatedTopology and carries per-AZ quota. This test verifies that + // the per-AZ quota value is read from the ProjectQuota CRD when present. + Name: "Fixed-ratio group with ProjectQuota CRD - quota reported per AZ", + ProjectID: "project-quota", + Flavors: []*TestFlavor{m1Small, m1Large}, + Config: &commitments.APIConfig{ + FlavorGroupResourceConfig: map[string]commitments.FlavorGroupResourcesConfig{ + "hana_1": {RAM: commitments.ResourceTypeConfig{HandlesCommitments: true, HasQuota: true}}, + }, + }, + VMs: []*TestVMUsage{ + newTestVMUsage("vm-001", m1Large, "project-quota", "az-a", "host-1", baseTime), + }, + Reservations: []*UsageTestReservation{ + {CommitmentID: "commit-1", Flavor: m1Small, ProjectID: "project-quota", AZ: "az-a", Count: 4}, + }, + ProjectQuota: &v1alpha1.ProjectQuota{ + ObjectMeta: metav1.ObjectMeta{Name: "quota-project-quota-az-a"}, + Spec: v1alpha1.ProjectQuotaSpec{ + ProjectID: "project-quota", + DomainID: "test-domain", + AvailabilityZone: "az-a", + Quota: map[string]int64{ + "hw_version_hana_1_ram": 16, + }, + }, + }, + AllAZs: []string{"az-a"}, + Expected: map[string]ExpectedResourceUsage{ + "hw_version_hana_1_ram": { + PerAZ: map[string]ExpectedAZUsage{ + "az-a": { + Usage: 4, + Quota: func() *int64 { v := int64(16); return &v }(), VMs: []ExpectedVMUsage{ - {UUID: "vm-error", CommitmentID: "", MemoryMB: 4096}, // PAYG - no commitment in "unknown" AZ + {UUID: "vm-001", CommitmentID: "commit-1", MemoryMB: 4096}, + }, + }, + }, + }, + }, + }, + { + // When no ProjectQuota CRD exists for a project, quota defaults to -1 (infinite). + Name: "Fixed-ratio group with no ProjectQuota CRD - infinite quota", + ProjectID: "project-no-quota", + Flavors: []*TestFlavor{m1Small, m1Large}, + Config: &commitments.APIConfig{ + FlavorGroupResourceConfig: map[string]commitments.FlavorGroupResourcesConfig{ + "hana_1": {RAM: commitments.ResourceTypeConfig{HandlesCommitments: true, HasQuota: true}}, + }, + }, + VMs: []*TestVMUsage{ + newTestVMUsage("vm-001", m1Large, "project-no-quota", "az-a", "host-1", baseTime), + }, + Reservations: []*UsageTestReservation{ + {CommitmentID: "commit-1", Flavor: m1Small, ProjectID: "project-no-quota", AZ: "az-a", Count: 4}, + }, + AllAZs: []string{"az-a"}, + Expected: map[string]ExpectedResourceUsage{ + "hw_version_hana_1_ram": { + PerAZ: map[string]ExpectedAZUsage{ + "az-a": { + Usage: 4, + Quota: func() *int64 { v := int64(-1); return &v }(), + VMs: []ExpectedVMUsage{ + {UUID: "vm-001", CommitmentID: "commit-1", MemoryMB: 4096}, + }, + }, + }, + }, + }, + }, + { + // gp_1 has a variable RAM/core ratio (gpSmall=1024 MiB/vCPU, gpMedium=512 MiB/vCPU), + // so _ram is NOT AZSeparatedTopology and must carry no quota field. + Name: "Variable-ratio group - no quota field on _ram resource", + ProjectID: "project-variable", + Flavors: []*TestFlavor{gpSmall, gpMedium}, + VMs: []*TestVMUsage{ + newTestVMUsage("vm-001", gpMedium, "project-variable", "az-a", "host-1", baseTime), + }, + Reservations: []*UsageTestReservation{}, + AllAZs: []string{"az-a"}, + Expected: map[string]ExpectedResourceUsage{ + "hw_version_gp_1_ram": { + PerAZ: map[string]ExpectedAZUsage{ + // AssertNoQuota: quota field must be absent for variable-ratio groups. + "az-a": { + Usage: 2, + AssertNoQuota: true, + VMs: []ExpectedVMUsage{ + {UUID: "vm-001", CommitmentID: "", MemoryMB: 2048}, }, }, }, @@ -407,6 +503,8 @@ type UsageReportTestCase struct { Flavors []*TestFlavor VMs []*TestVMUsage Reservations []*UsageTestReservation + ProjectQuota *v1alpha1.ProjectQuota // optional; nil means no quota CRD present + Config *commitments.APIConfig // optional; nil means DefaultAPIConfig AllAZs []string Expected map[string]ExpectedResourceUsage ExpectedStatusCode int // 0 means expect 200 OK @@ -459,8 +557,10 @@ type ExpectedResourceUsage struct { } type ExpectedAZUsage struct { - Usage uint64 // Usage in multiples of smallest flavor - VMs []ExpectedVMUsage + Usage uint64 // Usage in multiples of smallest flavor + Quota *int64 // non-nil: verify this exact value (-1 = infinite) + AssertNoQuota bool // true: verify quota field is absent + VMs []ExpectedVMUsage } type ExpectedVMUsage struct { @@ -541,6 +641,8 @@ func newUsageTestEnv( vms []*TestVMUsage, reservations []*UsageTestReservation, flavorGroups FlavorGroupsKnowledge, + projectQuota *v1alpha1.ProjectQuota, + config *commitments.APIConfig, ) *UsageTestEnv { t.Helper() @@ -584,9 +686,14 @@ func newUsageTestEnv( } k8sReservations = append(k8sReservations, crObjects...) + + builderObjs := k8sReservations + if projectQuota != nil { + builderObjs = append(builderObjs, projectQuota) + } k8sClient := fake.NewClientBuilder(). WithScheme(scheme). - WithObjects(k8sReservations...). + WithObjects(builderObjs...). WithStatusSubresource(&v1alpha1.Reservation{}). WithStatusSubresource(&v1alpha1.Knowledge{}). WithStatusSubresource(&v1alpha1.CommittedResource{}). @@ -608,6 +715,13 @@ func newUsageTestEnv( } return []string{cr.Spec.ProjectID} }). + WithIndex(&v1alpha1.ProjectQuota{}, "spec.projectID", func(obj client.Object) []string { + pq, ok := obj.(*v1alpha1.ProjectQuota) + if !ok || pq.Spec.ProjectID == "" { + return nil + } + return []string{pq.Spec.ProjectID} + }). Build() // Create mock DB client with VMs @@ -636,7 +750,11 @@ func newUsageTestEnv( } // Create API with mock DB client - api := NewAPIWithConfig(k8sClient, commitments.DefaultAPIConfig(), dbClient) + apiConfig := commitments.DefaultAPIConfig() + if config != nil && config.FlavorGroupResourceConfig != nil { + apiConfig.FlavorGroupResourceConfig = config.FlavorGroupResourceConfig + } + api := NewAPIWithConfig(k8sClient, apiConfig, dbClient) mux := http.NewServeMux() registry := prometheus.NewRegistry() api.Init(mux, registry, log.Log) @@ -726,7 +844,7 @@ func runUsageReportTest(t *testing.T, tc UsageReportTestCase) { }.ToFlavorGroupsKnowledge() // Create test environment - env := newUsageTestEnv(t, tc.VMs, tc.Reservations, flavorGroups) + env := newUsageTestEnv(t, tc.VMs, tc.Reservations, flavorGroups, tc.ProjectQuota, tc.Config) defer env.Close() // Call API @@ -785,6 +903,25 @@ func verifyUsageReport(t *testing.T, tc UsageReportTestCase, actual liquid.Servi resourceName, azName, expectedAZ.Usage, actualAZ.Usage) } + // Verify per-AZ quota when the test case specifies it. + if expectedAZ.Quota != nil { + actualQuota, hasQuota := actualAZ.Quota.Unpack() + if !hasQuota { + t.Errorf("Resource %s AZ %s: expected quota %d but quota field is absent", + resourceName, azName, *expectedAZ.Quota) + } else if actualQuota != *expectedAZ.Quota { + t.Errorf("Resource %s AZ %s: expected quota %d, got %d", + resourceName, azName, *expectedAZ.Quota, actualQuota) + } + } + if expectedAZ.AssertNoQuota { + if actualAZ.Quota.IsSome() { + v, _ := actualAZ.Quota.Unpack() + t.Errorf("Resource %s AZ %s: expected no quota field, got %d", + resourceName, azName, v) + } + } + // VM subresources are on the _instances resource, not _ram if actualInstancesResource == nil { t.Errorf("Instances resource %s not found", instancesResourceName) diff --git a/internal/scheduling/reservations/commitments/api/usage_test.go b/internal/scheduling/reservations/commitments/api/usage_test.go index d15967d16..86cc3f8ba 100644 --- a/internal/scheduling/reservations/commitments/api/usage_test.go +++ b/internal/scheduling/reservations/commitments/api/usage_test.go @@ -31,6 +31,16 @@ import ( // Unit Tests for UsageCalculator // ============================================================================ +// testUsageConfig is shared across UsageCalculator tests. +// Uses "*" catch-all so all flavor groups (hana_1, etc.) have HandlesCommitments=true for RAM. +var testUsageConfig = commitments.APIConfig{ + FlavorGroupResourceConfig: map[string]commitments.FlavorGroupResourcesConfig{ + "*": { + RAM: commitments.ResourceTypeConfig{HandlesCommitments: true, HasQuota: true}, + }, + }, +} + func TestUsageCalculator_CalculateUsage(t *testing.T) { log.SetLogger(zap.New(zap.WriteTo(os.Stderr), zap.UseDevMode(true))) ctx := context.Background() @@ -138,7 +148,7 @@ func TestUsageCalculator_CalculateUsage(t *testing.T) { } // Create calculator and run - calc := commitments.NewUsageCalculator(k8sClient, dbClient) + calc := commitments.NewUsageCalculator(k8sClient, dbClient, testUsageConfig) logger := log.FromContext(ctx) report, err := calc.CalculateUsage(ctx, logger, tt.projectID, tt.allAZs) if err != nil { @@ -392,7 +402,7 @@ func TestUsageCalculator_ExpiredAndFutureCommitments(t *testing.T) { } } - calc := commitments.NewUsageCalculator(k8sClient, dbClient) + calc := commitments.NewUsageCalculator(k8sClient, dbClient, testUsageConfig) logger := log.FromContext(ctx) report, err := calc.CalculateUsage(ctx, logger, tt.projectID, tt.allAZs) if err != nil { @@ -437,10 +447,9 @@ func TestUsageCalculator_ExpiredAndFutureCommitments(t *testing.T) { } // TestUsageMultipleCalculation_FloorDivision tests that RAM usage is calculated -// using floor division to handle Nova's memory overhead correctly. -// Nova flavors like "2 GiB" actually have 2032 MiB (not 2048) due to overhead. -// A "4 GiB" flavor has 4080 MiB, which is 2.007× the base unit. -// Floor division ensures 4080 / 2032 = 2 (not 3 from ceiling). +// by adding the 16 MiB video RAM reservation before dividing, matching actual flavor sizing. +// Nova flavors like "4 GiB" have 4080 MiB (4096 - 16 for hw_video:ram_max_mb=16). +// Adding 16 MiB restores the exact GiB multiple before integer division. func TestUsageMultipleCalculation_FloorDivision(t *testing.T) { log.SetLogger(zap.New(zap.WriteTo(os.Stderr), zap.UseDevMode(true))) ctx := context.Background() @@ -461,7 +470,7 @@ func TestUsageMultipleCalculation_FloorDivision(t *testing.T) { expectedInstances uint64 }{ { - name: "single smallest flavor - 1 unit", + name: "single smallest flavor - 2 units", vms: []commitments.VMRow{ { ID: "vm-001", Name: "vm-001", Status: "ACTIVE", @@ -470,12 +479,12 @@ func TestUsageMultipleCalculation_FloorDivision(t *testing.T) { FlavorName: "g_k_c1_m2_v2", FlavorRAM: 2032, FlavorVCPUs: 1, }, }, - expectedRAM: 1, + expectedRAM: 2, expectedCores: 1, expectedInstances: 1, }, { - name: "2x flavor with overhead - floor(4080/2032) = 2 units, not 3", + name: "2x flavor with overhead - (4080+16)/1024 = 4 GiB", vms: []commitments.VMRow{ { ID: "vm-001", Name: "vm-001", Status: "ACTIVE", @@ -484,7 +493,7 @@ func TestUsageMultipleCalculation_FloorDivision(t *testing.T) { FlavorName: "g_k_c2_m4_v2", FlavorRAM: 4080, FlavorVCPUs: 2, }, }, - expectedRAM: 2, // floor(4080/2032) = 2, NOT 3 (ceiling would give 3) + expectedRAM: 4, // (4080+16)/1024 = 4 expectedCores: 2, expectedInstances: 1, }, @@ -516,12 +525,10 @@ func TestUsageMultipleCalculation_FloorDivision(t *testing.T) { FlavorName: "g_k_c16_m32_v2", FlavorRAM: 32752, FlavorVCPUs: 16, }, }, - // floor(2032/2032) + floor(4080/2032) + floor(16368/2032) + floor(32752/2032) - // = 1 + 2 + 8 + 16 = 27 (matches sum of vCPUs: 1+2+4+16=23... wait, that's not right) - // Actually cores = 1+2+4+16 = 23 - // RAM units = 1+2+8+16 = 27 - // These don't match because vCPUs and RAM have different ratios per flavor! - expectedRAM: 27, // 1 + 2 + 8 + 16 + // (2032+16)/1024 + (4080+16)/1024 + (16368+16)/1024 + (32752+16)/1024 + // = 2 + 4 + 16 + 32 = 54 + // Cores: 1 + 2 + 4 + 16 = 23 + expectedRAM: 54, // 2 + 4 + 16 + 32 expectedCores: 23, // 1 + 2 + 4 + 16 expectedInstances: 4, }, @@ -563,7 +570,7 @@ func TestUsageMultipleCalculation_FloorDivision(t *testing.T) { }, } - calc := commitments.NewUsageCalculator(k8sClient, dbClient) + calc := commitments.NewUsageCalculator(k8sClient, dbClient, testUsageConfig) logger := log.FromContext(ctx) report, err := calc.CalculateUsage(ctx, logger, "project-A", []liquid.AvailabilityZone{"az-a"}) if err != nil { diff --git a/internal/scheduling/reservations/commitments/capacity.go b/internal/scheduling/reservations/commitments/capacity.go index 076428fa6..623493484 100644 --- a/internal/scheduling/reservations/commitments/capacity.go +++ b/internal/scheduling/reservations/commitments/capacity.go @@ -7,8 +7,8 @@ import ( "context" "fmt" - . "github.com/majewsky/gg/option" "github.com/sapcc/go-api-declarations/liquid" + . "go.xyrillian.de/gg/option" apimeta "k8s.io/apimachinery/pkg/api/meta" "sigs.k8s.io/controller-runtime/pkg/client" @@ -19,15 +19,19 @@ import ( // CapacityCalculator computes capacity reports for Limes LIQUID API. type CapacityCalculator struct { client client.Client + conf APIConfig } -func NewCapacityCalculator(client client.Client) *CapacityCalculator { - return &CapacityCalculator{client: client} +func NewCapacityCalculator(client client.Client, conf APIConfig) *CapacityCalculator { + return &CapacityCalculator{client: client, conf: conf} } // CalculateCapacity computes per-AZ capacity for all flavor groups. // For each flavor group, three resources are reported: _ram, _cores, _instances. // Capacity and usage are read from FlavorGroupCapacity CRDs pre-computed by the capacity controller. +// Usage is approximated from slot counts (total − placeable of the smallest flavor); this may +// slightly under-report usage when larger flavors are running, showing more free capacity than +// reality — acceptable for capacity planning purposes. func (c *CapacityCalculator) CalculateCapacity(ctx context.Context, req liquid.ServiceCapacityRequest) (liquid.ServiceCapacityReport, error) { // Get all flavor groups from Knowledge CRDs (needed for smallest-flavor lookup). knowledge := &reservations.FlavorGroupKnowledgeClient{Client: c.client} @@ -62,20 +66,35 @@ func (c *CapacityCalculator) CalculateCapacity(ctx context.Context, req liquid.S logger := LoggerFromContext(ctx) for groupName, groupData := range flavorGroups { + resCfg := c.conf.ResourceConfigForGroup(groupName) + // Skip groups not configured for capacity reporting. + if !resCfg.RAM.HasCapacity && !resCfg.Cores.HasCapacity && !resCfg.Instances.HasCapacity { + continue + } + smallestFlavorName := groupData.SmallestFlavor.Name + // Add 16 MiB before dividing: flavors reserve 16 MiB for video RAM (hw_video:ram_max_mb=16), + // so a nominal "2 GiB" flavor has 2032 MiB. + memoryMBPerSlot := groupData.SmallestFlavor.MemoryMB + 16 + vcpusPerSlot := groupData.SmallestFlavor.VCPUs + + ramAZCapacity := make(map[liquid.AvailabilityZone]*liquid.AZResourceCapacityReport, len(req.AllAZs)) + coresAZCapacity := make(map[liquid.AvailabilityZone]*liquid.AZResourceCapacityReport, len(req.AllAZs)) + instancesAZCapacity := make(map[liquid.AvailabilityZone]*liquid.AZResourceCapacityReport, len(req.AllAZs)) - azCapacity := make(map[liquid.AvailabilityZone]*liquid.AZResourceCapacityReport, len(req.AllAZs)) for _, az := range req.AllAZs { crd, ok := crdByKey[groupAZKey{groupName, string(az)}] if !ok { // No CRD for this (group, AZ) pair — report zero. - azCapacity[az] = &liquid.AZResourceCapacityReport{Capacity: 0} + zero := &liquid.AZResourceCapacityReport{Capacity: 0} + ramAZCapacity[az] = zero + coresAZCapacity[az] = &liquid.AZResourceCapacityReport{Capacity: 0} + instancesAZCapacity[az] = &liquid.AZResourceCapacityReport{Capacity: 0} continue } // If the CRD data is stale, report last-known capacity but omit usage. - ready := apimeta.IsStatusConditionTrue(crd.Status.Conditions, v1alpha1.FlavorGroupCapacityConditionReady) - if !ready { + if !apimeta.IsStatusConditionTrue(crd.Status.Conditions, v1alpha1.FlavorGroupCapacityConditionReady) { logger.Info("FlavorGroupCapacity CRD is stale, reporting capacity without usage", "flavorGroup", groupName, "az", az) } @@ -89,50 +108,51 @@ func (c *CapacityCalculator) CalculateCapacity(ctx context.Context, req liquid.S } } if smallest == nil { - azCapacity[az] = &liquid.AZResourceCapacityReport{Capacity: 0} + zero := &liquid.AZResourceCapacityReport{Capacity: 0} + ramAZCapacity[az] = zero + coresAZCapacity[az] = &liquid.AZResourceCapacityReport{Capacity: 0} + instancesAZCapacity[az] = &liquid.AZResourceCapacityReport{Capacity: 0} continue } - capacity := uint64(smallest.TotalCapacityVMSlots) //nolint:gosec - azEntry := &liquid.AZResourceCapacityReport{Capacity: capacity} - if ready { - placeable := uint64(smallest.PlaceableVMs) //nolint:gosec - var usage uint64 - if capacity > placeable { - usage = capacity - placeable + totalSlots := uint64(smallest.TotalCapacityVMSlots) //nolint:gosec // slot count from CRD, realistically bounded + ramEntry := &liquid.AZResourceCapacityReport{Capacity: totalSlots * memoryMBPerSlot / 1024} + coresEntry := &liquid.AZResourceCapacityReport{Capacity: totalSlots * vcpusPerSlot} + instancesEntry := &liquid.AZResourceCapacityReport{Capacity: totalSlots} + + // Usage is approximated from slot counts. This may slightly under-report usage when + // larger flavors are running (safe direction: shows more free capacity than reality). + if apimeta.IsStatusConditionTrue(crd.Status.Conditions, v1alpha1.FlavorGroupCapacityConditionReady) { + placeableSlots := uint64(smallest.PlaceableVMs) //nolint:gosec // slot count from CRD, realistically bounded + var usedSlots uint64 + if totalSlots > placeableSlots { + usedSlots = totalSlots - placeableSlots } - azEntry.Usage = Some[uint64](usage) + ramEntry.Usage = Some[uint64](usedSlots * memoryMBPerSlot / 1024) + coresEntry.Usage = Some[uint64](usedSlots * vcpusPerSlot) + instancesEntry.Usage = Some[uint64](usedSlots) } - azCapacity[az] = azEntry + ramAZCapacity[az] = ramEntry + coresAZCapacity[az] = coresEntry + instancesAZCapacity[az] = instancesEntry } - // All three resources share the same capacity units (multiples of smallest flavor). - report.Resources[liquid.ResourceName(ResourceNameRAM(groupName))] = &liquid.ResourceCapacityReport{ - PerAZ: azCapacity, + if resCfg.RAM.HasCapacity { + report.Resources[liquid.ResourceName(ResourceNameRAM(groupName))] = &liquid.ResourceCapacityReport{ + PerAZ: ramAZCapacity, + } } - report.Resources[liquid.ResourceName(ResourceNameCores(groupName))] = &liquid.ResourceCapacityReport{ - PerAZ: c.copyAZCapacity(azCapacity), + if resCfg.Cores.HasCapacity { + report.Resources[liquid.ResourceName(ResourceNameCores(groupName))] = &liquid.ResourceCapacityReport{ + PerAZ: coresAZCapacity, + } } - report.Resources[liquid.ResourceName(ResourceNameInstances(groupName))] = &liquid.ResourceCapacityReport{ - PerAZ: c.copyAZCapacity(azCapacity), + if resCfg.Instances.HasCapacity { + report.Resources[liquid.ResourceName(ResourceNameInstances(groupName))] = &liquid.ResourceCapacityReport{ + PerAZ: instancesAZCapacity, + } } } return report, nil } - -// copyAZCapacity creates a deep copy of the AZ capacity map. -// Each resource needs its own map instance. -func (c *CapacityCalculator) copyAZCapacity( - src map[liquid.AvailabilityZone]*liquid.AZResourceCapacityReport, -) map[liquid.AvailabilityZone]*liquid.AZResourceCapacityReport { - - result := make(map[liquid.AvailabilityZone]*liquid.AZResourceCapacityReport, len(src)) - for az, report := range src { - result[az] = &liquid.AZResourceCapacityReport{ - Capacity: report.Capacity, - Usage: report.Usage, - } - } - return result -} diff --git a/internal/scheduling/reservations/commitments/committed_resource_controller.go b/internal/scheduling/reservations/commitments/committed_resource_controller.go index a74c82f30..fde167152 100644 --- a/internal/scheduling/reservations/commitments/committed_resource_controller.go +++ b/internal/scheduling/reservations/commitments/committed_resource_controller.go @@ -91,6 +91,9 @@ func (r *CommittedResourceController) reconcilePending(ctx context.Context, logg "amount", cr.Spec.Amount.String(), "allowRejection", cr.Spec.AllowRejection, ) + if cr.Spec.ResourceType == v1alpha1.CommittedResourceTypeCores { + return r.reconcileCoresHeadroom(ctx, logger, cr) + } // If this spec generation was already rejected, don't re-apply. // Without this guard the controller oscillates: apply bad spec → delete reservations → // Reservation watch re-enqueues → apply bad spec again → loop. @@ -144,6 +147,9 @@ func (r *CommittedResourceController) reconcileCommitted(ctx context.Context, lo "amount", cr.Spec.Amount.String(), "allowRejection", cr.Spec.AllowRejection, ) + if cr.Spec.ResourceType == v1alpha1.CommittedResourceTypeCores { + return r.reconcileCoresHeadroom(ctx, logger, cr) + } // If this spec generation was already rejected, maintain rollback state without re-applying. // Without this guard the controller oscillates: apply bad spec → rollback → // Reservation watch re-enqueues → apply bad spec again → loop. @@ -195,6 +201,95 @@ func (r *CommittedResourceController) reconcileCommitted(ctx context.Context, lo return ctrl.Result{}, r.setAccepted(ctx, cr) } +// reconcileCoresHeadroom handles acceptance of CPU core commitments. +// No Reservation CRDs are created; instead it reads the flavor group's total CPU capacity +// from the FlavorGroupCapacity CRD and sums already-accepted CPU CRs for the same +// (flavorGroup, AZ) to check whether sufficient headroom exists. +func (r *CommittedResourceController) reconcileCoresHeadroom(ctx context.Context, logger logr.Logger, cr *v1alpha1.CommittedResource) (ctrl.Result, error) { + if isRejectedForGeneration(cr) { + logger.V(1).Info("spec already rejected for current generation", "generation", cr.Generation) + return ctrl.Result{}, nil + } + + // Find the FlavorGroupCapacity CRD for this (flavorGroup, AZ). + var capacityList v1alpha1.FlavorGroupCapacityList + if err := r.List(ctx, &capacityList); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to list FlavorGroupCapacity CRDs: %w", err) + } + var fgCap *v1alpha1.FlavorGroupCapacity + for i := range capacityList.Items { + c := &capacityList.Items[i] + if c.Spec.FlavorGroup == cr.Spec.FlavorGroupName && c.Spec.AvailabilityZone == cr.Spec.AvailabilityZone { + fgCap = c + break + } + } + if fgCap == nil { + // Capacity controller hasn't run yet for this group/AZ — retry. + delay := r.retryDelay(cr) + logger.Info("FlavorGroupCapacity CRD not found, will retry", "requeueAfter", delay) + return ctrl.Result{RequeueAfter: delay}, r.setNotReadyRetry(ctx, cr, "waiting for capacity data") + } + + totalCoresQty, ok := fgCap.Status.TotalCapacity[string(v1alpha1.CommittedResourceTypeCores)] + if !ok { + // Key absent means the capacity controller hasn't successfully probed yet. + delay := r.retryDelay(cr) + logger.Info("CPU capacity not yet populated in FlavorGroupCapacity CRD, will retry", "requeueAfter", delay) + return ctrl.Result{RequeueAfter: delay}, r.setNotReadyRetry(ctx, cr, "waiting for CPU capacity data") + } + // Key present (even if zero): the capacity controller has run and this is the actual capacity. + totalCores := totalCoresQty.Value() + + // Sum cores already accepted for this (flavorGroup, AZ), excluding this CR. + var alreadyCommitted int64 + var crList v1alpha1.CommittedResourceList + if err := r.List(ctx, &crList); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to list CommittedResources: %w", err) + } + for _, other := range crList.Items { + if other.Spec.CommitmentUUID == cr.Spec.CommitmentUUID { + continue + } + if other.Spec.ResourceType != v1alpha1.CommittedResourceTypeCores { + continue + } + if other.Spec.FlavorGroupName != cr.Spec.FlavorGroupName || other.Spec.AvailabilityZone != cr.Spec.AvailabilityZone { + continue + } + if other.Spec.State != v1alpha1.CommitmentStatusGuaranteed && other.Spec.State != v1alpha1.CommitmentStatusConfirmed { + continue + } + if other.Status.AcceptedSpec == nil { + continue + } + alreadyCommitted += other.Status.AcceptedSpec.Amount.Value() + } + + requestedCores := cr.Spec.Amount.Value() + headroom := totalCores - alreadyCommitted + logger.Info("cores headroom check", + "total", totalCores, + "committed", alreadyCommitted, + "requested", requestedCores, + "headroom", headroom, + ) + + if headroom < requestedCores { + reason := fmt.Sprintf("insufficient CPU cores: %d available, %d requested", headroom, requestedCores) + if cr.Spec.AllowRejection { + logger.Info("cores commitment rejected", "reason", reason) + return ctrl.Result{}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonRejected, reason) + } + delay := r.retryDelay(cr) + logger.Info("cores headroom insufficient, will retry", "reason", reason, "requeueAfter", delay) + return ctrl.Result{RequeueAfter: delay}, r.setNotReadyRetry(ctx, cr, reason) + } + + logger.Info("cores commitment accepted", "generation", cr.Generation, "cores", requestedCores) + return ctrl.Result{}, r.setAccepted(ctx, cr) +} + func (r *CommittedResourceController) applyReservationState(ctx context.Context, logger logr.Logger, cr *v1alpha1.CommittedResource) (*ApplyResult, error) { knowledge := &reservations.FlavorGroupKnowledgeClient{Client: r.Client} flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) diff --git a/internal/scheduling/reservations/commitments/committed_resource_controller_test.go b/internal/scheduling/reservations/commitments/committed_resource_controller_test.go index e5c243478..d0eb6433f 100644 --- a/internal/scheduling/reservations/commitments/committed_resource_controller_test.go +++ b/internal/scheduling/reservations/commitments/committed_resource_controller_test.go @@ -717,6 +717,229 @@ func TestCommittedResourceController_RetryBackoff(t *testing.T) { } } +// ============================================================================ +// Tests: reconcileCoresHeadroom +// ============================================================================ + +// newTestCoresCR creates a CommittedResource with ResourceType=cores. +func newTestCoresCR(name string, state v1alpha1.CommitmentStatus, cores int64, allowRejection bool) *v1alpha1.CommittedResource { + return &v1alpha1.CommittedResource{ + ObjectMeta: metav1.ObjectMeta{Name: name}, + Spec: v1alpha1.CommittedResourceSpec{ + CommitmentUUID: "cores-uuid-1234", + FlavorGroupName: "test-group", + ResourceType: v1alpha1.CommittedResourceTypeCores, + Amount: *resource.NewQuantity(cores, resource.DecimalSI), + AvailabilityZone: "test-az", + ProjectID: "test-project", + DomainID: "test-domain", + State: state, + AllowRejection: allowRejection, + }, + } +} + +// newTestFlavorGroupCapacity creates a FlavorGroupCapacity CRD with the given total cores. +func newTestFlavorGroupCapacity(flavorGroup, az string, totalCores int64) *v1alpha1.FlavorGroupCapacity { + return &v1alpha1.FlavorGroupCapacity{ + ObjectMeta: metav1.ObjectMeta{Name: flavorGroup + "-" + az}, + Spec: v1alpha1.FlavorGroupCapacitySpec{ + FlavorGroup: flavorGroup, + AvailabilityZone: az, + }, + Status: v1alpha1.FlavorGroupCapacityStatus{ + TotalCapacity: map[string]resource.Quantity{ + string(v1alpha1.CommittedResourceTypeCores): *resource.NewQuantity(totalCores, resource.DecimalSI), + }, + }, + } +} + +func TestCommittedResourceController_CoresHeadroom(t *testing.T) { + tests := []struct { + name string + state v1alpha1.CommitmentStatus + requestedCores int64 + totalCores int64 + allowRejection bool + // other accepted CPU CRs consuming cores + existingCores int64 + // capacity CRD missing entirely + noCapacityCRD bool + // capacity CRD present but TotalCapacity["cores"] not set + noCoreCapacity bool + expectedStatus metav1.ConditionStatus + expectedReason string + expectRequeue bool + }{ + { + name: "accepted: sufficient headroom", + state: v1alpha1.CommitmentStatusConfirmed, + requestedCores: 4, + totalCores: 16, + existingCores: 0, + expectedStatus: metav1.ConditionTrue, + expectedReason: "Accepted", + }, + { + name: "accepted: headroom exactly meets request", + state: v1alpha1.CommitmentStatusConfirmed, + requestedCores: 8, + totalCores: 16, + existingCores: 8, + expectedStatus: metav1.ConditionTrue, + expectedReason: "Accepted", + }, + { + name: "rejected: insufficient headroom, AllowRejection=true", + state: v1alpha1.CommitmentStatusConfirmed, + requestedCores: 10, + totalCores: 16, + existingCores: 8, + allowRejection: true, + expectedStatus: metav1.ConditionFalse, + expectedReason: "Rejected", + expectRequeue: false, + }, + { + name: "retry: insufficient headroom, AllowRejection=false", + state: v1alpha1.CommitmentStatusConfirmed, + requestedCores: 10, + totalCores: 16, + existingCores: 8, + allowRejection: false, + expectedStatus: metav1.ConditionFalse, + expectedReason: "Reserving", + expectRequeue: true, + }, + { + name: "retry: FlavorGroupCapacity CRD not found", + state: v1alpha1.CommitmentStatusConfirmed, + requestedCores: 4, + noCapacityCRD: true, + allowRejection: true, + expectedStatus: metav1.ConditionFalse, + expectedReason: "Reserving", + expectRequeue: true, + }, + { + name: "retry: TotalCapacity[cores] not set", + state: v1alpha1.CommitmentStatusConfirmed, + requestedCores: 4, + noCoreCapacity: true, + allowRejection: true, + expectedStatus: metav1.ConditionFalse, + expectedReason: "Reserving", + expectRequeue: true, + }, + { + // TotalCapacity["cores"]=0 means the capacity controller probed and found no + // eligible hosts (e.g. HANA flavor groups in a QA cluster). This must reject + // immediately rather than retrying, to avoid API timeouts. + name: "rejected immediately: zero CPU capacity, AllowRejection=true", + state: v1alpha1.CommitmentStatusConfirmed, + requestedCores: 4, + totalCores: 0, + allowRejection: true, + expectedStatus: metav1.ConditionFalse, + expectedReason: "Rejected", + expectRequeue: false, + }, + { + name: "stays rejected: already rejected for current generation", + state: v1alpha1.CommitmentStatusConfirmed, + requestedCores: 4, + totalCores: 16, + allowRejection: true, + expectedStatus: metav1.ConditionFalse, + expectedReason: "Rejected", + expectRequeue: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + scheme := newCRTestScheme(t) + cr := newTestCoresCR("test-cr", tt.state, tt.requestedCores, tt.allowRejection) + + objects := []client.Object{cr} + + if !tt.noCapacityCRD { + if tt.noCoreCapacity { + // Capacity CRD present but without the cores key. + fgc := &v1alpha1.FlavorGroupCapacity{ + ObjectMeta: metav1.ObjectMeta{Name: "test-group-test-az"}, + Spec: v1alpha1.FlavorGroupCapacitySpec{ + FlavorGroup: "test-group", + AvailabilityZone: "test-az", + }, + Status: v1alpha1.FlavorGroupCapacityStatus{ + TotalCapacity: map[string]resource.Quantity{}, + }, + } + objects = append(objects, fgc) + } else { + objects = append(objects, newTestFlavorGroupCapacity("test-group", "test-az", tt.totalCores)) + } + } + + if tt.existingCores > 0 { + // An already-accepted cores CR consuming existingCores. + otherCR := newTestCoresCR("other-cr", v1alpha1.CommitmentStatusConfirmed, tt.existingCores, false) + otherCR.Spec.CommitmentUUID = "other-uuid-5678" + otherSpec := otherCR.Spec + otherCR.Status.AcceptedSpec = &otherSpec + objects = append(objects, otherCR) + } + + k8sClient := newCRTestClient(scheme, objects...) + + // For the "stays rejected" test, pre-set Rejected condition at current generation. + if tt.name == "stays rejected: already rejected for current generation" { + var fetched v1alpha1.CommittedResource + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: cr.Name}, &fetched); err != nil { + t.Fatalf("get CR: %v", err) + } + fetched.Status.Conditions = []metav1.Condition{{ + Type: v1alpha1.CommittedResourceConditionReady, + Status: metav1.ConditionFalse, + Reason: v1alpha1.CommittedResourceReasonRejected, + ObservedGeneration: fetched.Generation, + LastTransitionTime: metav1.Now(), + }} + if err := k8sClient.Status().Update(context.Background(), &fetched); err != nil { + t.Fatalf("set rejected status: %v", err) + } + } + + controller := &CommittedResourceController{ + Client: k8sClient, + Scheme: scheme, + Conf: CommittedResourceControllerConfig{RequeueIntervalRetry: metav1.Duration{Duration: 1 * time.Minute}}, + } + + result, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)) + if err != nil { + t.Fatalf("reconcile: %v", err) + } + + assertCondition(t, k8sClient, cr.Name, tt.expectedStatus, tt.expectedReason) + + if tt.expectRequeue && result.RequeueAfter == 0 { + t.Errorf("expected requeue, got none") + } + if !tt.expectRequeue && result.RequeueAfter != 0 { + t.Errorf("expected no requeue, got RequeueAfter=%v", result.RequeueAfter) + } + + // CPU CRs must never produce Reservation CRDs. + if got := countChildReservations(t, k8sClient, cr.Spec.CommitmentUUID); got != 0 { + t.Errorf("expected 0 child reservations for cores CR, got %d", got) + } + }) + } +} + func TestRetryDelay(t *testing.T) { base := 30 * time.Second maxDelay := 30 * time.Minute diff --git a/internal/scheduling/reservations/commitments/e2e_checks.go b/internal/scheduling/reservations/commitments/e2e_checks.go index dbdbef40e..758f5cf24 100644 --- a/internal/scheduling/reservations/commitments/e2e_checks.go +++ b/internal/scheduling/reservations/commitments/e2e_checks.go @@ -14,9 +14,9 @@ import ( "strings" "time" - . "github.com/majewsky/gg/option" liquid "github.com/sapcc/go-api-declarations/liquid" "github.com/sapcc/go-bits/must" + . "go.xyrillian.de/gg/option" ) const ( @@ -189,7 +189,7 @@ func e2eRoundTripResource( // Only capacity rejections (no hosts available) are expected in production clusters. // Any other reason (flavor group ineligible, config error, timeout) indicates a // regression and should surface as a failure. - if !strings.Contains(rejectionReason, "no hosts found") { + if !strings.Contains(rejectionReason, "no hosts found") && !strings.Contains(rejectionReason, "insufficient CPU cores") { panic(fmt.Sprintf("round-trip check: commitment rejected with unexpected reason for resource %s: %s", resourceName, rejectionReason)) } slog.Info("round-trip check: commitment rejected — no capacity, continuing", @@ -391,7 +391,7 @@ func e2eBatchFlavorGroupResource( "project", projectID, "az", az) if reason := e2eSendChangeCommitments(ctx, baseURL, req2); reason != "" { - if !strings.Contains(reason, "no hosts found") { + if !strings.Contains(reason, "no hosts found") && !strings.Contains(reason, "insufficient CPU cores") { panic(fmt.Sprintf("batch check: unexpected rejection for batch of %s: %s", resourceName, reason)) } slog.Info("batch check: batch rejected — no capacity for full amount, cleanup will remove pending", diff --git a/internal/scheduling/reservations/commitments/field_index.go b/internal/scheduling/reservations/commitments/field_index.go index 5bcbefdc3..1f3733615 100644 --- a/internal/scheduling/reservations/commitments/field_index.go +++ b/internal/scheduling/reservations/commitments/field_index.go @@ -17,6 +17,7 @@ import ( const idxCommittedResourceByUUID = "spec.commitmentUUID" const idxCommittedResourceByProjectID = "spec.projectID" const idxReservationByCommitmentUUID = "spec.committedResourceReservation.commitmentUUID" +const idxProjectQuotaByProjectID = "spec.projectID" // once guards ensure each field index is registered exactly once. // Both CommittedResourceController and UsageReconciler call indexCommittedResourceByUUID; @@ -25,6 +26,7 @@ var ( onceIndexCRByUUID sync.Once onceIndexCRByProjectID sync.Once onceIndexReservationByUUID sync.Once + onceIndexPQByProjectID sync.Once ) // indexCommittedResourceByUUID registers the index used by UsageReconciler to look up @@ -101,3 +103,28 @@ func indexReservationByCommitmentUUID(ctx context.Context, mcl *multicluster.Cli }) return err } + +// indexProjectQuotaByProjectID registers the index used by UsageCalculator to look up +// a project's ProjectQuota CRD by its ProjectID without assuming a naming convention. +func indexProjectQuotaByProjectID(ctx context.Context, mcl *multicluster.Client) (err error) { + onceIndexPQByProjectID.Do(func() { + log := logf.FromContext(ctx) + err = mcl.IndexField(ctx, + &v1alpha1.ProjectQuota{}, + &v1alpha1.ProjectQuotaList{}, + idxProjectQuotaByProjectID, + func(obj client.Object) []string { + pq, ok := obj.(*v1alpha1.ProjectQuota) + if !ok { + log.Error(errors.New("unexpected type"), "expected ProjectQuota", "object", obj) + return nil + } + if pq.Spec.ProjectID == "" { + return nil + } + return []string{pq.Spec.ProjectID} + }, + ) + }) + return err +} diff --git a/internal/scheduling/reservations/commitments/state.go b/internal/scheduling/reservations/commitments/state.go index 149cdfc03..976334f9b 100644 --- a/internal/scheduling/reservations/commitments/state.go +++ b/internal/scheduling/reservations/commitments/state.go @@ -11,7 +11,6 @@ import ( "time" "github.com/cobaltcore-dev/cortex/api/v1alpha1" - "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" "github.com/sapcc/go-api-declarations/liquid" ) @@ -73,6 +72,32 @@ func GetFlavorGroupNameFromResource(resourceName string) (string, error) { return name, nil } +// GetFlavorGroupAndTypeFromResource extracts the flavor group name and resource type from a +// LIQUID resource name. Accepts _ram (memory) and _cores (CPU) suffixes. +// _instances resources are not supported for commitments. +func GetFlavorGroupAndTypeFromResource(resourceName string) (string, v1alpha1.CommittedResourceType, error) { + if !strings.HasPrefix(resourceName, resourceNamePrefix) { + return "", "", fmt.Errorf("invalid resource name: %s (missing prefix)", resourceName) + } + name := strings.TrimPrefix(resourceName, resourceNamePrefix) + switch { + case strings.HasSuffix(name, ResourceSuffixRAM): + group := strings.TrimSuffix(name, ResourceSuffixRAM) + if group == "" { + return "", "", fmt.Errorf("invalid resource name: %s (empty group name)", resourceName) + } + return group, v1alpha1.CommittedResourceTypeMemory, nil + case strings.HasSuffix(name, ResourceSuffixCores): + group := strings.TrimSuffix(name, ResourceSuffixCores) + if group == "" { + return "", "", fmt.Errorf("invalid resource name: %s (empty group name)", resourceName) + } + return group, v1alpha1.CommittedResourceTypeCores, nil + default: + return "", "", fmt.Errorf("invalid resource name: %s (only _ram and _cores resources are supported for commitments)", resourceName) + } +} + // CommitmentState represents desired or current commitment resource allocation. type CommitmentState struct { // CommitmentUUID is the UUID of the commitment this state corresponds to. @@ -83,8 +108,12 @@ type CommitmentState struct { DomainID string // FlavorGroupName identifies the flavor group (e.g., "hana_medium_v2") FlavorGroupName string - // the total memory in bytes across all reservation slots + // ResourceType is the kind of resource committed: memory or cores. + ResourceType v1alpha1.CommittedResourceType + // TotalMemoryBytes is the total memory in bytes across all reservation slots (memory commitments only). TotalMemoryBytes int64 + // TotalCores is the number of committed CPU cores (cores commitments only). + TotalCores int64 // AvailabilityZone specifies the availability zone for this commitment AvailabilityZone string // StartTime is when the commitment becomes active @@ -109,7 +138,6 @@ type CommitmentState struct { // FromCommitment converts Limes commitment to CommitmentState. func FromCommitment( commitment Commitment, - flavorGroup compute.FlavorGroupFeature, ) (*CommitmentState, error) { // Validate commitment UUID format if !commitmentUUIDPattern.MatchString(commitment.UUID) { @@ -121,9 +149,9 @@ func FromCommitment( return nil, err } - // Calculate total memory from commitment amount (amount = multiples of smallest flavor) - smallestFlavorMemoryBytes := int64(flavorGroup.SmallestFlavor.MemoryMB) * 1024 * 1024 //nolint:gosec // flavor memory from specs, realistically bounded - totalMemoryBytes := int64(commitment.Amount) * smallestFlavorMemoryBytes //nolint:gosec // commitment amount from Limes API, bounded by quota limits + // Calculate total memory from commitment amount (1 GiB per unit) + const gibInBytes = int64(1) << 30 + totalMemoryBytes := int64(commitment.Amount) * gibInBytes //nolint:gosec // commitment amount from Limes API, bounded by quota limits // Set start time: use ConfirmedAt if available, otherwise CreatedAt var startTime *time.Time @@ -161,7 +189,7 @@ func FromChangeCommitmentTargetState( projectID string, domainID string, flavorGroupName string, - flavorGroup compute.FlavorGroupFeature, + resourceType v1alpha1.CommittedResourceType, az string, ) (*CommitmentState, error) { // Validate commitment UUID format @@ -202,31 +230,37 @@ func FromChangeCommitmentTargetState( } } - // Flavors are sorted by size descending, so the last one is the smallest - smallestFlavor := flavorGroup.SmallestFlavor - smallestFlavorMemoryBytes := int64(smallestFlavor.MemoryMB) * 1024 * 1024 //nolint:gosec // flavor memory from specs, realistically bounded - - // Amount represents multiples of the smallest flavor in the group - totalMemoryBytes := int64(amountMultiple) * smallestFlavorMemoryBytes - - return &CommitmentState{ + state := &CommitmentState{ CommitmentUUID: string(commitment.UUID), ProjectID: projectID, DomainID: domainID, FlavorGroupName: flavorGroupName, - TotalMemoryBytes: totalMemoryBytes, + ResourceType: resourceType, AvailabilityZone: az, StartTime: startTime, EndTime: endTime, State: v1alpha1.CommitmentStatus(commitment.NewStatus.UnwrapOr("")), - }, nil + } + + switch resourceType { + case v1alpha1.CommittedResourceTypeCores: + state.TotalCores = int64(amountMultiple) + default: + // Amount represents GiB of RAM (1 GiB per unit) + const gibInBytes = int64(1) << 30 + state.TotalMemoryBytes = int64(amountMultiple) * gibInBytes + } + + return state, nil } // FromCommittedResource reads CommitmentState from a CommittedResource CRD. -// Only memory commitments are supported; cores support is added in a follow-up. func FromCommittedResource(cr v1alpha1.CommittedResource) (*CommitmentState, error) { - if cr.Spec.ResourceType != v1alpha1.CommittedResourceTypeMemory { - return nil, fmt.Errorf("unsupported resource type %q: only memory commitments are supported", cr.Spec.ResourceType) + switch cr.Spec.ResourceType { + case v1alpha1.CommittedResourceTypeMemory, v1alpha1.CommittedResourceTypeCores: + // supported + default: + return nil, fmt.Errorf("unsupported resource type %q", cr.Spec.ResourceType) } if !commitmentUUIDPattern.MatchString(cr.Spec.CommitmentUUID) { @@ -238,10 +272,17 @@ func FromCommittedResource(cr v1alpha1.CommittedResource) (*CommitmentState, err ProjectID: cr.Spec.ProjectID, DomainID: cr.Spec.DomainID, FlavorGroupName: cr.Spec.FlavorGroupName, - TotalMemoryBytes: cr.Spec.Amount.Value(), + ResourceType: cr.Spec.ResourceType, AvailabilityZone: cr.Spec.AvailabilityZone, } + switch cr.Spec.ResourceType { + case v1alpha1.CommittedResourceTypeCores: + state.TotalCores = cr.Spec.Amount.Value() + default: + state.TotalMemoryBytes = cr.Spec.Amount.Value() + } + if cr.Spec.StartTime != nil { t := cr.Spec.StartTime.Time state.StartTime = &t diff --git a/internal/scheduling/reservations/commitments/state_test.go b/internal/scheduling/reservations/commitments/state_test.go index 3aba27b71..4fceccebe 100644 --- a/internal/scheduling/reservations/commitments/state_test.go +++ b/internal/scheduling/reservations/commitments/state_test.go @@ -32,7 +32,6 @@ func testFlavorGroup() compute.FlavorGroupFeature { } func TestFromCommitment_CalculatesMemoryCorrectly(t *testing.T) { - flavorGroup := testFlavorGroup() commitment := Commitment{ UUID: "test-uuid", ProjectID: "project-1", @@ -40,7 +39,7 @@ func TestFromCommitment_CalculatesMemoryCorrectly(t *testing.T) { Amount: 5, // 5 multiples of smallest flavor } - state, err := FromCommitment(commitment, flavorGroup) + state, err := FromCommitment(commitment) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -56,15 +55,14 @@ func TestFromCommitment_CalculatesMemoryCorrectly(t *testing.T) { t.Errorf("expected FlavorGroupName test-group, got %s", state.FlavorGroupName) } - // Verify memory calculation: 5 * 8192 MB = 40960 MB = 42949672960 bytes - expectedMemory := int64(5 * 8192 * 1024 * 1024) + // Verify memory calculation: 5 GiB = 5 * 1<<30 bytes + expectedMemory := int64(5) * (1 << 30) if state.TotalMemoryBytes != expectedMemory { t.Errorf("expected memory %d, got %d", expectedMemory, state.TotalMemoryBytes) } } func TestFromCommitment_InvalidResourceName(t *testing.T) { - flavorGroup := testFlavorGroup() commitment := Commitment{ UUID: "test-uuid", ProjectID: "project-1", @@ -72,7 +70,7 @@ func TestFromCommitment_InvalidResourceName(t *testing.T) { Amount: 1, } - _, err := FromCommitment(commitment, flavorGroup) + _, err := FromCommitment(commitment) if err == nil { t.Fatal("expected error for invalid resource name, got nil") } @@ -273,3 +271,71 @@ func TestResourceNameRoundTrip(t *testing.T) { } } } + +func TestGetFlavorGroupAndTypeFromResource_RAM(t *testing.T) { + group, rtype, err := GetFlavorGroupAndTypeFromResource("hw_version_hana_medium_v2_ram") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if group != "hana_medium_v2" { + t.Errorf("expected hana_medium_v2, got %s", group) + } + if rtype != v1alpha1.CommittedResourceTypeMemory { + t.Errorf("expected CommittedResourceTypeMemory, got %s", rtype) + } +} + +func TestGetFlavorGroupAndTypeFromResource_Cores(t *testing.T) { + group, rtype, err := GetFlavorGroupAndTypeFromResource("hw_version_2101_cores") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if group != "2101" { + t.Errorf("expected 2101, got %s", group) + } + if rtype != v1alpha1.CommittedResourceTypeCores { + t.Errorf("expected CommittedResourceTypeCores, got %s", rtype) + } +} + +func TestGetFlavorGroupAndTypeFromResource_Invalid(t *testing.T) { + invalidCases := []string{ + "invalid", // no prefix + "hw_version__ram", // empty group + "hw_version__cores", // empty group + "hw_version_2101_instances", // unsupported suffix + "hw_version_2101", // no suffix + } + for _, input := range invalidCases { + if _, _, err := GetFlavorGroupAndTypeFromResource(input); err == nil { + t.Errorf("expected error for %q, got nil", input) + } + } +} + +func TestFromCommittedResource_Cores(t *testing.T) { + cr := v1alpha1.CommittedResource{ + Spec: v1alpha1.CommittedResourceSpec{ + CommitmentUUID: "test-uuid-1234", + ProjectID: "project-1", + DomainID: "domain-1", + FlavorGroupName: "2101", + ResourceType: v1alpha1.CommittedResourceTypeCores, + Amount: *resource.NewQuantity(8, resource.DecimalSI), + AvailabilityZone: "test-az", + }, + } + state, err := FromCommittedResource(cr) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if state.ResourceType != v1alpha1.CommittedResourceTypeCores { + t.Errorf("expected ResourceType cores, got %s", state.ResourceType) + } + if state.TotalCores != 8 { + t.Errorf("expected TotalCores 8, got %d", state.TotalCores) + } + if state.TotalMemoryBytes != 0 { + t.Errorf("expected TotalMemoryBytes 0 for cores CR, got %d", state.TotalMemoryBytes) + } +} diff --git a/internal/scheduling/reservations/commitments/syncer.go b/internal/scheduling/reservations/commitments/syncer.go index 7c96823d0..939d1cfc2 100644 --- a/internal/scheduling/reservations/commitments/syncer.go +++ b/internal/scheduling/reservations/commitments/syncer.go @@ -13,6 +13,7 @@ import ( "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" "github.com/go-logr/logr" + "github.com/sapcc/go-api-declarations/liquid" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -140,9 +141,8 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo continue } - // Validate unit matches between Limes commitment and Cortex flavor group - // Expected format: " MiB" e.g. "131072 MiB" for 128 GiB - expectedUnit := fmt.Sprintf("%d MiB", flavorGroup.SmallestFlavor.MemoryMB) + // Validate unit matches between Limes commitment and Cortex (1 GiB per unit) + expectedUnit := liquid.UnitGibibytes.String() // "GiB" if commitment.Unit != "" && commitment.Unit != expectedUnit { // Unit mismatch: Limes has not yet updated this commitment to the new unit. // Skip this commitment - trust what Cortex already has stored in CRDs. @@ -174,7 +174,7 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo } // Convert commitment to state using FromCommitment - state, err := FromCommitment(commitment, flavorGroup) + state, err := FromCommitment(commitment) if err != nil { log.Error(err, "failed to convert commitment to state", "id", id, diff --git a/internal/scheduling/reservations/commitments/syncer_test.go b/internal/scheduling/reservations/commitments/syncer_test.go index 28a464d1e..27e2ae6c1 100644 --- a/internal/scheduling/reservations/commitments/syncer_test.go +++ b/internal/scheduling/reservations/commitments/syncer_test.go @@ -412,8 +412,8 @@ func TestSyncer_SyncReservations_UnitMismatch(t *testing.T) { WithObjects(flavorGroupsKnowledge). Build() - // Create mock commitment with a unit that doesn't match Cortex's understanding - // Limes says "2048 MiB" but Cortex's smallest flavor is 1024 MB + // Create mock commitment with a unit that doesn't match Cortex's expected "GiB" + // Limes says "2048 MiB" but Cortex expects "GiB" mockCommitments := []Commitment{ { ID: 1, @@ -422,7 +422,7 @@ func TestSyncer_SyncReservations_UnitMismatch(t *testing.T) { ResourceName: "hw_version_test_group_v1_ram", AvailabilityZone: "az1", Amount: 2, - Unit: "2048 MiB", // Mismatched unit - should be "1024 MiB" + Unit: "2048 MiB", // Mismatched unit - should be "GiB" Status: "confirmed", ProjectID: "test-project", DomainID: "test-domain", @@ -502,7 +502,7 @@ func TestSyncer_SyncReservations_UnitMatch(t *testing.T) { ResourceName: "hw_version_test_group_v1_ram", AvailabilityZone: "az1", Amount: 2, - Unit: "1024 MiB", // Correct unit matching smallest flavor + Unit: "GiB", // Correct unit matching Cortex's expected GiB Status: "confirmed", ProjectID: "test-project", DomainID: "test-domain", diff --git a/internal/scheduling/reservations/commitments/usage.go b/internal/scheduling/reservations/commitments/usage.go index a9333fdaa..9931cda9d 100644 --- a/internal/scheduling/reservations/commitments/usage.go +++ b/internal/scheduling/reservations/commitments/usage.go @@ -19,8 +19,8 @@ import ( "github.com/cobaltcore-dev/cortex/internal/scheduling/external" "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" "github.com/go-logr/logr" - . "github.com/majewsky/gg/option" "github.com/sapcc/go-api-declarations/liquid" + . "go.xyrillian.de/gg/option" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -100,20 +100,22 @@ type VMUsageInfo struct { AZ string Hypervisor string CreatedAt time.Time - UsageMultiple uint64 // Memory in multiples of smallest flavor in the group + UsageMultiple uint64 // RAM in GiB } // UsageCalculator computes usage reports for Limes LIQUID API. type UsageCalculator struct { client client.Client usageDB UsageDBClient + config APIConfig } // NewUsageCalculator creates a new UsageCalculator instance. -func NewUsageCalculator(client client.Client, usageDB UsageDBClient) *UsageCalculator { +func NewUsageCalculator(client client.Client, usageDB UsageDBClient, config APIConfig) *UsageCalculator { return &UsageCalculator{ client: client, usageDB: usageDB, + config: config, } } @@ -144,12 +146,21 @@ func (c *UsageCalculator) CalculateUsage( return liquid.ServiceUsageReport{}, fmt.Errorf("failed to read VM assignments from CRD status: %w", err) } + // Fetch all per-AZ ProjectQuota CRDs for this project to read quota values. + // May be empty if Limes has not pushed quota yet — in that case quota defaults to infinite. + // Each CRD holds quota for one AZ; we build a combined map[resourceName][az] = quota. + var pqList v1alpha1.ProjectQuotaList + var quotaByResourceAZ map[string]map[string]int64 + if err := c.client.List(ctx, &pqList, client.MatchingFields{idxProjectQuotaByProjectID: projectID}); err == nil && len(pqList.Items) > 0 { + quotaByResourceAZ = buildCombinedQuotaMap(pqList.Items) + } + vms, err := getProjectVMs(ctx, c.usageDB, log, projectID, flavorGroups, allAZs) if err != nil { return liquid.ServiceUsageReport{}, fmt.Errorf("failed to get project VMs: %w", err) } - report := c.buildUsageResponse(vms, vmAssignments, flavorGroups, allAZs, infoVersion) + report := c.buildUsageResponse(vms, vmAssignments, flavorGroups, allAZs, infoVersion, quotaByResourceAZ, c.config) assignedToCommitments := 0 for _, vm := range vms { @@ -276,17 +287,10 @@ func getProjectVMs( // Build flavor name -> flavor group lookup flavorToGroup := make(map[string]string) - flavorToSmallestMemory := make(map[string]uint64) // for calculating usage multiples for groupName, group := range flavorGroups { for _, flavor := range group.Flavors { flavorToGroup[flavor.Name] = groupName } - // Smallest flavor in group determines the usage unit - if group.SmallestFlavor.Name != "" { - for _, flavor := range group.Flavors { - flavorToSmallestMemory[flavor.Name] = group.SmallestFlavor.MemoryMB - } - } } var vms []VMUsageInfo @@ -302,10 +306,12 @@ func getProjectVMs( // Determine flavor group flavorGroup := flavorToGroup[row.FlavorName] - // Calculate usage multiple (memory in units of smallest flavor) + // Calculate usage in GiB (FlavorRAM is in MiB). + // Add 16 MiB before dividing: flavors reserve 16 MiB for video RAM (hw_video:ram_max_mb=16), + // so a nominal "2 GiB" flavor has 2032 MiB. Without the adjustment, integer division truncates. var usageMultiple uint64 - if smallestMem := flavorToSmallestMemory[row.FlavorName]; smallestMem > 0 { - usageMultiple = row.FlavorRAM / smallestMem + if row.FlavorRAM > 0 { + usageMultiple = (row.FlavorRAM + 16) / 1024 } // Normalize AZ @@ -421,16 +427,35 @@ type azUsageData struct { subresources []liquid.Subresource // VM details for subresource reporting } +// buildCombinedQuotaMap aggregates per-AZ ProjectQuota CRDs into a combined lookup map. +// Returns quotaByResourceAZ[resourceName][az] = quota value. +func buildCombinedQuotaMap(pqs []v1alpha1.ProjectQuota) map[string]map[string]int64 { + result := make(map[string]map[string]int64) + for _, pq := range pqs { + az := pq.Spec.AvailabilityZone + for resourceName, quota := range pq.Spec.Quota { + if result[resourceName] == nil { + result[resourceName] = make(map[string]int64) + } + result[resourceName][az] = quota + } + } + return result +} + // buildUsageResponse constructs the Liquid API ServiceUsageReport. // All flavor groups are included in the report; commitment assignment only applies // to groups with fixed RAM/core ratio (those that accept commitments). // For each flavor group, three resources are reported: _ram, _cores, _instances. +// quotaByResourceAZ is a combined map[resourceName][az] = quota from all per-AZ ProjectQuota CRDs. func (c *UsageCalculator) buildUsageResponse( vms []VMUsageInfo, vmAssignments map[string]string, flavorGroups map[string]compute.FlavorGroupFeature, allAZs []liquid.AvailabilityZone, infoVersion int64, + quotaByResourceAZ map[string]map[string]int64, + config APIConfig, ) liquid.ServiceUsageReport { // Initialize resources map for all flavor groups resources := make(map[liquid.ResourceName]*liquid.ResourceUsageReport) @@ -478,36 +503,38 @@ func (c *UsageCalculator) buildUsageResponse( } // Build ResourceUsageReport for all flavor groups (not just those with fixed ratio) - for flavorGroupName, groupData := range flavorGroups { + for flavorGroupName := range flavorGroups { // All flavor groups are included in usage reporting. // === 1. RAM Resource === ramResourceName := liquid.ResourceName(ResourceNameRAM(flavorGroupName)) ramPerAZ := make(map[liquid.AvailabilityZone]*liquid.AZResourceUsageReport) - // For AZSeparatedTopology resources (fixed-ratio groups), per-AZ Quota must be non-null. - // Use -1 ("infinite quota") as default until actual quota is read from ProjectQuota CRD. - ramHasAZQuota := groupData.HasFixedRamCoreRatio() + // Include per-AZ quota for AZSeparatedTopology resources — same condition as info.go. + ramHasAZQuota := config.ResourceConfigForGroup(flavorGroupName).RAM.HandlesCommitments for _, az := range allAZs { report := &liquid.AZResourceUsageReport{ Usage: 0, Subresources: []liquid.Subresource{}, } if ramHasAZQuota { - report.Quota = Some(int64(-1)) // infinite — will be overridden by ProjectQuota CRD + quota := int64(-1) // default: infinite + if quotaByResourceAZ != nil { + if azMap, ok := quotaByResourceAZ[string(ramResourceName)]; ok { + if q, ok := azMap[string(az)]; ok { + quota = q + } + } + } + report.Quota = Some(quota) } ramPerAZ[az] = report } if azData, exists := usageByFlavorGroupAZ[flavorGroupName]; exists { for az, data := range azData { if _, known := ramPerAZ[az]; !known { - report := &liquid.AZResourceUsageReport{} - if ramHasAZQuota { - report.Quota = Some(int64(-1)) - } - ramPerAZ[az] = report + continue // skip VMs in AZs not in allAZs } ramPerAZ[az].Usage = data.ramUsage - ramPerAZ[az].PhysicalUsage = Some(data.ramUsage) // No overcommit for RAM // Subresources are only on instances resource } } @@ -527,10 +554,9 @@ func (c *UsageCalculator) buildUsageResponse( if azData, exists := usageByFlavorGroupAZ[flavorGroupName]; exists { for az, data := range azData { if _, known := coresPerAZ[az]; !known { - coresPerAZ[az] = &liquid.AZResourceUsageReport{} + continue // skip VMs in AZs not in allAZs } coresPerAZ[az].Usage = data.coresUsage - coresPerAZ[az].PhysicalUsage = Some(data.coresUsage) // No overcommit for cores // Subresources are only on instances resource } } @@ -550,10 +576,9 @@ func (c *UsageCalculator) buildUsageResponse( if azData, exists := usageByFlavorGroupAZ[flavorGroupName]; exists { for az, data := range azData { if _, known := instancesPerAZ[az]; !known { - instancesPerAZ[az] = &liquid.AZResourceUsageReport{} + continue // skip VMs in AZs not in allAZs } instancesPerAZ[az].Usage = data.instanceCount - instancesPerAZ[az].PhysicalUsage = Some(data.instanceCount) instancesPerAZ[az].Subresources = data.subresources // VM details on instances resource } } diff --git a/internal/scheduling/reservations/commitments/usage_reconciler.go b/internal/scheduling/reservations/commitments/usage_reconciler.go index 4d09439b9..6314c53c8 100644 --- a/internal/scheduling/reservations/commitments/usage_reconciler.go +++ b/internal/scheduling/reservations/commitments/usage_reconciler.go @@ -278,6 +278,9 @@ func (r *UsageReconciler) SetupWithManager(mgr ctrl.Manager, mcl *multicluster.C if err := indexCommittedResourceByProjectID(context.Background(), mcl); err != nil { return fmt.Errorf("failed to set up committed resource project index: %w", err) } + if err := indexProjectQuotaByProjectID(context.Background(), mcl); err != nil { + return fmt.Errorf("failed to set up project quota project index: %w", err) + } bldr := multicluster.BuildController(mcl, mgr) diff --git a/internal/scheduling/reservations/quota/controller.go b/internal/scheduling/reservations/quota/controller.go index f00d040dc..f374080bc 100644 --- a/internal/scheduling/reservations/quota/controller.go +++ b/internal/scheduling/reservations/quota/controller.go @@ -183,7 +183,7 @@ func (c *QuotaController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl // Determine if this is a spec change (new CRD or quota update) vs. a CR UsedAmount change specChanged := pq.Generation > pq.Status.ObservedGeneration - var totalUsage map[string]v1alpha1.ResourceQuotaUsage + var totalUsage map[string]map[string]int64 if specChanged { // Spec changed (new CRD or quota update) — recompute TotalUsage from Postgres logger.Info("spec changed, recomputing TotalUsage from Postgres", @@ -195,9 +195,12 @@ func (c *QuotaController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl return ctrl.Result{}, err } } else { - // CR UsedAmount changed — read persisted TotalUsage, only recompute PaygUsage - totalUsage = pq.Status.TotalUsage - if totalUsage == nil { + // CR UsedAmount changed — read persisted TotalUsage, only recompute PaygUsage. + // Status stores flat map[string]int64 (for this AZ only), but internal functions + // operate on map[string]map[string]int64. Reconstruct the multi-AZ view. + if pq.Status.TotalUsage != nil { + totalUsage = expandAZSlice(pq.Status.TotalUsage, pq.Spec.AvailabilityZone) + } else { // Safety fallback: TotalUsage should always be set after first spec reconcile logger.Info("no TotalUsage persisted, computing as fallback") var err error @@ -247,7 +250,7 @@ func (c *QuotaController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl // computeTotalUsageForProject computes TotalUsage for a single project by reading // all VMs from Postgres and filtering to the target project. Used as bootstrap when // a ProjectQuota is first created and has no persisted TotalUsage yet. -func (c *QuotaController) computeTotalUsageForProject(ctx context.Context, projectID string) (map[string]v1alpha1.ResourceQuotaUsage, error) { +func (c *QuotaController) computeTotalUsageForProject(ctx context.Context, projectID string) (map[string]map[string]int64, error) { // Fetch flavor groups from Knowledge CRD flavorGroupClient := &reservations.FlavorGroupKnowledgeClient{Client: c.Client} flavorGroups, err := flavorGroupClient.GetAllFlavorGroups(ctx, nil) @@ -418,17 +421,11 @@ func (c *QuotaController) accumulateAddedVM( if !ok { return // Flavor not in any group } - fg, ok := flavorGroups[groupName] - if !ok { - return - } - - unitSizeMiB := int64(fg.SmallestFlavor.MemoryMB) //nolint:gosec // MemoryMB is always within int64 range - if unitSizeMiB == 0 { + if _, ok := flavorGroups[groupName]; !ok { return } - ramUnits, coresAmount := vmResourceUnits(vm.Resources, unitSizeMiB) + ramUnits, coresAmount := vmResourceUnits(vm.Resources) delta := projectDeltas[vm.ProjectID] if delta == nil { @@ -458,7 +455,7 @@ func (c *QuotaController) isVMNewSinceLastReconcile(ctx context.Context, vm *fai } // Look up the ProjectQuota for this VM's project - crdName := "quota-" + vm.ProjectID + crdName := "quota-" + vm.ProjectID + "-" + vm.AvailabilityZone var pq v1alpha1.ProjectQuota if err := c.Get(ctx, client.ObjectKey{Name: crdName}, &pq); err != nil { // If we can't find the ProjectQuota, skip (full reconcile will handle it) @@ -523,19 +520,13 @@ func (c *QuotaController) accumulateRemovedVM( if !ok { return // Flavor not in any group } - fg, ok := flavorGroups[groupName] - if !ok { + if _, ok := flavorGroups[groupName]; !ok { return } // Compute commitment units from the resolved flavor resources - unitSizeMiB := int64(fg.SmallestFlavor.MemoryMB) //nolint:gosec // MemoryMB is always within int64 range - if unitSizeMiB == 0 { - return - } - - ramUnits := int64(info.RAMMiB) / unitSizeMiB //nolint:gosec // safe - coresAmount := int64(info.VCPUs) //nolint:gosec // safe + ramUnits := int64(info.RAMMiB) / 1024 //nolint:gosec // safe + coresAmount := int64(info.VCPUs) //nolint:gosec // safe delta := projectDeltas[info.ProjectID] if delta == nil { @@ -547,8 +538,8 @@ func (c *QuotaController) accumulateRemovedVM( delta.addDecrement(commitments.ResourceNameCores(groupName), info.AvailabilityZone, coresAmount) } -// applyDeltaAndUpdateStatus fetches the ProjectQuota, applies the batched delta to TotalUsage, -// recomputes PaygUsage, and persists with conflict retry. +// applyDeltaAndUpdateStatus applies batched deltas to ALL per-AZ ProjectQuota CRDs for a project. +// It lists all per-AZ CRDs, applies relevant deltas to each, recomputes PaygUsage, and persists. func (c *QuotaController) applyDeltaAndUpdateStatus( ctx context.Context, projectID string, @@ -557,51 +548,80 @@ func (c *QuotaController) applyDeltaAndUpdateStatus( flavorGroups map[string]compute.FlavorGroupFeature, ) error { - crdName := "quota-" + projectID - - return retry.RetryOnConflict(retry.DefaultRetry, func() error { - // Re-fetch fresh copy on each retry - var pq v1alpha1.ProjectQuota - if err := c.Get(ctx, client.ObjectKey{Name: crdName}, &pq); err != nil { - if client.IgnoreNotFound(err) == nil { - return nil // PQ deleted, nothing to do - } - return err + // Collect all AZs affected by this delta + affectedAZs := make(map[string]bool) + for _, azAmounts := range delta.increments { + for az := range azAmounts { + affectedAZs[az] = true } - - if pq.Status.TotalUsage == nil { - pq.Status.TotalUsage = make(map[string]v1alpha1.ResourceQuotaUsage) + } + for _, azAmounts := range delta.decrements { + for az := range azAmounts { + affectedAZs[az] = true } + } + + crUsage := c.computeCRUsage(projectCRs, flavorGroups) + + for az := range affectedAZs { + crdName := "quota-" + projectID + "-" + az - // Apply increments - for resourceName, azAmounts := range delta.increments { - for az, amount := range azAmounts { - incrementUsage(pq.Status.TotalUsage, resourceName, az, amount) + err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + var pq v1alpha1.ProjectQuota + if err := c.Get(ctx, client.ObjectKey{Name: crdName}, &pq); err != nil { + if client.IgnoreNotFound(err) == nil { + return nil // PQ for this AZ doesn't exist, skip + } + return err } - } - // Apply decrements - for resourceName, azAmounts := range delta.decrements { - for az, amount := range azAmounts { - decrementUsage(pq.Status.TotalUsage, resourceName, az, amount) + if pq.Status.TotalUsage == nil { + pq.Status.TotalUsage = make(map[string]int64) } - } - // Recompute PaygUsage - crUsage := c.computeCRUsage(projectCRs, flavorGroups) - paygUsage := derivePaygUsage(pq.Status.TotalUsage, crUsage) + // Apply increments for this AZ + for resourceName, azAmounts := range delta.increments { + if amount, ok := azAmounts[az]; ok { + pq.Status.TotalUsage[resourceName] += amount + } + } - pq.Status.PaygUsage = paygUsage - now := metav1.Now() - pq.Status.LastReconcileAt = &now + // Apply decrements for this AZ + for resourceName, azAmounts := range delta.decrements { + if amount, ok := azAmounts[az]; ok { + pq.Status.TotalUsage[resourceName] -= amount + if pq.Status.TotalUsage[resourceName] < 0 { + pq.Status.TotalUsage[resourceName] = 0 + } + } + } + + // Derive PaygUsage for this AZ: totalUsage[resource] - crUsage[resource][az] + pq.Status.PaygUsage = make(map[string]int64) + for resourceName, totalAmount := range pq.Status.TotalUsage { + crAmount := int64(0) + if cr, ok := crUsage[resourceName]; ok { + if azAmount, ok := cr[az]; ok { + crAmount = azAmount + } + } + paygAmount := totalAmount - crAmount + if paygAmount < 0 { + paygAmount = 0 + } + pq.Status.PaygUsage[resourceName] = paygAmount + } - if err := c.Status().Update(ctx, &pq); err != nil { + now := metav1.Now() + pq.Status.LastReconcileAt = &now + return c.Status().Update(ctx, &pq) + }) + if err != nil { return err } + } - c.recordUsageMetrics(projectID, pq.Status.TotalUsage, paygUsage, crUsage) - return nil - }) + return nil } // ============================================================================ @@ -691,47 +711,42 @@ func (c *QuotaController) computeTotalUsage( vms []failover.VM, flavorToGroup map[string]string, flavorGroups map[string]compute.FlavorGroupFeature, -) map[string]map[string]v1alpha1.ResourceQuotaUsage { +) map[string]map[string]map[string]int64 { // result[projectID][resourceName] = ResourceQuotaUsage{PerAZ: {az: amount}} - result := make(map[string]map[string]v1alpha1.ResourceQuotaUsage) + result := make(map[string]map[string]map[string]int64) for _, vm := range vms { groupName, ok := flavorToGroup[vm.FlavorName] if !ok { continue // Flavor not in any tracked group } - fg, ok := flavorGroups[groupName] - if !ok { + if _, ok := flavorGroups[groupName]; !ok { continue } - if fg.SmallestFlavor.MemoryMB == 0 { - continue // Invalid group config - } ramResourceName := commitments.ResourceNameRAM(groupName) coresResourceName := commitments.ResourceNameCores(groupName) - unitSizeMiB := int64(fg.SmallestFlavor.MemoryMB) //nolint:gosec // safe - ramUnits, coresAmount := vmResourceUnits(vm.Resources, unitSizeMiB) + ramUnits, coresAmount := vmResourceUnits(vm.Resources) if _, ok := result[vm.ProjectID]; !ok { - result[vm.ProjectID] = make(map[string]v1alpha1.ResourceQuotaUsage) + result[vm.ProjectID] = make(map[string]map[string]int64) } // Accumulate RAM usage for this project + AZ ramUsage := result[vm.ProjectID][ramResourceName] - if ramUsage.PerAZ == nil { - ramUsage.PerAZ = make(map[string]int64) + if ramUsage == nil { + ramUsage = make(map[string]int64) } - ramUsage.PerAZ[vm.AvailabilityZone] += ramUnits + ramUsage[vm.AvailabilityZone] += ramUnits result[vm.ProjectID][ramResourceName] = ramUsage // Accumulate cores usage for this project + AZ coresUsage := result[vm.ProjectID][coresResourceName] - if coresUsage.PerAZ == nil { - coresUsage.PerAZ = make(map[string]int64) + if coresUsage == nil { + coresUsage = make(map[string]int64) } - coresUsage.PerAZ[vm.AvailabilityZone] += coresAmount + coresUsage[vm.AvailabilityZone] += coresAmount result[vm.ProjectID][coresResourceName] = coresUsage } @@ -750,8 +765,8 @@ func groupCRsByProject(crs []v1alpha1.CommittedResource) map[string][]v1alpha1.C // computeCRUsage computes the committed resource usage from a pre-filtered slice of CRs for one project. // It reads UsedResources from each CR's status and converts to commitment units (multiples for RAM, raw for cores). -func (c *QuotaController) computeCRUsage(crs []v1alpha1.CommittedResource, flavorGroups map[string]compute.FlavorGroupFeature) map[string]v1alpha1.ResourceQuotaUsage { - result := make(map[string]v1alpha1.ResourceQuotaUsage) +func (c *QuotaController) computeCRUsage(crs []v1alpha1.CommittedResource, flavorGroups map[string]compute.FlavorGroupFeature) map[string]map[string]int64 { + result := make(map[string]map[string]int64) for i := range crs { cr := &crs[i] @@ -783,14 +798,12 @@ func (c *QuotaController) computeCRUsage(crs []v1alpha1.CommittedResource, flavo if !ok { continue } - // Convert bytes to commitment units (multiples of smallest flavor) + // Convert bytes to GiB (1 GiB per commitment unit) usedBytes := memQty.Value() - fg, ok := flavorGroups[spec.FlavorGroupName] - if !ok || fg.SmallestFlavor.MemoryMB == 0 { + if _, ok := flavorGroups[spec.FlavorGroupName]; !ok { continue } - unitSizeBytes := int64(fg.SmallestFlavor.MemoryMB) * 1024 * 1024 //nolint:gosec // safe - usedAmount = usedBytes / unitSizeBytes + usedAmount = usedBytes / (1024 * 1024 * 1024) case v1alpha1.CommittedResourceTypeCores: resourceName = commitments.ResourceNameCores(spec.FlavorGroupName) cpuQty, ok := cr.Status.UsedResources["cpu"] @@ -808,10 +821,10 @@ func (c *QuotaController) computeCRUsage(crs []v1alpha1.CommittedResource, flavo // Accumulate per AZ usage := result[resourceName] - if usage.PerAZ == nil { - usage.PerAZ = make(map[string]int64) + if usage == nil { + usage = make(map[string]int64) } - usage.PerAZ[spec.AvailabilityZone] += usedAmount + usage[spec.AvailabilityZone] += usedAmount result[resourceName] = usage } @@ -830,20 +843,18 @@ func (c *QuotaController) isCRStateIncluded(state v1alpha1.CommitmentStatus) boo // derivePaygUsage computes PaygUsage = TotalUsage - CRUsage (clamped >= 0). func derivePaygUsage( - totalUsage map[string]v1alpha1.ResourceQuotaUsage, - crUsage map[string]v1alpha1.ResourceQuotaUsage, -) map[string]v1alpha1.ResourceQuotaUsage { + totalUsage map[string]map[string]int64, + crUsage map[string]map[string]int64, +) map[string]map[string]int64 { - result := make(map[string]v1alpha1.ResourceQuotaUsage) + result := make(map[string]map[string]int64) for resourceName, total := range totalUsage { - payg := v1alpha1.ResourceQuotaUsage{ - PerAZ: make(map[string]int64), - } - for az, totalAmount := range total.PerAZ { + payg := make(map[string]int64) + for az, totalAmount := range total { crAmount := int64(0) if cr, ok := crUsage[resourceName]; ok { - if azAmount, ok := cr.PerAZ[az]; ok { + if azAmount, ok := cr[az]; ok { crAmount = azAmount } } @@ -851,7 +862,7 @@ func derivePaygUsage( if paygAmount < 0 { paygAmount = 0 // Clamp >= 0 } - payg.PerAZ[az] = paygAmount + payg[az] = paygAmount } result[resourceName] = payg } @@ -859,14 +870,38 @@ func derivePaygUsage( return result } +// extractAZSlice extracts the data for a single AZ from a multi-AZ usage map. +// Returns map[resourceName] = value for that AZ only. +func extractAZSlice(usage map[string]map[string]int64, az string) map[string]int64 { + result := make(map[string]int64) + for resourceName, azMap := range usage { + if val, ok := azMap[az]; ok { + result[resourceName] = val + } + } + return result +} + +// expandAZSlice reconstructs a multi-AZ map from a flat per-AZ map. +// Used when reading persisted status (flat) back into the controller's internal format. +func expandAZSlice(flat map[string]int64, az string) map[string]map[string]int64 { + result := make(map[string]map[string]int64) + for resourceName, val := range flat { + result[resourceName] = map[string]int64{az: val} + } + return result +} + // updateProjectQuotaStatusWithRetry writes TotalUsage + PaygUsage + LastReconcileAt // with retry-on-conflict to handle concurrent updates. +// totalUsage and paygUsage are multi-AZ maps; this function extracts the relevant AZ +// slice based on the CRD's Spec.AvailabilityZone. // If fullReconcile is true, also updates LastFullReconcileAt and ObservedGeneration. func (c *QuotaController) updateProjectQuotaStatusWithRetry( ctx context.Context, pqName string, - totalUsage map[string]v1alpha1.ResourceQuotaUsage, - paygUsage map[string]v1alpha1.ResourceQuotaUsage, + totalUsage map[string]map[string]int64, + paygUsage map[string]map[string]int64, fullReconcile bool, ) error { @@ -877,8 +912,10 @@ func (c *QuotaController) updateProjectQuotaStatusWithRetry( return err } - pq.Status.TotalUsage = totalUsage - pq.Status.PaygUsage = paygUsage + // Extract only this AZ's data from the multi-AZ maps + az := pq.Spec.AvailabilityZone + pq.Status.TotalUsage = extractAZSlice(totalUsage, az) + pq.Status.PaygUsage = extractAZSlice(paygUsage, az) pq.Status.ObservedGeneration = pq.Generation now := metav1.Now() pq.Status.LastReconcileAt = &now @@ -889,16 +926,14 @@ func (c *QuotaController) updateProjectQuotaStatusWithRetry( }) } -// vmResourceUnits computes RAM commitment units and cores from a VM's resources. -// RAM is converted from bytes (resource.Quantity) to MiB, then divided by unitSizeMiB -// (the smallest flavor's memory in MiB for the flavor group) to get commitment units. -func vmResourceUnits(resources map[string]resource.Quantity, unitSizeMiB int64) (ramUnits, cores int64) { +// vmResourceUnits computes RAM commitment units (GiB) and cores from a VM's resources. +func vmResourceUnits(resources map[string]resource.Quantity) (ramGiB, cores int64) { memQty := resources["memory"] serverRAMMiB := memQty.Value() / (1024 * 1024) // bytes to MiB - ramUnits = serverRAMMiB / unitSizeMiB // commitment units + ramGiB = serverRAMMiB / 1024 // MiB to GiB (1 GiB per unit) vcpuQty := resources["vcpus"] cores = vcpuQty.Value() - return ramUnits, cores + return ramGiB, cores } // buildFlavorToGroupMap builds a flavorName → flavorGroupName lookup from flavor groups. @@ -913,24 +948,24 @@ func buildFlavorToGroupMap(flavorGroups map[string]compute.FlavorGroupFeature) m } // incrementUsage increments a usage value in the map. -func incrementUsage(usage map[string]v1alpha1.ResourceQuotaUsage, resourceName, az string, amount int64) { +func incrementUsage(usage map[string]map[string]int64, resourceName, az string, amount int64) { u := usage[resourceName] - if u.PerAZ == nil { - u.PerAZ = make(map[string]int64) + if u == nil { + u = make(map[string]int64) } - u.PerAZ[az] += amount + u[az] += amount usage[resourceName] = u } // decrementUsage decrements a usage value in the map (clamp >= 0). -func decrementUsage(usage map[string]v1alpha1.ResourceQuotaUsage, resourceName, az string, amount int64) { +func decrementUsage(usage map[string]map[string]int64, resourceName, az string, amount int64) { u := usage[resourceName] - if u.PerAZ == nil { + if u == nil { return } - u.PerAZ[az] -= amount - if u.PerAZ[az] < 0 { - u.PerAZ[az] = 0 + u[az] -= amount + if u[az] < 0 { + u[az] = 0 } usage[resourceName] = u } @@ -938,20 +973,20 @@ func decrementUsage(usage map[string]v1alpha1.ResourceQuotaUsage, resourceName, // recordUsageMetrics emits Prometheus metrics for all resources in a project. func (c *QuotaController) recordUsageMetrics( projectID string, - totalUsage map[string]v1alpha1.ResourceQuotaUsage, - paygUsage map[string]v1alpha1.ResourceQuotaUsage, - crUsage map[string]v1alpha1.ResourceQuotaUsage, + totalUsage map[string]map[string]int64, + paygUsage map[string]map[string]int64, + crUsage map[string]map[string]int64, ) { for resourceName, total := range totalUsage { - for az, totalAmount := range total.PerAZ { + for az, totalAmount := range total { paygAmount := int64(0) if payg, ok := paygUsage[resourceName]; ok { - paygAmount = payg.PerAZ[az] + paygAmount = payg[az] } crAmount := int64(0) if cr, ok := crUsage[resourceName]; ok { - crAmount = cr.PerAZ[az] + crAmount = cr[az] } c.Metrics.RecordUsage(projectID, az, resourceName, totalAmount, paygAmount, crAmount) } @@ -968,17 +1003,17 @@ func (c *QuotaController) mapCRToProjectQuota(_ context.Context, obj client.Obje if !ok { return nil } - // Map to the ProjectQuota for this project - crdName := "quota-" + cr.Spec.ProjectID + // Map to the per-AZ ProjectQuota for this project + AZ + crdName := "quota-" + cr.Spec.ProjectID + "-" + cr.Spec.AvailabilityZone return []reconcile.Request{ {NamespacedName: client.ObjectKey{Name: crdName}}, } } -// crUsedResourcesChangePredicate triggers only when Status.UsedResources changes on a CommittedResource. +// crUsedResourcesChangePredicate triggers on create, delete, and UsedResources changes of a CommittedResource. func crUsedAmountChangePredicate() predicate.Predicate { return predicate.Funcs{ - CreateFunc: func(_ event.CreateEvent) bool { return false }, + CreateFunc: func(_ event.CreateEvent) bool { return true }, UpdateFunc: func(e event.UpdateEvent) bool { oldCR, ok1 := e.ObjectOld.(*v1alpha1.CommittedResource) newCR, ok2 := e.ObjectNew.(*v1alpha1.CommittedResource) diff --git a/internal/scheduling/reservations/quota/controller_test.go b/internal/scheduling/reservations/quota/controller_test.go index d503b363f..b5b724647 100644 --- a/internal/scheduling/reservations/quota/controller_test.go +++ b/internal/scheduling/reservations/quota/controller_test.go @@ -93,39 +93,39 @@ func TestComputeTotalUsage(t *testing.T) { result := ctrl.computeTotalUsage(vms, flavorToGroup, flavorGroups) - // project-a: hana_v2 in az-1: 32768+65536 = 98304 MiB / 32768 = 3 units RAM, 8+16=24 cores - // project-a: hana_v2 in az-2: 32768 MiB / 32768 = 1 unit RAM, 8 cores + // project-a: hana_v2 in az-1: (32768+65536)/1024 = 96 GiB RAM, 8+16=24 cores + // project-a: hana_v2 in az-2: 32768/1024 = 32 GiB RAM, 8 cores projectA := result["project-a"] if projectA == nil { t.Fatal("expected project-a in results") } ramUsage := projectA["hw_version_hana_v2_ram"] - if ramUsage.PerAZ["az-1"] != 3 { - t.Errorf("expected project-a az-1 hana_v2_ram = 3, got %d", ramUsage.PerAZ["az-1"]) + if ramUsage["az-1"] != 96 { + t.Errorf("expected project-a az-1 hana_v2_ram = 96, got %d", ramUsage["az-1"]) } - if ramUsage.PerAZ["az-2"] != 1 { - t.Errorf("expected project-a az-2 hana_v2_ram = 1, got %d", ramUsage.PerAZ["az-2"]) + if ramUsage["az-2"] != 32 { + t.Errorf("expected project-a az-2 hana_v2_ram = 32, got %d", ramUsage["az-2"]) } coresUsage := projectA["hw_version_hana_v2_cores"] - if coresUsage.PerAZ["az-1"] != 24 { - t.Errorf("expected project-a az-1 hana_v2_cores = 24, got %d", coresUsage.PerAZ["az-1"]) + if coresUsage["az-1"] != 24 { + t.Errorf("expected project-a az-1 hana_v2_cores = 24, got %d", coresUsage["az-1"]) } - if coresUsage.PerAZ["az-2"] != 8 { - t.Errorf("expected project-a az-2 hana_v2_cores = 8, got %d", coresUsage.PerAZ["az-2"]) + if coresUsage["az-2"] != 8 { + t.Errorf("expected project-a az-2 hana_v2_cores = 8, got %d", coresUsage["az-2"]) } - // project-b: general in az-1: 4096/4096=1 unit RAM, 2 cores + // project-b: general in az-1: 4096/1024 = 4 GiB RAM, 2 cores projectB := result["project-b"] if projectB == nil { t.Fatal("expected project-b in results") } - if projectB["hw_version_general_ram"].PerAZ["az-1"] != 1 { - t.Errorf("expected project-b az-1 general_ram = 1, got %d", projectB["hw_version_general_ram"].PerAZ["az-1"]) + if projectB["hw_version_general_ram"]["az-1"] != 4 { + t.Errorf("expected project-b az-1 general_ram = 4, got %d", projectB["hw_version_general_ram"]["az-1"]) } - if projectB["hw_version_general_cores"].PerAZ["az-1"] != 2 { - t.Errorf("expected project-b az-1 general_cores = 2, got %d", projectB["hw_version_general_cores"].PerAZ["az-1"]) + if projectB["hw_version_general_cores"]["az-1"] != 2 { + t.Errorf("expected project-b az-1 general_cores = 2, got %d", projectB["hw_version_general_cores"]["az-1"]) } // project-c: unknown flavor → not in results @@ -137,12 +137,11 @@ func TestComputeTotalUsage(t *testing.T) { func TestComputeCRUsage(t *testing.T) { ctrl := &QuotaController{Config: DefaultQuotaControllerConfig()} - // Flavor groups with SmallestFlavor.MemoryMB = 1 for simple unit conversion in tests - // (1 multiple = 1 MiB = 1048576 bytes) + // Flavor groups — MemoryMB value is no longer used for unit conversion (now fixed at 1 GiB). testFlavorGroups := map[string]compute.FlavorGroupFeature{ "hana_v2": { - SmallestFlavor: compute.FlavorInGroup{Name: "m1.hana_v2.small", MemoryMB: 1}, - Flavors: []compute.FlavorInGroup{{Name: "m1.hana_v2.small", MemoryMB: 1}}, + SmallestFlavor: compute.FlavorInGroup{Name: "m1.hana_v2.small", MemoryMB: 1024}, + Flavors: []compute.FlavorInGroup{{Name: "m1.hana_v2.small", MemoryMB: 1024}}, }, } @@ -156,7 +155,7 @@ func TestComputeCRUsage(t *testing.T) { State: v1alpha1.CommitmentStatusConfirmed, }, Status: v1alpha1.CommittedResourceStatus{ - UsedResources: map[string]resource.Quantity{"memory": resource.MustParse("5Mi")}, + UsedResources: map[string]resource.Quantity{"memory": resource.MustParse("5Gi")}, }, }, { @@ -168,7 +167,7 @@ func TestComputeCRUsage(t *testing.T) { State: v1alpha1.CommitmentStatusGuaranteed, }, Status: v1alpha1.CommittedResourceStatus{ - UsedResources: map[string]resource.Quantity{"memory": resource.MustParse("3Mi")}, + UsedResources: map[string]resource.Quantity{"memory": resource.MustParse("3Gi")}, }, }, { @@ -193,7 +192,7 @@ func TestComputeCRUsage(t *testing.T) { State: v1alpha1.CommitmentStatusConfirmed, }, Status: v1alpha1.CommittedResourceStatus{ - UsedResources: map[string]resource.Quantity{"memory": resource.MustParse("5Mi")}, + UsedResources: map[string]resource.Quantity{"memory": resource.MustParse("5Gi")}, }, }, // Pending state — should be excluded by state filter @@ -206,7 +205,7 @@ func TestComputeCRUsage(t *testing.T) { State: v1alpha1.CommitmentStatusPending, }, Status: v1alpha1.CommittedResourceStatus{ - UsedResources: map[string]resource.Quantity{"memory": resource.MustParse("2Mi")}, + UsedResources: map[string]resource.Quantity{"memory": resource.MustParse("2Gi")}, }, }, } @@ -217,35 +216,35 @@ func TestComputeCRUsage(t *testing.T) { // Should include confirmed + guaranteed for project-a only ramUsage := result["hw_version_hana_v2_ram"] - if ramUsage.PerAZ["az-1"] != 8 { // 5 + 3 - t.Errorf("expected cr ram usage az-1 = 8, got %d", ramUsage.PerAZ["az-1"]) + if ramUsage["az-1"] != 8 { // 5 + 3 + t.Errorf("expected cr ram usage az-1 = 8, got %d", ramUsage["az-1"]) } coresUsage := result["hw_version_hana_v2_cores"] - if coresUsage.PerAZ["az-1"] != 2 { - t.Errorf("expected cr cores usage az-1 = 2, got %d", coresUsage.PerAZ["az-1"]) + if coresUsage["az-1"] != 2 { + t.Errorf("expected cr cores usage az-1 = 2, got %d", coresUsage["az-1"]) } - // az-2 should NOT be included (pending state) - if ramUsage.PerAZ["az-2"] != 0 { - t.Errorf("expected cr ram usage az-2 = 0 (pending excluded), got %d", ramUsage.PerAZ["az-2"]) + // az-2 should NOT be included (pending state) — assert key absence, not zero value + if got, exists := ramUsage["az-2"]; exists { + t.Errorf("expected cr ram usage az-2 to be absent (pending excluded), got %d", got) } } func TestDerivePaygUsage(t *testing.T) { tests := []struct { name string - totalUsage map[string]v1alpha1.ResourceQuotaUsage - crUsage map[string]v1alpha1.ResourceQuotaUsage + totalUsage map[string]map[string]int64 + crUsage map[string]map[string]int64 expected map[string]map[string]int64 // resourceName -> az -> amount }{ { name: "basic subtraction", - totalUsage: map[string]v1alpha1.ResourceQuotaUsage{ - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 10, "az-2": 5}}, + totalUsage: map[string]map[string]int64{ + "hw_version_hana_v2_ram": {"az-1": 10, "az-2": 5}, }, - crUsage: map[string]v1alpha1.ResourceQuotaUsage{ - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3}}, + crUsage: map[string]map[string]int64{ + "hw_version_hana_v2_ram": {"az-1": 3}, }, expected: map[string]map[string]int64{ "hw_version_hana_v2_ram": {"az-1": 7, "az-2": 5}, @@ -253,11 +252,11 @@ func TestDerivePaygUsage(t *testing.T) { }, { name: "clamp to zero", - totalUsage: map[string]v1alpha1.ResourceQuotaUsage{ - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 2}}, + totalUsage: map[string]map[string]int64{ + "hw_version_hana_v2_ram": {"az-1": 2}, }, - crUsage: map[string]v1alpha1.ResourceQuotaUsage{ - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 10}}, + crUsage: map[string]map[string]int64{ + "hw_version_hana_v2_ram": {"az-1": 10}, }, expected: map[string]map[string]int64{ "hw_version_hana_v2_ram": {"az-1": 0}, @@ -265,19 +264,19 @@ func TestDerivePaygUsage(t *testing.T) { }, { name: "no CR usage", - totalUsage: map[string]v1alpha1.ResourceQuotaUsage{ - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 5}}, + totalUsage: map[string]map[string]int64{ + "hw_version_hana_v2_ram": {"az-1": 5}, }, - crUsage: map[string]v1alpha1.ResourceQuotaUsage{}, + crUsage: map[string]map[string]int64{}, expected: map[string]map[string]int64{ "hw_version_hana_v2_ram": {"az-1": 5}, }, }, { name: "empty total usage", - totalUsage: map[string]v1alpha1.ResourceQuotaUsage{}, - crUsage: map[string]v1alpha1.ResourceQuotaUsage{ - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 5}}, + totalUsage: map[string]map[string]int64{}, + crUsage: map[string]map[string]int64{ + "hw_version_hana_v2_ram": {"az-1": 5}, }, expected: map[string]map[string]int64{}, }, @@ -294,9 +293,9 @@ func TestDerivePaygUsage(t *testing.T) { continue } for az, expectedAmount := range expectedAZ { - if resUsage.PerAZ[az] != expectedAmount { + if resUsage[az] != expectedAmount { t.Errorf("resource=%s az=%s: expected %d, got %d", - resourceName, az, expectedAmount, resUsage.PerAZ[az]) + resourceName, az, expectedAmount, resUsage[az]) } } } @@ -343,37 +342,37 @@ func TestBuildFlavorToGroupMap(t *testing.T) { } func TestIncrementDecrementUsage(t *testing.T) { - usage := make(map[string]v1alpha1.ResourceQuotaUsage) + usage := make(map[string]map[string]int64) // Increment from empty incrementUsage(usage, "res1", "az-1", 5) - if usage["res1"].PerAZ["az-1"] != 5 { - t.Errorf("expected 5 after increment, got %d", usage["res1"].PerAZ["az-1"]) + if usage["res1"]["az-1"] != 5 { + t.Errorf("expected 5 after increment, got %d", usage["res1"]["az-1"]) } // Increment again incrementUsage(usage, "res1", "az-1", 3) - if usage["res1"].PerAZ["az-1"] != 8 { - t.Errorf("expected 8 after second increment, got %d", usage["res1"].PerAZ["az-1"]) + if usage["res1"]["az-1"] != 8 { + t.Errorf("expected 8 after second increment, got %d", usage["res1"]["az-1"]) } // Decrement decrementUsage(usage, "res1", "az-1", 2) - if usage["res1"].PerAZ["az-1"] != 6 { - t.Errorf("expected 6 after decrement, got %d", usage["res1"].PerAZ["az-1"]) + if usage["res1"]["az-1"] != 6 { + t.Errorf("expected 6 after decrement, got %d", usage["res1"]["az-1"]) } // Decrement below zero → clamp to 0 decrementUsage(usage, "res1", "az-1", 100) - if usage["res1"].PerAZ["az-1"] != 0 { - t.Errorf("expected 0 after over-decrement, got %d", usage["res1"].PerAZ["az-1"]) + if usage["res1"]["az-1"] != 0 { + t.Errorf("expected 0 after over-decrement, got %d", usage["res1"]["az-1"]) } // Decrement non-existent resource (no-op) decrementUsage(usage, "res2", "az-1", 5) // Should not panic, and res2 should not exist if _, exists := usage["res2"]; exists { - if usage["res2"].PerAZ != nil { + if usage["res2"] != nil { t.Error("expected res2 to not have PerAZ after decrement on non-existent") } } @@ -503,8 +502,8 @@ func TestAccumulateAddedVM_KnownFlavor(t *testing.T) { } pq := &v1alpha1.ProjectQuota{ - ObjectMeta: metav1.ObjectMeta{Name: "quota-project-a"}, - Spec: v1alpha1.ProjectQuotaSpec{ProjectID: "project-a"}, + ObjectMeta: metav1.ObjectMeta{Name: "quota-project-a-az-1"}, + Spec: v1alpha1.ProjectQuotaSpec{ProjectID: "project-a", AvailabilityZone: "az-1"}, Status: v1alpha1.ProjectQuotaStatus{ LastReconcileAt: &lastReconcile, LastFullReconcileAt: &lastReconcile, @@ -554,9 +553,9 @@ func TestAccumulateAddedVM_KnownFlavor(t *testing.T) { t.Fatal("expected delta for project-a") } - // 32768 MiB / 32768 = 1 unit RAM - if delta.increments["hw_version_hana_v2_ram"]["az-1"] != 1 { - t.Errorf("expected ram increment = 1, got %d", delta.increments["hw_version_hana_v2_ram"]["az-1"]) + // 32768 MiB / 1024 = 32 GiB + if delta.increments["hw_version_hana_v2_ram"]["az-1"] != 32 { + t.Errorf("expected ram increment = 32, got %d", delta.increments["hw_version_hana_v2_ram"]["az-1"]) } if delta.increments["hw_version_hana_v2_cores"]["az-1"] != 8 { t.Errorf("expected cores increment = 8, got %d", delta.increments["hw_version_hana_v2_cores"]["az-1"]) diff --git a/internal/scheduling/reservations/quota/integration_test.go b/internal/scheduling/reservations/quota/integration_test.go index 740341d9a..dbe174f69 100644 --- a/internal/scheduling/reservations/quota/integration_test.go +++ b/internal/scheduling/reservations/quota/integration_test.go @@ -34,39 +34,40 @@ func TestIntegration(t *testing.T) { FlavorGroups: testFlavorGroups, VMs: testVMs, ProjectQuotas: []*v1alpha1.ProjectQuota{ - makePQ("project-a", nil), - makePQ("project-b", nil), + makePQPerAZ("project-a", "az-1", nil), + makePQPerAZ("project-a", "az-2", nil), + makePQPerAZ("project-b", "az-1", nil), }, Actions: []TestAction{ { Type: "full_reconcile", - // project-a: hana_v2 az-1: (32768+65536)/32768 = 3 RAM units, 8+16=24 cores - // project-a: hana_v2 az-2: 32768/32768 = 1 RAM unit, 8 cores - // project-a: general az-1: 4096/4096 = 1 RAM unit, 2 cores - // project-b: general az-1: 4096/4096 = 1 RAM unit, 2 cores - ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + // project-a: hana_v2 az-1: (32768+65536)/1024 = 96 GiB, 8+16=24 cores + // project-a: hana_v2 az-2: 32768/1024 = 32 GiB, 8 cores + // project-a: general az-1: 4096/1024 = 4 GiB, 2 cores + // project-b: general az-1: 4096/1024 = 4 GiB, 2 cores + ExpectedTotalUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 96, "az-2": 32}, + "hw_version_hana_v2_cores": {"az-1": 24, "az-2": 8}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, "project-b": { - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, }, // No CRs -> PaygUsage == TotalUsage - ExpectedPaygUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedPaygUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 96, "az-2": 32}, + "hw_version_hana_v2_cores": {"az-1": 24, "az-2": 8}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, "project-b": { - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, }, }, @@ -77,7 +78,8 @@ func TestIntegration(t *testing.T) { FlavorGroups: testFlavorGroups, VMs: testVMs, ProjectQuotas: []*v1alpha1.ProjectQuota{ - makePQ("project-a", nil), + makePQPerAZ("project-a", "az-1", nil), + makePQPerAZ("project-a", "az-2", nil), }, CommittedResources: []*v1alpha1.CommittedResource{ // 2 units of hana_v2 RAM committed in az-1 for project-a @@ -90,24 +92,24 @@ func TestIntegration(t *testing.T) { Actions: []TestAction{ { Type: "full_reconcile", - ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedTotalUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 96, "az-2": 32}, + "hw_version_hana_v2_cores": {"az-1": 24, "az-2": 8}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, }, // PaygUsage = TotalUsage - CRUsage - // hana_v2 RAM: 3-2=1 in az-1, 1-0=1 in az-2 + // hana_v2 RAM: 96-2=94 in az-1, 32-0=32 in az-2 // hana_v2 Cores: 24-10=14 in az-1, 8-0=8 in az-2 // general: no CRs so PaygUsage == TotalUsage - ExpectedPaygUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedPaygUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 1, "az-2": 1}}, - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 14, "az-2": 8}}, - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 94, "az-2": 32}, + "hw_version_hana_v2_cores": {"az-1": 14, "az-2": 8}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, }, }, @@ -118,18 +120,19 @@ func TestIntegration(t *testing.T) { FlavorGroups: testFlavorGroups, VMs: testVMs, ProjectQuotas: []*v1alpha1.ProjectQuota{ - makePQ("project-a", nil), + makePQPerAZ("project-a", "az-1", nil), + makePQPerAZ("project-a", "az-2", nil), }, Actions: []TestAction{ // Step 1: full reconcile to establish baseline { Type: "full_reconcile", - ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedTotalUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 96, "az-2": 32}, + "hw_version_hana_v2_cores": {"az-1": 24, "az-2": 8}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, }, }, @@ -151,19 +154,19 @@ func TestIntegration(t *testing.T) { ProjectID: "project-a", AvailabilityZone: "az-1", CreatedAt: "2099-01-01T00:00:00Z", // far future, always AFTER last reconcile Resources: map[string]resource.Quantity{ - "memory": resource.MustParse("34359738368"), // 32768 MiB = 1 RAM unit + "memory": resource.MustParse("34359738368"), // 32768 MiB = 32 GiB "vcpus": resource.MustParse("8"), }, }, ), // vm-new is created AFTER last reconcile, so it gets incremented - // +1 RAM unit (32768/32768), +8 cores - ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + // +32 GiB RAM (32768/1024), +8 cores + ExpectedTotalUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 128, "az-2": 32}, + "hw_version_hana_v2_cores": {"az-1": 32, "az-2": 8}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, }, }, @@ -174,18 +177,19 @@ func TestIntegration(t *testing.T) { FlavorGroups: testFlavorGroups, VMs: testVMs, ProjectQuotas: []*v1alpha1.ProjectQuota{ - makePQ("project-a", nil), + makePQPerAZ("project-a", "az-1", nil), + makePQPerAZ("project-a", "az-2", nil), }, Actions: []TestAction{ // Step 1: full reconcile { Type: "full_reconcile", - ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedTotalUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 96, "az-2": 32}, + "hw_version_hana_v2_cores": {"az-1": 24, "az-2": 8}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, }, }, @@ -197,12 +201,12 @@ func TestIntegration(t *testing.T) { activeInstance("vm-1"), // migrated here, created before reconcile }), // Should NOT increment -- vm-1 CreatedAt is 2025-12-01 which is before reconcile time - ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedTotalUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 96, "az-2": 32}, + "hw_version_hana_v2_cores": {"az-1": 24, "az-2": 8}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, }, }, @@ -226,18 +230,19 @@ func TestIntegration(t *testing.T) { "vm-del": false, // not active (truly deleted) }, ProjectQuotas: []*v1alpha1.ProjectQuota{ - makePQ("project-a", nil), + makePQPerAZ("project-a", "az-1", nil), + makePQPerAZ("project-a", "az-2", nil), }, Actions: []TestAction{ // Step 1: full reconcile (vm-del not in VMs so not counted) { Type: "full_reconcile", - ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedTotalUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 96, "az-2": 32}, + "hw_version_hana_v2_cores": {"az-1": 24, "az-2": 8}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, }, }, @@ -255,13 +260,13 @@ func TestIntegration(t *testing.T) { // vm-del gone }), // vm-del: IsServerActive=false, deleted info found - // Decrement: -1 RAM unit, -8 cores in az-1 - ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + // Decrement: -32 GiB RAM, -8 cores in az-1 + ExpectedTotalUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 2, "az-2": 1}}, - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 16, "az-2": 8}}, - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 64, "az-2": 32}, + "hw_version_hana_v2_cores": {"az-1": 16, "az-2": 8}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, }, }, @@ -275,17 +280,18 @@ func TestIntegration(t *testing.T) { "vm-1": true, // still active (migrated to another HV) }, ProjectQuotas: []*v1alpha1.ProjectQuota{ - makePQ("project-a", nil), + makePQPerAZ("project-a", "az-1", nil), + makePQPerAZ("project-a", "az-2", nil), }, Actions: []TestAction{ { Type: "full_reconcile", - ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedTotalUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 96, "az-2": 32}, + "hw_version_hana_v2_cores": {"az-1": 24, "az-2": 8}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, }, }, @@ -301,12 +307,12 @@ func TestIntegration(t *testing.T) { // vm-1 gone from this HV }), // vm-1: IsServerActive=true, so NOT decremented - ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedTotalUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 96, "az-2": 32}, + "hw_version_hana_v2_cores": {"az-1": 24, "az-2": 8}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, }, }, @@ -317,7 +323,8 @@ func TestIntegration(t *testing.T) { FlavorGroups: testFlavorGroups, VMs: testVMs, ProjectQuotas: []*v1alpha1.ProjectQuota{ - makePQ("project-a", nil), + makePQPerAZ("project-a", "az-1", nil), + makePQPerAZ("project-a", "az-2", nil), }, CommittedResources: []*v1alpha1.CommittedResource{ makeCR("cr-ram-1", "project-a", "hana_v2", "az-1", @@ -327,12 +334,12 @@ func TestIntegration(t *testing.T) { // Step 1: full reconcile with initial CR (UsedAmount=1) { Type: "full_reconcile", - ExpectedPaygUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedPaygUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 2, "az-2": 1}}, // 3-1=2 - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 95, "az-2": 32}, // 96-1=95 + "hw_version_hana_v2_cores": {"az-1": 24, "az-2": 8}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, }, }, @@ -341,12 +348,12 @@ func TestIntegration(t *testing.T) { Type: "cr_update", CRName: "cr-ram-1", UsedAmount: 3, - ExpectedPaygUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedPaygUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 0, "az-2": 1}}, // 3-3=0 - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 93, "az-2": 32}, // 96-3=93 + "hw_version_hana_v2_cores": {"az-1": 24, "az-2": 8}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, }, }, @@ -366,16 +373,16 @@ func TestIntegration(t *testing.T) { }, }, ProjectQuotas: []*v1alpha1.ProjectQuota{ - makePQ("project-x", nil), + makePQPerAZ("project-x", "az-1", nil), }, Actions: []TestAction{ { Type: "full_reconcile", // No usage for project-x (unknown flavor skipped) - ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedTotalUsage: map[string]map[string]map[string]int64{ "project-x": {}, }, - ExpectedPaygUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedPaygUsage: map[string]map[string]map[string]int64{ "project-x": {}, }, }, @@ -386,38 +393,39 @@ func TestIntegration(t *testing.T) { FlavorGroups: testFlavorGroups, VMs: testVMs, ProjectQuotas: []*v1alpha1.ProjectQuota{ - makePQ("project-a", nil), - makePQ("project-b", nil), + makePQPerAZ("project-a", "az-1", nil), + makePQPerAZ("project-a", "az-2", nil), + makePQPerAZ("project-b", "az-1", nil), }, Actions: []TestAction{ { Type: "full_reconcile", - ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedTotalUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 96, "az-2": 32}, + "hw_version_hana_v2_cores": {"az-1": 24, "az-2": 8}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, "project-b": { - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, }, }, // Second full reconcile - same result { Type: "full_reconcile", - ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedTotalUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 96, "az-2": 32}, + "hw_version_hana_v2_cores": {"az-1": 24, "az-2": 8}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, "project-b": { - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, }, }, @@ -428,7 +436,8 @@ func TestIntegration(t *testing.T) { FlavorGroups: testFlavorGroups, VMs: testVMs, ProjectQuotas: []*v1alpha1.ProjectQuota{ - makePQ("project-a", nil), + makePQPerAZ("project-a", "az-1", nil), + makePQPerAZ("project-a", "az-2", nil), }, CommittedResources: []*v1alpha1.CommittedResource{ // Pending CR should NOT reduce PaygUsage @@ -439,12 +448,12 @@ func TestIntegration(t *testing.T) { { Type: "full_reconcile", // PaygUsage == TotalUsage because pending CRs are excluded - ExpectedPaygUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedPaygUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 96, "az-2": 32}, + "hw_version_hana_v2_cores": {"az-1": 24, "az-2": 8}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, }, }, @@ -455,18 +464,19 @@ func TestIntegration(t *testing.T) { FlavorGroups: testFlavorGroups, VMs: testVMs, ProjectQuotas: []*v1alpha1.ProjectQuota{ - makePQ("project-a", nil), + makePQPerAZ("project-a", "az-1", nil), + makePQPerAZ("project-a", "az-2", nil), }, Actions: []TestAction{ // Step 1: full reconcile establishes correct baseline { Type: "full_reconcile", - ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedTotalUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 96, "az-2": 32}, + "hw_version_hana_v2_cores": {"az-1": 24, "az-2": 8}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, }, }, @@ -490,18 +500,18 @@ func TestIntegration(t *testing.T) { ProjectID: "project-a", AvailabilityZone: "az-1", CreatedAt: "2099-01-01T00:00:00Z", // after last reconcile Resources: map[string]resource.Quantity{ - "memory": resource.MustParse("34359738368"), // 32768 MiB = 1 RAM unit + "memory": resource.MustParse("34359738368"), // 32768 MiB = 32 GiB "vcpus": resource.MustParse("8"), }, }, ), // TotalUsage now has phantom's contribution (drift) - ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedTotalUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, // 3+1 drift - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, // 24+8 drift - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 128, "az-2": 32}, // 96+32 drift + "hw_version_hana_v2_cores": {"az-1": 32, "az-2": 8}, // 24+8 drift + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, }, }, @@ -510,12 +520,12 @@ func TestIntegration(t *testing.T) { { Type: "full_reconcile", OverrideVMs: baseVMsPtr(), - ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedTotalUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, // corrected - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, // corrected - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 96, "az-2": 32}, // corrected + "hw_version_hana_v2_cores": {"az-1": 24, "az-2": 8}, // corrected + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, }, }, @@ -539,30 +549,31 @@ func TestIntegration(t *testing.T) { "vm-1": true, // still active (for migration scenario) }, ProjectQuotas: []*v1alpha1.ProjectQuota{ - makePQ("project-a", nil), - makePQ("project-b", nil), + makePQPerAZ("project-a", "az-1", nil), + makePQPerAZ("project-a", "az-2", nil), + makePQPerAZ("project-b", "az-1", nil), }, Actions: []TestAction{ // Step 1: full reconcile establishes baseline for both projects - // project-a hana_v2: az-1=3 RAM / 24 cores, az-2=1 RAM / 8 cores; general: az-1=1 RAM / 2 cores - // project-b general: az-1=1 RAM / 2 cores + // project-a hana_v2: az-1=96 GiB / 24 cores, az-2=32 GiB / 8 cores; general: az-1=4 GiB / 2 cores + // project-b general: az-1=4 GiB / 2 cores { Type: "full_reconcile", - ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedTotalUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 96, "az-2": 32}, + "hw_version_hana_v2_cores": {"az-1": 24, "az-2": 8}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, "project-b": { - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, }, }, // Step 2: HV diff adds a genuine new VM to project-a (hana_v2 small, az-1) - // +1 RAM unit, +8 cores + // +32 GiB RAM, +8 cores { Type: "hv_diff", OldHV: makeHV("hv-1", []hv1.Instance{ @@ -580,17 +591,17 @@ func TestIntegration(t *testing.T) { ProjectID: "project-a", AvailabilityZone: "az-1", CreatedAt: "2099-01-01T00:00:00Z", Resources: map[string]resource.Quantity{ - "memory": resource.MustParse("34359738368"), // 32768 MiB = 1 RAM unit + "memory": resource.MustParse("34359738368"), // 32768 MiB = 32 GiB "vcpus": resource.MustParse("8"), }, }, ), - ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedTotalUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 128, "az-2": 32}, + "hw_version_hana_v2_cores": {"az-1": 32, "az-2": 8}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, }, }, @@ -611,20 +622,20 @@ func TestIntegration(t *testing.T) { ProjectID: "project-b", AvailabilityZone: "az-1", CreatedAt: "2099-01-01T00:00:00Z", Resources: map[string]resource.Quantity{ - "memory": resource.MustParse("4294967296"), // 4096 MiB = 1 RAM unit + "memory": resource.MustParse("4294967296"), // 4096 MiB = 4 GiB "vcpus": resource.MustParse("2"), }, }, ), - ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedTotalUsage: map[string]map[string]map[string]int64{ "project-b": { - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 2}}, // 1+1 drift - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 4}}, // 2+2 drift + "hw_version_general_ram": {"az-1": 8}, // 4+4 drift + "hw_version_general_cores": {"az-1": 4}, // 2+2 drift }, }, }, // Step 4: HV diff removes vm-del from project-a (truly deleted) - // -1 RAM unit, -8 cores in az-1 + // -32 GiB RAM, -8 cores in az-1 { Type: "hv_diff", OldHV: makeHV("hv-1", []hv1.Instance{ @@ -650,12 +661,12 @@ func TestIntegration(t *testing.T) { }, }, ), - ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedTotalUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, // 4-1=3 - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, // 32-8=24 - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 96, "az-2": 32}, // 128-32=96 + "hw_version_hana_v2_cores": {"az-1": 24, "az-2": 8}, // 32-8=24 + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, }, }, @@ -679,16 +690,16 @@ func TestIntegration(t *testing.T) { }, }, }, - ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedTotalUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, // corrected up - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, // corrected up - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 128, "az-2": 32}, // corrected up + "hw_version_hana_v2_cores": {"az-1": 32, "az-2": 8}, // corrected up + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, "project-b": { - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, // corrected down - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, // corrected down + "hw_version_general_ram": {"az-1": 4}, // corrected down + "hw_version_general_cores": {"az-1": 2}, // corrected down }, }, }, @@ -717,12 +728,12 @@ func TestIntegration(t *testing.T) { }, ), // vm-1 migrated, NOT decremented -- totals unchanged - ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedTotalUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 128, "az-2": 32}, + "hw_version_hana_v2_cores": {"az-1": 32, "az-2": 8}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, }, }, @@ -730,16 +741,100 @@ func TestIntegration(t *testing.T) { // This is the "reconcile that matches the deltas" -- nothing to fix. { Type: "full_reconcile", - ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + ExpectedTotalUsage: map[string]map[string]map[string]int64{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, - "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_hana_v2_ram": {"az-1": 128, "az-2": 32}, + "hw_version_hana_v2_cores": {"az-1": 32, "az-2": 8}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, "project-b": { - "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, - "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, + }, + }, + }, + }, + }, + { + Name: "partial AZ coverage - only az-1 has CRD, az-2 VMs are ignored", + FlavorGroups: testFlavorGroups, + VMs: testVMs, // project-a has VMs in az-1 AND az-2 + ProjectQuotas: []*v1alpha1.ProjectQuota{ + // Only az-1 CRD exists — az-2 has VMs but no CRD + makePQPerAZ("project-a", "az-1", nil), + }, + Actions: []TestAction{ + { + Type: "full_reconcile", + // Only az-1 data should be written (az-2 CRD doesn't exist) + ExpectedTotalUsage: map[string]map[string]map[string]int64{ + "project-a": { + "hw_version_hana_v2_ram": {"az-1": 96}, + "hw_version_hana_v2_cores": {"az-1": 24}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, + }, + }, + ExpectedPaygUsage: map[string]map[string]map[string]int64{ + "project-a": { + "hw_version_hana_v2_ram": {"az-1": 96}, + "hw_version_hana_v2_cores": {"az-1": 24}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, + }, + }, + }, + }, + }, + { + Name: "total calculation - multi-resource multi-AZ verified", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQPerAZ("project-a", "az-1", nil), + makePQPerAZ("project-a", "az-2", nil), + makePQPerAZ("project-b", "az-1", nil), + }, + CommittedResources: []*v1alpha1.CommittedResource{ + // 5 GiB hana_v2 RAM committed in az-1, 3 GiB in az-2 + makeCR("cr-ram-az1", "project-a", "hana_v2", "az-1", + v1alpha1.CommittedResourceTypeMemory, v1alpha1.CommitmentStatusConfirmed, int64Ptr(5)), + makeCR("cr-ram-az2", "project-a", "hana_v2", "az-2", + v1alpha1.CommittedResourceTypeMemory, v1alpha1.CommitmentStatusConfirmed, int64Ptr(3)), + // 4 cores committed in az-1 + makeCR("cr-cores-az1", "project-a", "hana_v2", "az-1", + v1alpha1.CommittedResourceTypeCores, v1alpha1.CommitmentStatusConfirmed, int64Ptr(4)), + }, + Actions: []TestAction{ + { + Type: "full_reconcile", + // Verify TotalUsage is correctly computed from VMs + ExpectedTotalUsage: map[string]map[string]map[string]int64{ + "project-a": { + "hw_version_hana_v2_ram": {"az-1": 96, "az-2": 32}, + "hw_version_hana_v2_cores": {"az-1": 24, "az-2": 8}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, + }, + "project-b": { + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, + }, + }, + // Verify PaygUsage = TotalUsage - CRUsage per AZ + // az-1: hana_v2_ram: 96-5=91, hana_v2_cores: 24-4=20 + // az-2: hana_v2_ram: 32-3=29, hana_v2_cores: 8-0=8 + ExpectedPaygUsage: map[string]map[string]map[string]int64{ + "project-a": { + "hw_version_hana_v2_ram": {"az-1": 91, "az-2": 29}, + "hw_version_hana_v2_cores": {"az-1": 20, "az-2": 8}, + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, + }, + "project-b": { + "hw_version_general_ram": {"az-1": 4}, + "hw_version_general_cores": {"az-1": 2}, }, }, }, @@ -789,7 +884,7 @@ var testFlavorGroups = map[string]compute.FlavorGroupFeature{ // project-a has VMs in BOTH flavor groups (hana_v2 and general). // project-b has only general VMs. var testVMs = []failover.VM{ - // vm-1: hana_v2, 1 RAM unit (32768/32768), 8 cores + // vm-1: hana_v2, 32 GiB RAM (32768/1024), 8 cores { UUID: "vm-1", FlavorName: "m1.hana_v2.small", ProjectID: "project-a", AvailabilityZone: "az-1", @@ -799,7 +894,7 @@ var testVMs = []failover.VM{ "vcpus": resource.MustParse("8"), }, }, - // vm-2: hana_v2, 2 RAM units (65536/32768), 16 cores + // vm-2: hana_v2, 64 GiB RAM (65536/1024), 16 cores { UUID: "vm-2", FlavorName: "m1.hana_v2.large", ProjectID: "project-a", AvailabilityZone: "az-1", @@ -809,7 +904,7 @@ var testVMs = []failover.VM{ "vcpus": resource.MustParse("16"), }, }, - // vm-3: hana_v2, 1 RAM unit (32768/32768), 8 cores + // vm-3: hana_v2, 32 GiB RAM (32768/1024), 8 cores { UUID: "vm-3", FlavorName: "m1.hana_v2.small", ProjectID: "project-a", AvailabilityZone: "az-2", @@ -819,7 +914,7 @@ var testVMs = []failover.VM{ "vcpus": resource.MustParse("8"), }, }, - // vm-4: general, 1 RAM unit (4096/4096), 2 cores + // vm-4: general, 4 GiB RAM (4096/1024), 2 cores { UUID: "vm-4", FlavorName: "m1.general.small", ProjectID: "project-a", AvailabilityZone: "az-1", @@ -829,7 +924,7 @@ var testVMs = []failover.VM{ "vcpus": resource.MustParse("2"), }, }, - // vm-5: general, 1 RAM unit (4096/4096), 2 cores + // vm-5: general, 4 GiB RAM (4096/1024), 2 cores { UUID: "vm-5", FlavorName: "m1.general.small", ProjectID: "project-b", AvailabilityZone: "az-1", @@ -869,8 +964,8 @@ type TestAction struct { // Optional: verify state AFTER this action completes. // Keys are project IDs. If nil, no verification for this step. - ExpectedTotalUsage map[string]map[string]v1alpha1.ResourceQuotaUsage - ExpectedPaygUsage map[string]map[string]v1alpha1.ResourceQuotaUsage + ExpectedTotalUsage map[string]map[string]map[string]int64 + ExpectedPaygUsage map[string]map[string]map[string]int64 } // IntegrationTestCase defines a complete integration test scenario. @@ -982,69 +1077,123 @@ func newIntegrationTestEnv(t *testing.T, tc IntegrationTestCase) *integrationTes } } -func (env *integrationTestEnv) verifyTotalUsage(projectID string, expected map[string]v1alpha1.ResourceQuotaUsage) { +func (env *integrationTestEnv) verifyTotalUsage(projectID string, expected map[string]map[string]int64) { env.t.Helper() - crdName := "quota-" + projectID - var pq v1alpha1.ProjectQuota - if err := env.client.Get(context.Background(), client.ObjectKey{Name: crdName}, &pq); err != nil { - env.t.Fatalf("failed to get ProjectQuota %s: %v", crdName, err) + + if expected == nil { + return } - if expected == nil && pq.Status.TotalUsage == nil { - return // both nil, ok + // Collect expected data per AZ: az → resourceName → value + perAZ := make(map[string]map[string]int64) + for resourceName, azMap := range expected { + for az, val := range azMap { + if perAZ[az] == nil { + perAZ[az] = make(map[string]int64) + } + perAZ[az][resourceName] = val + } } - for resourceName, expectedUsage := range expected { - actual, ok := pq.Status.TotalUsage[resourceName] - if !ok { - env.t.Errorf("project %s: expected TotalUsage resource %q not found", projectID, resourceName) - continue + for az, expectedResources := range perAZ { + crdName := "quota-" + projectID + "-" + az + var pq v1alpha1.ProjectQuota + if err := env.client.Get(context.Background(), client.ObjectKey{Name: crdName}, &pq); err != nil { + env.t.Fatalf("failed to get ProjectQuota %s: %v", crdName, err) } - for az, expectedAmount := range expectedUsage.PerAZ { - if actual.PerAZ[az] != expectedAmount { + + for resourceName, expectedAmount := range expectedResources { + actual, ok := pq.Status.TotalUsage[resourceName] + if !ok { + env.t.Errorf("project %s AZ %s: expected TotalUsage resource %q not found", projectID, az, resourceName) + continue + } + if actual != expectedAmount { env.t.Errorf("project %s: TotalUsage[%s][%s] = %d, want %d", - projectID, resourceName, az, actual.PerAZ[az], expectedAmount) + projectID, resourceName, az, actual, expectedAmount) + } + } + + // Check no unexpected resources + for resourceName := range pq.Status.TotalUsage { + if _, ok := expectedResources[resourceName]; !ok { + env.t.Errorf("project %s AZ %s: unexpected TotalUsage resource %q", projectID, az, resourceName) } } } - // Check no unexpected resources - for resourceName := range pq.Status.TotalUsage { - if _, ok := expected[resourceName]; !ok { - env.t.Errorf("project %s: unexpected TotalUsage resource %q", projectID, resourceName) + // Ensure no unexpected AZ CRDs carry TotalUsage for this project. + var allPQ v1alpha1.ProjectQuotaList + if err := env.client.List(context.Background(), &allPQ); err != nil { + env.t.Fatalf("failed to list ProjectQuota objects: %v", err) + } + for _, pq := range allPQ.Items { + if pq.Spec.ProjectID != projectID { + continue + } + az := pq.Spec.AvailabilityZone + if _, ok := perAZ[az]; !ok && len(pq.Status.TotalUsage) > 0 { + env.t.Errorf("project %s AZ %s: unexpected TotalUsage in non-expected AZ CRD", projectID, az) } } } -func (env *integrationTestEnv) verifyPaygUsage(projectID string, expected map[string]v1alpha1.ResourceQuotaUsage) { +func (env *integrationTestEnv) verifyPaygUsage(projectID string, expected map[string]map[string]int64) { env.t.Helper() - crdName := "quota-" + projectID - var pq v1alpha1.ProjectQuota - if err := env.client.Get(context.Background(), client.ObjectKey{Name: crdName}, &pq); err != nil { - env.t.Fatalf("failed to get ProjectQuota %s: %v", crdName, err) - } - if expected == nil && pq.Status.PaygUsage == nil { + if expected == nil { return } - for resourceName, expectedUsage := range expected { - actual, ok := pq.Status.PaygUsage[resourceName] - if !ok { - env.t.Errorf("project %s: expected PaygUsage resource %q not found", projectID, resourceName) - continue + // Collect expected data per AZ: az → resourceName → value + perAZ := make(map[string]map[string]int64) + for resourceName, azMap := range expected { + for az, val := range azMap { + if perAZ[az] == nil { + perAZ[az] = make(map[string]int64) + } + perAZ[az][resourceName] = val + } + } + + for az, expectedResources := range perAZ { + crdName := "quota-" + projectID + "-" + az + var pq v1alpha1.ProjectQuota + if err := env.client.Get(context.Background(), client.ObjectKey{Name: crdName}, &pq); err != nil { + env.t.Fatalf("failed to get ProjectQuota %s: %v", crdName, err) } - for az, expectedAmount := range expectedUsage.PerAZ { - if actual.PerAZ[az] != expectedAmount { + + for resourceName, expectedAmount := range expectedResources { + actual, ok := pq.Status.PaygUsage[resourceName] + if !ok { + env.t.Errorf("project %s AZ %s: expected PaygUsage resource %q not found", projectID, az, resourceName) + continue + } + if actual != expectedAmount { env.t.Errorf("project %s: PaygUsage[%s][%s] = %d, want %d", - projectID, resourceName, az, actual.PerAZ[az], expectedAmount) + projectID, resourceName, az, actual, expectedAmount) + } + } + + for resourceName := range pq.Status.PaygUsage { + if _, ok := expectedResources[resourceName]; !ok { + env.t.Errorf("project %s AZ %s: unexpected PaygUsage resource %q", projectID, az, resourceName) } } } - for resourceName := range pq.Status.PaygUsage { - if _, ok := expected[resourceName]; !ok { - env.t.Errorf("project %s: unexpected PaygUsage resource %q", projectID, resourceName) + // Ensure no unexpected AZ CRDs carry PaygUsage for this project. + var allPQ v1alpha1.ProjectQuotaList + if err := env.client.List(context.Background(), &allPQ); err != nil { + env.t.Fatalf("failed to list ProjectQuota objects: %v", err) + } + for _, pq := range allPQ.Items { + if pq.Spec.ProjectID != projectID { + continue + } + az := pq.Spec.AvailabilityZone + if _, ok := perAZ[az]; !ok && len(pq.Status.PaygUsage) > 0 { + env.t.Errorf("project %s AZ %s: unexpected PaygUsage in non-expected AZ CRD", projectID, az) } } } @@ -1091,8 +1240,8 @@ func (env *integrationTestEnv) executeAction(action TestAction) { env.t.Fatalf("failed to update CR %s status: %v", action.CRName, err) } - // Simulate watch trigger: call Reconcile for the affected project - pqName := "quota-" + cr.Spec.ProjectID + // Simulate watch trigger: call Reconcile for the affected per-AZ CRD + pqName := "quota-" + cr.Spec.ProjectID + "-" + cr.Spec.AvailabilityZone _, err := env.controller.Reconcile(ctx, reconcileRequest(pqName)) if err != nil { env.t.Fatalf("Reconcile failed after CR update: %v", err) @@ -1171,7 +1320,7 @@ func reconcileRequest(name string) ctrl.Request { return ctrl.Request{NamespacedName: client.ObjectKey{Name: name}} } -func makePQ(projectID string, lastReconcileAt *metav1.Time) *v1alpha1.ProjectQuota { //nolint:unparam +func makePQ(projectID string, lastReconcileAt *metav1.Time) *v1alpha1.ProjectQuota { //nolint:unused return &v1alpha1.ProjectQuota{ ObjectMeta: metav1.ObjectMeta{Name: "quota-" + projectID}, Spec: v1alpha1.ProjectQuotaSpec{ProjectID: projectID, DomainID: "domain-1"}, @@ -1181,6 +1330,21 @@ func makePQ(projectID string, lastReconcileAt *metav1.Time) *v1alpha1.ProjectQuo } } +// makePQPerAZ creates a per-AZ ProjectQuota CRD for integration tests. +func makePQPerAZ(projectID, az string, lastReconcileAt *metav1.Time) *v1alpha1.ProjectQuota { //nolint:unparam + return &v1alpha1.ProjectQuota{ + ObjectMeta: metav1.ObjectMeta{Name: "quota-" + projectID + "-" + az}, + Spec: v1alpha1.ProjectQuotaSpec{ + ProjectID: projectID, + DomainID: "domain-1", + AvailabilityZone: az, + }, + Status: v1alpha1.ProjectQuotaStatus{ + LastReconcileAt: lastReconcileAt, + }, + } +} + func makeCR(name, projectID, flavorGroup, az string, resourceType v1alpha1.CommittedResourceType, state v1alpha1.CommitmentStatus, usedAmount *int64) *v1alpha1.CommittedResource { //nolint:unparam cr := &v1alpha1.CommittedResource{ ObjectMeta: metav1.ObjectMeta{Name: name}, @@ -1202,16 +1366,15 @@ func makeCR(name, projectID, flavorGroup, az string, resourceType v1alpha1.Commi } // usedResourcesFromMultiples converts a "multiples" value (the old UsedAmount unit) to UsedResources. -// For memory: multiples * smallestFlavorMB * 1024 * 1024 = bytes. +// For memory: multiples * 1 GiB = bytes. // For cores: the value is used directly. func usedResourcesFromMultiples(resourceType v1alpha1.CommittedResourceType, flavorGroup string, multiples int64) map[string]resource.Quantity { switch resourceType { case v1alpha1.CommittedResourceTypeMemory: - fg, ok := testFlavorGroups[flavorGroup] - if !ok || fg.SmallestFlavor.MemoryMB == 0 { + if _, ok := testFlavorGroups[flavorGroup]; !ok { return nil } - bytesVal := multiples * int64(fg.SmallestFlavor.MemoryMB) * 1024 * 1024 //nolint:gosec // test only + bytesVal := multiples * 1024 * 1024 * 1024 return map[string]resource.Quantity{ "memory": *resource.NewQuantity(bytesVal, resource.BinarySI), } diff --git a/pkg/multicluster/routers.go b/pkg/multicluster/routers.go index 16d74dc6f..fdbd251dc 100644 --- a/pkg/multicluster/routers.go +++ b/pkg/multicluster/routers.go @@ -16,10 +16,12 @@ import ( // for the multicluster client that cortex supports by default. This is used to // route resources to the correct cluster in a multicluster setup. var DefaultResourceRouters = map[schema.GroupVersionKind]ResourceRouter{ - {Group: "kvm.cloud.sap", Version: "v1", Kind: "Hypervisor"}: HypervisorResourceRouter{}, - {Group: "cortex.cloud", Version: "v1alpha1", Kind: "Reservation"}: ReservationsResourceRouter{}, - {Group: "cortex.cloud", Version: "v1alpha1", Kind: "History"}: HistoryResourceRouter{}, - {Group: "cortex.cloud", Version: "v1alpha1", Kind: "CommittedResource"}: CommittedResourceRouter{}, + {Group: "kvm.cloud.sap", Version: "v1", Kind: "Hypervisor"}: HypervisorResourceRouter{}, + {Group: "cortex.cloud", Version: "v1alpha1", Kind: "Reservation"}: ReservationsResourceRouter{}, + {Group: "cortex.cloud", Version: "v1alpha1", Kind: "History"}: HistoryResourceRouter{}, + {Group: "cortex.cloud", Version: "v1alpha1", Kind: "CommittedResource"}: CommittedResourceRouter{}, + {Group: "cortex.cloud", Version: "v1alpha1", Kind: "ProjectQuota"}: ProjectQuotaResourceRouter{}, + {Group: "cortex.cloud", Version: "v1alpha1", Kind: "FlavorGroupCapacity"}: FlavorGroupCapacityResourceRouter{}, } // ResourceRouter determines which remote cluster a resource should be written to @@ -111,6 +113,33 @@ func (c CommittedResourceRouter) Match(obj any, labels map[string]string) (bool, return cr.Spec.AvailabilityZone == availabilityZone, nil } +// FlavorGroupCapacityResourceRouter routes flavor group capacity CRDs to clusters based on availability zone. +type FlavorGroupCapacityResourceRouter struct{} + +func (f FlavorGroupCapacityResourceRouter) Match(obj any, labels map[string]string) (bool, error) { + var fgc v1alpha1.FlavorGroupCapacity + + switch v := obj.(type) { + case *v1alpha1.FlavorGroupCapacity: + if v == nil { + return false, errors.New("object is nil") + } + fgc = *v + case v1alpha1.FlavorGroupCapacity: + fgc = v + default: + return false, errors.New("object is not a FlavorGroupCapacity") + } + availabilityZone, ok := labels["availabilityZone"] + if !ok { + return false, errors.New("cluster does not have availabilityZone label") + } + if fgc.Spec.AvailabilityZone == "" { + return false, errors.New("flavor group capacity does not have availability zone in spec") + } + return fgc.Spec.AvailabilityZone == availabilityZone, nil +} + // HistoryResourceRouter routes histories to clusters based on availability zone. type HistoryResourceRouter struct{} @@ -137,3 +166,30 @@ func (h HistoryResourceRouter) Match(obj any, labels map[string]string) (bool, e } return *hist.Spec.AvailabilityZone == availabilityZone, nil } + +// ProjectQuotaResourceRouter routes project quotas to clusters based on availability zone. +type ProjectQuotaResourceRouter struct{} + +func (p ProjectQuotaResourceRouter) Match(obj any, labels map[string]string) (bool, error) { + var pq v1alpha1.ProjectQuota + + switch v := obj.(type) { + case *v1alpha1.ProjectQuota: + if v == nil { + return false, errors.New("object is nil") + } + pq = *v + case v1alpha1.ProjectQuota: + pq = v + default: + return false, errors.New("object is not a ProjectQuota") + } + availabilityZone, ok := labels["availabilityZone"] + if !ok { + return false, errors.New("cluster does not have availabilityZone label") + } + if pq.Spec.AvailabilityZone == "" { + return false, errors.New("project quota does not have availability zone in spec") + } + return pq.Spec.AvailabilityZone == availabilityZone, nil +} diff --git a/pkg/multicluster/routers_test.go b/pkg/multicluster/routers_test.go index c3e1a0c16..882bf1ba8 100644 --- a/pkg/multicluster/routers_test.go +++ b/pkg/multicluster/routers_test.go @@ -217,6 +217,101 @@ func TestHistoryResourceRouter_Match(t *testing.T) { } } +func TestFlavorGroupCapacityResourceRouter_Match(t *testing.T) { + router := FlavorGroupCapacityResourceRouter{} + + tests := []struct { + name string + obj any + labels map[string]string + wantMatch bool + wantErr bool + }{ + { + name: "matching AZ", + obj: v1alpha1.FlavorGroupCapacity{ + Spec: v1alpha1.FlavorGroupCapacitySpec{ + AvailabilityZone: "qa-de-1b", + }, + }, + labels: map[string]string{"availabilityZone": "qa-de-1b"}, + wantMatch: true, + }, + { + name: "matching AZ pointer", + obj: &v1alpha1.FlavorGroupCapacity{ + Spec: v1alpha1.FlavorGroupCapacitySpec{ + AvailabilityZone: "qa-de-1b", + }, + }, + labels: map[string]string{"availabilityZone": "qa-de-1b"}, + wantMatch: true, + }, + { + name: "non-matching AZ", + obj: v1alpha1.FlavorGroupCapacity{ + Spec: v1alpha1.FlavorGroupCapacitySpec{ + AvailabilityZone: "qa-de-1b", + }, + }, + labels: map[string]string{"availabilityZone": "qa-de-1a"}, + wantMatch: false, + }, + { + name: "not a FlavorGroupCapacity", + obj: "not-a-flavor-group-capacity", + labels: map[string]string{"availabilityZone": "qa-de-1b"}, + wantErr: true, + }, + { + name: "cluster missing availabilityZone label", + obj: v1alpha1.FlavorGroupCapacity{ + Spec: v1alpha1.FlavorGroupCapacitySpec{ + AvailabilityZone: "qa-de-1b", + }, + }, + labels: map[string]string{}, + wantErr: true, + }, + { + name: "FlavorGroupCapacity missing availability zone", + obj: v1alpha1.FlavorGroupCapacity{ + Spec: v1alpha1.FlavorGroupCapacitySpec{}, + }, + labels: map[string]string{"availabilityZone": "qa-de-1b"}, + wantErr: true, + }, + { + name: "typed nil pointer doesn't panic", + obj: (*v1alpha1.FlavorGroupCapacity)(nil), + labels: map[string]string{"availabilityZone": "qa-de-1b"}, + wantErr: true, + }, + { + name: "nil object doesn't panic", + obj: nil, + labels: map[string]string{"availabilityZone": "qa-de-1b"}, + wantErr: true, + wantMatch: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + match, err := router.Match(tt.obj, tt.labels) + if tt.wantErr && err == nil { + t.Fatal("expected error, got nil") + } + if !tt.wantErr && err != nil { + t.Fatalf("unexpected error: %v", err) + } + if match != tt.wantMatch { + t.Errorf("expected match=%v, got %v", tt.wantMatch, match) + } + }) + } +} + func TestReservationsResourceRouter_Match(t *testing.T) { router := ReservationsResourceRouter{} diff --git a/pkg/sso/sso.go b/pkg/sso/sso.go index c5535d915..5f8241bea 100644 --- a/pkg/sso/sso.go +++ b/pkg/sso/sso.go @@ -32,7 +32,7 @@ type requestLogger struct { // RoundTrip logs the request URL before making the request. func (lrt *requestLogger) RoundTrip(req *http.Request) (*http.Response, error) { - slog.Info("making http request", "url", req.URL.String()) //nolint:gosec // structured logging key-value, not string interpolation + slog.Info("making http request", "url", req.URL.String()) return lrt.T.RoundTrip(req) } diff --git a/postgres/Dockerfile b/postgres/Dockerfile index 552a067da..6a352e441 100644 --- a/postgres/Dockerfile +++ b/postgres/Dockerfile @@ -1,4 +1,4 @@ -FROM debian:trixie-slim@sha256:cedb1ef40439206b673ee8b33a46a03a0c9fa90bf3732f54704f99cb061d2c5a +FROM debian:trixie-slim@sha256:109e2c65005bf160609e4ba6acf7783752f8502ad218e298253428690b9eaa4b # explicitly set user/group IDs RUN set -eux; \ diff --git a/tools/resdiff/main.go b/tools/resdiff/main.go index 1e992ba11..76d4d79a2 100644 --- a/tools/resdiff/main.go +++ b/tools/resdiff/main.go @@ -35,7 +35,7 @@ func main() { diffFlag := flag.String("diff", "", "Comma-separated list of resource names to compare (empty or omit = all)") noColorFlag := flag.Bool("no-color", false, "Disable colorized output") flag.Parse() - useColor = !*noColorFlag && term.IsTerminal(int(os.Stdout.Fd())) //nolint:gosec // file descriptors are small numbers, uintptr->int safe in practice + useColor = !*noColorFlag && term.IsTerminal(int(os.Stdout.Fd())) resources := readAndParseInput(os.Stdin) var names []string var selected []map[string]any diff --git a/tools/visualize-committed-resources/main.go b/tools/visualize-committed-resources/main.go index f722f8b8d..4cb99ce7c 100644 --- a/tools/visualize-committed-resources/main.go +++ b/tools/visualize-committed-resources/main.go @@ -9,7 +9,7 @@ // // Flags: // -// --context=ctx Kubernetes context (default: current context) +// --contexts=ctx1,ctx2 Kubernetes contexts to query (default: current context) // --filter-project=id Show only CRs for this project ID (substring match) // --filter-az=az Show only CRs in this availability zone (substring match) // --filter-group=name Show only CRs for this flavor group (substring match) @@ -37,7 +37,6 @@ import ( utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/client-go/tools/clientcmd" "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/client/config" ) var scheme = runtime.NewScheme() @@ -55,6 +54,7 @@ const ( colYellow = "\033[33m" colRed = "\033[31m" colCyan = "\033[36m" + colBlue = "\033[34m" colGray = "\033[90m" ) @@ -62,9 +62,21 @@ func green(s string) string { return colGreen + s + colReset } func yellow(s string) string { return colYellow + s + colReset } func red(s string) string { return colRed + s + colReset } func cyan(s string) string { return colCyan + s + colReset } +func blue(s string) string { return colBlue + s + colReset } func gray(s string) string { return colGray + s + colReset } func bold(s string) string { return colBold + s + colReset } +func resourceTypeBadge(rt v1alpha1.CommittedResourceType) string { + switch rt { + case v1alpha1.CommittedResourceTypeCores: + return yellow("[CPU]") + case v1alpha1.CommittedResourceTypeMemory: + return blue("[RAM]") + default: + return gray("[?]") + } +} + // ── Views ───────────────────────────────────────────────────────────────────── const ( @@ -106,17 +118,9 @@ func (vs viewSet) has(v string) bool { return vs[v] } // ── k8s client ──────────────────────────────────────────────────────────────── -func newClient(contextName string) (client.Client, error) { - if contextName == "" { - c, err := config.GetConfig() - if err != nil { - return nil, fmt.Errorf("getting kubeconfig: %w", err) - } - return client.New(c, client.Options{Scheme: scheme}) - } - loadingRules := clientcmd.NewDefaultClientConfigLoadingRules() +func getClientForContext(contextName string) (client.Client, error) { kubeConfig := clientcmd.NewNonInteractiveDeferredLoadingClientConfig( - loadingRules, + clientcmd.NewDefaultClientConfigLoadingRules(), &clientcmd.ConfigOverrides{CurrentContext: contextName}, ) c, err := kubeConfig.ClientConfig() @@ -126,6 +130,18 @@ func newClient(contextName string) (client.Client, error) { return client.New(c, client.Options{Scheme: scheme}) } +type contextClient struct { + name string + client client.Client +} + +func contextDisplayName(ctx string) string { + if ctx == "" { + return "(current)" + } + return ctx +} + // ── helpers ─────────────────────────────────────────────────────────────────── func printHeader(title string) { @@ -302,44 +318,50 @@ func printCommitments(crs []v1alpha1.CommittedResource, showUsage bool) { } for _, cr := range crs { - fmt.Printf("\n %s %s\n", + accepted := gray("—") + if cr.Status.AcceptedSpec != nil { + accepted = cr.Status.AcceptedSpec.Amount.String() + } + + endStr := gray("no expiry") + if cr.Spec.EndTime != nil { + remaining := time.Until(cr.Spec.EndTime.Time).Round(time.Minute) + if remaining < 0 { + endStr = red(fmt.Sprintf("expired %s ago", age(cr.Spec.EndTime))) + } else { + endStr = gray(fmt.Sprintf("exp in %s", remaining)) + } + } + + // Line 1: identity + type + ready + state + group + az + fmt.Printf("\n %s %s %s %s group=%s az=%s age=%s\n", bold(cyan(cr.Spec.CommitmentUUID)), + resourceTypeBadge(cr.Spec.ResourceType), crReadyStatus(cr), - ) - fmt.Printf(" project=%-36s group=%-20s az=%s\n", - cr.Spec.ProjectID, cr.Spec.FlavorGroupName, cr.Spec.AvailabilityZone) - fmt.Printf(" state=%-14s amount=%-10s accepted=%s\n", stateColour(cr.Spec.State), + cr.Spec.FlavorGroupName, + cr.Spec.AvailabilityZone, + age(&cr.CreationTimestamp), + ) + // Line 2: project + amount + expiry + fmt.Printf(" project=%s amount=%s accepted=%s %s\n", + cr.Spec.ProjectID, cr.Spec.Amount.String(), - func() string { - if cr.Status.AcceptedSpec == nil { - return gray("—") - } - return cr.Status.AcceptedSpec.Amount.String() - }(), + accepted, + endStr, ) + // Line 3 (optional): usage if mem, ok := cr.Status.UsedResources["memory"]; ok { cpu := cr.Status.UsedResources["cpu"] usageAgeStr := gray("—") if cr.Status.LastUsageReconcileAt != nil { usageAgeStr = age(cr.Status.LastUsageReconcileAt) } - fmt.Printf(" used=%-12s usedCPU=%-10s instances=%-4d usage-age=%s\n", + fmt.Printf(" used=%s cpu=%s instances=%d usage-age=%s\n", mem.String(), cpu.String(), len(cr.Status.AssignedInstances), usageAgeStr) } - endStr := gray("no expiry") - if cr.Spec.EndTime != nil { - remaining := time.Until(cr.Spec.EndTime.Time).Round(time.Minute) - if remaining < 0 { - endStr = red(fmt.Sprintf("expired %s ago", age(cr.Spec.EndTime))) - } else { - endStr = fmt.Sprintf("expires in %s (at %s)", remaining, cr.Spec.EndTime.Format(time.RFC3339)) - } - } - fmt.Printf(" age=%-8s %s\n", age(&cr.CreationTimestamp), endStr) - if showUsage && len(cr.Status.AssignedInstances) > 0 { fmt.Printf(" assigned instances (%d):\n", len(cr.Status.AssignedInstances)) for _, inst := range cr.Status.AssignedInstances { @@ -466,7 +488,7 @@ func printReservations(crs []v1alpha1.CommittedResource, reservations []v1alpha1 // ── main ────────────────────────────────────────────────────────────────────── func main() { - k8sContext := flag.String("context", "", "Kubernetes context (default: current context)") + contextsFlag := flag.String("contexts", "", "Comma-separated Kubernetes contexts to query (default: current context)") filterProject := flag.String("filter-project", "", "Show only CRs for this project ID (substring match)") filterAZ := flag.String("filter-az", "", "Show only CRs in this availability zone (substring match)") filterGroup := flag.String("filter-group", "", "Show only CRs for this flavor group (substring match)") @@ -489,17 +511,28 @@ func main() { active: *activeOnly, } - cl, err := newClient(*k8sContext) - if err != nil { - fmt.Fprintf(os.Stderr, "error: %v\n", err) - os.Exit(1) + contextNames := []string{""} + if *contextsFlag != "" { + contextNames = strings.FieldsFunc(*contextsFlag, func(r rune) bool { return r == ',' }) + for i := range contextNames { + contextNames[i] = strings.TrimSpace(contextNames[i]) + } + } + var clients []contextClient + for _, name := range contextNames { + cl, err := getClientForContext(name) + if err != nil { + fmt.Fprintf(os.Stderr, "error creating client for context %q: %v\n", contextDisplayName(name), err) + os.Exit(1) + } + clients = append(clients, contextClient{name: name, client: cl}) } ctx := context.Background() var prevDigest string first := true for { - crs, reservations := fetchSnapshot(ctx, cl, f, *limitFlag) + crs, reservations := fetchSnapshot(ctx, clients, f, *limitFlag) if d := snapshotDigest(crs, reservations); first || d != prevDigest { if !first { fmt.Printf("\n%s %s %s\n", @@ -531,59 +564,75 @@ func snapshotDigest(crs []v1alpha1.CommittedResource, reservations []v1alpha1.Re return b.String() } -func fetchSnapshot(ctx context.Context, cl client.Client, f filters, limit int) ([]v1alpha1.CommittedResource, []v1alpha1.Reservation) { - var listOpts []client.ListOption - if limit > 0 { - listOpts = append(listOpts, client.Limit(int64(limit))) - } +func fetchSnapshot(ctx context.Context, clients []contextClient, f filters, limit int) ([]v1alpha1.CommittedResource, []v1alpha1.Reservation) { + multiContext := len(clients) > 1 - var crList v1alpha1.CommittedResourceList - if err := cl.List(ctx, &crList, listOpts...); err != nil { - fmt.Fprintf(os.Stderr, "error listing CommittedResources: %v\n", err) - os.Exit(1) - } + var allCRs []v1alpha1.CommittedResource + var allReservations []v1alpha1.Reservation - var resList v1alpha1.ReservationList - if err := cl.List(ctx, &resList, append(listOpts, client.MatchingLabels{ - v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, - })...); err != nil { - fmt.Fprintf(os.Stderr, "error listing Reservations: %v\n", err) - os.Exit(1) - } + for _, cc := range clients { + var listOpts []client.ListOption + if limit > 0 { + listOpts = append(listOpts, client.Limit(int64(limit))) + } - if crList.Continue != "" { - fmt.Fprintf(os.Stderr, yellow("warning: CR list truncated at %d — use --limit=0 or a higher value to see all\n"), limit) - } - if resList.Continue != "" { - fmt.Fprintf(os.Stderr, yellow("warning: Reservation list truncated at %d — use --limit=0 or a higher value to see all\n"), limit) - } - var crs []v1alpha1.CommittedResource - for _, cr := range crList.Items { - if f.match(cr) { - crs = append(crs, cr) + var crList v1alpha1.CommittedResourceList + if err := cc.client.List(ctx, &crList, listOpts...); err != nil { + fmt.Fprintf(os.Stderr, "warning: error listing CommittedResources in context %q: %v\n", contextDisplayName(cc.name), err) + continue + } + var resList v1alpha1.ReservationList + if err := cc.client.List(ctx, &resList, append(listOpts, client.MatchingLabels{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + })...); err != nil { + fmt.Fprintf(os.Stderr, "warning: error listing Reservations in context %q: %v\n", contextDisplayName(cc.name), err) + continue + } + + if crList.Continue != "" { + fmt.Fprintf(os.Stderr, yellow("warning: CR list truncated at %d in context %q — use --limit=0 or a higher value\n"), limit, contextDisplayName(cc.name)) + } + if resList.Continue != "" { + fmt.Fprintf(os.Stderr, yellow("warning: Reservation list truncated at %d in context %q — use --limit=0 or a higher value\n"), limit, contextDisplayName(cc.name)) + } + + for _, cr := range crList.Items { + if f.match(cr) { + if multiContext { + cr.Name = cr.Name + "@" + contextDisplayName(cc.name) + } + allCRs = append(allCRs, cr) + } + } + for _, res := range resList.Items { + if res.Spec.CommittedResourceReservation == nil { + continue + } + if multiContext { + res.Name = res.Name + "@" + contextDisplayName(cc.name) + } + allReservations = append(allReservations, res) } } - sort.Slice(crs, func(i, j int) bool { - if crs[i].Spec.FlavorGroupName != crs[j].Spec.FlavorGroupName { - return crs[i].Spec.FlavorGroupName < crs[j].Spec.FlavorGroupName + + sort.Slice(allCRs, func(i, j int) bool { + if allCRs[i].Spec.FlavorGroupName != allCRs[j].Spec.FlavorGroupName { + return allCRs[i].Spec.FlavorGroupName < allCRs[j].Spec.FlavorGroupName } - return crs[i].Spec.CommitmentUUID < crs[j].Spec.CommitmentUUID + return allCRs[i].Spec.CommitmentUUID < allCRs[j].Spec.CommitmentUUID }) - matchedUUIDs := make(map[string]bool, len(crs)) - for _, cr := range crs { + matchedUUIDs := make(map[string]bool, len(allCRs)) + for _, cr := range allCRs { matchedUUIDs[cr.Spec.CommitmentUUID] = true } var reservations []v1alpha1.Reservation - for _, res := range resList.Items { - if res.Spec.CommittedResourceReservation == nil { - continue - } + for _, res := range allReservations { if matchedUUIDs[res.Spec.CommittedResourceReservation.CommitmentUUID] { reservations = append(reservations, res) } } - return crs, reservations + return allCRs, reservations } func printSnapshot(crs []v1alpha1.CommittedResource, reservations []v1alpha1.Reservation, f filters, views viewSet) {