From 3c3a3f6d0506f659a99a551b9902aaccdb1c89a9 Mon Sep 17 00:00:00 2001 From: Markus Wieland Date: Mon, 13 Apr 2026 13:20:18 +0200 Subject: [PATCH 1/3] Update links in cortex alerts --- .../cortex-cinder/alerts/cinder.alerts.yaml | 4 ++-- .../cortex-manila/alerts/manila.alerts.yaml | 4 ++-- helm/bundles/cortex-nova/alerts/nova.alerts.yaml | 14 ++++++++++++-- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml b/helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml index 5bb9c965c..90b096e52 100644 --- a/helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml +++ b/helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml @@ -12,7 +12,7 @@ groups: service: cortex severity: warning support_group: workload-management - playbook: docs/support/playbook/cortex/down + playbook: docs/support/playbook/cortex/alerts/down annotations: summary: "Cortex Scheduling for Cinder is down" description: > @@ -31,7 +31,7 @@ groups: service: cortex severity: warning support_group: workload-management - playbook: docs/support/playbook/cortex/down + playbook: docs/support/playbook/cortex/alerts/down annotations: summary: "Cortex Knowledge for Cinder is down" description: > diff --git a/helm/bundles/cortex-manila/alerts/manila.alerts.yaml b/helm/bundles/cortex-manila/alerts/manila.alerts.yaml index eb92d2a95..ab21af9a9 100644 --- a/helm/bundles/cortex-manila/alerts/manila.alerts.yaml +++ b/helm/bundles/cortex-manila/alerts/manila.alerts.yaml @@ -12,7 +12,7 @@ groups: service: cortex severity: warning support_group: workload-management - playbook: docs/support/playbook/cortex/down + playbook: docs/support/playbook/cortex/alerts/down annotations: summary: "Cortex Scheduling for Manila is down" description: > @@ -31,7 +31,7 @@ groups: service: cortex severity: warning support_group: workload-management - playbook: docs/support/playbook/cortex/down + playbook: docs/support/playbook/cortex/alerts/down annotations: summary: "Cortex Knowledge for Manila is down" description: > diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index 41bf29794..7490ce2be 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -12,7 +12,7 @@ groups: service: cortex severity: critical support_group: workload-management - playbook: docs/support/playbook/cortex/down + playbook: docs/support/playbook/cortex/alerts/down annotations: summary: "Cortex Scheduling for Nova is down" description: > @@ -32,7 +32,7 @@ groups: service: cortex severity: warning support_group: workload-management - playbook: docs/support/playbook/cortex/down + playbook: docs/support/playbook/cortex/alerts/down annotations: summary: "Cortex Knowledge for Nova is down" description: > @@ -65,6 +65,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/api-errors annotations: summary: "Nova Scheduler HTTP request 400 errors too high" description: > @@ -82,6 +83,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/api-errors annotations: summary: "Nova Scheduler HTTP request 500 errors too high" description: > @@ -184,6 +186,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready annotations: summary: "Datasource `{{$labels.datasource}}` is in `{{$labels.state}}` state" description: > @@ -200,6 +203,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready annotations: summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state" description: > @@ -253,6 +257,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready annotations: summary: "KPI `{{$labels.kpi}}` is in `{{$labels.state}}` state" description: > @@ -269,6 +274,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready annotations: summary: "Pipeline `{{$labels.pipeline}}` is in `{{$labels.state}}` state" description: > @@ -602,6 +608,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/scheduling annotations: summary: "Nova scheduling cannot find valid KVM hosts" description: > @@ -656,6 +663,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/reconciles annotations: summary: "Controller reconcile error rate >10%" description: > @@ -675,6 +683,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/reconciles annotations: summary: "Controller reconciliation takes longer than ({{ $value | humanizeDuration }})" description: "Reconcile duration higher than 10m while reconciling {{ $labels.controller }}" @@ -689,6 +698,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/reconciles annotations: summary: "Controller {{ $labels.name }}'s backlog is not being drained." description: > From 0d5eb73ee80b741142b3e112b5695f522960ce8f Mon Sep 17 00:00:00 2001 From: Markus Wieland Date: Mon, 13 Apr 2026 13:23:08 +0200 Subject: [PATCH 2/3] Update dashboard links to cortex status dashboard --- .../cortex-cinder/alerts/cinder.alerts.yaml | 30 +++---- .../cortex-manila/alerts/manila.alerts.yaml | 30 +++---- .../cortex-nova/alerts/nova.alerts.yaml | 82 +++++++++---------- 3 files changed, 71 insertions(+), 71 deletions(-) diff --git a/helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml b/helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml index 90b096e52..6684e3392 100644 --- a/helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml +++ b/helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml @@ -8,7 +8,7 @@ groups: for: 5m labels: context: liveness - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -27,7 +27,7 @@ groups: for: 5m labels: context: liveness - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -44,7 +44,7 @@ groups: for: 5m labels: context: api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -61,7 +61,7 @@ groups: for: 5m labels: context: api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -78,7 +78,7 @@ groups: for: 5m labels: context: memory - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -94,7 +94,7 @@ groups: for: 5m labels: context: cpu - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -110,7 +110,7 @@ groups: for: 5m labels: context: db - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -125,7 +125,7 @@ groups: for: 5m labels: context: syncstatus - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -144,7 +144,7 @@ groups: for: 60m labels: context: syncobjects - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -163,7 +163,7 @@ groups: for: 60m labels: context: datasources - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -179,7 +179,7 @@ groups: for: 60m labels: context: knowledge - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -195,7 +195,7 @@ groups: for: 5m labels: context: decisions - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -212,7 +212,7 @@ groups: for: 5m labels: context: decisions - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -232,7 +232,7 @@ groups: for: 60m labels: context: kpis - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -248,7 +248,7 @@ groups: for: 5m labels: context: pipelines - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management diff --git a/helm/bundles/cortex-manila/alerts/manila.alerts.yaml b/helm/bundles/cortex-manila/alerts/manila.alerts.yaml index ab21af9a9..3cd92cf3b 100644 --- a/helm/bundles/cortex-manila/alerts/manila.alerts.yaml +++ b/helm/bundles/cortex-manila/alerts/manila.alerts.yaml @@ -8,7 +8,7 @@ groups: for: 5m labels: context: liveness - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -27,7 +27,7 @@ groups: for: 5m labels: context: liveness - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -44,7 +44,7 @@ groups: for: 5m labels: context: api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -61,7 +61,7 @@ groups: for: 5m labels: context: api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -78,7 +78,7 @@ groups: for: 5m labels: context: memory - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -94,7 +94,7 @@ groups: for: 5m labels: context: cpu - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -110,7 +110,7 @@ groups: for: 5m labels: context: db - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -125,7 +125,7 @@ groups: for: 5m labels: context: syncstatus - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -144,7 +144,7 @@ groups: for: 60m labels: context: syncobjects - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -163,7 +163,7 @@ groups: for: 60m labels: context: datasources - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -179,7 +179,7 @@ groups: for: 60m labels: context: knowledge - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -195,7 +195,7 @@ groups: for: 5m labels: context: decisions - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -212,7 +212,7 @@ groups: for: 5m labels: context: decisions - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -232,7 +232,7 @@ groups: for: 60m labels: context: kpis - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -248,7 +248,7 @@ groups: for: 5m labels: context: kpis - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index 7490ce2be..4bd5e1e31 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -8,7 +8,7 @@ groups: for: 5m labels: context: liveness - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: critical support_group: workload-management @@ -28,7 +28,7 @@ groups: for: 5m labels: context: liveness - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -45,7 +45,7 @@ groups: for: 5m labels: context: descheduler - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -61,7 +61,7 @@ groups: for: 5m labels: context: api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -79,7 +79,7 @@ groups: for: 5m labels: context: api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -97,7 +97,7 @@ groups: for: 5m labels: context: memory - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -113,7 +113,7 @@ groups: for: 5m labels: context: cpu - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -129,7 +129,7 @@ groups: for: 5m labels: context: db - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -144,7 +144,7 @@ groups: for: 5m labels: context: syncstatus - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -163,7 +163,7 @@ groups: for: 60m labels: context: syncobjects - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -182,7 +182,7 @@ groups: for: 60m labels: context: datasources - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -199,7 +199,7 @@ groups: for: 60m labels: context: knowledge - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -216,7 +216,7 @@ groups: for: 5m labels: context: decisions - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -233,7 +233,7 @@ groups: for: 5m labels: context: decisions - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -253,7 +253,7 @@ groups: for: 60m labels: context: kpis - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -270,7 +270,7 @@ groups: for: 5m labels: context: pipelines - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -288,7 +288,7 @@ groups: for: 5m labels: context: committed-resource-api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -306,7 +306,7 @@ groups: for: 5m labels: context: committed-resource-api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -324,7 +324,7 @@ groups: for: 5m labels: context: committed-resource-api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -342,7 +342,7 @@ groups: for: 5m labels: context: committed-resource-api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -361,7 +361,7 @@ groups: for: 5m labels: context: committed-resource-api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -379,7 +379,7 @@ groups: for: 5m labels: context: committed-resource-api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -399,7 +399,7 @@ groups: for: 5m labels: context: committed-resource-api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -415,7 +415,7 @@ groups: for: 5m labels: context: committed-resource-api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -431,7 +431,7 @@ groups: for: 5m labels: context: committed-resource-api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -448,7 +448,7 @@ groups: for: 5m labels: context: committed-resource-api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -463,7 +463,7 @@ groups: for: 5m labels: context: committed-resource-api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -479,7 +479,7 @@ groups: for: 5m labels: context: committed-resource-api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -496,7 +496,7 @@ groups: for: 5m labels: context: committed-resource-syncer - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -516,7 +516,7 @@ groups: for: 15m labels: context: committed-resource-syncer - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -539,7 +539,7 @@ groups: for: 15m labels: context: committed-resource-syncer - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -564,7 +564,7 @@ groups: for: 15m labels: context: committed-resource-syncer - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -586,7 +586,7 @@ groups: for: 15m labels: context: committed-resource-syncer - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -604,7 +604,7 @@ groups: for: 5m labels: context: scheduling - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -622,7 +622,7 @@ groups: for: 60m labels: context: datasources - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -640,7 +640,7 @@ groups: for: 10m labels: context: datasources - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -659,7 +659,7 @@ groups: for: 15m labels: context: controller-errors - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -679,7 +679,7 @@ groups: for: 15m labels: context: controller-duration - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -694,7 +694,7 @@ groups: for: 60m labels: context: controller-workqueue - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -713,7 +713,7 @@ groups: for: 15m labels: context: controller-webhook - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -731,7 +731,7 @@ groups: for: 15m labels: context: controller-webhook - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management From 01ac66ce34563fb5be7e680126ce34f222211c31 Mon Sep 17 00:00:00 2001 From: Markus Wieland Date: Wed, 15 Apr 2026 14:24:42 +0200 Subject: [PATCH 3/3] Add playbook links to Cortex Manila and Nova alert rules --- .../cortex-manila/alerts/manila.alerts.yaml | 47 +++++-------------- .../cortex-nova/alerts/nova.alerts.yaml | 9 +++- 2 files changed, 19 insertions(+), 37 deletions(-) diff --git a/helm/bundles/cortex-manila/alerts/manila.alerts.yaml b/helm/bundles/cortex-manila/alerts/manila.alerts.yaml index 3cd92cf3b..2211d44fe 100644 --- a/helm/bundles/cortex-manila/alerts/manila.alerts.yaml +++ b/helm/bundles/cortex-manila/alerts/manila.alerts.yaml @@ -48,6 +48,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/apierrors annotations: summary: "Manila Scheduler HTTP request 400 errors too high" description: > @@ -65,6 +66,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/apierrors annotations: summary: "Manila Scheduler HTTP request 500 errors too high" description: > @@ -82,6 +84,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/deployment annotations: summary: "`{{$labels.component}}` uses too much memory" description: > @@ -98,6 +101,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/deployment annotations: summary: "`{{$labels.component}}` uses too much CPU" description: > @@ -114,6 +118,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/database annotations: summary: "`{{$labels.component}}` is trying to connect to the database too often" description: > @@ -129,6 +134,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources annotations: summary: "`{{$labels.component}}` Sync not successful" description: > @@ -148,6 +154,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources annotations: summary: "`{{$labels.component}}` is not syncing any new data from `{{$labels.datasource}}`" description: > @@ -167,6 +174,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready annotations: summary: "Datasource `{{$labels.datasource}}` is in `{{$labels.state}}` state" description: > @@ -183,6 +191,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready annotations: summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state" description: > @@ -190,42 +199,6 @@ groups: configuration. It is recommended to investigate the knowledge status and logs for more details. - - alert: CortexManilaDecisionsWithErrors - expr: cortex_decision_state{domain="manila",state="error"} > 0 - for: 5m - labels: - context: decisions - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Some decisions are in error state for operator `{{$labels.operator}}`" - description: > - The cortex scheduling pipeline generated decisions that are in error state. - This may indicate issues with the decision logic or the underlying infrastructure. - It is recommended to investigate the decision logs and the state of the - VMs being processed. - - - alert: CortexManilaTooManyDecisionsWaiting - expr: cortex_decision_state{domain="manila",state="waiting"} > 10 - for: 5m - labels: - context: decisions - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Too many decisions are in waiting state for operator `{{$labels.operator}}`" - description: > - The cortex scheduling pipeline has a high number of decisions for which - no target host has been assigned yet. - - This may indicate a backlog in processing or issues with the decision logic. - It is recommended to investigate the decision logs and the state of the - VMs being processed. - - alert: CortexManilaKPIUnready expr: | cortex_kpi_state{domain="manila",state!="ready"} != 0 @@ -236,6 +209,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready annotations: summary: "KPI `{{$labels.kpi}}` is in `{{$labels.state}}` state" description: > @@ -252,6 +226,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready annotations: summary: "Pipeline `{{$labels.pipeline}}` is in `{{$labels.state}}` state" description: > diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index 4bd5e1e31..fd7f9df99 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -101,6 +101,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/deployment annotations: summary: "`{{$labels.component}}` uses too much memory" description: > @@ -117,6 +118,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/deployment annotations: summary: "`{{$labels.component}}` uses too much CPU" description: > @@ -133,6 +135,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/database annotations: summary: "`{{$labels.component}}` is trying to connect to the database too often" description: > @@ -148,6 +151,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources annotations: summary: "`{{$labels.component}}` Sync not successful" description: > @@ -167,6 +171,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources annotations: summary: "`{{$labels.component}}` is not syncing any new data from `{{$labels.datasource}}`" description: > @@ -626,6 +631,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources annotations: summary: "New datasource `{{$labels.datasource}}` has not reconciled" description: > @@ -644,6 +650,7 @@ groups: service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources annotations: summary: "Existing datasource `{{$labels.datasource}}` is lacking behind" description: > @@ -698,7 +705,7 @@ groups: service: cortex severity: warning support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/reconciles + playbook: docs/support/playbook/cortex/alerts/datasources annotations: summary: "Controller {{ $labels.name }}'s backlog is not being drained." description: >