diff --git a/helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml b/helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml index 5bb9c965c..6684e3392 100644 --- a/helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml +++ b/helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml @@ -8,11 +8,11 @@ groups: for: 5m labels: context: liveness - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management - playbook: docs/support/playbook/cortex/down + playbook: docs/support/playbook/cortex/alerts/down annotations: summary: "Cortex Scheduling for Cinder is down" description: > @@ -27,11 +27,11 @@ groups: for: 5m labels: context: liveness - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management - playbook: docs/support/playbook/cortex/down + playbook: docs/support/playbook/cortex/alerts/down annotations: summary: "Cortex Knowledge for Cinder is down" description: > @@ -44,7 +44,7 @@ groups: for: 5m labels: context: api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -61,7 +61,7 @@ groups: for: 5m labels: context: api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -78,7 +78,7 @@ groups: for: 5m labels: context: memory - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -94,7 +94,7 @@ groups: for: 5m labels: context: cpu - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -110,7 +110,7 @@ groups: for: 5m labels: context: db - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -125,7 +125,7 @@ groups: for: 5m labels: context: syncstatus - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -144,7 +144,7 @@ groups: for: 60m labels: context: syncobjects - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -163,7 +163,7 @@ groups: for: 60m labels: context: datasources - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -179,7 +179,7 @@ groups: for: 60m labels: context: knowledge - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -195,7 +195,7 @@ groups: for: 5m labels: context: decisions - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -212,7 +212,7 @@ groups: for: 5m labels: context: decisions - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -232,7 +232,7 @@ groups: for: 60m labels: context: kpis - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -248,7 +248,7 @@ groups: for: 5m labels: context: pipelines - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management diff --git a/helm/bundles/cortex-manila/alerts/manila.alerts.yaml b/helm/bundles/cortex-manila/alerts/manila.alerts.yaml index eb92d2a95..2211d44fe 100644 --- a/helm/bundles/cortex-manila/alerts/manila.alerts.yaml +++ b/helm/bundles/cortex-manila/alerts/manila.alerts.yaml @@ -8,11 +8,11 @@ groups: for: 5m labels: context: liveness - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management - playbook: docs/support/playbook/cortex/down + playbook: docs/support/playbook/cortex/alerts/down annotations: summary: "Cortex Scheduling for Manila is down" description: > @@ -27,11 +27,11 @@ groups: for: 5m labels: context: liveness - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management - playbook: docs/support/playbook/cortex/down + playbook: docs/support/playbook/cortex/alerts/down annotations: summary: "Cortex Knowledge for Manila is down" description: > @@ -44,10 +44,11 @@ groups: for: 5m labels: context: api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/apierrors annotations: summary: "Manila Scheduler HTTP request 400 errors too high" description: > @@ -61,10 +62,11 @@ groups: for: 5m labels: context: api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/apierrors annotations: summary: "Manila Scheduler HTTP request 500 errors too high" description: > @@ -78,10 +80,11 @@ groups: for: 5m labels: context: memory - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/deployment annotations: summary: "`{{$labels.component}}` uses too much memory" description: > @@ -94,10 +97,11 @@ groups: for: 5m labels: context: cpu - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/deployment annotations: summary: "`{{$labels.component}}` uses too much CPU" description: > @@ -110,10 +114,11 @@ groups: for: 5m labels: context: db - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/database annotations: summary: "`{{$labels.component}}` is trying to connect to the database too often" description: > @@ -125,10 +130,11 @@ groups: for: 5m labels: context: syncstatus - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources annotations: summary: "`{{$labels.component}}` Sync not successful" description: > @@ -144,10 +150,11 @@ groups: for: 60m labels: context: syncobjects - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources annotations: summary: "`{{$labels.component}}` is not syncing any new data from `{{$labels.datasource}}`" description: > @@ -163,10 +170,11 @@ groups: for: 60m labels: context: datasources - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready annotations: summary: "Datasource `{{$labels.datasource}}` is in `{{$labels.state}}` state" description: > @@ -179,10 +187,11 @@ groups: for: 60m labels: context: knowledge - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready annotations: summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state" description: > @@ -190,52 +199,17 @@ groups: configuration. It is recommended to investigate the knowledge status and logs for more details. - - alert: CortexManilaDecisionsWithErrors - expr: cortex_decision_state{domain="manila",state="error"} > 0 - for: 5m - labels: - context: decisions - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Some decisions are in error state for operator `{{$labels.operator}}`" - description: > - The cortex scheduling pipeline generated decisions that are in error state. - This may indicate issues with the decision logic or the underlying infrastructure. - It is recommended to investigate the decision logs and the state of the - VMs being processed. - - - alert: CortexManilaTooManyDecisionsWaiting - expr: cortex_decision_state{domain="manila",state="waiting"} > 10 - for: 5m - labels: - context: decisions - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Too many decisions are in waiting state for operator `{{$labels.operator}}`" - description: > - The cortex scheduling pipeline has a high number of decisions for which - no target host has been assigned yet. - - This may indicate a backlog in processing or issues with the decision logic. - It is recommended to investigate the decision logs and the state of the - VMs being processed. - - alert: CortexManilaKPIUnready expr: | cortex_kpi_state{domain="manila",state!="ready"} != 0 for: 60m labels: context: kpis - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready annotations: summary: "KPI `{{$labels.kpi}}` is in `{{$labels.state}}` state" description: > @@ -248,10 +222,11 @@ groups: for: 5m labels: context: kpis - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready annotations: summary: "Pipeline `{{$labels.pipeline}}` is in `{{$labels.state}}` state" description: > diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index 41bf29794..fd7f9df99 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -8,11 +8,11 @@ groups: for: 5m labels: context: liveness - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: critical support_group: workload-management - playbook: docs/support/playbook/cortex/down + playbook: docs/support/playbook/cortex/alerts/down annotations: summary: "Cortex Scheduling for Nova is down" description: > @@ -28,11 +28,11 @@ groups: for: 5m labels: context: liveness - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management - playbook: docs/support/playbook/cortex/down + playbook: docs/support/playbook/cortex/alerts/down annotations: summary: "Cortex Knowledge for Nova is down" description: > @@ -45,7 +45,7 @@ groups: for: 5m labels: context: descheduler - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -61,10 +61,11 @@ groups: for: 5m labels: context: api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/api-errors annotations: summary: "Nova Scheduler HTTP request 400 errors too high" description: > @@ -78,10 +79,11 @@ groups: for: 5m labels: context: api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/api-errors annotations: summary: "Nova Scheduler HTTP request 500 errors too high" description: > @@ -95,10 +97,11 @@ groups: for: 5m labels: context: memory - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/deployment annotations: summary: "`{{$labels.component}}` uses too much memory" description: > @@ -111,10 +114,11 @@ groups: for: 5m labels: context: cpu - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/deployment annotations: summary: "`{{$labels.component}}` uses too much CPU" description: > @@ -127,10 +131,11 @@ groups: for: 5m labels: context: db - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/database annotations: summary: "`{{$labels.component}}` is trying to connect to the database too often" description: > @@ -142,10 +147,11 @@ groups: for: 5m labels: context: syncstatus - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources annotations: summary: "`{{$labels.component}}` Sync not successful" description: > @@ -161,10 +167,11 @@ groups: for: 60m labels: context: syncobjects - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources annotations: summary: "`{{$labels.component}}` is not syncing any new data from `{{$labels.datasource}}`" description: > @@ -180,10 +187,11 @@ groups: for: 60m labels: context: datasources - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready annotations: summary: "Datasource `{{$labels.datasource}}` is in `{{$labels.state}}` state" description: > @@ -196,10 +204,11 @@ groups: for: 60m labels: context: knowledge - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready annotations: summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state" description: > @@ -212,7 +221,7 @@ groups: for: 5m labels: context: decisions - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -229,7 +238,7 @@ groups: for: 5m labels: context: decisions - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -249,10 +258,11 @@ groups: for: 60m labels: context: kpis - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready annotations: summary: "KPI `{{$labels.kpi}}` is in `{{$labels.state}}` state" description: > @@ -265,10 +275,11 @@ groups: for: 5m labels: context: pipelines - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready annotations: summary: "Pipeline `{{$labels.pipeline}}` is in `{{$labels.state}}` state" description: > @@ -282,7 +293,7 @@ groups: for: 5m labels: context: committed-resource-api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -300,7 +311,7 @@ groups: for: 5m labels: context: committed-resource-api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -318,7 +329,7 @@ groups: for: 5m labels: context: committed-resource-api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -336,7 +347,7 @@ groups: for: 5m labels: context: committed-resource-api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -355,7 +366,7 @@ groups: for: 5m labels: context: committed-resource-api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -373,7 +384,7 @@ groups: for: 5m labels: context: committed-resource-api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -393,7 +404,7 @@ groups: for: 5m labels: context: committed-resource-api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -409,7 +420,7 @@ groups: for: 5m labels: context: committed-resource-api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -425,7 +436,7 @@ groups: for: 5m labels: context: committed-resource-api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -442,7 +453,7 @@ groups: for: 5m labels: context: committed-resource-api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -457,7 +468,7 @@ groups: for: 5m labels: context: committed-resource-api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -473,7 +484,7 @@ groups: for: 5m labels: context: committed-resource-api - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -490,7 +501,7 @@ groups: for: 5m labels: context: committed-resource-syncer - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -510,7 +521,7 @@ groups: for: 15m labels: context: committed-resource-syncer - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -533,7 +544,7 @@ groups: for: 15m labels: context: committed-resource-syncer - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -558,7 +569,7 @@ groups: for: 15m labels: context: committed-resource-syncer - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -580,7 +591,7 @@ groups: for: 15m labels: context: committed-resource-syncer - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -598,10 +609,11 @@ groups: for: 5m labels: context: scheduling - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/scheduling annotations: summary: "Nova scheduling cannot find valid KVM hosts" description: > @@ -615,10 +627,11 @@ groups: for: 60m labels: context: datasources - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources annotations: summary: "New datasource `{{$labels.datasource}}` has not reconciled" description: > @@ -633,10 +646,11 @@ groups: for: 10m labels: context: datasources - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources annotations: summary: "Existing datasource `{{$labels.datasource}}` is lacking behind" description: > @@ -652,10 +666,11 @@ groups: for: 15m labels: context: controller-errors - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/reconciles annotations: summary: "Controller reconcile error rate >10%" description: > @@ -671,10 +686,11 @@ groups: for: 15m labels: context: controller-duration - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/reconciles annotations: summary: "Controller reconciliation takes longer than ({{ $value | humanizeDuration }})" description: "Reconcile duration higher than 10m while reconciling {{ $labels.controller }}" @@ -685,10 +701,11 @@ groups: for: 60m labels: context: controller-workqueue - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources annotations: summary: "Controller {{ $labels.name }}'s backlog is not being drained." description: > @@ -703,7 +720,7 @@ groups: for: 15m labels: context: controller-webhook - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management @@ -721,7 +738,7 @@ groups: for: 15m labels: context: controller-webhook - dashboard: cortex/cortex + dashboard: cortex-status-dashboard/cortex-status-dashboard service: cortex severity: warning support_group: workload-management