Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 17 additions & 17 deletions helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@ groups:
for: 5m
labels:
context: liveness
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
playbook: docs/support/playbook/cortex/down
playbook: docs/support/playbook/cortex/alerts/down
annotations:
summary: "Cortex Scheduling for Cinder is down"
description: >
Expand All @@ -27,11 +27,11 @@ groups:
for: 5m
labels:
context: liveness
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
playbook: docs/support/playbook/cortex/down
playbook: docs/support/playbook/cortex/alerts/down
annotations:
summary: "Cortex Knowledge for Cinder is down"
description: >
Expand All @@ -44,7 +44,7 @@ groups:
for: 5m
labels:
context: api
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
Expand All @@ -61,7 +61,7 @@ groups:
for: 5m
labels:
context: api
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
Expand All @@ -78,7 +78,7 @@ groups:
for: 5m
labels:
context: memory
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
Expand All @@ -94,7 +94,7 @@ groups:
for: 5m
labels:
context: cpu
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
Expand All @@ -110,7 +110,7 @@ groups:
for: 5m
labels:
context: db
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
Expand All @@ -125,7 +125,7 @@ groups:
for: 5m
labels:
context: syncstatus
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
Expand All @@ -144,7 +144,7 @@ groups:
for: 60m
labels:
context: syncobjects
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
Expand All @@ -163,7 +163,7 @@ groups:
for: 60m
labels:
context: datasources
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
Expand All @@ -179,7 +179,7 @@ groups:
for: 60m
labels:
context: knowledge
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
Expand All @@ -195,7 +195,7 @@ groups:
for: 5m
labels:
context: decisions
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
Expand All @@ -212,7 +212,7 @@ groups:
for: 5m
labels:
context: decisions
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
Expand All @@ -232,7 +232,7 @@ groups:
for: 60m
labels:
context: kpis
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
Expand All @@ -248,7 +248,7 @@ groups:
for: 5m
labels:
context: pipelines
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
Expand Down
77 changes: 26 additions & 51 deletions helm/bundles/cortex-manila/alerts/manila.alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@ groups:
for: 5m
labels:
context: liveness
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
playbook: docs/support/playbook/cortex/down
playbook: docs/support/playbook/cortex/alerts/down
annotations:
summary: "Cortex Scheduling for Manila is down"
description: >
Expand All @@ -27,11 +27,11 @@ groups:
for: 5m
labels:
context: liveness
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
playbook: docs/support/playbook/cortex/down
playbook: docs/support/playbook/cortex/alerts/down
annotations:
summary: "Cortex Knowledge for Manila is down"
description: >
Expand All @@ -44,10 +44,11 @@ groups:
for: 5m
labels:
context: api
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
playbook: docs/support/playbook/cortex/alerts/apierrors
annotations:
summary: "Manila Scheduler HTTP request 400 errors too high"
description: >
Expand All @@ -61,10 +62,11 @@ groups:
for: 5m
labels:
context: api
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
playbook: docs/support/playbook/cortex/alerts/apierrors
annotations:
summary: "Manila Scheduler HTTP request 500 errors too high"
description: >
Expand All @@ -78,10 +80,11 @@ groups:
for: 5m
labels:
context: memory
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
playbook: docs/support/playbook/cortex/alerts/deployment
annotations:
summary: "`{{$labels.component}}` uses too much memory"
description: >
Expand All @@ -94,10 +97,11 @@ groups:
for: 5m
labels:
context: cpu
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
playbook: docs/support/playbook/cortex/alerts/deployment
annotations:
summary: "`{{$labels.component}}` uses too much CPU"
description: >
Expand All @@ -110,10 +114,11 @@ groups:
for: 5m
labels:
context: db
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
playbook: docs/support/playbook/cortex/alerts/database
annotations:
summary: "`{{$labels.component}}` is trying to connect to the database too often"
description: >
Expand All @@ -125,10 +130,11 @@ groups:
for: 5m
labels:
context: syncstatus
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
playbook: docs/support/playbook/cortex/alerts/datasources
annotations:
summary: "`{{$labels.component}}` Sync not successful"
description: >
Expand All @@ -144,10 +150,11 @@ groups:
for: 60m
labels:
context: syncobjects
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
playbook: docs/support/playbook/cortex/alerts/datasources
annotations:
summary: "`{{$labels.component}}` is not syncing any new data from `{{$labels.datasource}}`"
description: >
Expand All @@ -163,10 +170,11 @@ groups:
for: 60m
labels:
context: datasources
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
playbook: docs/support/playbook/cortex/alerts/unready
annotations:
summary: "Datasource `{{$labels.datasource}}` is in `{{$labels.state}}` state"
description: >
Expand All @@ -179,63 +187,29 @@ groups:
for: 60m
labels:
context: knowledge
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
playbook: docs/support/playbook/cortex/alerts/unready
annotations:
summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state"
description: >
This may indicate issues with the knowledge
configuration. It is recommended to investigate the
knowledge status and logs for more details.

- alert: CortexManilaDecisionsWithErrors
expr: cortex_decision_state{domain="manila",state="error"} > 0
for: 5m
labels:
context: decisions
dashboard: cortex/cortex
service: cortex
severity: warning
support_group: workload-management
annotations:
summary: "Some decisions are in error state for operator `{{$labels.operator}}`"
description: >
The cortex scheduling pipeline generated decisions that are in error state.
This may indicate issues with the decision logic or the underlying infrastructure.
It is recommended to investigate the decision logs and the state of the
VMs being processed.

- alert: CortexManilaTooManyDecisionsWaiting
expr: cortex_decision_state{domain="manila",state="waiting"} > 10
for: 5m
labels:
context: decisions
dashboard: cortex/cortex
service: cortex
severity: warning
support_group: workload-management
annotations:
summary: "Too many decisions are in waiting state for operator `{{$labels.operator}}`"
description: >
The cortex scheduling pipeline has a high number of decisions for which
no target host has been assigned yet.

This may indicate a backlog in processing or issues with the decision logic.
It is recommended to investigate the decision logs and the state of the
VMs being processed.

- alert: CortexManilaKPIUnready
expr: |
cortex_kpi_state{domain="manila",state!="ready"} != 0
for: 60m
labels:
context: kpis
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
playbook: docs/support/playbook/cortex/alerts/unready
annotations:
summary: "KPI `{{$labels.kpi}}` is in `{{$labels.state}}` state"
description: >
Expand All @@ -248,10 +222,11 @@ groups:
for: 5m
labels:
context: kpis
dashboard: cortex/cortex
dashboard: cortex-status-dashboard/cortex-status-dashboard
service: cortex
severity: warning
support_group: workload-management
playbook: docs/support/playbook/cortex/alerts/unready
annotations:
summary: "Pipeline `{{$labels.pipeline}}` is in `{{$labels.state}}` state"
description: >
Expand Down
Loading
Loading