groups: - name: system rules: - alert: HostHighCPU # expr: rate(node_cpu_seconds_total{mode!="idle"}[2m]) > 0.9 expr: 100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m]))) > 90 for: 2m labels: severity: warning annotations: summary: "High CPU usage on host" - alert: ContainerRestarting # expr: increase(container_start_time_seconds[10m]) > 3 expr: increase(container_restart_count[10m]) > 3 for: 1m labels: severity: warning annotations: summary: "Container restarting frequently" - alert: AlwaysFiring expr: vector(1) for: 10s labels: severity: critical annotations: summary: "This alert should always fire"