Rules

alerts_beta.conf

31.831s ago

18.16ms

Rule State Error Last Evaluation Evaluation Time
alert: InstanceDiskFullCrit expr: (node_filesystem_avail_bytes{fstype!~"(rpc_pipefs|rootfs)"} / node_filesystem_size_bytes) * 100 < 5 for: 15m labels: severity: crit annotations: summary: '{{$labels.instance}} FS {{$labels.mountpoint}} at {{ printf "%3.2f" $value }}% avail' ok 31.832s ago 17.47ms
alert: InstanceIOWaitCrit expr: max by(instance) (irate(node_cpu{mode="iowait"}[5m])) * 100 > 80 for: 5m labels: severity: crit annotations: summary: '{{$labels.instance}} sustained iowait at {{ printf "%3.2f" $value }}%' ok 31.815s ago 669.8us

beta_puppet_alerts

4.307s ago

12.82ms

Rule State Error Last Evaluation Evaluation Time
alert: PuppetHugeFail expr: (sum(puppet_agent_failed) / count(puppet_agent_failed)) * 100 > 7 for: 2m labels: severity: crit annotations: summary: 'Puppet widespread failure: {{ printf "%3.2f" $value }}%' ok 4.308s ago 5.005ms
alert: meta_PuppetHugeFail expr: absent((sum(puppet_agent_failed) / count(puppet_agent_failed)) * 100) for: 10m labels: severity: crit annotations: summary: Data not found for PuppetHugeFail ok 4.303s ago 3.117ms
alert: PuppetFail expr: puppet_agent_failed == 1 for: 2m labels: severity: warn annotations: logs: https://logstash-beta.wmflabs.org/app/kibana#/dashboard/66ed4030-782f-11e7-b59f-bdb74a2a8a82?_g=(time:(from:now-1h,mode:quick,to:now))&_a=(filters:!((query:(match:(host:(query:{{ reReplaceAll ":\\d+" "" $labels.instance }},type:phrase))),meta:(alias:!n,disabled:!f,index:'logstash-*',key:host,negate:!f,value:{{ reReplaceAll ":\\d+" "" $labels.instance }})),(meta:(alias:!n,disabled:!f,index:'logstash-*',key:program,negate:!f,value:puppet-agent),query:(match:(program:(query:puppet-agent,type:phrase)))),(meta:(alias:!n,disabled:!f,index:'logstash-*',key:level,negate:!f,value:ERROR),query:(match:(level:(query:ERROR,type:phrase)))))) open_task: https://phabricator.wikimedia.org/maniphest/task/create/?title=Puppet failure on {{$labels.instance}}&projects=Beta-Cluster-Infrastructure&description=Puppet has failed on {{$labels.instance}} summary: Puppet failed on {{$labels.instance}} ok 4.3s ago 1.537ms
alert: PuppetStale expr: time() - puppet_agent_last_run > (60 * 35) for: 5m labels: severity: warn annotations: logs: https://logstash-beta.wmflabs.org/app/kibana#/dashboard/puppet summary: Puppet stale on {{$labels.instance}} for {{$value | humanizeDuration}} ok 4.299s ago 3.13ms

alerts_default.conf

38.655s ago

408.7us

Rule State Error Last Evaluation Evaluation Time
alert: PrometheusReloadFailed expr: prometheus_config_last_reload_successful == 0 for: 1h labels: severity: warn annotations: summary: Prometheus {{$labels.instance}} config reload fail ok 38.656s ago 392.8us