Alerts

InstanceDiskFullCrit (0 active)
alert: InstanceDiskFullCrit
expr: (node_filesystem_avail_bytes{fstype!~"(rpc_pipefs|rootfs)"}
  / node_filesystem_size_bytes) * 100 < 5
for: 15m
labels:
  severity: crit
annotations:
  summary: '{{$labels.instance}} FS {{$labels.mountpoint}} at {{ printf "%3.2f"
    $value }}% avail'
InstanceIOWaitCrit (0 active)
alert: InstanceIOWaitCrit
expr: max
  by(instance) (irate(node_cpu{mode="iowait"}[5m])) * 100 > 80
for: 5m
labels:
  severity: crit
annotations:
  summary: '{{$labels.instance}} sustained iowait at {{ printf "%3.2f" $value
    }}%'
PrometheusReloadFailed (0 active)
alert: PrometheusReloadFailed
expr: prometheus_config_last_reload_successful
  == 0
for: 1h
labels:
  severity: warn
annotations:
  summary: Prometheus {{$labels.instance}} config reload fail
PuppetFail (0 active)
alert: PuppetFail
expr: puppet_agent_failed
  == 1
for: 2m
labels:
  severity: warn
annotations:
  logs: https://logstash-beta.wmflabs.org/app/kibana#/dashboard/66ed4030-782f-11e7-b59f-bdb74a2a8a82?_g=(time:(from:now-1h,mode:quick,to:now))&_a=(filters:!((query:(match:(host:(query:{{
    reReplaceAll ":\\d+" "" $labels.instance }},type:phrase))),meta:(alias:!n,disabled:!f,index:'logstash-*',key:host,negate:!f,value:{{
    reReplaceAll ":\\d+" "" $labels.instance }})),(meta:(alias:!n,disabled:!f,index:'logstash-*',key:program,negate:!f,value:puppet-agent),query:(match:(program:(query:puppet-agent,type:phrase)))),(meta:(alias:!n,disabled:!f,index:'logstash-*',key:level,negate:!f,value:ERROR),query:(match:(level:(query:ERROR,type:phrase))))))
  open_task: https://phabricator.wikimedia.org/maniphest/task/create/?title=Puppet
    failure on {{$labels.instance}}&projects=Beta-Cluster-Infrastructure&description=Puppet
    has failed on {{$labels.instance}}
  summary: Puppet failed on {{$labels.instance}}
PuppetHugeFail (0 active)
alert: PuppetHugeFail
expr: (sum(puppet_agent_failed)
  / count(puppet_agent_failed)) * 100 > 7
for: 2m
labels:
  severity: crit
annotations:
  summary: 'Puppet widespread failure: {{ printf "%3.2f" $value }}%'
PuppetStale (0 active)
alert: PuppetStale
expr: time()
  - puppet_agent_last_run > (60 * 35)
for: 5m
labels:
  severity: warn
annotations:
  logs: https://logstash-beta.wmflabs.org/app/kibana#/dashboard/puppet
  summary: Puppet stale on {{$labels.instance}} for {{$value | humanizeDuration}}
meta_PuppetHugeFail (0 active)
alert: meta_PuppetHugeFail
expr: absent((sum(puppet_agent_failed)
  / count(puppet_agent_failed)) * 100)
for: 10m
labels:
  severity: crit
annotations:
  summary: Data not found for PuppetHugeFail