prometheus监控kubernetes容器

prometheus.yaml

# Prometheus self-monitoring 普罗米修斯自我监控
groups:
    - name: 普罗米修斯-监控告警                                               #组名,报警规则组名称
      rules:                                                                #定义角色
# 1.1.1. Prometheus job missing 普罗米修斯失踪  
# A Prometheus job has disappeared 普罗米修斯的工作不见了  
      - alert: PrometheusJobMissing                                          #告警名称,实例在规定时间无法访问发出告警
        expr: absent(up{job="prometheus"})                                   #expr表达式 
        for: 0m                                                              #for持续时间,表示0M获取不到信息,触发告警
        labels:                                                              
          severity: warning                                                  #告警级别
        annotations:                                                         #注释告警通知  
          summary: Prometheus job missing (instance {{ $labels.instance }})  #自定义告警通知
          description: "A Prometheus job has disappeared
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.1.2. Prometheus target missing         
# A Prometheus target has disappeared. An exporter might be crashed.普罗米修斯的目标消失了。出口商可能会破产。
      - alert: PrometheusTargetMissing
        expr: up == 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus target missing (instance {{ $labels.instance }})
          description: "A Prometheus target has disappeared. An exporter might be crashed.
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.1.3. Prometheus all targets missing         
# A Prometheus job does not have living target anymore.  普罗米修斯的工作已经没有活的目标了
      - alert: PrometheusAllTargetsMissing
        expr: count by (job) (up) == 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus all targets missing (instance {{ $labels.instance }})
          description: "A Prometheus job does not have living target anymore.
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.1.4. Prometheus configuration reload failure         
# Prometheus configuration reload error    普罗米修斯配置重新加载错误
      - alert: PrometheusConfigurationReloadFailure
         expr: prometheus_config_last_reload_successful != 1
         for: 0m
         labels:
           severity: warning
         annotations:
           summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
           description: "Prometheus configuration reload error
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.1.5. Prometheus too many restarts          
# Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping. 在过去的15分钟里,普罗米修斯已经重启了两次以上。可能是撞车
      - alert: PrometheusTooManyRestarts
        expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Prometheus too many restarts (instance {{ $labels.instance }})
          description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.1.6. Prometheus AlertManager configuration reload failure      
# AlertManager configuration reload error  AlertManager配置重新加载错误
      - alert: PrometheusAlertmanagerConfigurationReloadFailure
        expr: alertmanager_config_last_reload_successful != 1
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
          description: "AlertManager configuration reload error
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.1.7. Prometheus AlertManager config not synced
# Configurations of AlertManager cluster instances are out of sync  AlertManager群集实例的配置不同步
      - alert: PrometheusAlertmanagerConfigNotSynced
        expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
          description: "Configurations of AlertManager cluster instances are out of sync
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.1.8. Prometheus AlertManager E2E dead man switch
#Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.  普罗米修斯死神开关是一个随时开火的警报。它被用作通过Alertmanager对普罗米修斯的端到端测试          
      - alert: PrometheusAlertmanagerE2eDeadManSwitch
        expr: vector(1)
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
          description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.1.9. Prometheus not connected to alertmanager
# Prometheus cannot connect the alertmanager     普罗米修斯无法连接alertmanager 
      - alert: PrometheusNotConnectedToAlertmanager
        expr: prometheus_notifications_alertmanagers_discovered < 1
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
          description: "Prometheus cannot connect the alertmanager
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.1.10. Prometheus rule evaluation failures
# Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts. 普罗米修斯遇到{$value}}规则评估失败,导致可能被忽略的警报
      - alert: PrometheusRuleEvaluationFailures
        expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
          description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.1.11. Prometheus template text expansion failures
# Prometheus encountered {{ $value }} template text expansion failures 普罗米修斯遇到{$value}}模板文本扩展失败
      - alert: PrometheusTemplateTextExpansionFailures
        expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
          description: "Prometheus encountered {{ $value }} template text expansion failures
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.1.12. Prometheus rule evaluation slow
# Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.普罗米修斯规则评估花费的时间比计划的时间间隔长。它表示存储后端访问速度较慢或查询太复杂。
      - alert: PrometheusRuleEvaluationSlow
        expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
          description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.1.13. Prometheus notifications backlog
# The Prometheus notification queue has not been empty for 10 minutes 普罗米修斯通知队列已经有10分钟没有空了。
      - alert: PrometheusNotificationsBacklog
        expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Prometheus notifications backlog (instance {{ $labels.instance }})
          description: "The Prometheus notification queue has not been empty for 10 minutes
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.1.14. Prometheus AlertManager notification failing
# Alertmanager is failing sending notifications    Alertmanager无法发送通知
      - alert: PrometheusAlertmanagerNotificationFailing
        expr: rate(alertmanager_notifications_failed_total[1m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
          description: "Alertmanager is failing sending notifications
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.1.15. Prometheus target empty
# Prometheus has no target in service discovery   普罗米修斯在服务发现中没有目标
      - alert: PrometheusTargetEmpty
        expr: prometheus_sd_discovered_targets == 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus target empty (instance {{ $labels.instance }})
          description: "Prometheus has no target in service discovery
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.1.16. Prometheus target scraping slow
# Prometheus is scraping exporters slowly  普罗米修斯正在慢慢地刮
      - alert: PrometheusTargetScrapingSlow
         expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
         for: 5m
         labels:
           severity: warning
         annotations:
           summary: Prometheus target scraping slow (instance {{ $labels.instance }})
           description: "Prometheus is scraping exporters slowly
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.1.17. Prometheus large scrape
# Prometheus has many scrapes that exceed the sample limit  普罗米修斯有许多刮痕超过了样本限制
      - alert: PrometheusLargeScrape
        expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: Prometheus large scrape (instance {{ $labels.instance }})
          description: "Prometheus has many scrapes that exceed the sample limit
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.1.18. Prometheus target scrape duplicate
# Prometheus has many samples rejected due to duplicate timestamps but different values     普罗米修斯有许多样本由于重复的时间戳而被拒绝,但值不同  
      - alert: PrometheusTargetScrapeDuplicate
        expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
          description: "Prometheus has many samples rejected due to duplicate timestamps but different values
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.1.19. Prometheus TSDB checkpoint creation failures
# Prometheus encountered {{ $value }} checkpoint creation failures   普罗米修斯遇到{$value}}检查点创建失败
      - alert: PrometheusTsdbCheckpointCreationFailures
        expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
          description: "Prometheus encountered {{ $value }} checkpoint creation failures
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.1.20. Prometheus TSDB checkpoint deletion failures
# Prometheus encountered {{ $value }} checkpoint deletion failures   Prometheus遇到{$value}}检查点删除失败
      - alert: PrometheusTsdbCheckpointDeletionFailures
        expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
          description: "Prometheus encountered {{ $value }} checkpoint deletion failures
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.1.21. Prometheus TSDB compactions failed
# Prometheus encountered {{ $value }} TSDB compactions failures   普罗米修斯遇到{$value}}TSDB压缩失败
      - alert: PrometheusTsdbCompactionsFailed
        expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
          description: "Prometheus encountered {{ $value }} TSDB compactions failures
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.1.22. Prometheus TSDB head truncations failed 
# Prometheus encountered {{ $value }} TSDB head truncation failures  Prometheus遇到{$value}}TSDB头截断失败
      - alert: PrometheusTsdbHeadTruncationsFailed 
        expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
          description: "Prometheus encountered {{ $value }} TSDB head truncation failures
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.1.23. Prometheus TSDB reload failures
# Prometheus encountered {{ $value }} TSDB reload failures   普罗米修斯遇到{$value}}TSDB重新加载失败
      - alert: PrometheusTsdbReloadFailures
        expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
          description: "Prometheus encountered {{ $value }} TSDB reload failures
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.1.24. Prometheus TSDB WAL corruptions
# Prometheus encountered {{ $value }} TSDB WAL corruptions  普罗米修斯遇到了{$value}}TSDB-WAL腐蚀
      - alert: PrometheusTsdbWalCorruptions
        expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
          description: "Prometheus encountered {{ $value }} TSDB WAL corruptions
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.1.25. Prometheus TSDB WAL truncations failed
# Prometheus encountered {{ $value }} TSDB WAL truncation failures  普罗米修斯遇到{$value}}TSDB WAL截断失败
      - alert: PrometheusTsdbWalTruncationsFailed
        expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
          description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"                                                      

windows.yaml

 # 1.5. Windows Server : prometheus-community/windows_exporter (5 rules)
 groups:
    - name: Docker容器-监控告警                                               #组名,报警规则组名称
      rules:                                                                #定义角色
# 1.5.1. Windows Server collector Error
# Collector {{ $labels.collector }} was not successful
      - alert: WindowsServerCollectorError
        expr: windows_exporter_collector_success == 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Windows Server collector Error (instance {{ $labels.instance }})
          description: "Collector {{ $labels.collector }} was not successful
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.5.2. Windows Server service Status
# Windows Service state is not OK
      - alert: WindowsServerServiceStatus
        expr: windows_service_status{status="ok"} != 1
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: Windows Server service Status (instance {{ $labels.instance }})
          description: "Windows Service state is not OK
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.5.3. Windows Server CPU Usage
# CPU Usage is more than 80%
      - alert: WindowsServerCpuUsage
        expr: 100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 80
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Windows Server CPU Usage (instance {{ $labels.instance }})
          description: "CPU Usage is more than 80%
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.5.4. Windows Server memory Usage
# Memory usage is more than 90%
      - alert: WindowsServerMemoryUsage
        expr: 100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 90
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Windows Server memory Usage (instance {{ $labels.instance }})
          description: "Memory usage is more than 90%
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.5.5. Windows Server disk Space Usage
# Disk usage is more than 80%
      - alert: WindowsServerDiskSpaceUsage
        expr: 100.0 - 100 * ((windows_logical_disk_free_bytes / 1024 / 1024 ) / (windows_logical_disk_size_bytes / 1024 / 1024)) > 80
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: Windows Server disk Space Usage (instance {{ $labels.instance }})
          description: "Disk usage is more than 80%
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"

node-exporter.yaml

# Host and hardware : node-exporter (31 rules)
groups:
    - name: 主机节点-监控告警                                                                 #组名,报警规则组名称
      rules:                                                                                #定义角色
# 1.2.1. Host out of memory
# 节点内存已满(<10%- alert: 主机内存                                                                  #告警名称,实例在规定时间无法访问发出告警
        expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10            #expr规则
        for: 10m                                                                                 ##for持续时间,表示2M获取不到信息,触发告警
        labels:
          severity: warning                                                                     #告警级别
        annotations:                                                                            #注释告警通知  
          summary: 主机内存不足 (instance {{ $labels.instance }})                         #自定义告警通知
          description: "节点内存已满(<10%)
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.2.2. Host memory under memory pressure
# The node is under heavy memory pressure. High rate of major page faults
      - alert: HostMemoryUnderMemoryPressure
        expr: rate(node_vmstat_pgmajfault[1m]) > 1000
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Host memory under memory pressure (instance {{ $labels.instance }})
          description: "The node is under heavy memory pressure. High rate of major page faults
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.2.3. Host unusual network throughput in
# 主机网络接口可能接收的数据太多(>100 MB/s)
      - alert: 网卡接收数据
        expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: 主机网络吞吐量 (instance {{ $labels.instance }})
          description: "主机网络接口可能接收的数据太多主机网络接口可能接收的数据太多 (> 100 MB/s)
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.2.4. Host unusual network throughput out
# 主机网络接口可能发送太多数据 (> 100 MB/s)
      - alert: 网卡发送数据
        expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: 主机网络吞吐量 (instance {{ $labels.instance }})
          description: "主机网络接口可能发送太多数据 (> 100 MB/s)
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.2.5. Host unusual disk read rate
# 磁盘可能读取了太多数据(>50 MB/s)
      - alert: 主机磁盘异常读取
        expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: 主机磁盘读取率 (instance {{ $labels.instance }})
          description: "磁盘可能读取了太多数据 (> 50 MB/s)
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.2.6. Host unusual disk write rate
# Disk is probably writing too much data (> 50 MB/s)
      - alert: 主机异常磁盘写入
        expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Host unusual disk write rate (instance {{ $labels.instance }})
          description: "Disk is probably writing too much data (> 50 MB/s)
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.2.7. Host out of disk space
# Disk is almost full (< 10% left)
  # Please add ignored mountpoints in node_exporter parameters like
  # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
  # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
      - alert: 主机磁盘空间
        expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 20 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: 主机磁盘空间不足 (instance {{ $labels.instance }})
          description: "磁盘快满了 (< 10% left)
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.2.8. Host disk will fill in 24 hours
# Filesystem is predicted to run out of space within the next 24 hours at current write rate
  # Please add ignored mountpoints in node_exporter parameters like
  # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".1
  # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
      - alert: 主机磁盘将在24小时内填满
        expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 20 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: 主机磁盘将占用24小时 (instance {{ $labels.instance }})
          description: "文件系统预计将在未来24小时内以当前写入速率耗尽空间
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.2.9. Host out of inodes
# 磁盘上的可用索引节点快用完了(<10%- alert: 主机inodes
        expr: node_filesystem_files_free{mountpoint ="/"} / node_filesystem_files{mountpoint="/"} * 100 < 20 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/"} == 0
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: 主机已用inode(instance {{ $labels.instance }})
          description: "磁盘的可用索引节点快用完了 (< 10% left)
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.2.10. Host inodes will fill in 24 hours
# Filesystem is predicted to run out of inodes within the next 24 hours at current write rate
      - alert: 主机inode将在24小时内用完
        expr: node_filesystem_files_free{mountpoint ="/"} / node_filesystem_files{mountpoint="/"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: 主机索引节点将在24小时内用完 (instance {{ $labels.instance }})
          description: "文件系统预计将在未来24小时内以当前写入速率耗尽inode
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.2.11. Host unusual disk read latency
# Disk latency is growing (read operations > 100ms)
      - alert: 主机磁盘读取延迟
        expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: 主机磁盘读取延迟 (instance {{ $labels.instance }})
          description: "磁盘延迟正在增长 (读取操作 > 100ms)
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.2.12. Host unusual disk write latency
# Disk latency is growing (write operations > 100ms)
      - alert: 主机磁盘写入延迟
        expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: 主机磁盘写入延迟 (instance {{ $labels.instance }})
          description: "磁盘延迟正在增长 (写入操作 > 100ms)
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.2.13. Host high CPU load
#mode="idle" 从系统启动开始,累计到当前时刻,除IO等待时间以外的其它等待时间,亦即空闲时间
# CPU load is > 80%
      - alert: 主机CPU高负载
        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: 主机高负载 (instance {{ $labels.instance }})
          description: "CPU负载为 > 80%
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.2.14. Host CPU steal noisy neighbor
#mode="steal"当运行在虚拟化环境中,花费在其它 OS 中的时间(基于虚拟机监视器 hypervisor 的调度);可以理解成由于虚拟机调度器将 cpu 时间用于其它 OS 了,故当前 OS 无法使用 CPU 的时间。
# CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
      - alert: HostCpuStealNoisyNeighbor
        expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
          description: "CPU窃取>10%
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.2.15. Host context switching
# Context switching is growing on node (> 1000 / s)
  # 1000 context switches is an arbitrary number.
  # Alert threshold depends on nature of application.
  # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
      - alert: 主机上下文切换
        expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 15000
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Host context switching (instance {{ $labels.instance }})
          description: "Context switching is growing on node (> 1000 / s)
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.2.16. Host swap is filling up
# Swap is filling up (>80%)
      - alert: 主机交换分区
        expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: 主机交换已满 (instance {{ $labels.instance }})
          description: "主机交换分区 (>80%)
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.2.17. Host systemd service crashed
# systemd service crashed
      - alert: systemd服务崩溃
        expr: node_systemd_unit_state{state="failed"} == 1
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: 主机systemd服务崩溃 (instance {{ $labels.instance }})
          description: "systemd服务崩溃
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.2.22. Host kernel version deviations
# Different kernel versions are running
      - alert: 主机内核
         expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1
         for: 6h
         labels:
           severity: warning
         annotations:
           summary: Host kernel version deviations (instance {{ $labels.instance }})
           description: "Different kernel versions are running
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.2.23. Host OOM kill detected
# OOM kill detected
      - alert: 检测到OOM杀死
        expr: increase(node_vmstat_oom_kill[1m]) > 0
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: 检测到主机OOM终止 (instance {{ $labels.instance }})
          description: "检测到OOM杀死
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.2.26. Host Network Receive Errors
# Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes.
      - alert: 主机网络接收错误
        expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: 主机网络接收错误 (instance {{ $labels.instance }})
          description: "主机 {{ $labels.instance }} 接口 {{ $labels.device }} 在过去五分钟内收到错误遇到 {{ printf "%.0f" $value }} .
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.2.27. Host Network Transmit Errors
# Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes.
      - alert: 主机网络传输错误
        expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: 主机网络传输错误 (instance {{ $labels.instance }})
          description: "主机 {{ $labels.instance }} 接口 {{ $labels.device }} 在过去五分钟内收到错误遇到 {{ printf "%.0f" $value }} 
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.2.28. Host Network Interface Saturated
# The network interface "{{ $labels.interface }}" on "{{ $labels.instance }}" is getting overloaded.
      - alert: 主机网络接口
        expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: 主机网络接口饱和 (instance {{ $labels.instance }})
          description: "网络接口 "{{ $labels.interface }}" 在 "{{ $labels.instance }}" 已经超负荷了.
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.2.29. Host conntrack limit
# The number of conntrack is approching limit
      - alert: 连接数接近极限
        expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: 主机连接数接近极限 (instance {{ $labels.instance }})
          description: "主机连接数接近极限
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.2.30. Host clock skew
# Clock skew detected. Clock is out of sync.
      - alert: 时钟偏移
        expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: 主机时间偏移 (instance {{ $labels.instance }})
          description: "检测到时钟偏移。时钟不同步.
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.2.31. Host clock not synchronising
# Clock not synchronising.
      - alert: 主机时间不同步
        expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: 主机时间不同步 (instance {{ $labels.instance }})
          description: "时钟不同步。
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"

docker.yaml

# Docker containers : google/cAdvisor (6 rules)
groups:
    - name: Docker容器-监控告警                                               #组名,报警规则组名称
      rules:                                                                #定义角色
# 1.3.1. Container killed
# A container has disappeared
      - alert: ContainerKilled
        expr: time() - container_last_seen > 60
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Container killed (instance {{ $labels.instance }})
          description: "A container has disappeared
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.3.2. Container CPU usage
# Container CPU usage is above 80%
 # cAdvisor can sometimes consume a lot of CPU, so this alert will fire constantly.
  # If you want to exclude it from this alert, exclude the serie having an empty name: container_cpu_usage_seconds_total{name!=""}
      - alert: 容器cpu使用量
        expr: sum(rate(container_cpu_system_seconds_total{name=~".+"}[1m])) by (name,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_host_ip,Bsc_host_ip,Bsc_k8s_namespace,Bsc_k8s_pod_name,Bsc_k8s_pod_ip) * 100  > 80
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: 容器cpu使用量 (instance {{ $labels.instance }})
          description: "容器cpu使用量达到80%
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
      - alert: 容器内存使用率
        expr: (container_memory_working_set_bytes/container_spec_memory_limit_bytes )*100
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: 容器内存使用率
          description: "容器内存使用率是 
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"

# 1.3.5. Container Volume IO usage
# Container Volume IO usage is above 80%
      - alert: 容器磁盘使用量
        expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Container Volume IO usage (instance {{ $labels.instance }})
          description: "Container Volume IO usage is above 80%
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.3.6. Container high throttle rate
# Container is being throttled   
      - alert: ContainerHighThrottleRate
       expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
       for: 2m
       labels:
         severity: warning
       annotations:
         summary: Container high throttle rate (instance {{ $labels.instance }})
         description: "Container is being throttled
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"              

blackbox.yaml

# 1.4. Blackbox : prometheus/blackbox_exporter (8 rules)
groups:
    - name: Blackbox黑匣子-监控告警                                               #组名,报警规则组名称
      rules:                                                                #定义角色
# 1.4.1. Blackbox probe failed
# Probe failed
      - alert: BlackboxProbeFailed
        expr: probe_success == 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Blackbox probe failed (instance {{ $labels.instance }})
          description: "Probe failed
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.4.2. Blackbox slow probe
# Blackbox probe took more than 1s to complete
      - alert: BlackboxSlowProbe
        expr: avg_over_time(probe_duration_seconds[1m]) > 1
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: Blackbox slow probe (instance {{ $labels.instance }})
          description: "Blackbox probe took more than 1s to complete
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.4.3. Blackbox probe HTTP failure
# HTTP status code is not 200-399
      - alert: BlackboxProbeHttpFailure
       expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
       for: 0m
       labels:
         severity: critical
       annotations:
         summary: Blackbox probe HTTP failure (instance {{ $labels.instance }})
         description: "HTTP status code is not 200-399
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.4.4. Blackbox SSL certificate will expire soon
# SSL certificate expires in 30 days
      - alert: BlackboxSslCertificateWillExpireSoon
        expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
          description: "SSL certificate expires in 30 days
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.4.5. Blackbox SSL certificate will expire soon
# SSL certificate expires in 3 days
      - alert: BlackboxSslCertificateWillExpireSoon
        expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
          description: "SSL certificate expires in 3 days
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.4.6. Blackbox SSL certificate expired
# SSL certificate has expired already
      - alert: BlackboxSslCertificateExpired
        expr: probe_ssl_earliest_cert_expiry - time() <= 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
          description: "SSL certificate has expired already
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.4.7. Blackbox probe slow HTTP
# HTTP request took more than 1s
      - alert: BlackboxProbeSlowHttp
        expr: avg_over_time(probe_http_duration_seconds[1m]) > 1
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
          description: "HTTP request took more than 1s
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 1.4.8. Blackbox probe slow ping
# Blackbox ping took more than 1s
      - alert: BlackboxProbeSlowPing
        expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: Blackbox probe slow ping (instance {{ $labels.instance }})
          description: "Blackbox ping took more than 1s
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"                             

kube-state-mertric.yaml

# 5.1。Kubernetes: kube-state-metrics (33条规则)
groups:
    - name: Docker容器-监控告警                                               #组名,报警规则组名称
      rules:                                                                #定义角色
# 5.1.1. Kubernetes Node ready
# Node {{ $labels.node }} has been unready for a long time
      - alert: 节点断开连接
        expr: kube_node_status_condition{condition="Ready",status="true"} == 0
        for: 10m
        labels:
          severity: critical
        annotations:
          summary: 节点断开连接 (instance {{ $labels.instance }})
          description: "节点 {{ $labels.node }} 已经很长时间没有联系上了
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 5.1.2. Kubernetes memory pressure
# {{ $labels.node }} has MemoryPressure condition
      - alert: k8s节点内存有压力
        expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: k8s节点内存有压力 (instance {{ $labels.instance }})
          description: "{{ $labels.node }} 是否存在内存有压力
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 5.1.3. Kubernetes disk pressure
# {{ $labels.node }} has DiskPressure condition
      - alert: k8s节点磁盘有压力
        expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: k8s节点存在磁盘有压力 (instance {{ $labels.instance }})
          description: "{{ $labels.node }} has DiskPressure condition
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 5.1.4. Kubernetes out of disk
# {{ $labels.node }} has OutOfDisk condition
      - alert: k8s磁盘不足
        expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: K8s磁盘空间不足 (instance {{ $labels.instance }})
          description: "{{ $labels.node }} 磁盘空间不足
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 5.1.5. Kubernetes out of capacity
# {{ $labels.node }} is out of capacity
      - alert: 容量不足
        expr: sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(pod, namespace) group_left(node) (0 * kube_pod_info)) / sum(kube_node_status_allocatable_pods) by (node) * 100 > 90
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Kubernetes 容量不足 (instance {{ $labels.instance }})
          description: "{{ $labels.node }} 容量不足
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 5.1.6. Kubernetes container oom killer
# Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.
      - alert: 十分钟容器被kill的次数
        expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: 十分钟pod被kill的次数 (instance {{ $labels.instance }})
          description: "过去10分钟内容器 {{ $labels.container }} 在pod {{ $labels.namespace }}/{{ $labels.pod }} 被杀死了 {{ $value }}  
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 5.1.7. Kubernetes Job failed
# Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete
      - alert: job 未能完成
        expr: kube_job_status_failed > 0
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Kubernetes Job 未完成 (instance {{ $labels.instance }})
          description: "Job {{$labels.namespace}}/{{$labels.exported_job}} 未能完成
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 5.1.9. Kubernetes PersistentVolumeClaim pending
# PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending
      - alert: k8s volumeclaim 已挂起
        expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: k8s PersistentVolumeClaim 已挂起 (instance {{ $labels.instance }})
          description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} 已挂起
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 5.1.12. Kubernetes PersistentVolume error
# Persistent volume is in bad state
      - alert: 永久卷处于错误状态
        expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: K8s 永久卷处于错误状态 (instance {{ $labels.instance }})
          description: "永久卷处于错误状态
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 5.1.13. Kubernetes StatefulSet down
# A StatefulSet went down
      - alert: k8s 状态集
        expr: (kube_statefulset_status_replicas_ready / kube_statefulset_status_replicas_current) != 1
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: Kubernetes 状态集 down (instance {{ $labels.instance }})
          description: "A StatefulSet went down
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 5.1.17. Kubernetes Pod not healthy
# Pod has been in a non-ready state for longer than 15 minutes.
      - alert: POd 亚健康状态
        expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[15m:1m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: k8s Pod not healthy (instance {{ $labels.instance }})
          description: "Pod已处于非就绪状态超过15分钟。
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 5.1.18. Kubernetes pod crash looping
# Pod {{ $labels.pod }} is crash looping
      - alert: K8s Pod CrashLooping
        expr: increase(kube_pod_container_status_restarts_total[1m]) > 3
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
          description: "Pod {{ $labels.pod }} 崩溃循环
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 5.1.21. Kubernetes StatefulSet replicas mismatch
# A StatefulSet does not match the expected number of replicas.
      - alert: 状态集与副本的预期数量不匹配
        expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: Kubernetes 状态集副本不匹配 (instance {{ $labels.instance }})
          description: "状态集与副本的预期数量不匹配.
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 5.1.23. Kubernetes StatefulSet generation mismatch
# A StatefulSet has failed but has not been rolled back.
      - alert: K8s状态集生成失配
        expr: kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation
        for: 10m
        labels:
          severity: critical
        annotations:
          summary: Kubernetes 状态集生成失配 (instance {{ $labels.instance }})
          description: "状态集已失败,但尚未被回滚。
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 5.1.28. Kubernetes job slow completion
# Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time
      - alert: K8s Job 缓慢完成
        expr: kube_job_spec_completions - kube_job_status_succeeded > 0
        for: 12h    
        labels:
          severity: critical
        annotations:
          summary: Kubernetes job 完成缓慢 (instance {{ $labels.instance }})
          description: "K8s Job {{ $labels.namespace }}/{{ $labels.job_name }} 未及时完成.
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 5.1.30. Kubernetes API client errors
# Kubernetes API client is experiencing high error rate
      - alert: K8s API客户端错误
        expr: (sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: Kubernetes API客户端错误 (instance {{ $labels.instance }})
          description: "Kubernetes API客户端遇到高错误率
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 5.1.31. Kubernetes client certificate expires next week
# A client certificate used to authenticate to the apiserver is expiring next week.
#      - alert: KubernetesClientCertificateExpiresNextWeek
#        expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60
#        for: 0m
#        labels:
#          severity: warning
#        annotations:
#          summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }})
#          description: "用于向apiserver进行身份验证的客户端证书将于下周过期。
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 5.1.32. Kubernetes client certificate expires soon
# A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
#      - alert: KubernetesClientCertificateExpiresSoon
#        expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60
#        for: 0m
#        labels:
#          severity: critical
#        annotations:
#          summary: Kubernetes client certificate expires soon (instance {{ $labels.instance }})
#          description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"
# 5.1.33. Kubernetes API server latency
# Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.
#      - alert: KubernetesApiServerLatency
#        expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1
#        for: 2m
#        labels:
#          severity: warning
#        annotations:
#          summary: Kubernetes API server latency (instance {{ $labels.instance }})
#          description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.
  VALUE = {{ $value }}
  LABELS = {{ $labels }}"                               

注:未测试,谨慎使用

原文地址:https://www.cnblogs.com/fat-girl-spring/p/15045717.html