Prometheus+Grafana+Altermanager监控告警(一)

参考

https://blog.csdn.net/yanggd1987/article/details/108807171
https://help.aliyun.com/document_detail/123394.html
https://blog.csdn.net/baidu_36943075/article/details/91829364
https://blog.csdn.net/liukuan73/article/details/78881008
https://blog.csdn.net/aixiaoyang168/article/details/98474494

安装启动Prometheus

cat prome_run.sh

docker rm -f prometheus
docker run --name=prometheus -d 
--restart=always 
-p 9090:9090 
-v /data/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml 
-v /data/prometheus/rules.yml:/etc/prometheus/rules.yml 
-v /data/prometheus/data:/data 
-v /data/prometheus/host_discovery_data:/host_discovery_data 
-v /data/prometheus/prometheus_rules:/prometheus_rules 
-v /etc/localtime:/etc/localtime 
prom/prometheus:v2.30.2 
--config.file=/etc/prometheus/prometheus.yml 
--storage.tsdb.path=/data 
--storage.tsdb.retention=30d 
--web.external-url=http://10.5.250.10 
--web.enable-lifecycle

rules.yml是个目录,告警规则在Prometheus_rules里面

cat prometheus.yml

global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  scrape_timeout:      15s
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).
rule_files:
- /prometheus_rules/*.rules
scrape_configs:
- job_name: prometheus
  static_configs:
  - targets:
    - localhost:9090
- job_name: 'host_discovery'
  file_sd_configs:
    - files:
      - "/host_discovery_data/*.json"
      refresh_interval: 3s

- job_name: "kube-state-metrics"
  scheme: https
  tls_config:
    insecure_skip_verify: true
  #使用apiserver授权部分解密的token值,以文件形式存储
  bearer_token_file: /data/xn-secret
  # k8s自动发现具体配置
  kubernetes_sd_configs:
  # 使用endpoint级别自动发现
  - role: endpoints
    api_server: "https://10.3.218.10:16443"
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /data/xn-secret
  relabel_configs:
  - source_labels: [__meta_kubernetes_service_name]
    # 只保留指定匹配正则的标签,不匹配则删除
    action: keep
    #regex: '^(kube-state-metrics)$'
    regex: '^(prometheus-operator-kube-state-metrics)$'
#  - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
    # 只保留指定匹配正则的标签,不匹配则删除
#    action: keep
#    regex: true
  - source_labels: [__address__]
    action: replace
    target_label: instance
  - target_label: __address__
    # 使用replacement值替换__address__默认值
    replacement: 10.3.218.10:16443
  - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_name, __meta_kubernetes_pod_container_port_number]
    # 正则匹配
    regex: ([^;]+);([^;]+);([^;]+)
    # 使用replacement值替换__metrics_path__默认值
    target_label: __metrics_path__
    # 自行构建的apiserver proxy url
    replacement: /api/v1/namespaces/${1}/pods/http:${2}:${3}/proxy/metrics
  - action: labelmap
    regex: __meta_kubernetes_service_label_(.+)
  - source_labels: [__meta_kubernetes_namespace]
    action: replace
    # 将标签__meta_kubernetes_namespace修改为kubernetes_namespace
    target_label: kubernetes_namespace
  - source_labels: [__meta_kubernetes_service_name]
    action: replace
    # 将标签__meta_kubernetes_service_name修改为service_name
    target_label: service_name

- job_name: "kube-node-exporter"
  scheme: https
  tls_config:
    insecure_skip_verify: true
  #使用apiserver授权部分解密的token值,以文件形式存储
  bearer_token_file: /data/xn-secret
  # k8s自动发现具体配置
  kubernetes_sd_configs:
  # 使用endpoint级别自动发现
  - role: endpoints
    api_server: "https://10.3.218.10:16443"
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /data/xn-secret
  relabel_configs:
  - source_labels: [__meta_kubernetes_service_name]
    # 只保留指定匹配正则的标签,不匹配则删除
    action: keep
    regex: '^(prometheus-operator-prometheus-node-exporter)$'
  - source_labels: [__address__]
    action: replace
    target_label: instance
  - target_label: __address__
    # 使用replacement值替换__address__默认值
    replacement: 10.3.218.10:16443
  - source_labels: [__meta_kubernetes_endpoint_node_name]
    # 正则匹配
    regex: (.+)
    # 使用replacement值替换__metrics_path__默认值
    target_label: __metrics_path__
    # 自行构建的apiserver proxy url
    replacement: /api/v1/nodes/${1}:9100/proxy/metrics
  - action: labelmap
    regex: __meta_kubernetes_service_label_(.+)
  - source_labels: [__meta_kubernetes_namespace]
    action: replace
    # 将标签__meta_kubernetes_namespace修改为kubernetes_namespace
    target_label: kubernetes_namespace
  - source_labels: [__meta_kubernetes_service_name]
    action: replace
    # 将标签__meta_kubernetes_service_name修改为service_name
    target_label: service_name

#pods
#- job_name: "kube-pods"
#  scheme: https
#  tls_config:
#    insecure_skip_verify: true
#  #使用apiserver授权部分解密的token值,以文件形式存储
#  bearer_token_file: /data/xn-secret
#  # k8s自动发现具体配置
#  kubernetes_sd_configs:
#  # 使用endpoint级别自动发现
#  - role: pod
#    api_server: "https://10.3.218.10:16443"
#    tls_config:
#      insecure_skip_verify: true
#    bearer_token_file: /data/xn-secret
#  relabel_configs:
#  - source_labels: [__address__]
#    action: replace
#    target_label: instance
#  - target_label: __address__
#    # 使用replacement值替换__address__默认值
#    replacement: 10.3.218.10:16443
#  - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_name, __meta_kubernetes_pod_container_port_number]
#    # 正则匹配
#    regex: ([^;]+);([^;]+);([^;]+)
#    # 使用replacement值替换__metrics_path__默认值
#    target_label: __metrics_path__
#    # 自行构建的apiserver proxy url
#    replacement: /api/v1/namespaces/${1}/pods/http:${2}:${3}/proxy/metrics
#  - action: labelmap
#    regex: __meta_kubernetes_service_label_(.+)
#  - source_labels: [__meta_kubernetes_namespace]
#    action: replace
#    # 将标签__meta_kubernetes_namespace修改为kubernetes_namespace
#    target_label: kubernetes_namespace
#  - source_labels: [__meta_kubernetes_service_name]
#    action: replace
#    # 将标签__meta_kubernetes_service_name修改为service_name
#    target_label: service_name


# kubelet
- job_name: "kube-node-kubelet"
  scheme: https
  tls_config:
    insecure_skip_verify: true
  bearer_token_file: /data/xn-secret
  kubernetes_sd_configs:
  - role: node
    api_server: "https://10.3.218.10:16443"
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /data/xn-secret
  relabel_configs:
  - target_label: __address__
    # 使用replacement值替换__address__默认值
    replacement: 10.3.218.10:16443
  - source_labels: [__meta_kubernetes_node_name]
    regex: (.+)
    # 使用replacement值替换__metrics_path__默认值
    target_label: __metrics_path__
    replacement: /api/v1/nodes/${1}:10250/proxy/metrics
  - action: labelmap
    regex: __meta_kubernetes_service_label_(.+)
  - source_labels: [__meta_kubernetes_namespace]
    action: replace
    target_label: kubernetes_namespace
  - source_labels: [__meta_kubernetes_service_name]
    action: replace
    target_label: service_name
  - source_labels: [__meta_kubernetes_node_address_InternalIP]
    separator: ;
    regex: (.*)
    target_label: IP
    replacement: $1
    action: replace

# advisor    
- job_name: "kube-node-cadvisor"
  scheme: https
  tls_config:
    insecure_skip_verify: true
  bearer_token_file: /data/xn-secret
  kubernetes_sd_configs:
  - role: node
    api_server: "https://10.3.218.10:16443"
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /data/xn-secret
  relabel_configs:
  - target_label: __address__
    # 使用replacement值替换__address__默认值
    replacement: 10.3.218.10:16443
  - source_labels: [__meta_kubernetes_node_name]
    regex: (.+)
    # 使用replacement值替换__metrics_path__默认值
    target_label: __metrics_path__
    replacement: /api/v1/nodes/${1}:10250/proxy/metrics/cadvisor
  - action: labelmap
    regex: __meta_kubernetes_service_label_(.+)
  - source_labels: [__meta_kubernetes_namespace]
    action: replace
    target_label: kubernetes_namespace
  - source_labels: [__meta_kubernetes_service_name]
    action: replace
    target_label: service_name

alerting:
  alertmanagers:
    - static_configs:
      - targets: ['10.5.250.10:9093']
View Code

host_discovery_data自动将list转化成json

cat redis.list

10.3.238.75     xinxi-prod-redis-caiwujihe01
10.3.238.76     xinxi-prod-redis-caiwujihe02
10.5.250.29     xinxi-prod-redis-caiwushoudan01
10.5.250.188    xinxi-prod-redis-caiwushoudan02
10.5.250.8      xinxi-prod-redis-caigou
10.5.250.133    xinxi-prod-redis-tanxiao
10.5.250.133    xinxi-prod-redis-caigoufentan
10.5.250.175    xinxi-prod-redis-fawuhetong
10.3.238.86     xinxi-prod-redis-fawushoudan01
10.3.238.87     xinxi-prod-redis-fawushoudan02
10.3.238.173    xinxi-prod-redis-shucang01
10.3.238.174    xinxi-prod-redis-shucang02
10.3.238.186    xinxi-prod-redis-touzi
10.3.238.55     xinxi-prod-redis-rencai01
10.3.238.56     xinxi-prod-redis-rencai02
10.3.215.182    xinxi-prod-redis-changqijili01
10.3.215.195    xinxi-prod-redis-changqijili02
10.3.238.245    xinxi-prod-redis-zijinjianguan01
10.3.238.246    xinxi-prod-redis-zijinjianguan02
10.3.248.6      xinxi-prod-redis-caigouToC01
10.3.248.7      xinxi-prod-redis-caigouToC02
​cat redis.json
[{"targets": ["10.3.238.75:9100"], "labels": {"cluster": "redis", "instance": "10.3.238.75", "alias": "xinxi-prod-redis-caiwujihe01"}}, {"targets": ["10.3.238.76:9100"], "labels": {"cluster": "redis", "instance": "10.3.238.76", "alias": "xinxi-prod-redis-caiwujihe02"}}, {"targets": ["10.5.250.29:9100"], "labels": {"cluster": "redis", "instance": "10.5.250.29", "alias": "xinxi-prod-redis-caiwushoudan01"}}, {"targets": ["10.5.250.188:9100"], "labels": {"cluster": "redis", "instance": "10.5.250.188", "alias": "xinxi-prod-redis-caiwushoudan02"}}, {"targets": ["10.5.250.8:9100"], "labels": {"cluster": "redis", "instance": "10.5.250.8", "alias": "xinxi-prod-redis-caigou"}}, {"targets": ["10.5.250.133:9100"], "labels": {"cluster": "redis", "instance": "10.5.250.133", "alias": "xinxi-prod-redis-tanxiao"}}, {"targets": ["10.5.250.133:9100"], "labels": {"cluster": "redis", "instance": "10.5.250.133", "alias": "xinxi-prod-redis-caigoufentan"}}, {"targets": ["10.5.250.175:9100"], "labels": {"cluster": "redis", "instance": "10.5.250.175", "alias": "xinxi-prod-redis-fawuhetong"}}, {"targets": ["10.3.238.86:9100"], "labels": {"cluster": "redis", "instance": "10.3.238.86", "alias": "xinxi-prod-redis-fawushoudan01"}}, {"targets": ["10.3.238.87:9100"], "labels": {"cluster": "redis", "instance": "10.3.238.87", "alias": "xinxi-prod-redis-fawushoudan02"}}, {"targets": ["10.3.238.173:9100"], "labels": {"cluster": "redis", "instance": "10.3.238.173", "alias": "xinxi-prod-redis-shucang01"}}, {"targets": ["10.3.238.174:9100"], "labels": {"cluster": "redis", "instance": "10.3.238.174", "alias": "xinxi-prod-redis-shucang02"}}, {"targets": ["10.3.238.186:9100"], "labels": {"cluster": "redis", "instance": "10.3.238.186", "alias": "xinxi-prod-redis-touzi"}}, {"targets": ["10.3.238.55:9100"], "labels": {"cluster": "redis", "instance": "10.3.238.55", "alias": "xinxi-prod-redis-rencai01"}}, {"targets": ["10.3.238.56:9100"], "labels": {"cluster": "redis", "instance": "10.3.238.56", "alias": "xinxi-prod-redis-rencai02"}}, {"targets": ["10.3.215.182:9100"], "labels": {"cluster": "redis", "instance": "10.3.215.182", "alias": "xinxi-prod-redis-changqijili01"}}, {"targets": ["10.3.215.195:9100"], "labels": {"cluster": "redis", "instance": "10.3.215.195", "alias": "xinxi-prod-redis-changqijili02"}}, {"targets": ["10.3.238.245:9100"], "labels": {"cluster": "redis", "instance": "10.3.238.245", "alias": "xinxi-prod-redis-zijinjianguan01"}}, {"targets": ["10.3.238.246:9100"], "labels": {"cluster": "redis", "instance": "10.3.238.246", "alias": "xinxi-prod-redis-zijinjianguan02"}}, {"targets": ["10.3.248.6:9100"], "labels": {"cluster": "redis", "instance": "10.3.248.6", "alias": "xinxi-prod-redis-caigouToC01"}}, {"targets": ["10.3.248.7:9100"], "labels": {"cluster": "redis", "instance": "10.3.248.7", "alias": "xinxi-prod-redis-caigouToC02"}}]

转换脚本

# -*- coding: utf-8 -*-
import os
import logging
import json
import time


def log_level(level):

    if level == "DEBUG":
        logging.basicConfig(level=logging.DEBUG,
                            format='%(asctime)s - %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d,%H:%M:%S',
                            )
        logging.info("log_level:%s", log_level)
        logging.info("Debug mode")
    else:
        logging.basicConfig(level=logging.INFO,
                            format='%(asctime)s - %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d,%H:%M:%S',
                            )


def read_list_file():
    dirnames = 'host_discovery_data'
    read_file_names = [name for name in os.listdir(dirnames)
            if name.endswith('.list')]
    for read_file_name in read_file_names:

        fo = open(dirnames + '/' + read_file_name, "r")
        node_dict_list = list()
        for fo_line in fo.readlines():
            fo_line = fo_line.rstrip("
")
            # for fo_line_element in fo_line.split():
            #     logging.debug("file_name: %s - fo_line_element: %s", read_file_name, fo_line_element)
            node_ip = fo_line.split()[0]
            node_alias = fo_line.split()[1]
            # logging.debug("node_ip: %s - node_alias: %s", node_ip, node_alias)

            node_dict = dict()
            node_labels_dict = dict()
            node_labels_dict['cluster'] = read_file_name.replace('.list', '')
            node_dict['targets'] = [node_ip + ':9100']
            node_labels_dict['instance'] = node_ip
            node_labels_dict['alias'] = node_alias
            node_dict['labels'] = node_labels_dict
            node_dict_list.append(node_dict)
            # logging.debug("node_dict_list: %s - type %s", node_dict_list, type(node_dict_list))
        node_json = json.dumps(node_dict_list)
        logging.debug("node_json: %s - type %s", node_json, type(node_json))
        # 将json 写入文件
        node_write_filename = dirnames + '/' + read_file_name.replace('.list', '') + '.json'
        fw = open(node_write_filename, "w")
        fw.write(node_json)


def run():
    # 设置LOG 级别
    level = "INFO"
    log_level(level)
    # 持续运行
    while True:
        read_list_file()
        time.sleep(10)

if __name__ == '__main__':
    run()
View Code
原文地址:https://www.cnblogs.com/litzhiai/p/15429664.html