【监控】prometheus监控安装

部署

wget https://github.com/prometheus/prometheus/releases/download/v2.28.0/prometheus-2.28.0.linux-amd64.tar.gz

tar xf prometheus-2.28.0.linux-amd64.tar.gz

mv prometheus-2.28.0.linux-amd64 /usr/local/prometheus-2.28.0

vim /usr/local/prometheus-2.28.0/prometheus.yml

# my global config
global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    - targets:
      # - alertmanager:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'prometheus'

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
    - targets: ['localhost:9090']

　　vim /usr/lib/systemd/system/prometheus.service

[Unit]
Description=Prometheus Services
After=network.target remote-fs.target

[Service]
Type=simple
ExecStart=/usr/local/prometheus-2.28.0/prometheus --config.file=/usr/local/prometheus-2.28.0/prometheus.yml --storage.tsdb.path=/usr/local/prometheus-2.28.0/
Restart=on-failure
RestartSec=5

[Install]
WantedBy=multi-user.target

　　systemctl restart prometheus.service

监控

wget https://github.com/prometheus/node_exporter/releases/download/v1.1.2/node_exporter-1.1.2.linux-amd64.tar.gz

tar xf node_exporter-1.1.2.linux-amd64.tar.gz

mv node_exporter-1.1.2.linux-amd64 /usr/local/node_exporter

cat > /usr/lib/systemd/system/node_exporter.service << EOF

[Unit]
Description=Prometheus Node Exporter Services
After=network.target remote-fs.target

[Service]
Type=simple
ExecStart=/usr/local/node_exporter/node_exporter
Restart=on-failure
RestartSec=5

[Install]
WantedBy=multi-user.target

EOF

systemctl daemon-reload

systemctl start node_exporter

telegram报警

git clone https://github.com/nopp/alertmanager-webhook-telegram-python.git

yum install python3 pip3

cd alertmanager-webhook-telegram-python/

pip3 install -r requirements.txt

pip3 install python-dateutil

vim flaskAlert.py

import telegram, json, logging
from dateutil import parser
from flask import Flask
from flask import request
from flask_basicauth import BasicAuth

app = Flask(__name__)
app.secret_key = 'lAlAlA123'
basic_auth = BasicAuth(app)

# Yes need to have -, change it!
chatID = "" # 更改

# Authentication conf, change it!
app.config['BASIC_AUTH_FORCE'] = True
app.config['BASIC_AUTH_USERNAME'] = '' #更改
app.config['BASIC_AUTH_PASSWORD'] = '' #更改

# Bot token, change it!
bot = telegram.Bot(token="")  #更改

@app.route('/alert', methods = ['POST'])
def postAlertmanager():

    try:
        content = json.loads(request.get_data())
        for alert in content['alerts']:
            message = "Status: "+alert['status']+"
"
            if 'name' in alert['labels']:
                message += "Instance: "+alert['labels']['instance']+"("+alert['labels']['name']+")
"
            else:
                message += "Instance: "+alert['labels']['instance']+"
"
            if 'info' in alert['annotations']:
                message += "Info: "+alert['annotations']['info']+"
"
            if 'summary' in alert['annotations']:
                message += "Summary: "+alert['annotations']['summary']+"
"                
            if 'description' in alert['annotations']:
                message += "Description: "+alert['annotations']['description']+"
"
            if alert['status'] == "resolved":
                correctDate = parser.parse(alert['endsAt']).strftime('%Y-%m-%d %H:%M:%S')
                message += "Resolved: "+correctDate
            elif alert['status'] == "firing":
                correctDate = parser.parse(alert['startsAt']).strftime('%Y-%m-%d %H:%M:%S')
                message += "Started: "+correctDate
            bot.sendMessage(chat_id=chatID, text=message)
            return "Alert OK", 200
    except RetryAfter:
        sleep(30)
        bot.sendMessage(chat_id=chatID, text=message)
        return "Alert OK", 200
    except TimedOut as e:
        sleep(60)
        bot.sendMessage(chat_id=chatID, text=message)
        return "Alert OK", 200
    except NetworkError as e:
        sleep(60)
        bot.sendMessage(chat_id=chatID, text=message)
        return "Alert OK", 200
    except Exception as error:       
        bot.sendMessage(chat_id=chatID, text="Error: "+str(error))
        app.logger.info("	%s",error)
        return "Alert fail", 200

if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)
    app.run(host='0.0.0.0', port=9119)

nohup python3 flaskAlert.py &

测试

curl -XPOST --data '{"status":"resolved","groupLabels":{"alertname":"instance_down"},"commonAnnotations":{"description":"i-0d7188fkl90bac100 of job ec2-sp-node_exporter has been down for more than 2 minutes.","summary":"Instance i-0d7188fkl90bac100 down"},"alerts":[{"status":"resolved","labels":{"name":"olokinho01-prod","instance":"i-0d7188fkl90bac100","job":"ec2-sp-node_exporter","alertname":"instance_down","os":"linux","severity":"page"},"endsAt":"2019-07-01T16:16:19.376244942-03:00","generatorURL":"http://pmts.io:9090","startsAt":"2019-07-01T16:02:19.376245319-03:00","annotations":{"description":"i-0d7188fkl90bac100 of job ec2-sp-node_exporter has been down for more than 2 minutes.","summary":"Instance i-0d7188fkl90bac100 down"}}],"version":"4","receiver":"infra-alert","externalURL":"http://alm.io:9093","commonLabels":{"name":"olokinho01-prod","instance":"i-0d7188fkl90bac100","job":"ec2-sp-node_exporter","alertname":"instance_down","os":"linux","severity":"page"}}' http://username:password@flaskAlert:9119/alert

安装alertmanager

wget https://github.com/prometheus/alertmanager/releases/download/v0.22.2/alertmanager-0.22.2.linux-amd64.tar.gz

tar xf alertmanager-0.22.2.linux-amd64.tar.gz

mv alertmanager-0.22.2.linux-amd64 /usr/local/alertmanager

cd /usr/local/alertmanager/

route:
  group_by: ['alertname']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 1h
  receiver: 'alertmananger-bot'
receivers:
- name: 'alertmananger-bot'
  webhook_configs:
  - send_resolved: true
    url: http://127.0.0.1:9119/alert
    http_config:
      basic_auth:
        username: 'goroutine'
        password: 'goroutine-12345'
templates:
  - '/usr/local/alertmanager/test.tmpl'


########### /usr/local/alertmanager/test.rmpl ############
{{ define "test.html" }}
  {{ range .Alerts }}
 <pre>
故障实例: {{ .Labels.instance }}
故障概要: {{ .Annotations.summary }}
故障描述: {{ .Annotations.description }}
告警级别: {{ .Labels.severity }}
告警时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }} 
 </pre>
  {{ end }}
{{ end }}

nohup /usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml --storage.path=/usr/local/alertmanager/data > /usr/local/alertmanager/alertmanager.log &

测试

#!/usr/bin/env bash

alerts_message='[
  {
    "labels": {
       "alertname": "DiskRunningFull",
       "dev": "sda1",
       "instance": "example1",
       "msgtype": "testing"
     },
     "annotations": {
        "info": "The disk sda1 is running full",
        "summary": "please check the instance example1"
      }
  },
  {
    "labels": {
       "alertname": "DiskRunningFull",
       "dev": "sda2",
       "instance": "example1",
       "msgtype": "testing"
     },
     "annotations": {
        "info": "The disk sda2 is running full",
        "summary": "please check the instance example1",
        "runbook": "the following link http://test-url should be clickable"
      }
  }
]'

curl -XPOST -d"$alerts_message" http://127.0.0.1:9093/api/v1/alerts

prometheus修改

/usr/local/prometheus-2.28.0/prometheus.yml

alerting:
  alertmanagers:
  - static_configs:
    - targets:
       - alertmanager:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - "rules/*.yml"

rules/base_rules.yml

groups:
  - name: node-exporter-alert
    rules:
    - alert: node-exporter-down
      expr: node_exporter:up == 0 
      for: 1m
      labels: 
        severity: info
      annotations: 
        summary: "instance: {{ $labels.instance }} 宕机了"  
        description: "instance: {{ $labels.instance }} 
- job: {{ $labels.job }} 关机了， 时间已经1分钟了。" 
        value: "{{ $value }}"
        instance: "{{ $labels.instance }}"
        grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
        type: "google-cloud"

    - alert: node-exporter-cpu-high 
      expr:  node_exporter:cpu:total:percent > 80
      for: 3m
      labels: 
        severity: info
      annotations: 
        summary: "instance: {{ $labels.instance }} cpu 使用率高于 {{ $value }}"  
        description: ""    
        value: "{{ $value }}"
        instance: "{{ $labels.instance }}"
        grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
        type: "google-cloud"

    - alert: node-exporter-cpu-iowait-high 
      expr:  node_exporter:cpu:iowait:percent >= 12
      for: 3m
      labels: 
        severity: info
      annotations: 
        summary: "instance: {{ $labels.instance }} cpu iowait 使用率高于 {{ $value }}"  
        description: ""    
        value: "{{ $value }}"
        instance: "{{ $labels.instance }}"
        grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
        type: "google-cloud"

    - alert: node-exporter-load-load1-high 
      expr:  (node_exporter:load:load1) > (node_exporter:cpu:count) * 1.2
      for: 3m
      labels: 
        severity: info
      annotations: 
        summary: "instance: {{ $labels.instance }} load1 使用率高于 {{ $value }}"  
        description: ""    
        value: "{{ $value }}"
        instance: "{{ $labels.instance }}"
        grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
        type: "google-cloud"

    - alert: node-exporter-memory-high
      expr:  node_exporter:memory:used:percent > 85
      for: 3m
      labels: 
        severity: info
      annotations: 
        summary: "instance: {{ $labels.instance }} memory 使用率高于 {{ $value }}"  
        description: ""    
        value: "{{ $value }}"
        instance: "{{ $labels.instance }}"
        grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
        type: "google-cloud"

    - alert: node-exporter-disk-high
      expr:  node_exporter:disk:used:percent > 88
      for: 10m
      labels: 
        severity: info
      annotations: 
        summary: "instance: {{ $labels.instance }} disk 使用率高于 {{ $value }}"  
        description: ""    
        value: "{{ $value }}"
        instance: "{{ $labels.instance }}"
        grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
        type: "google-cloud"

    - alert: node-exporter-disk-read:count-high
      expr:  node_exporter:disk:read:count:rate > 3000
      for: 2m
      labels: 
        severity: info
      annotations: 
        summary: "instance: {{ $labels.instance }} iops read 使用率高于 {{ $value }}"  
        description: ""    
        value: "{{ $value }}"
        instance: "{{ $labels.instance }}"
        grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
        type: "google-cloud"

    - alert: node-exporter-disk-write-count-high
      expr:  node_exporter:disk:write:count:rate > 3000
      for: 2m
      labels: 
        severity: info
      annotations: 
        summary: "instance: {{ $labels.instance }} iops write 使用率高于 {{ $value }}"  
        description: ""    
        value: "{{ $value }}"
        instance: "{{ $labels.instance }}"
        grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
        type: "google-cloud"


    - alert: node-exporter-disk-read-mb-high
      expr:  node_exporter:disk:read:mb:rate > 60 
      for: 2m
      labels: 
        severity: info
      annotations: 
        summary: "instance: {{ $labels.instance }} 读取字节数 高于 {{ $value }}"  
        description: ""    
        instance: "{{ $labels.instance }}"
        value: "{{ $value }}"
        grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
        type: "google-cloud"

    - alert: node-exporter-disk-write-mb-high
      expr:  node_exporter:disk:write:mb:rate > 60
      for: 2m
      labels: 
        severity: info
      annotations: 
        summary: "instance: {{ $labels.instance }} 写入字节数 高于 {{ $value }}"  
        description: ""    
        value: "{{ $value }}"
        instance: "{{ $labels.instance }}"
        grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
        type: "google-cloud"

    - alert: node-exporter-filefd-allocated-percent-high 
      expr:  node_exporter:filefd_allocated:percent > 80
      for: 10m
      labels: 
        severity: info
      annotations: 
        summary: "instance: {{ $labels.instance }} 打开文件描述符 高于 {{ $value }}"  
        description: ""    
        value: "{{ $value }}"
        instance: "{{ $labels.instance }}"
        grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
        type: "google-cloud"

    - alert: node-exporter-network-netin-error-rate-high
      expr:  node_exporter:network:netin:error:rate > 4
      for: 1m
      labels: 
        severity: info
      annotations: 
        summary: "instance: {{ $labels.instance }} 包进入的错误速率 高于 {{ $value }}"  
        description: ""    
        value: "{{ $value }}"
        instance: "{{ $labels.instance }}"
        grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
        type: "google-cloud"
    - alert: node-exporter-network-netin-packet-rate-high
      expr:  node_exporter:network:netin:packet:rate > 35000
      for: 1m
      labels: 
        severity: info
      annotations: 
        summary: "instance: {{ $labels.instance }} 包进入速率 高于 {{ $value }}"  
        description: ""    
        value: "{{ $value }}"
        instance: "{{ $labels.instance }}"
        grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
        type: "google-cloud"

    - alert: node-exporter-network-netout-packet-rate-high
      expr:  node_exporter:network:netout:packet:rate > 35000
      for: 1m
      labels: 
        severity: info
      annotations: 
        summary: "instance: {{ $labels.instance }} 包流出速率 高于 {{ $value }}"  
        description: ""    
        value: "{{ $value }}"
        instance: "{{ $labels.instance }}"
        grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
        type: "google-cloud"

    - alert: node-exporter-network-tcp-total-count-high
      expr:  node_exporter:network:tcp:total:count > 40000
      for: 1m
      labels: 
        severity: info
      annotations: 
        summary: "instance: {{ $labels.instance }} tcp连接数量 高于 {{ $value }}"  
        description: ""    
        value: "{{ $value }}"
        instance: "{{ $labels.instance }}"
        grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
        type: "google-cloud"

    - alert: node-exporter-process-zoom-total-count-high 
      expr:  node_exporter:process:zoom:total:count > 10
      for: 10m
      labels: 
        severity: info
      annotations: 
        summary: "instance: {{ $labels.instance }} 僵死进程数量 高于 {{ $value }}"  
        description: ""    
        value: "{{ $value }}"
        instance: "{{ $labels.instance }}"
        grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
        type: "google-cloud"

    - alert: node-exporter-time-offset-high
      expr:  node_exporter:time:offset > 0.03
      for: 2m
      labels: 
        severity: info
      annotations:
        summary: "instance: {{ $labels.instance }} {{ $labels.desc }}  {{ $value }} {{ $labels.unit }}"  
        description: ""    
        value: "{{ $value }}"
        instance: "{{ $labels.instance }}"
        grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
        type: "google-cloud"

systemctl restart prometheus