kubernetes监控--Prometheus

本文基于kubernetes 1.5.2版本编写

kube-state-metrics

kubectl create ns monitoring
kubectl create sa -n monitoring kube-state-metrics

cat << EOF > kube-state-metrics.yaml
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: kube-state-metrics
  namespace: monitoring
spec:
  replicas: 1
  template:
    metadata:
      labels:
        app: kube-state-metrics
    spec:
      serviceAccountName: kube-state-metrics
      containers:
      - name: kube-state-metrics
        image: quay.io/coreos/kube-state-metrics
        ports:
        - containerPort: 8080
EOF
kubectl create -f kube-state-metrics.yaml

cat << EOF > kube-state-metrics-svc.yaml
apiVersion: v1
kind: Service
metadata:
  annotations:
    prometheus.io/scrape: 'true'
  name: kube-state-metrics
  namespace: monitoring
  labels:
    app: kube-state-metrics
spec:
  ports:
  - name: kube-state-metrics
    port: 8080
    protocol: TCP
  selector:
    app: kube-state-metrics
EOF
kubectl create -f kube-state-metrics-svc.yaml

prom-node-exporter

cat << EOF > prom-node-exporter.yaml 
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
  name: prometheus-node-exporter
  namespace: monitoring
  labels:
    app: prometheus
    component: node-exporter
spec:
  template:
    metadata:
      name: prometheus-node-exporter
      labels:
        app: prometheus
        component: node-exporter
    spec:
      containers:
      - image: docker.io/prom/node-exporter:v0.14.0
        name: prometheus-node-exporter
        ports:
        - name: prom-node-exp
          #^ must be an IANA_SVC_NAME (at most 15 characters, ..)
          containerPort: 9100
          hostPort: 9100
      hostNetwork: true
      hostPID: true
EOF
kubectl create -f prom-node-exporter.yaml 


cat << EOF > prom-node-exporter-svc.yaml
apiVersion: v1
kind: Service
metadata:
  annotations:
    prometheus.io/scrape: 'true'
  name: prometheus-node-exporter
  namespace: monitoring
  labels:
    app: prometheus
    component: node-exporter
spec:
  #clusterIP: None
  ports:
    - name: prometheus-node-exporter
      port: 9100
      protocol: TCP
  selector:
    app: prometheus
    component: node-exporter
  type: ClusterIP
EOF
kubectl create -f prom-node-exporter-svc.yaml 

node-directory-size-metrics

cat node-directory-size-metrics.yaml 
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
  name: node-directory-size-metrics
  namespace: monitoring
  annotations:
    description: |
      This `DaemonSet` provides metrics in Prometheus format about disk usage on the nodes.
      The container `read-du` reads in sizes of all directories below /mnt and writes that to `/tmp/metrics`. It only reports directories larger then `100M` for now.
      The other container `caddy` just hands out the contents of that file on request via `http` on `/metrics` at port `9102` which are the defaults for Prometheus.
      These are scheduled on every node in the Kubernetes cluster.
      To choose directories from the node to check, just mount them on the `read-du` container below `/mnt`.
spec:
  template:
    metadata:
      labels:
        app: node-directory-size-metrics
      annotations:
        prometheus.io/scrape: 'true'
        prometheus.io/port: '9102'
        description: |
          This `Pod` provides metrics in Prometheus format about disk usage on the node.
          The container `read-du` reads in sizes of all directories below /mnt and writes that to `/tmp/metrics`. It only reports directories larger then `100M` for now.
          The other container `caddy` just hands out the contents of that file on request on `/metrics` at port `9102` which are the defaults for Prometheus.
          This `Pod` is scheduled on every node in the Kubernetes cluster.
          To choose directories from the node to check just mount them on `read-du` below `/mnt`.
    spec:
      containers:
      - name: read-du
        image: giantswarm/tiny-tools
        imagePullPolicy: Always
        # FIXME threshold via env var
        # The
        command:
        - fish
        - --command
        - |
          touch /tmp/metrics-temp
          while true
            for directory in (du --bytes --separate-dirs --threshold=100M /mnt)
              echo $directory | read size path
              echo "node_directory_size_bytes{path="$path"} $size" 
                >> /tmp/metrics-temp
            end
            mv /tmp/metrics-temp /tmp/metrics
            sleep 300
          end
        volumeMounts:
        - name: host-fs-var
          mountPath: /mnt/var
          readOnly: true
        - name: metrics
          mountPath: /tmp
      - name: caddy
        image: dockermuenster/caddy:latest
        command:
        - "caddy"
        - "-port=9102"
        - "-root=/var/www"
        ports:
        - containerPort: 9102
        volumeMounts:
        - name: metrics
          mountPath: /var/www
      volumes:
      - name: host-fs-var
        hostPath:
          path: /var
      - name: metrics
        emptyDir:
          medium: Memory
kubectl create -f node-directory-size-metrics.yaml

prometheus

cat prometheus-configmap.yaml 
apiVersion: v1
kind: ConfigMap
metadata:
  creationTimestamp: null
  name: prometheus-core
  namespace: monitoring
data:
  prometheus.yaml: |
    global:
      scrape_interval: 30s
      scrape_timeout: 30s
      evaluation_interval: 30s
    rule_files:
      - "/etc/prometheus-rules/*.rules"
    scrape_configs:

      # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L37
      - job_name: 'kubernetes-nodes'
        tls_config:
          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        kubernetes_sd_configs:
          - role: node
        relabel_configs:
          - source_labels: [__address__]
            regex: '(.*):10250'
            replacement: '${1}:10255'
            target_label: __address__

      # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L79
      - job_name: 'kubernetes-endpoints'
        kubernetes_sd_configs:
          - role: endpoints
        relabel_configs:
          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
            action: keep
            regex: true
          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
            action: replace
            target_label: __scheme__
            regex: (https?)
          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
            action: replace
            target_label: __metrics_path__
            regex: (.+)
          - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
            action: replace
            target_label: __address__
            regex: (.+)(?::d+);(d+)
            replacement: $1:$2
          - action: labelmap
            regex: __meta_kubernetes_service_label_(.+)
          - source_labels: [__meta_kubernetes_namespace]
            action: replace
            target_label: kubernetes_namespace
          - source_labels: [__meta_kubernetes_service_name]
            action: replace
            target_label: kubernetes_name

      # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L119
      - job_name: 'kubernetes-services'
        metrics_path: /probe
        params:
          module: [http_2xx]
        kubernetes_sd_configs:
          - role: service
        relabel_configs:
          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
            action: keep
            regex: true
          - source_labels: [__address__]
            target_label: __param_target
          - target_label: __address__
            replacement: blackbox
          - source_labels: [__param_target]
            target_label: instance
          - action: labelmap
            regex: __meta_kubernetes_service_label_(.+)
          - source_labels: [__meta_kubernetes_namespace]
            target_label: kubernetes_namespace
          - source_labels: [__meta_kubernetes_service_name]
            target_label: kubernetes_name

      # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L156
      - job_name: 'kubernetes-pods'
        kubernetes_sd_configs:
          - role: pod
        relabel_configs:
          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
            action: keep
            regex: true
          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
            action: replace
            target_label: __metrics_path__
            regex: (.+)
          - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
            action: replace
            regex: (.+):(?:d+);(d+)
            replacement: ${1}:${2}
            target_label: __address__
          - action: labelmap
            regex: __meta_kubernetes_pod_label_(.+)
          - source_labels: [__meta_kubernetes_namespace]
            action: replace
            target_label: kubernetes_namespace
          - source_labels: [__meta_kubernetes_pod_name]
            action: replace
            target_label: kubernetes_pod_name
          - source_labels: [__meta_kubernetes_pod_container_port_number]
            action: keep
            regex: 9d{3}
kubectl create -f prometheus-configmap.yaml 


cat prometheus-rules.yaml 
apiVersion: v1
data:
  cpu-usage.rules: |
    ALERT NodeCPUUsage
      IF (100 - (avg by (instance) (irate(node_cpu{name="node-exporter",mode="idle"}[5m])) * 100)) > 75
      FOR 2m
      LABELS {
        severity="page"
      }
      ANNOTATIONS {
        SUMMARY = "{{$labels.instance}}: High CPU usage detected",
        DESCRIPTION = "{{$labels.instance}}: CPU usage is above 75% (current value is: {{ $value }})"
      }
  instance-availability.rules: |
    ALERT InstanceDown
      IF up == 0
      FOR 1m
      LABELS { severity = "page" }
      ANNOTATIONS {
        summary = "Instance {{ $labels.instance }} down",
        description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute.",
      }
  low-disk-space.rules: |
    ALERT NodeLowRootDisk
      IF ((node_filesystem_size{mountpoint="/root-disk"} - node_filesystem_free{mountpoint="/root-disk"} ) / node_filesystem_size{mountpoint="/root-disk"} * 100) > 75
      FOR 2m
      LABELS {
        severity="page"
      }
      ANNOTATIONS {
        SUMMARY = "{{$labels.instance}}: Low root disk space",
        DESCRIPTION = "{{$labels.instance}}: Root disk usage is above 75% (current value is: {{ $value }})"
      }

    ALERT NodeLowDataDisk
      IF ((node_filesystem_size{mountpoint="/data-disk"} - node_filesystem_free{mountpoint="/data-disk"} ) / node_filesystem_size{mountpoint="/data-disk"} * 100) > 75
      FOR 2m
      LABELS {
        severity="page"
      }
      ANNOTATIONS {
        SUMMARY = "{{$labels.instance}}: Low data disk space",
        DESCRIPTION = "{{$labels.instance}}: Data disk usage is above 75% (current value is: {{ $value }})"
      }
  mem-usage.rules: |
    ALERT NodeSwapUsage
      IF (((node_memory_SwapTotal-node_memory_SwapFree)/node_memory_SwapTotal)*100) > 75
      FOR 2m
      LABELS {
        severity="page"
      }
      ANNOTATIONS {
        SUMMARY = "{{$labels.instance}}: Swap usage detected",
        DESCRIPTION = "{{$labels.instance}}: Swap usage usage is above 75% (current value is: {{ $value }})"
      }

    ALERT NodeMemoryUsage
      IF (((node_memory_MemTotal-node_memory_MemFree-node_memory_Cached)/(node_memory_MemTotal)*100)) > 75
      FOR 2m
      LABELS {
        severity="page"
      }
      ANNOTATIONS {
        SUMMARY = "{{$labels.instance}}: High memory usage detected",
        DESCRIPTION = "{{$labels.instance}}: Memory usage is above 75% (current value is: {{ $value }})"
      }
kind: ConfigMap
metadata:
  creationTimestamp: null
  name: prometheus-rules
  namespace: monitoring
kubectl create -f prometheus-rules.yaml

kubectl create sa prometheus-k8s -n monitoring


cat << EOF > prometheus-core-deploy.yaml 
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: prometheus-core
  namespace: monitoring
  labels:
    app: prometheus
    component: core
spec:
  replicas: 1
  template:
    metadata:
      name: prometheus-main
      labels:
        app: prometheus
        component: core
    spec:
      serviceAccountName: prometheus-k8s
      containers:
      - name: prometheus
        image: prom/prometheus:v1.7.1
        args:
          - '-storage.local.retention=12h'
          - '-storage.local.memory-chunks=500000'
          - '-config.file=/etc/prometheus/prometheus.yaml'
          - '-alertmanager.url=http://alertmanager:9093/'
        ports:
        - name: webui
          containerPort: 9090
        resources:
          requests:
            cpu: 500m
            memory: 500M
          limits:
            cpu: 500m
            memory: 500M
        volumeMounts:
        - name: config-volume
          mountPath: /etc/prometheus
        - name: rules-volume
          mountPath: /etc/prometheus-rules
      volumes:
      - name: config-volume
        configMap:
          name: prometheus-core
      - name: rules-volume
        configMap:
          name: prometheus-rules
EOF
kubectl create -f prometheus-core-deploy.yaml 


cat << EOF > prometheus-core-service.yaml 
apiVersion: v1
kind: Service
metadata:
  name: prometheus
  namespace: monitoring
  labels:
    app: prometheus
    component: core
  annotations:
    prometheus.io/scrape: 'true'
spec:
  type: NodePort
  ports:
    - port: 9090
      protocol: TCP
      name: webui
  selector:
    app: prometheus
    component: core
EOF
kubectl create -f prometheus-core-service.yaml

grafana

cat << EOF > grafana-core-deployment.yaml 
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: grafana-core
  namespace: monitoring
  labels:
    app: grafana
    component: core
spec:
  replicas: 1
  template:
    metadata:
      labels:
        app: grafana
        component: core
    spec:
      containers:
        - image: docker.io/grafana/grafana:latest
          name: grafana-core
          # env:
          resources:
            # keep request = limit to keep this container in guaranteed class
            limits:
              cpu: 100m
              memory: 100Mi
            requests:
              cpu: 100m
              memory: 100Mi
          ports:
            - name: grafana
              containerPort: 3000
          env:
            # This variable is required to setup templates in Grafana.
              # The following env variables are required to make Grafana accessible via
              # the kubernetes api-server proxy. On production clusters, we recommend
              # removing these env variables, setup auth for grafana, and expose the grafana
              # service using a LoadBalancer or a public IP.
            - name: GF_AUTH_BASIC_ENABLED
              value: "false"
            - name: GF_AUTH_ANONYMOUS_ENABLED
              value: "true"
            - name: GF_AUTH_ANONYMOUS_ORG_ROLE
              value: Admin
            # - name: GF_SERVER_ROOT_URL
            #   value: /api/v1/proxy/namespaces/monitoring/services/grafana/
          volumeMounts:
          - name: grafana-persistent-storage
            mountPath: /var
      volumes:
      - name: grafana-persistent-storage
        hostPath:
          emptyDir: {}
          #path: /grafanaData
EOF
kubectl create -f grafana-core-deployment.yaml 


cat << EOF > grafana-core-service.yaml 
apiVersion: v1
kind: Service
metadata:
  name: grafana
  namespace: monitoring
  labels:
    app: grafana
    component: core
  # annotations:
  #   prometheus.io/scrape: 'true'
spec:
  type: NodePort
  ports:
    - port: 3000
      nodePort: 31000
  selector:
    app: grafana
    component: core
EOF
kubectl create -f grafana-core-service.yaml 

grafana模板

{
  "annotations": {
    "list": []
  },
  "editable": true,
  "gnetId": null,
  "graphTooltip": 0,
  "hideControls": false,
  "id": 21,
  "links": [],
  "refresh": false,
  "rows": [
    {
      "collapse": false,
      "height": 282,
      "panels": [
        {
          "aliasColors": {},
          "bars": false,
          "dashLength": 10,
          "dashes": false,
          "datasource": null,
          "fill": 0,
          "height": "",
          "hideTimeOverride": false,
          "id": 1,
          "legend": {
            "alignAsTable": true,
            "avg": false,
            "current": true,
            "hideEmpty": false,
            "hideZero": false,
            "max": true,
            "min": false,
            "rightSide": true,
            "show": true,
            "sideWidth": null,
            "total": false,
            "values": true
          },
          "lines": true,
          "linewidth": 2,
          "links": [],
          "minSpan": null,
          "nullPointMode": "null",
          "percentage": false,
          "pointradius": 5,
          "points": false,
          "renderer": "flot",
          "repeat": null,
          "seriesOverrides": [],
          "spaceLength": 10,
          "span": 6,
          "stack": false,
          "steppedLine": false,
          "targets": [
            {
              "expr": "sum(container_memory_usage_bytes{pod_name="$pod", namespace="$namespace"}) by (namespace,pod_name)",
              "format": "time_series",
              "interval": "30s",
              "intervalFactor": 1,
              "legendFormat": "total",
              "refId": "B",
              "step": 30
            },
            {
              "expr": "sum(container_memory_rss{namespace="$namespace",pod_name="$pod"}) by (namespace,pod_name)",
              "format": "time_series",
              "hide": false,
              "interval": "30s",
              "intervalFactor": 1,
              "legendFormat": "rss",
              "metric": "",
              "refId": "A",
              "step": 30
            },
            {
              "expr": "sum(container_memory_cache{namespace="$namespace",pod_name="$pod"}) by (namespace,pod_name)",
              "format": "time_series",
              "interval": "30s",
              "intervalFactor": 1,
              "legendFormat": "cache",
              "refId": "D",
              "step": 30
            },
            {
              "expr": "sum(container_memory_swap{namespace="$namespace",pod_name="$pod"}) by (namespace,pod_name)",
              "format": "time_series",
              "interval": "30s",
              "intervalFactor": 1,
              "legendFormat": "swap",
              "refId": "C",
              "step": 30
            },
            {
              "expr": "sum(container_memory_failures_total{namespace="$namespace",pod_name="$pod"}) by (namespace,pod_name)",
              "format": "time_series",
              "hide": true,
              "intervalFactor": 2,
              "legendFormat": "failures_total",
              "refId": "E",
              "step": 20
            },
            {
              "expr": "sum(container_memory_failcnt{namespace="$namespace",pod_name="$pod"}) by (namespace,pod_name)",
              "format": "time_series",
              "hide": true,
              "intervalFactor": 2,
              "legendFormat": "failcnt",
              "refId": "F",
              "step": 20
            },
            {
              "expr": "sum(container_memory_working_set_bytes{namespace="$namespace",pod_name="$pod"}) by (namespace,pod_name)",
              "format": "time_series",
              "hide": true,
              "intervalFactor": 2,
              "legendFormat": "working_set",
              "refId": "G",
              "step": 20
            }
          ],
          "thresholds": [],
          "timeFrom": null,
          "timeShift": null,
          "title": "内存使用量",
          "tooltip": {
            "shared": false,
            "sort": 0,
            "value_type": "individual"
          },
          "transparent": false,
          "type": "graph",
          "xaxis": {
            "buckets": null,
            "mode": "time",
            "name": null,
            "show": true,
            "values": [
              "total"
            ]
          },
          "yaxes": [
            {
              "format": "bytes",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            },
            {
              "format": "bytes",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            }
          ]
        },
        {
          "aliasColors": {},
          "bars": false,
          "dashLength": 10,
          "dashes": false,
          "datasource": null,
          "decimals": null,
          "description": "获取CPU资源使用情况,判断方式现在时刻和一分钟前的数据进行对比。",
          "fill": 0,
          "id": 2,
          "legend": {
            "alignAsTable": true,
            "avg": false,
            "current": true,
            "hideEmpty": false,
            "hideZero": false,
            "max": true,
            "min": false,
            "rightSide": true,
            "show": true,
            "sideWidth": null,
            "total": false,
            "values": true
          },
          "lines": true,
          "linewidth": 2,
          "links": [],
          "minSpan": null,
          "nullPointMode": "null",
          "percentage": false,
          "pointradius": 5,
          "points": false,
          "renderer": "flot",
          "repeat": null,
          "seriesOverrides": [],
          "spaceLength": 10,
          "span": 6,
          "stack": true,
          "steppedLine": false,
          "targets": [
            {
              "expr": "sum(rate(container_cpu_system_seconds_total{namespace="$namespace",pod_name="$pod"}[1m])) by (namespace,pod_name)",
              "format": "time_series",
              "hide": false,
              "interval": "30s",
              "intervalFactor": 1,
              "legendFormat": "system",
              "metric": "",
              "refId": "A",
              "step": 30
            },
            {
              "expr": "sum(rate(container_cpu_user_seconds_total{namespace="$namespace",pod_name="$pod"}[1m])) by (namespace,pod_name)",
              "format": "time_series",
              "interval": "30s",
              "intervalFactor": 1,
              "legendFormat": "user",
              "refId": "C",
              "step": 30
            },
            {
              "expr": "sum(rate(container_cpu_usage_seconds_total{namespace="$namespace",pod_name="$pod"}[1m])) by (namespace,pod_name)",
              "format": "time_series",
              "interval": "30s",
              "intervalFactor": 1,
              "legendFormat": "total",
              "refId": "B",
              "step": 30
            }
          ],
          "thresholds": [],
          "timeFrom": null,
          "timeShift": null,
          "title": "CPU使用量(核)",
          "tooltip": {
            "shared": true,
            "sort": 0,
            "value_type": "individual"
          },
          "transparent": false,
          "type": "graph",
          "xaxis": {
            "buckets": null,
            "mode": "time",
            "name": null,
            "show": true,
            "values": [
              "total"
            ]
          },
          "yaxes": [
            {
              "format": "short",
              "label": "",
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            },
            {
              "format": "short",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            }
          ]
        }
      ],
      "repeat": null,
      "repeatIteration": null,
      "repeatRowId": null,
      "showTitle": false,
      "title": "所有pod",
      "titleSize": "h6"
    },
    {
      "collapse": false,
      "height": 199,
      "panels": [
        {
          "aliasColors": {},
          "bars": false,
          "dashLength": 10,
          "dashes": false,
          "datasource": null,
          "fill": 1,
          "id": 6,
          "legend": {
            "alignAsTable": true,
            "avg": false,
            "current": false,
            "max": false,
            "min": false,
            "rightSide": true,
            "show": true,
            "total": false,
            "values": false
          },
          "lines": true,
          "linewidth": 1,
          "links": [],
          "nullPointMode": "null",
          "percentage": false,
          "pointradius": 5,
          "points": false,
          "renderer": "flot",
          "seriesOverrides": [],
          "spaceLength": 10,
          "span": 6,
          "stack": false,
          "steppedLine": false,
          "targets": [
            {
              "expr": "sum(rate(container_network_receive_packets_total{namespace="$namespace",pod_name="$pod"}[1m])) by (namespace,pod_name)",
              "format": "time_series",
              "interval": "1m",
              "intervalFactor": 2,
              "legendFormat": "in",
              "refId": "A",
              "step": 120
            },
            {
              "expr": "sum(rate(container_network_transmit_packets_total{namespace="$namespace",pod_name="$pod"}[1m])) by (namespace,pod_name)",
              "format": "time_series",
              "interval": "1m",
              "intervalFactor": 2,
              "legendFormat": "out",
              "refId": "B",
              "step": 120
            },
            {
              "expr": "sum(rate(container_network_receive_errors_total{namespace="$namespace",pod_name="$pod"}[1m])) by (namespace,pod_name) + sum(rate(container_network_transmit_errors_total{namespace="$namespace",pod_name="$pod"}[1m])) by (namespace,pod_name) + sum(rate(container_network_receive_packets_dropped_total{namespace="$namespace",pod_name="$pod"}[1m])) by (namespace,pod_name) + sum(rate(container_network_transmit_packets_dropped_total{namespace="$namespace",pod_name="$pod"}[1m])) by (namespace,pod_name)",
              "format": "time_series",
              "interval": "1m",
              "intervalFactor": 1,
              "legendFormat": "error",
              "refId": "C",
              "step": 60
            }
          ],
          "thresholds": [],
          "timeFrom": null,
          "timeShift": null,
          "title": "数据包",
          "tooltip": {
            "shared": true,
            "sort": 0,
            "value_type": "individual"
          },
          "type": "graph",
          "xaxis": {
            "buckets": null,
            "mode": "time",
            "name": null,
            "show": true,
            "values": []
          },
          "yaxes": [
            {
              "format": "pps",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            },
            {
              "format": "pps",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            }
          ]
        },
        {
          "aliasColors": {},
          "bars": false,
          "dashLength": 10,
          "dashes": false,
          "datasource": null,
          "fill": 0,
          "id": 5,
          "legend": {
            "alignAsTable": true,
            "avg": false,
            "current": false,
            "max": false,
            "min": false,
            "rightSide": true,
            "show": true,
            "total": false,
            "values": false
          },
          "lines": true,
          "linewidth": 2,
          "links": [],
          "nullPointMode": "null",
          "percentage": false,
          "pointradius": 5,
          "points": false,
          "renderer": "flot",
          "seriesOverrides": [],
          "spaceLength": 10,
          "span": 6,
          "stack": false,
          "steppedLine": false,
          "targets": [
            {
              "expr": "sum(rate(container_network_receive_bytes_total{pod_name="$pod", namespace="$namespace"}[1m])*1) by (namespace,container_name)",
              "format": "time_series",
              "interval": "1m",
              "intervalFactor": 1,
              "legendFormat": "in",
              "metric": "",
              "refId": "A",
              "step": 60
            },
            {
              "expr": "sum(rate(container_network_transmit_bytes_total{pod_name="$pod", namespace="$namespace"}[1m])*1) by (namespace,container_name)",
              "format": "time_series",
              "interval": "1m",
              "intervalFactor": 1,
              "legendFormat": "out",
              "refId": "B",
              "step": 60
            }
          ],
          "thresholds": [],
          "timeFrom": null,
          "timeShift": null,
          "title": "网络流量",
          "tooltip": {
            "shared": true,
            "sort": 0,
            "value_type": "individual"
          },
          "type": "graph",
          "xaxis": {
            "buckets": null,
            "mode": "time",
            "name": null,
            "show": true,
            "values": []
          },
          "yaxes": [
            {
              "format": "bps",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            },
            {
              "format": "bps",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            }
          ]
        }
      ],
      "repeat": null,
      "repeatIteration": null,
      "repeatRowId": null,
      "showTitle": false,
      "title": "Dashboard Row",
      "titleSize": "h6"
    },
    {
      "collapse": false,
      "height": 163,
      "panels": [
        {
          "aliasColors": {},
          "bars": false,
          "dashLength": 10,
          "dashes": false,
          "datasource": null,
          "fill": 0,
          "hideTimeOverride": false,
          "id": 3,
          "legend": {
            "alignAsTable": true,
            "avg": false,
            "current": false,
            "max": false,
            "min": false,
            "rightSide": true,
            "show": true,
            "sortDesc": true,
            "total": false,
            "values": false
          },
          "lines": true,
          "linewidth": 2,
          "links": [],
          "nullPointMode": "null",
          "percentage": false,
          "pointradius": 5,
          "points": false,
          "renderer": "flot",
          "repeat": null,
          "seriesOverrides": [],
          "spaceLength": 10,
          "span": 6,
          "stack": false,
          "steppedLine": false,
          "targets": [
            {
              "expr": "sum(container_fs_limit_bytes{namespace="$namespace",pod_name="$pod"}) by (namespace,pod_name)",
              "format": "time_series",
              "intervalFactor": 2,
              "legendFormat": "total",
              "refId": "A",
              "step": 4
            },
            {
              "expr": "sum(container_fs_usage_bytes{namespace="$namespace",pod_name="$pod"}) by (namespace,pod_name)",
              "format": "time_series",
              "intervalFactor": 2,
              "legendFormat": "usage",
              "refId": "B",
              "step": 4
            }
          ],
          "thresholds": [],
          "timeFrom": null,
          "timeShift": null,
          "title": "硬盘使用量",
          "tooltip": {
            "shared": false,
            "sort": 0,
            "value_type": "individual"
          },
          "type": "graph",
          "xaxis": {
            "buckets": null,
            "mode": "time",
            "name": null,
            "show": true,
            "values": []
          },
          "yaxes": [
            {
              "format": "decbytes",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            },
            {
              "format": "short",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            }
          ]
        },
        {
          "aliasColors": {},
          "bars": false,
          "dashLength": 10,
          "dashes": false,
          "datasource": null,
          "fill": 0,
          "id": 4,
          "legend": {
            "alignAsTable": true,
            "avg": false,
            "current": false,
            "max": false,
            "min": false,
            "rightSide": true,
            "show": true,
            "total": false,
            "values": false
          },
          "lines": true,
          "linewidth": 2,
          "links": [],
          "nullPointMode": "null",
          "percentage": false,
          "pointradius": 5,
          "points": false,
          "renderer": "flot",
          "seriesOverrides": [],
          "spaceLength": 10,
          "span": 6,
          "stack": false,
          "steppedLine": false,
          "targets": [
            {
              "expr": "sum(container_fs_reads_total{namespace="$namespace",pod_name="$pod"}) by (namespace,pod_name)",
              "format": "time_series",
              "interval": "30s",
              "intervalFactor": 1,
              "legendFormat": "read",
              "refId": "A",
              "step": 30
            },
            {
              "expr": "sum(container_fs_writes_total{namespace="$namespace",pod_name="$pod"}) by (namespace,pod_name)",
              "format": "time_series",
              "interval": "30s",
              "intervalFactor": 1,
              "legendFormat": "write",
              "refId": "B",
              "step": 30
            },
            {
              "expr": "sum(container_fs_io_current{namespace="$namespace",pod_name="$pod"}) by (namespace,pod_name)",
              "format": "time_series",
              "intervalFactor": 2,
              "legendFormat": "current",
              "refId": "C",
              "step": 4
            }
          ],
          "thresholds": [],
          "timeFrom": null,
          "timeShift": null,
          "title": "IO",
          "tooltip": {
            "shared": true,
            "sort": 0,
            "value_type": "individual"
          },
          "type": "graph",
          "xaxis": {
            "buckets": null,
            "mode": "time",
            "name": null,
            "show": true,
            "values": []
          },
          "yaxes": [
            {
              "format": "short",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            },
            {
              "format": "short",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            }
          ]
        }
      ],
      "repeat": null,
      "repeatIteration": null,
      "repeatRowId": null,
      "showTitle": false,
      "title": "Dashboard Row",
      "titleSize": "h6"
    }
  ],
  "schemaVersion": 14,
  "style": "dark",
  "tags": [],
  "templating": {
    "list": [
      {
        "allValue": null,
        "current": {
          "selected": true,
          "text": "monitoring",
          "value": "monitoring"
        },
        "datasource": "prometheus",
        "hide": 0,
        "includeAll": false,
        "label": "namespace",
        "multi": false,
        "name": "namespace",
        "options": [],
        "query": "label_values(kube_pod_info{namespace=~".+"},namespace)",
        "refresh": 1,
        "regex": "",
        "sort": 0,
        "tagValuesQuery": "",
        "tags": [],
        "tagsQuery": "",
        "type": "query",
        "useTags": false
      },
      {
        "allValue": null,
        "current": {
          "selected": true,
          "text": "kube-state-metrics-2802505745",
          "value": "kube-state-metrics-2802505745"
        },
        "datasource": "prometheus",
        "hide": 0,
        "includeAll": false,
        "label": "创建者",
        "multi": false,
        "name": "created_by_name",
        "options": [],
        "query": "label_values(kube_pod_info{created_by_name=~".+",namespace="$namespace"} ,created_by_name)",
        "refresh": 1,
        "regex": "",
        "sort": 0,
        "tagValuesQuery": "",
        "tags": [],
        "tagsQuery": "",
        "type": "query",
        "useTags": false
      },
      {
        "allValue": null,
        "current": {
          "selected": true,
          "text": "kube-state-metrics-2802505745-k24zk",
          "value": "kube-state-metrics-2802505745-k24zk"
        },
        "datasource": "prometheus",
        "hide": 0,
        "includeAll": false,
        "label": "pod",
        "multi": false,
        "name": "pod",
        "options": [],
        "query": "label_values(kube_pod_info{created_by_name="$created_by_name",namespace="$namespace",pod=~".+"} ,pod)",
        "refresh": 1,
        "regex": "",
        "sort": 0,
        "tagValuesQuery": "",
        "tags": [],
        "tagsQuery": "",
        "type": "query",
        "useTags": false
      }
    ]
  },
  "time": {
    "from": "now-30m",
    "to": "now"
  },
  "timepicker": {
    "refresh_intervals": [
      "5s",
      "10s",
      "30s",
      "1m",
      "5m",
      "15m",
      "30m",
      "1h",
      "2h",
      "1d"
    ],
    "time_options": [
      "5m",
      "15m",
      "1h",
      "6h",
      "12h",
      "24h",
      "2d",
      "7d",
      "30d"
    ]
  },
  "timezone": "",
  "title": "基于资源对象对pod进行监控",
  "version": 5
}
原文地址:https://www.cnblogs.com/lykops/p/8263120.html