zabbix 监控openshift pod状态

需求:

pod中的容器重启一次则报警通知

pod非Runing 状态则报警

pod中的容器非true状态则报警

三个需求其实是有点重叠的

pod重启期间pod肯定会有非Running状态,只要有重启报警那么pod非Runing也会报警,pod非Runing容器状态肯定非true也会报警

所有报警设置为:

pod重启一次就报警

pod非Runing  and 容器非true (#3)  and pod非删除 =报警

zabbix server中建一个模板

<?xml version="1.0" encoding="UTF-8"?>
<zabbix_export>
    <version>3.2</version>
    <date>2017-11-23T07:48:53Z</date>
    <groups>
        <group>
            <name>OpenShift</name>
        </group>
    </groups>
    <templates>
        <template>
            <template>OC Pods</template>
            <name>OC Pods</name>
            <description/>
            <groups>
                <group>
                    <name>OpenShift</name>
                </group>
            </groups>
            <applications>
                <application>
                    <name>restartCount</name>
                </application>
                <application>
                    <name>RunningStatus</name>
                </application>
            </applications>
            <items/>
            <discovery_rules>
                <discovery_rule>
                    <name>OC Pods Discover</name>
                    <type>0</type>
                    <snmp_community/>
                    <snmp_oid/>
                    <key>oc.pod.discover</key>
                    <delay>300</delay>
                    <status>1</status>
                    <allowed_hosts/>
                    <snmpv3_contextname/>
                    <snmpv3_securityname/>
                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
                    <snmpv3_authpassphrase/>
                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
                    <snmpv3_privpassphrase/>
                    <delay_flex/>
                    <params/>
                    <ipmi_sensor/>
                    <authtype>0</authtype>
                    <username/>
                    <password/>
                    <publickey/>
                    <privatekey/>
                    <port/>
                    <filter>
                        <evaltype>0</evaltype>
                        <formula/>
                        <conditions/>
                    </filter>
                    <lifetime>7</lifetime>
                    <description/>
                    <item_prototypes>
                        <item_prototype>
                            <name>Pod {#POD_NAME} Restarts</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.pod.status[{#POD_NAME},restarts]</key>
                            <delay>30</delay>
                            <history>30</history>
                            <trends>0</trends>
                            <status>0</status>
                            <value_type>4</value_type>
                            <allowed_hosts/>
                            <units/>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>restartCount</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>Pod {#POD_NAME} Running</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.pod.status[{#POD_NAME},running]</key>
                            <delay>30</delay>
                            <history>30</history>
                            <trends>0</trends>
                            <status>0</status>
                            <value_type>4</value_type>
                            <allowed_hosts/>
                            <units/>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>RunningStatus</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>Pod {#POD_NAME} Running True</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>1</multiplier>
                            <snmp_oid/>
                            <key>oc.pod.status[{#POD_NAME},running_true]</key>
                            <delay>30</delay>
                            <history>30</history>
                            <trends>365</trends>
                            <status>0</status>
                            <value_type>3</value_type>
                            <allowed_hosts/>
                            <units/>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>RunningStatus</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                    </item_prototypes>
                    <trigger_prototypes>
                        <trigger_prototype>
                            <expression>{OC Pods:oc.pod.status[{#POD_NAME},running].str(Running_true)}=0&#13;
and&#13;
{OC Pods:oc.pod.status[{#POD_NAME},running].str(Pod deleted)}=0&#13;
and&#13;
{OC Pods:oc.pod.status[{#POD_NAME},running_true].last(#5)}=0</expression>
                            <recovery_mode>0</recovery_mode>
                            <recovery_expression/>
                            <name>Pod {#POD_NAME} No Running</name>
                            <correlation_mode>0</correlation_mode>
                            <correlation_tag/>
                            <url/>
                            <status>0</status>
                            <priority>1</priority>
                            <description/>
                            <type>0</type>
                            <manual_close>1</manual_close>
                            <dependencies/>
                            <tags/>
                        </trigger_prototype>
                        <trigger_prototype>
                            <expression>{OC Pods:oc.pod.status[{#POD_NAME},restarts].str(Warning)}=1</expression>
                            <recovery_mode>1</recovery_mode>
                            <recovery_expression>{OC Pods:oc.pod.status[{#POD_NAME},restarts].str(Warning,#3)}=0</recovery_expression>
                            <name>Pod {#POD_NAME} restarted Warning</name>
                            <correlation_mode>0</correlation_mode>
                            <correlation_tag/>
                            <url/>
                            <status>0</status>
                            <priority>1</priority>
                            <description/>
                            <type>0</type>
                            <manual_close>1</manual_close>
                            <dependencies/>
                            <tags/>
                        </trigger_prototype>
                    </trigger_prototypes>
                    <graph_prototypes/>
                    <host_prototypes/>
                </discovery_rule>
            </discovery_rules>
            <httptests/>
            <macros/>
            <templates/>
            <screens/>
        </template>
    </templates>
</zabbix_export>
模板文件

新建一个自动发现规则,有三个监控项对于上面说的三个需求

zabbix agent

在配置文件末尾中加入

# vim zabbix_agentd.conf

UserParameter=oc.pod.discover,/data/app/zabbix/etc/oc_pod_discover.sh
UserParameter=oc.pod.status[*],/data/app/zabbix/etc/oc_pod_monitor.sh $1 $2

自动发现脚本

# vim oc_pod_discover.sh

#!/bin/bash
TOKEN="123456"
ENDPOINT="www.oc.domain.cn:8443"
WORKSPACE="/data/tmp/oc_monitor"
mkdir -p $WORKSPACE

#获取所有pod只保留pod name
curl -k 
  -H "Authorization: Bearer $TOKEN" 
  -H 'Accept: application/json' 
  https://$ENDPOINT/api/v1/pods 2>/dev/null  > $WORKSPACE/all_pods.json

Pod_Name=(`cat $WORKSPACE/all_pods.json |jq -r '.items | .[] | .metadata | .name' |grep -v build |grep -v deploy`)

#转换为json格式
printf "{
"
printf '	"data":[
'
for ((i=0;i<${#Pod_Name[@]};i++))
do
        printf '		{
'
        num=$(echo $((${#Pod_Name[@]}-1)))
        if [ "$i" == ${num} ];
        then
                printf "			"{#POD_NAME}":"${Pod_Name[$i]}"}
"
        else
                printf "			"{#POD_NAME}":"${Pod_Name[$i]}"},
"
        fi
done
printf "	]
"
printf "}
"

监控脚本

# vim oc_pod_monitor.sh

#!/bin/bash
TOKEN="123456"
ENDPOINT="www.oc.domain.cn:8443"
POD_NAME="$1"
Monitoring_type="$2"
WORKSPACE="/data/tmp/oc_monitor"
mkdir -p $WORKSPACE

#通过pod name获得pod所在的namespace5分钟更新一次
NAMESPACE="`cat $WORKSPACE/all_pods.json |jq -r '.items |.[] |.metadata |.name,.namespace' |grep -A1 $POD_NAME |grep -v $POD_NAME`"

#验证pod是否存在
if [ ! -n "$NAMESPACE" ]; then
  if [ "$Monitoring_type" = "running_true" ]; then
    echo "1"
    exit 0
  fi
  echo "Pod deleted"
  exit 0
fi

#获取pod状态数据
if [ ! -f "$WORKSPACE/${POD_NAME}.status" ]; then
  if [ "$Monitoring_type" = "running_true" ]; then
    echo "1"
    exit 0
  fi
  echo "New Pod"
  exit 0
fi
Pod_Status="`cat $WORKSPACE/${POD_NAME}.status`"

#验证容器是否在Pending状态
Pending="`echo "$Pod_Status" |jq -r '.status |.phase'`"
if [ "$Pending" = "Pending" ]; then
  if [ "$Monitoring_type" = "running_true" ]; then
    echo "0"
    exit 0
  fi
  echo "Pending"
  exit 0
fi

#选择要获取的数据
case $Monitoring_type in
   restarts)#监控pod是否重启过
     #获取pod状态数据写到文件里面可供所有项目调用
     curl -k 
       -H "Authorization: Bearer $TOKEN" 
       -H 'Accept: application/json' 
       https://${ENDPOINT}/api/v1/namespaces/$NAMESPACE/pods/$POD_NAME/status 2>/dev/null > $WORKSPACE/${POD_NAME}.status
       find /data/tmp/oc_monitor/ -type f -mtime +3 -name "*" -exec rm -f {} ;
     #获取pod的状态只保留restartCount的值
     
     ##获取上次的值
     A_line=`sed -n 1p $WORKSPACE/${POD_NAME}.restartCount`
     B_line_null="`sed -n 2p $WORKSPACE/${POD_NAME}.restartCount`"
     if [ ! -n "$B_line_null" ]; then  #处理有两个restartCount值的pod
       B_line="0"
     else
       B_line=`sed -n 2p $WORKSPACE/${POD_NAME}.restartCount`
     fi
     Last_state=`expr $A_line + $B_line`
     ##

     ##获取本次的值
     echo "$Pod_Status" |jq -r '.status |.containerStatuses |.[] |.restartCount' > $WORKSPACE/${POD_NAME}.restartCount
     A_line=`sed -n 1p $WORKSPACE/${POD_NAME}.restartCount`
     B_line_null="`sed -n 2p $WORKSPACE/${POD_NAME}.restartCount`"
     if [ ! -n "$B_line_null" ]; then  #处理有两个restartCount值的pod
       B_line="0"
       else
       B_line=`sed -n 2p $WORKSPACE/${POD_NAME}.restartCount`
     fi
     Current_state=`expr $A_line + $B_line`
     ##
 
     #对比本次拿到的restartCount值与上此的restartCount值
     if [ "$Current_state" -gt "$Last_state" ]; then
       Restart_status="Warning restart_count=$Current_state"
     else
       Restart_status="Normal restart_count=$Current_state"
     fi
     echo "$Restart_status"
  ;;

   running)#监控pod的运行状态和容器的状态返回字符串
     if [ ! -n "$Pod_Status" ]; then
       echo "New Pod"
       exit 0
       fi
     running_status=`echo "$Pod_Status" |jq -r '.status |.phase'`
     Container_status="`echo "$Pod_Status" |jq -r '.status |.containerStatuses |.[] |.ready' |grep false`"
     if [ ! -n "$Container_status" ]; then
        Container_status="_true"
     else
        Container_status="_false"
     fi
     echo "${running_status}${Container_status}"
  ;;
   running_true)#监控pod中的容器运行状态返回数字
     if [ ! -n "$Pod_Status" ]; then
       echo "New Pod"
       exit 0
       fi
     Container_status="`echo "$Pod_Status" |jq -r '.status |.containerStatuses |.[] |.ready' |grep false`"
     if [ ! -n "$Container_status" ]; then
        Container_status="true"
     else
        Container_status="false"
     fi
     if [ "$Container_status" = "true" ]; then
        echo "1"
     else
        echo "0"
     fi
  ;;
 
   *)
     echo "Error parameters"
     exit 0
  ;;

esac
原文地址:https://www.cnblogs.com/37yan/p/7885404.html