openshift 容器云从入门到崩溃之九《容器监控-报警》

容器状态监控

主要是监控POD的状态包括重启、不健康等等这些k8s api 状态本身会报出来,在配合zabbix报警

导入zabbix模板关联上oc master主机

<?xml version="1.0" encoding="UTF-8"?>
<zabbix_export>
    <version>3.2</version>
    <date>2019-02-27T07:33:05Z</date>
    <groups>
        <group>
            <name>Templates</name>
        </group>
    </groups>
    <templates>
        <template>
            <template>OC Pods</template>
            <name>OC Pods</name>
            <description/>
            <groups>
                <group>
                    <name>Templates</name>
                </group>
            </groups>
            <applications>
                <application>
                    <name>restartCount</name>
                </application>
                <application>
                    <name>RunningStatus</name>
                </application>
            </applications>
            <items/>
            <discovery_rules>
                <discovery_rule>
                    <name>OC Pods Discover</name>
                    <type>0</type>
                    <snmp_community/>
                    <snmp_oid/>
                    <key>oc.pod.status[discover,discover]</key>
                    <delay>300</delay>
                    <status>0</status>
                    <allowed_hosts/>
                    <snmpv3_contextname/>
                    <snmpv3_securityname/>
                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
                    <snmpv3_authpassphrase/>
                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
                    <snmpv3_privpassphrase/>
                    <delay_flex/>
                    <params/>
                    <ipmi_sensor/>
                    <authtype>0</authtype>
                    <username/>
                    <password/>
                    <publickey/>
                    <privatekey/>
                    <port/>
                    <filter>
                        <evaltype>0</evaltype>
                        <formula/>
                        <conditions/>
                    </filter>
                    <lifetime>7</lifetime>
                    <description/>
                    <item_prototypes>
                        <item_prototype>
                            <name>Pod {#POD_NAME} Get Status</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.pod.status[{#POD_NAME},get_status]</key>
                            <delay>300</delay>
                            <history>7</history>
                            <trends>0</trends>
                            <status>0</status>
                            <value_type>4</value_type>
                            <allowed_hosts/>
                            <units/>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>RunningStatus</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>Pod {#POD_NAME} Restarts</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.pod.status[{#POD_NAME},restarts]</key>
                            <delay>300</delay>
                            <history>7</history>
                            <trends>0</trends>
                            <status>0</status>
                            <value_type>4</value_type>
                            <allowed_hosts/>
                            <units/>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>restartCount</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>Pod {#POD_NAME} Running</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.pod.status[{#POD_NAME},running]</key>
                            <delay>300</delay>
                            <history>7</history>
                            <trends>0</trends>
                            <status>0</status>
                            <value_type>4</value_type>
                            <allowed_hosts/>
                            <units/>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>RunningStatus</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                    </item_prototypes>
                    <trigger_prototypes>
                        <trigger_prototype>
                            <expression>{OC Pods:oc.pod.status[{#POD_NAME},running].str(Running_true)}=0&#13;
and&#13;
{OC Pods:oc.pod.status[{#POD_NAME},running].str(Pod deleted)}=0</expression>
                            <recovery_mode>0</recovery_mode>
                            <recovery_expression/>
                            <name>Pod {#POD_NAME} Not Running</name>
                            <correlation_mode>0</correlation_mode>
                            <correlation_tag/>
                            <url/>
                            <status>0</status>
                            <priority>1</priority>
                            <description/>
                            <type>0</type>
                            <manual_close>1</manual_close>
                            <dependencies/>
                            <tags/>
                        </trigger_prototype>
                        <trigger_prototype>
                            <expression>{OC Pods:oc.pod.status[{#POD_NAME},restarts].str(Warning)}=1</expression>
                            <recovery_mode>1</recovery_mode>
                            <recovery_expression>{OC Pods:oc.pod.status[{#POD_NAME},restarts].str(Warning,#3)}=0</recovery_expression>
                            <name>Pod {#POD_NAME} restarted Warning</name>
                            <correlation_mode>0</correlation_mode>
                            <correlation_tag/>
                            <url/>
                            <status>0</status>
                            <priority>1</priority>
                            <description/>
                            <type>0</type>
                            <manual_close>1</manual_close>
                            <dependencies/>
                            <tags/>
                        </trigger_prototype>
                    </trigger_prototypes>
                    <graph_prototypes/>
                    <host_prototypes/>
                </discovery_rule>
            </discovery_rules>
            <httptests/>
            <macros/>
            <templates/>
            <screens/>
        </template>
    </templates>
</zabbix_export>

zabbix客户端配置

修改zabbix_agentd.conf

Timeout=30
UserParameter=oc.pod.status[*],/data/app/zabbix/etc/oc_pod_monitor.sh $1 $2

oc_pod_monitor.sh内容

#!/bin/bash
TOKEN=""
ENDPOINT=""
POD_NAME="`echo "$1" |sed 's/.*=(.*$)/1/'`"
Monitoring_type="$2"
WORKSPACE="/data/tmp/oc_monitor"
mkdir -p $WORKSPACE

#通过pod name获得pod所在的namespace
NAMESPACE="`jq -r '.items |.[] |.metadata |.name,.namespace' $WORKSPACE/all_pods.json |grep -A1 $POD_NAME |grep -v $POD_NAME`"

#验证pod是否存在
if [ "$POD_NAME" == "discover" ]; then
  echo
elif [ ! -n "$NAMESPACE" ]; then
  echo "Pod deleted"
  exit 0
fi
##自动发现
case $Monitoring_type in
   discover)
     #获取所有pod只保留pod name
     curl -k 
       -H "Authorization: Bearer $TOKEN" 
       -H 'Accept: application/json' 
       https://$ENDPOINT/api/v1/pods 2>/dev/null  > $WORKSPACE/all_pods.json

     Pod_Name=(`jq -r '.items | .[] | .metadata | .name' $WORKSPACE/all_pods.json |egrep -v 'build|deploy|debug'`)
     #转换为json格式
     printf "{
"
     printf '	"data":[
'
     for ((i=0;i<${#Pod_Name[@]};i++))
     do
        NAMESPACE="`jq -r '.items |.[] |.metadata |.name,.namespace' $WORKSPACE/all_pods.json |grep -A1 ${Pod_Name[i]} |grep -v ${Pod_Name[i]}`"
        Pod_Name_N=""$NAMESPACE"="${Pod_Name[i]}""
        printf '		{
'
        num=$(echo $((${#Pod_Name[@]}-1)))
        if [ "$i" == ${num} ];
        then
                printf "			"{#POD_NAME}":"${Pod_Name_N}"}
"
        else
                printf "			"{#POD_NAME}":"${Pod_Name_N}"},
"
        fi
     done
     printf "	]
"
     printf "}
"   
     exit 0
  ;;

   get_status)#获取pod状态以供所有项目调用
     curl -k 
       -H "Authorization: Bearer $TOKEN" 
       -H 'Accept: application/json' 
       https://${ENDPOINT}/api/v1/namespaces/$NAMESPACE/pods/$POD_NAME/status 2>/dev/null > $WORKSPACE/${NAMESPACE}-${POD_NAME}.status
     Pod_NotFound="`cat $WORKSPACE/${NAMESPACE}-${POD_NAME}.status |grep '"code": 404'`"
     if [ -n "$Pod_NotFound" ]; then
       echo "Pod_Status=NotFound"
       exit 0
     else
       echo "Success"
       exit 0 
     fi
   ;;
esac

#获取pod状态数据
if [ -f "$WORKSPACE/${NAMESPACE}-${POD_NAME}.status" ];then
   Pod_Status="`cat $WORKSPACE/${NAMESPACE}-${POD_NAME}.status`"
else
   echo "" > $WORKSPACE/${NAMESPACE}-${POD_NAME}.status
   Pod_Status="`cat $WORKSPACE/${NAMESPACE}-${POD_NAME}.status`"
fi

#处理Pod_Status的异常
if [ ! -n "$Pod_Status" ]; then  #处理Pod_Status的为空的异常
   echo "Running_true Pod_Status=Null"
   exit 0
elif [ -n "`echo "$Pod_Status" |grep '"code": 404'`" ]; then  #处理pod不存在但是all_pods.json还没更新的异常
   echo "Pod_Status=NotFound"
   exit 0
elif [ "`echo "$Pod_Status" |jq -r '.status |.phase'`" = "Pending" ]; then  #验证容器是否在Pending状态
   echo "Pending"
   exit 0
fi

#选择要获取的数据
case $Monitoring_type in
   restarts)#监控pod是否重启过
     #判断是否是新pod
     if [ ! -f "$WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount" ]; then
       echo "Warning New Pod"
       echo "0" > $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount
       exit 0
     fi
    
     ##获取上次的值
     A_line=`sed -n 1p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`
     B_line_null="`sed -n 2p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`"
     if [ ! -n "$B_line_null" ]; then  #处理有两个restartCount值的pod
       B_line="0"
     else
       B_line=`sed -n 2p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`
     fi
     Last_state=`expr $A_line + $B_line`
     ##
     
     ##获取本次的值
     echo "$Pod_Status" |jq -r '.status |.containerStatuses |.[] |.restartCount' > $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount
     A_line=`sed -n 1p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`
     B_line_null="`sed -n 2p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`"
     if [ ! -n "$B_line_null" ]; then  #处理有两个restartCount值的pod
       B_line="0"
       else
       B_line=`sed -n 2p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`
     fi
     Current_state=`expr $A_line + $B_line`
     ##
 
     #对比本次拿到的restartCount值与上此的restartCount值
     if [ "$Current_state" -gt "$Last_state" ]; then
       Restart_status="Warning restart_count=$Current_state"
     else
       Restart_status="Normal restart_count=$Current_state"
     fi
     echo "$Restart_status"
  ;;

   running)#监控pod的运行状态和容器的状态返回字符串
     
     #获取pod和容器的状态
     running_status=`echo "$Pod_Status" |jq -r '.status |.phase'`
     Container_status="`echo "$Pod_Status" |jq -r '.status |.containerStatuses |.[] |.ready' |grep false`"
     if [ ! -n "$Container_status" ]; then
        Container_status="_true"
     else
        Container_status="_false"
     fi
     echo "${running_status}${Container_status}"
  ;;
 
   *)
     echo "Error parameters"
     exit 0
  ;;

esac
exit 0

这样POD重启或者新建都会报出来

集群NODE节点监控

主要监控node节点的不健康状态,还有lvm卷容量监控

导入zabbix模板关联上oc master主机

<?xml version="1.0" encoding="UTF-8"?>
<zabbix_export>
    <version>3.2</version>
    <date>2019-02-27T07:47:32Z</date>
    <groups>
        <group>
            <name>Templates</name>
        </group>
    </groups>
    <templates>
        <template>
            <template>OC Node Status</template>
            <name>OC Node Status</name>
            <description/>
            <groups>
                <group>
                    <name>Templates</name>
                </group>
            </groups>
            <applications>
                <application>
                    <name>oc_node</name>
                </application>
            </applications>
            <items/>
            <discovery_rules>
                <discovery_rule>
                    <name>OC Nodes Discover</name>
                    <type>0</type>
                    <snmp_community/>
                    <snmp_oid/>
                    <key>oc.node.status[discover,discover]</key>
                    <delay>60</delay>
                    <status>0</status>
                    <allowed_hosts/>
                    <snmpv3_contextname/>
                    <snmpv3_securityname/>
                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
                    <snmpv3_authpassphrase/>
                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
                    <snmpv3_privpassphrase/>
                    <delay_flex/>
                    <params/>
                    <ipmi_sensor/>
                    <authtype>0</authtype>
                    <username/>
                    <password/>
                    <publickey/>
                    <privatekey/>
                    <port/>
                    <filter>
                        <evaltype>0</evaltype>
                        <formula/>
                        <conditions/>
                    </filter>
                    <lifetime>7</lifetime>
                    <description/>
                    <item_prototypes>
                        <item_prototype>
                            <name>Node {#NODE_NAME}  DiskPressure</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.node.status[{#NODE_NAME},DiskPressure]</key>
                            <delay>30</delay>
                            <history>7</history>
                            <trends>0</trends>
                            <status>1</status>
                            <value_type>4</value_type>
                            <allowed_hosts/>
                            <units/>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>oc_node</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>Node {#NODE_NAME} Get Status</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.node.status[{#NODE_NAME},get_status]</key>
                            <delay>30</delay>
                            <history>7</history>
                            <trends>0</trends>
                            <status>0</status>
                            <value_type>4</value_type>
                            <allowed_hosts/>
                            <units/>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications/>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>Node {#NODE_NAME}  MemoryPressure</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.node.status[{#NODE_NAME},MemoryPressure]</key>
                            <delay>30</delay>
                            <history>7</history>
                            <trends>0</trends>
                            <status>1</status>
                            <value_type>4</value_type>
                            <allowed_hosts/>
                            <units/>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>oc_node</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>Node {#NODE_NAME}  Ready</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.node.status[{#NODE_NAME},node_ready]</key>
                            <delay>30</delay>
                            <history>7</history>
                            <trends>0</trends>
                            <status>0</status>
                            <value_type>4</value_type>
                            <allowed_hosts/>
                            <units/>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>oc_node</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>Node {#NODE_NAME} CPU Limits</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.node.status[{#NODE_NAME},node_resources,cpu_limits]</key>
                            <delay>120</delay>
                            <history>7</history>
                            <trends>0</trends>
                            <status>0</status>
                            <value_type>3</value_type>
                            <allowed_hosts/>
                            <units>%</units>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>oc_node</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>Node {#NODE_NAME} CPU Requests</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.node.status[{#NODE_NAME},node_resources,cpu_requests]</key>
                            <delay>120</delay>
                            <history>7</history>
                            <trends>0</trends>
                            <status>0</status>
                            <value_type>3</value_type>
                            <allowed_hosts/>
                            <units>%</units>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>oc_node</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>Node {#NODE_NAME} Memory Limits</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.node.status[{#NODE_NAME},node_resources,memory_limits]</key>
                            <delay>120</delay>
                            <history>7</history>
                            <trends>0</trends>
                            <status>0</status>
                            <value_type>3</value_type>
                            <allowed_hosts/>
                            <units>%</units>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>oc_node</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>Node {#NODE_NAME} Memory Requests</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.node.status[{#NODE_NAME},node_resources,memory_requests]</key>
                            <delay>120</delay>
                            <history>7</history>
                            <trends>0</trends>
                            <status>0</status>
                            <value_type>3</value_type>
                            <allowed_hosts/>
                            <units>%</units>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>oc_node</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                        <item_prototype>
                            <name>Node {#NODE_NAME}  OutOfDisk</name>
                            <type>0</type>
                            <snmp_community/>
                            <multiplier>0</multiplier>
                            <snmp_oid/>
                            <key>oc.node.status[{#NODE_NAME},OutOfDisk]</key>
                            <delay>30</delay>
                            <history>7</history>
                            <trends>0</trends>
                            <status>1</status>
                            <value_type>4</value_type>
                            <allowed_hosts/>
                            <units/>
                            <delta>0</delta>
                            <snmpv3_contextname/>
                            <snmpv3_securityname/>
                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
                            <snmpv3_authpassphrase/>
                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
                            <snmpv3_privpassphrase/>
                            <formula>1</formula>
                            <delay_flex/>
                            <params/>
                            <ipmi_sensor/>
                            <data_type>0</data_type>
                            <authtype>0</authtype>
                            <username/>
                            <password/>
                            <publickey/>
                            <privatekey/>
                            <port/>
                            <description/>
                            <inventory_link>0</inventory_link>
                            <applications>
                                <application>
                                    <name>oc_node</name>
                                </application>
                            </applications>
                            <valuemap/>
                            <logtimefmt/>
                            <application_prototypes/>
                        </item_prototype>
                    </item_prototypes>
                    <trigger_prototypes>
                        <trigger_prototype>
                            <expression>{OC Node Status:oc.node.status[{#NODE_NAME},node_resources,cpu_limits].last()}&gt;150</expression>
                            <recovery_mode>0</recovery_mode>
                            <recovery_expression/>
                            <name>Node {#NODE_NAME} CPU Limits 150%</name>
                            <correlation_mode>0</correlation_mode>
                            <correlation_tag/>
                            <url/>
                            <status>0</status>
                            <priority>1</priority>
                            <description/>
                            <type>0</type>
                            <manual_close>1</manual_close>
                            <dependencies/>
                            <tags/>
                        </trigger_prototype>
                        <trigger_prototype>
                            <expression>{OC Node Status:oc.node.status[{#NODE_NAME},node_resources,cpu_requests].last()}&gt;100</expression>
                            <recovery_mode>0</recovery_mode>
                            <recovery_expression/>
                            <name>Node {#NODE_NAME} CPU Requests 100%</name>
                            <correlation_mode>0</correlation_mode>
                            <correlation_tag/>
                            <url/>
                            <status>0</status>
                            <priority>2</priority>
                            <description/>
                            <type>0</type>
                            <manual_close>1</manual_close>
                            <dependencies/>
                            <tags/>
                        </trigger_prototype>
                        <trigger_prototype>
                            <expression>{OC Node Status:oc.node.status[{#NODE_NAME},DiskPressure].str(DiskPressure_False)}=0</expression>
                            <recovery_mode>0</recovery_mode>
                            <recovery_expression/>
                            <name>Node {#NODE_NAME} DiskPressure</name>
                            <correlation_mode>0</correlation_mode>
                            <correlation_tag/>
                            <url/>
                            <status>1</status>
                            <priority>5</priority>
                            <description/>
                            <type>0</type>
                            <manual_close>1</manual_close>
                            <dependencies/>
                            <tags/>
                        </trigger_prototype>
                        <trigger_prototype>
                            <expression>{OC Node Status:oc.node.status[{#NODE_NAME},node_resources,memory_limits].last()}&gt;150</expression>
                            <recovery_mode>0</recovery_mode>
                            <recovery_expression/>
                            <name>Node {#NODE_NAME} Memory Limits 150%</name>
                            <correlation_mode>0</correlation_mode>
                            <correlation_tag/>
                            <url/>
                            <status>0</status>
                            <priority>1</priority>
                            <description/>
                            <type>0</type>
                            <manual_close>1</manual_close>
                            <dependencies/>
                            <tags/>
                        </trigger_prototype>
                        <trigger_prototype>
                            <expression>{OC Node Status:oc.node.status[{#NODE_NAME},MemoryPressure].str(MemoryPressure_False)}=0</expression>
                            <recovery_mode>0</recovery_mode>
                            <recovery_expression/>
                            <name>Node {#NODE_NAME} MemoryPressure</name>
                            <correlation_mode>0</correlation_mode>
                            <correlation_tag/>
                            <url/>
                            <status>1</status>
                            <priority>5</priority>
                            <description/>
                            <type>0</type>
                            <manual_close>1</manual_close>
                            <dependencies/>
                            <tags/>
                        </trigger_prototype>
                        <trigger_prototype>
                            <expression>{OC Node Status:oc.node.status[{#NODE_NAME},node_resources,memory_requests].last()}&gt;95</expression>
                            <recovery_mode>0</recovery_mode>
                            <recovery_expression/>
                            <name>Node {#NODE_NAME} Memory Requests 95%</name>
                            <correlation_mode>0</correlation_mode>
                            <correlation_tag/>
                            <url/>
                            <status>0</status>
                            <priority>2</priority>
                            <description/>
                            <type>0</type>
                            <manual_close>1</manual_close>
                            <dependencies/>
                            <tags/>
                        </trigger_prototype>
                        <trigger_prototype>
                            <expression>{OC Node Status:oc.node.status[{#NODE_NAME},node_ready].str(Ready_True)}=0</expression>
                            <recovery_mode>0</recovery_mode>
                            <recovery_expression/>
                            <name>Node {#NODE_NAME} Not Ready</name>
                            <correlation_mode>0</correlation_mode>
                            <correlation_tag/>
                            <url/>
                            <status>0</status>
                            <priority>5</priority>
                            <description/>
                            <type>0</type>
                            <manual_close>1</manual_close>
                            <dependencies/>
                            <tags/>
                        </trigger_prototype>
                        <trigger_prototype>
                            <expression>{OC Node Status:oc.node.status[{#NODE_NAME},OutOfDisk].str(OutOfDisk_False)}=0</expression>
                            <recovery_mode>0</recovery_mode>
                            <recovery_expression/>
                            <name>Node {#NODE_NAME} OutOfDisk</name>
                            <correlation_mode>0</correlation_mode>
                            <correlation_tag/>
                            <url/>
                            <status>1</status>
                            <priority>5</priority>
                            <description/>
                            <type>0</type>
                            <manual_close>1</manual_close>
                            <dependencies/>
                            <tags/>
                        </trigger_prototype>
                    </trigger_prototypes>
                    <graph_prototypes/>
                    <host_prototypes/>
                </discovery_rule>
            </discovery_rules>
            <httptests/>
            <macros/>
            <templates/>
            <screens/>
        </template>
    </templates>
</zabbix_export>

zabbix客户端配置

修改zabbix_agentd.conf

Timeout=30
UserParameter=oc.node.status[*],/data/app/zabbix/etc/oc_node_monitor.sh $1 $2 $3

oc_node_monitor.sh的内容

#!/bin/bash
TOKEN=""
ENDPOINT=""
NODE_NAME="$1"
Monitoring_type="$2"
WORKSPACE="/data/tmp/oc_monitor"
mkdir -p $WORKSPACE

case $Monitoring_type in
   discover)#自动发现节点
     Node_Name=(`curl -k 
                   -H "Authorization: Bearer $TOKEN" 
                   -H 'Accept: application/json' 
                    https://$ENDPOINT/api/v1/nodes 2>/dev/null |jq -r '.items|.[]|.metadata|.name'`)

     printf "{
"
     printf '	"data":[
'
     for ((i=0;i<${#Node_Name[@]};i++))
     do
        printf '		{
'
        num=$(echo $((${#Node_Name[@]}-1)))
        if [ "$i" == ${num} ];
        then
                printf "			"{#NODE_NAME}":"${Node_Name[$i]}"}
"
        else
                printf "			"{#NODE_NAME}":"${Node_Name[$i]}"},
"
        fi
     done
     printf "	]
"
     printf "}
"
     exit 0
;;
   get_status)#获取node状态以供所有项目调用
     curl -k 
       -H "Authorization: Bearer $TOKEN" 
       -H 'Accept: application/json' 
       https://${ENDPOINT}/api/v1/nodes/$NODE_NAME 2>/dev/null > $WORKSPACE/${NODE_NAME}.status
     if [ -n "`cat $WORKSPACE/${NODE_NAME}.status |grep '"code": 404'`" ]; then
       echo "Node_Status=NotFound"
       exit 0
     elif [ ! -n "`cat $WORKSPACE/${NODE_NAME}.status`" ]; then
       echo "Node_Status=null"
       exit 0
     else
       echo "Success"
       exit 0
     fi
   ;;
esac 

case $Monitoring_type in
   OutOfDisk)#监控node是否磁盘空间不足
     Node_Status="`cat $WORKSPACE/${NODE_NAME}.status |jq -r '.status|.conditions|.[]|.status' | sed -n 1p`"
     if [ "$Node_Status" == "False" ]; then 
       echo "OutOfDisk_False"
     elif [ ! -n "$Node_Status" ]; then
       echo "OutOfDisk_False"
     else
       echo "OutOfDisk_$Node_Status"
     fi
  ;;

   MemoryPressure)#监控node是否磁盘空间不足
     Node_Status="`cat $WORKSPACE/${NODE_NAME}.status |jq -r '.status|.conditions|.[]|.status' | sed -n 2p`"
     if [ "$Node_Status" == "False" ]; then
       echo "MemoryPressure_False"
     elif [ ! -n "$Node_Status" ]; then
       echo "MemoryPressure_False"
     else
       echo "MemoryPressure_$Node_Status"
     fi
  ;;
  
   DiskPressure)#监控node是否磁盘压力太大
     Node_Status="`cat $WORKSPACE/${NODE_NAME}.status |jq -r '.status|.conditions|.[]|.status' | sed -n 3p`"
     if [ "$Node_Status" == "False" ]; then
       echo "DiskPressure_False"
     elif [ ! -n "$Node_Status" ]; then
       echo "DiskPressure_False"
     else
       echo "DiskPressure_$Node_Status"
     fi
  ;;

   node_ready)#监控node是否准备好了
     Node_Status="`cat $WORKSPACE/${NODE_NAME}.status |jq -r '.status|.conditions|.[]|.status' | sed -n 4p`"
     if [ "$Node_Status" == "True" ]; then
       echo "Ready_True"
     elif [ ! -n "$Node_Status" ]; then
       echo "Ready_True"
     else
       echo "Ready_$Node_Status"
     fi
  ;;

   node_resources)#监控node资源分配情况
     null="`cat $WORKSPACE/${NODE_NAME}.resources |awk '{print $2}'`"
     if [ ! -n "$null" ]; then
        sleep 1
     fi
     if [ "$3" == "cpu_requests" ]; then
        data="`cat $WORKSPACE/${NODE_NAME}.resources |awk '{print $2}' |grep -o '[0-9]*'`"
        if [ $data -gt 0 ]; then
          echo $data
        else
          echo 0
        fi  
     elif [ "$3" == "cpu_limits" ]; then 
        data="`cat $WORKSPACE/${NODE_NAME}.resources |awk '{print $4}' |grep -o '[0-9]*'`"
        if [ $data -gt 0 ]; then
          echo $data
        else
          echo 0
        fi

     elif [ "$3" == "memory_requests" ]; then
        data="`cat $WORKSPACE/${NODE_NAME}.resources |awk '{print $6}' |grep -o '[0-9]*'`"
        if [ "$data" -gt 0 ]; then
          echo $data
        else
          echo 0
        fi 

     elif [ "$3" == "memory_limits" ]; then
        data="`cat $WORKSPACE/${NODE_NAME}.resources |awk '{print $8}' |grep -o '[0-9]*'`"
        if [ $data -gt 0 ]; then
          echo $data
        else
          echo 0
        fi 
     fi
  ;;
esac

crontab -e

*/2 * * * * /data/scripts/oc_master_crontab.sh >/dev/null 2>&1

oc_master_crontab.sh内容

node_name=(`oc get node |grep -v "NAME" |awk '{print $1}'`)
for ((i=0;i<${#node_name[*]};i++))
do
oc describe node "${node_name[i]}" |grep -B 1 "Events"  |grep -v "Events"  > /data/tmp/oc_monitor/${node_name[i]}.resources
chmod -R 777 /data/tmp/
done
原文地址:https://www.cnblogs.com/37yan/p/10444009.html