一次服务器挂掉解决路程

最近一个月服务器每天都挂掉,

top -c 一下 发现 cpu 390%

蒙蔽了,

top -Hp <pid> 发现有四个线程一直在跑

jstack 了一下 发现死锁了,

排查了很久 暂时没找到原因 只能决定写个python脚本监控一下了

当服务调用不通,jstack一下,然后重新拉起服务

这里用到psutil模块,要安装一下

# coding:utf-8
import urllib
import re
import os
import time
import socket
import logging
import psutil
import re
import traceback




# 日志格式化方式
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
# 日期格式化方式
DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
# 日志输出文件
logging.basicConfig(filename='my.log', level=logging.DEBUG, format=LOG_FORMAT, datefmt=DATE_FORMAT)

# 检测超时时间设置 单位(秒)
socket.setdefaulttimeout(30)

hospitalId='10300'

accountId='19071'

ipStr='127.0.0.1'

portStr='8081'

urlStr='http://'+ipStr+':'+portStr+'/xxxxxxxxccountId+'/'+hospitalId

para=urllib.urlencode({'xxx':xxx,'accountId':accountId})

serverName="/usr/local/servers/tomcat-mobile"

javaDumpPath='/home'


# lastStartDate=time.time()
# time.sleep(2)
# min
startingTime=1
# sec
testSleeptime=5
# print lastStartDate - time.time()
def get_html():
    # logging.info('start Test Web')
    try:
        page = urllib.urlopen(urlStr,para)
        #if page.code != '200' :
        #    return 'error'
        #else :   
        html = page.read()
    except Exception, e:
        logging.error(e.message)
        logging.error(traceback.format_exc())
        return 'error'
    return html

def test_tomcat():
    lastStartDate=1
    html=""
    while(1):
        logging.info(' ')
        print '-'
        if((lastStartDate + (startingTime * 60)) < time.time()):
            html = get_html()
            #logging.info(html)
            if(html == 'error'):
                logging.error('reboot ')
                start_dump()
                lastStartDate = time.time()
            else:
                logging.info('Web test success')
        time.sleep(5)

def get_threadId():
    pids = psutil.pids()
    p1 = r""+serverName
    pattern1 = re.compile(p1)
    for pid in pids:
        p = psutil.Process(pid)
        res=pattern1.findall(','.join(str(i) for i in p.cmdline()))
        if(res):
            return pid
    return -1


def start_dump():
    pid = get_threadId()
    if pid > -1 :
        logging.info("pid:"+str(pid)+"Error")
        os.system('mkdir -p '+javaDumpPath+'/'+str(pid))
        logging.info("start stack")
        os.system('jstack '+str(pid)+' > '+javaDumpPath+'/'+str(pid)+'/jstack.txt')
        logging.info("start dump heap")
        os.system('jmap -heap '+str(pid)+' > '+javaDumpPath+'/'+str(pid)+'/jmapheap.txt')
        logging.info("start dump Mem")
        os.system(' jmap -dump:format=b,file='+javaDumpPath+'/'+str(pid)+'/jmapdump.txt '+str(pid))
        logging.info("Kill Thread Id :"+str(pid))
        os.system('kill -9 '+str(pid))
    logging.info("restart tomcat")
    os.system(serverName+'/bin/startup.sh &')
# 启动检测函数

while(1):
    try:
        test_tomcat()
    except Exception, e:
        logging.error(e.message)
        logging.error(traceback.format_exc())

最后的最后,把出问题的代码片段给干掉了,貌似现在服务没再挂过。先记录一波,

我的守护进程为毛会被操作系统杀?

原文地址:https://www.cnblogs.com/kongxianghao/p/9435887.html