Browse Source

Merge branch 'master' of https://github.com/lework/script

master
lework 5 years ago
parent
commit
29385d1697
  1. 50
      python/supervisor_healthCheck.py

50
python/supervisor_healthCheck.py

@ -117,8 +117,8 @@ def get_proc_rss(pid, cumulative=False):
procs = [] procs = []
for line in data.splitlines(): for line in data.splitlines():
pid, ppid, rss = map(int, line.split()) p_pid, p_ppid, p_rss = map(int, line.split())
procs.append(ProcInfo(pid=pid, ppid=ppid, rss=rss)) procs.append(ProcInfo(pid=p_pid, ppid=p_ppid, rss=p_rss))
# 计算rss # 计算rss
try: try:
@ -262,7 +262,6 @@ class HealthCheck(object):
initialDelaySeconds = config.get('initialDelaySeconds', self.initialDelaySeconds) initialDelaySeconds = config.get('initialDelaySeconds', self.initialDelaySeconds)
sendResolved = config.get('sendResolved', self.sendResolved) sendResolved = config.get('sendResolved', self.sendResolved)
action_type = config.get('action', 'restart') action_type = config.get('action', 'restart')
action_exec_cmd = config.get('execCmd')
check_type = config.get('type', 'HTTP').lower() check_type = config.get('type', 'HTTP').lower()
check_method = self.http_check check_method = self.http_check
@ -324,10 +323,10 @@ class HealthCheck(object):
check_state[program]['failure'] != 0 and check_state[program]['failure'] % ( check_state[program]['failure'] != 0 and check_state[program]['failure'] % (
(periodSeconds + initialDelaySeconds) * 2) == 0): (periodSeconds + initialDelaySeconds) * 2) == 0):
action_param = { action_param = {
'config': config,
'action_type': action_type, 'action_type': action_type,
'check_status': check_status, 'check_status': check_status,
'msg': check_result.get('msg', ''), 'msg': check_result.get('msg', '')
'action_exec_cmd': action_exec_cmd
} }
self.action(program, **action_param) self.action(program, **action_param)
check_state[program]['action'] = True check_state[program]['action'] = True
@ -485,25 +484,32 @@ class HealthCheck(object):
""" """
action_type = args.get('action_type') action_type = args.get('action_type')
msg = args.get('msg') msg = args.get('msg')
action_exec_cmd = args.get('action_exec_cmd')
check_status = args.get('check_status') check_status = args.get('check_status')
config = args.get('config')
self.log(program, 'Action: %s', action_type) self.log(program, 'Action: %s', action_type)
action_list = action_type.split(',') action_list = action_type.split(',')
if 'restart' in action_list: if 'restart' in action_list:
restart_result = self.action_supervistor_restart(program) restart_result = self.action_supervisor_restart(program)
msg += '\r\n Restart:%s' % restart_result msg += '\r\n Restart:%s' % restart_result
elif 'exec' in action_list: elif 'exec' in action_list:
action_exec_cmd = config.get('action_exec_cmd')
exec_result = self.action_exec(program, action_exec_cmd) exec_result = self.action_exec(program, action_exec_cmd)
msg += '\r\n Exec:%s' % exec_result msg += '\r\n Exec:%s' % exec_result
elif 'kill' in action_list:
pid_get = config.get('pidGet', 'supervisor')
pid_file = config.get('pidFile', )
pid, err = self.get_pid(program, pid_get, pid_file)
kill_result = self.action_kill(program, pid)
msg += '\r\n Kill:%s' % kill_result
if 'email' in action_list and self.mail_config: if 'email' in action_list and self.mail_config:
self.action_email(program, action_type, msg, check_status) self.action_email(program, action_type, msg, check_status)
if 'wechat' in action_list and self.wechat_config: if 'wechat' in action_list and self.wechat_config:
self.action_wechat(program, action_type, msg, check_status) self.action_wechat(program, action_type, msg, check_status)
def action_supervistor_restart(self, program): def action_supervisor_restart(self, program):
""" """
通过supervisor的rpc接口重启进程 通过supervisor的rpc接口重启进程
:param program: :param program:
@ -564,6 +570,30 @@ class HealthCheck(object):
return result return result
def action_kill(self, program, pid):
"""
杀死进程
:param program:
:param pid:
:return:
"""
self.log(program, 'Action: kill')
result = 'success'
if int(pid) < 3:
return 'Failed to kill %s, pid: %s '% (program, exitcode)
cmd = "kill -9 %s" % pid
exitcode, stdout, stderr = shell(cmd)
if exitcode == 0:
self.log(program, "Action: kill result success")
else:
result = 'Failed to kill %s, pid: %s exiting: %s' % (program, pid, exitcode)
self.log(program, "Action: kill result %s", result)
return result
def action_email(self, program, action_type, msg, check_status): def action_email(self, program, action_type, msg, check_status):
""" """
发送email 发送email
@ -786,7 +816,7 @@ cat1: # supervisor中配置的program名称
type: mem # 检查类型: http,tcp,mem,cpu 默认: http type: mem # 检查类型: http,tcp,mem,cpu 默认: http
maxRss: 1024 # 内存阈值, 超过则为检测失败. 单位MB, 默认: 1024 maxRss: 1024 # 内存阈值, 超过则为检测失败. 单位MB, 默认: 1024
cumulative: True # 是否统计子进程的内存, 默认: False cumulative: True # 是否统计子进程的内存, 默认: False
pidGet: supervistor # 获取pid的方式: supervistor,name,file, 选择name时,按program名称搜索pid,选择file时,需指定pidFile 默认: supervistor pidGet: supervisor # 获取pid的方式: supervisor,name,file, 选择name时,按program名称搜索pid,选择file时,需指定pidFile 默认: supervisor
pidFile: /var/run/t.pid # 指定pid文件的路径, 只在pidGet为file的时候有用 pidFile: /var/run/t.pid # 指定pid文件的路径, 只在pidGet为file的时候有用
periodSeconds: 10 # 检查的频率(以秒为单位), 默认: 5 periodSeconds: 10 # 检查的频率(以秒为单位), 默认: 5
initialDelaySeconds: 10 # 首次检查等待的时间(以秒为单位), 默认: 1 initialDelaySeconds: 10 # 首次检查等待的时间(以秒为单位), 默认: 1
@ -800,7 +830,7 @@ cat1: # supervisor中配置的program名称
cat2: # supervisor中配置的program名称 cat2: # supervisor中配置的program名称
type: cpu # 检查类型: http,tcp,mem,cpu 默认: http type: cpu # 检查类型: http,tcp,mem,cpu 默认: http
maxCpu: 80 # CPU阈值, 超过则为检测失败. 单位% 默认: 90% maxCpu: 80 # CPU阈值, 超过则为检测失败. 单位% 默认: 90%
pidGet: supervistor # 获取pid的方式: supervistor,name,file, 选择name时,按program名称搜索pid,选择file时,需指定pidFile 默认: supervistor pidGet: supervisor # 获取pid的方式: supervisor,name,file, 选择name时,按program名称搜索pid,选择file时,需指定pidFile 默认: supervisor
pidFile: /var/run/t.pid # 指定pid文件的路径, 只在pidGet为file的时候有用 pidFile: /var/run/t.pid # 指定pid文件的路径, 只在pidGet为file的时候有用
periodSeconds: 10 # 检查的频率(以秒为单位), 默认: 5 periodSeconds: 10 # 检查的频率(以秒为单位), 默认: 5
initialDelaySeconds: 10 # 首次检查等待的时间(以秒为单位), 默认: 1 initialDelaySeconds: 10 # 首次检查等待的时间(以秒为单位), 默认: 1

Loading…
Cancel
Save