Browse Source

add exec action

master
lework 5 years ago
parent
commit
2cbfcd54c6
  1. 189
      python/supervisor_healthCheck.py

189
python/supervisor_healthCheck.py

@ -17,6 +17,7 @@ import smtplib
import datetime import datetime
import platform import platform
import threading import threading
import subprocess
from xmlrpclib import ServerProxy, Fault from xmlrpclib import ServerProxy, Fault
from email.header import Header from email.header import Header
from email.mime.text import MIMEText from email.mime.text import MIMEText
@ -27,7 +28,6 @@ try:
except ImportError: except ImportError:
import http.client as httplib import http.client as httplib
PY2 = sys.version_info[0] == 2 PY2 = sys.version_info[0] == 2
PY3 = sys.version_info[0] == 3 PY3 = sys.version_info[0] == 3
@ -51,10 +51,18 @@ def shell(cmd):
""" """
执行系统命令 执行系统命令
:param cmd: :param cmd:
:return: :return: (exitcode, stdout, stderr)
""" """
with os.popen(cmd) as f: # with os.popen(cmd) as f:
return f.read() # return f.read()
env_to_pass = dict(os.environ)
proc = subprocess.Popen(cmd,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=env_to_pass)
proc.wait()
return (proc.returncode,) + proc.communicate()
def get_proc_cpu(pid): def get_proc_cpu(pid):
@ -65,12 +73,12 @@ def get_proc_cpu(pid):
""" """
pscommand = 'ps -opcpu= -p %s' pscommand = 'ps -opcpu= -p %s'
data = shell(pscommand % pid) _, data, _ = shell(pscommand % pid)
if not data: if not data:
# 未获取到数据值,或者没有此pid信息 # 未获取到数据值,或者没有此pid信息
return None return None
try: try:
cpu_utilization = data.lstrip().rstrip() cpu_utilization = data.strip()
cpu_utilization = float(cpu_utilization) cpu_utilization = float(cpu_utilization)
except ValueError: except ValueError:
# 获取的结果不包含数据,或者无法识别cpu_utilization # 获取的结果不包含数据,或者无法识别cpu_utilization
@ -101,7 +109,7 @@ def get_proc_rss(pid, cumulative=False):
if cumulative: if cumulative:
# 统计进程的子进程rss # 统计进程的子进程rss
data = shell(pstreecommand) _, data, _ = shell(pstreecommand)
data = data.strip() data = data.strip()
procs = [] procs = []
@ -120,12 +128,12 @@ def get_proc_rss(pid, cumulative=False):
return None return None
else: else:
data = shell(pscommand % pid) _, data, _ = shell(pscommand % pid)
if not data: if not data:
# 未获取到数据值,或者没有此pid信息 # 未获取到数据值,或者没有此pid信息
return None return None
try: try:
rss = data.lstrip().rstrip() rss = data.strip()
rss = int(rss) rss = int(rss)
except ValueError: except ValueError:
# 获取的结果不包含数据,或者无法识别rss # 获取的结果不包含数据,或者无法识别rss
@ -149,7 +157,7 @@ class HealthCheck(object):
if 'config' in config: if 'config' in config:
self.mail_config = config['config'].get('mail') self.mail_config = config['config'].get('mail')
self.wechat_config = config['config'].get('wechat') self.wechat_config = config['config'].get('wechat')
self.supervisor_url = config['config'].get('supervistor_url', self.supervisor_url) self.supervisor_url = config['config'].get('supervistorUrl', self.supervisor_url)
config.pop('config') config.pop('config')
self.program_config = config self.program_config = config
@ -163,6 +171,51 @@ class HealthCheck(object):
self.cumulative = False self.cumulative = False
self.max_cpu = 90 self.max_cpu = 90
def get_pid(self, program, kind, pid_file):
"""
获取进程pid
:param program:
:param kind:
:param args:
:return:
"""
pid = 0
err = ''
if kind == 'supervisor':
try:
s = ServerProxy(self.supervisor_url)
info = s.supervisor.getProcessInfo(program)
pid = info.get('pid')
err = info.get('description')
except Exception as e:
self.log(program, "PID: Can't get pid from supervisor %s ", e)
elif kind == 'name':
pscommand = "ps -A -o pid,cmd |grep '[%s]%s' | awk '{print $1}' | head -1"
exitcode, stdout, stderr = shell(pscommand % (program[0], program[1:]))
if exitcode == 0:
pid = stdout.strip()
else:
self.log(program, "PID: Can't get pid from name %s ", stderr)
pid = 0
err = stderr
elif kind == 'file':
if pid_file:
try:
with open(pid_file) as f:
pid = f.read().strip()
except Exception as e:
self.log(program, "PID: Can't get pid from file %s ", e)
err = "Can't get pid from file"
else:
err = "PID: pid file not set"
self.log(program, err)
if not pid:
pid = 0
return pid, err
def log(self, program, msg, *args): def log(self, program, msg, *args):
""" """
写信息到 STDERR. 写信息到 STDERR.
@ -189,6 +242,7 @@ class HealthCheck(object):
successThreshold = config.get('successThreshold', self.successThreshold) successThreshold = config.get('successThreshold', self.successThreshold)
initialDelaySeconds = config.get('initialDelaySeconds', self.initialDelaySeconds) initialDelaySeconds = config.get('initialDelaySeconds', self.initialDelaySeconds)
action_type = config.get('action', 'restart') action_type = config.get('action', 'restart')
action_exec_cmd = config.get('execCmd')
check_type = config.get('type', 'HTTP').lower() check_type = config.get('type', 'HTTP').lower()
check_method = self.http_check check_method = self.http_check
@ -234,9 +288,13 @@ class HealthCheck(object):
if check_state[program]['failure'] >= failureThreshold: if check_state[program]['failure'] >= failureThreshold:
# 失败后, 只触发一次action,或者检测错误数是2倍periodSeconds的平方数时触发(避免重启失败导致服务一直不可用) # 失败后, 只触发一次action,或者检测错误数是2倍periodSeconds的平方数时触发(避免重启失败导致服务一直不可用)
if not check_state[program]['action'] or ( if not check_state[program]['action'] or (
check_state[program]['failure'] != 0 and check_state[program][ check_state[program]['failure'] != 0 and check_state[program]['failure'] % (periodSeconds * 2) == 0):
'failure'] % (periodSeconds * 2) == 0): action_param = {
self.action(program, action_type, check_result.get('msg', '')) 'action_type': action_type,
'check_result': check_result.get('msg', ''),
'action_exec_cmd': action_exec_cmd
}
self.action(program, **action_param)
check_state[program]['action'] = True check_state[program]['action'] = True
# 间隔时间清0 # 间隔时间清0
@ -333,25 +391,23 @@ class HealthCheck(object):
:return: dict :return: dict
""" """
program = config.get('program') program = config.get('program')
max_rss = config.get('max_rss', self.max_rss) max_rss = config.get('maxRss', self.max_rss)
cumulative = config.get('cumulative', self.cumulative) cumulative = config.get('cumulative', self.cumulative)
pid_get = config.get('pidGet', 'supervisor')
pid_file = config.get('pidFile', )
check_info = 'max_rss:%sMB cumulative:%s' % (max_rss, cumulative) check_info = 'max_rss:%sMB cumulative:%s' % (max_rss, cumulative)
try: pid, err = self.get_pid(program, pid_get, pid_file)
s = ServerProxy(self.supervisor_url)
info = s.supervisor.getProcessInfo(program)
pid = info.get('pid')
if pid == 0: if pid == 0:
self.log(program, 'MEM: check error, program not starting') self.log(program, 'MEM: check error, program not starting')
return {'status': 'failure', return {'status': 'failure',
'msg': '[mem_check] program not starting, message: %s' % (info.get('description')),'info': check_info} 'msg': '[mem_check] program not starting, message: %s' % err,
'info': check_info}
now_rss = get_proc_rss(pid, cumulative) now_rss = get_proc_rss(pid, cumulative)
check_info = '%s now_rss:%sMB' % (check_info, now_rss) check_info = '%s now_rss:%sMB pid:%s' % (check_info, now_rss, pid)
if now_rss >= int(max_rss): if now_rss >= int(max_rss):
return {'status': 'failure', 'msg': '[mem_check] max_rss(%sMB) now_rss(%sMB)' % (max_rss, now_rss), 'info': check_info} return {'status': 'failure', 'msg': '[mem_check] max_rss(%sMB) now_rss(%sMB)' % (max_rss, now_rss),
except Exception as e: 'info': check_info}
self.log(program, 'MEM: check error, %s', e)
return {'status': 'failure', 'msg': '[mem_check] %s' % e,'info': check_info}
return {'status': 'success', 'info': check_info} return {'status': 'success', 'info': check_info}
@ -362,28 +418,27 @@ class HealthCheck(object):
:return: dict :return: dict
""" """
program = config.get('program') program = config.get('program')
max_cpu = config.get('max_cpu', self.max_cpu) max_cpu = config.get('maxCpu', self.max_cpu)
pid_get = config.get('pidGet', 'supervisor')
pid_file = config.get('pidFile', )
check_info = 'max_cpu:{cpu}%'.format(cpu=max_cpu) check_info = 'max_cpu:{cpu}%'.format(cpu=max_cpu)
try: pid, err = self.get_pid(program, pid_get, pid_file)
s = ServerProxy(self.supervisor_url)
info = s.supervisor.getProcessInfo(program)
pid = info.get('pid')
if pid == 0: if pid == 0:
self.log(program, 'CPU: check error, program not starting') self.log(program, 'CPU: check error, program not starting')
return {'status': 'failure', return {'status': 'failure',
'msg': '[cpu_check] program not starting, message: %s' % (info.get('description')),'info': check_info} 'msg': '[cpu_check] program not starting, message: %s' % err,
'info': check_info}
now_cpu = get_proc_cpu(pid) now_cpu = get_proc_cpu(pid)
check_info = '{info} now_cpu:{now}%'.format(info=check_info, now=now_cpu) check_info = '{info} now_cpu:{now}% pid:{pid}'.format(info=check_info, now=now_cpu, pid=pid)
if now_cpu >= int(max_cpu): if now_cpu >= int(max_cpu):
return {'status': 'failure', 'msg': '[cpu_check] max_cpu({max_cpu}%) now_cpu({now}%)'.format(max_cpu=max_cpu, now=now_cpu),'info': check_info} return {'status': 'failure',
except Exception as e: 'msg': '[cpu_check] max_cpu({max_cpu}%) now_cpu({now}%)'.format(max_cpu=max_cpu, now=now_cpu),
self.log(program, 'CPU: check error, %s', e) 'info': check_info}
return {'status': 'failure', 'msg': '[cpu_check] %s' % e,'info': check_info}
return {'status': 'success', 'info': check_info} return {'status': 'success', 'info': check_info}
def action(self, program, action_type, error): def action(self, program, **args):
""" """
执行动作 执行动作
:param program: :param program:
@ -391,15 +446,24 @@ class HealthCheck(object):
:param error: :param error:
:return: :return:
""" """
action_type = args.get('action_type')
check_result = args.get('check_result')
action_exec_cmd = args.get('action_exec_cmd')
self.log(program, 'Action: %s', action_type) self.log(program, 'Action: %s', action_type)
action_list = action_type.split(',') action_list = action_type.split(',')
if 'restart' in action_list: if 'restart' in action_list:
restart_result = self.action_supervistor_restart(program) restart_result = self.action_supervistor_restart(program)
error += '\r\n Restart:%s' % restart_result check_result += '\r\n Restart:%s' % restart_result
elif 'exec' in action_list:
exec_result = self.action_exec(program, action_exec_cmd)
check_result += '\r\n Exec:%s' % exec_result
if 'email' in action_list and self.mail_config: if 'email' in action_list and self.mail_config:
self.action_email(program, action_type, error) self.action_email(program, action_type, check_result)
if 'wechat' in action_list and self.wechat_config: if 'wechat' in action_list and self.wechat_config:
self.action_wechat(program, action_type, error) self.action_wechat(program, action_type, check_result)
def action_supervistor_restart(self, program): def action_supervistor_restart(self, program):
""" """
@ -442,6 +506,26 @@ class HealthCheck(object):
return result return result
def action_exec(self, program, cmd):
"""
执行系统命令
:param program:
:param exec:
:return:
"""
self.log(program, 'Action: exec')
result = 'success'
exitcode, stdout, stderr = shell(cmd)
if exitcode == 0:
self.log(program, "Action: exec result success")
else:
result = 'Failed to exec %s, exiting: %s' % (program, exitcode)
self.log(program, "Action: exec result %s", result)
return result
def action_email(self, program, action_type, msg): def action_email(self, program, action_type, msg):
""" """
发送email 发送email
@ -616,7 +700,7 @@ if __name__ == '__main__':
if not os.path.exists(config_file): if not os.path.exists(config_file):
example_config = """ example_config = """
config: # 脚本配置名称,请勿更改 config: # 脚本配置名称,请勿更改
supervisord_url: http://localhost:9001/RPC2 # supervisor的rpc接口地址 supervisordUrl: http://localhost:9001/RPC2 # supervisor的rpc接口地址
# mail: # stmp配置 # mail: # stmp配置
# host: 'smtp.test.com' # host: 'smtp.test.com'
# port': '465' # port': '465'
@ -631,25 +715,34 @@ config: # 脚本配置名称,请勿更
# toparty: # toparty:
# totag: # totag:
# 内存方式监控
cat1: # supervisor中配置的program名称 cat1: # supervisor中配置的program名称
type: mem # 检查类型: http,tcp,mem,cpu 默认: http type: mem # 检查类型: http,tcp,mem,cpu 默认: http
max_rss: 1024 # 单位MB, 默认: 1024 maxRss: 1024 # 内存阈值, 超过则为检测失败. 单位MB, 默认: 1024
cumulative: True # 是否统计子进程的内存, 默认: False cumulative: True # 是否统计子进程的内存, 默认: False
pidGet: supervistor # 获取pid的方式: supervistor,name,file, 选择name时,按program名称搜索pid,选择file时,需指定pidFile 默认: supervistor
pidFile: /var/run/t.pid # 指定pid文件的路径, 只在pidGet为file的时候有用
periodSeconds: 10 # 检查的频率(以秒为单位), 默认: 5 periodSeconds: 10 # 检查的频率(以秒为单位), 默认: 5
initialDelaySeconds: 10 # 首次检查等待的时间(以秒为单位), 默认: 1 initialDelaySeconds: 10 # 首次检查等待的时间(以秒为单位), 默认: 1
failureThreshold: 3 # 检查成功后,最少连续检查失败多少次才被认定为失败, 默认: 3 failureThreshold: 3 # 检查成功后,最少连续检查失败多少次才被认定为失败, 默认: 3
successThreshold: 2 # 失败后检查成功的最小连续成功次数, 默认:1 successThreshold: 2 # 失败后检查成功的最小连续成功次数, 默认:1
action: restart,email # 触发的动作: restart,email,wechat 默认: restart action: restart,email # 触发的动作: restart,exec,email,wechat (restart和exec互斥,同时设置时restart生效) 默认: restart
execCmd: command # action exec 的执行命令
# cpu方式监控
cat2: # supervisor中配置的program名称 cat2: # supervisor中配置的program名称
type: cpu # 检查类型: http,tcp,mem,cpu 默认: http type: cpu # 检查类型: http,tcp,mem,cpu 默认: http
max_cpu: 80 # cpu使用百分比,单位% 默认: 90% maxCpu: 80 # CPU阈值, 超过则为检测失败. 单位% 默认: 90%
pidGet: supervistor # 获取pid的方式: supervistor,name,file, 选择name时,按program名称搜索pid,选择file时,需指定pidFile 默认: supervistor
pidFile: /var/run/t.pid # 指定pid文件的路径, 只在pidGet为file的时候有用
periodSeconds: 10 # 检查的频率(以秒为单位), 默认: 5 periodSeconds: 10 # 检查的频率(以秒为单位), 默认: 5
initialDelaySeconds: 10 # 首次检查等待的时间(以秒为单位), 默认: 1 initialDelaySeconds: 10 # 首次检查等待的时间(以秒为单位), 默认: 1
failureThreshold: 3 # 检查成功后,最少连续检查失败多少次才被认定为失败, 默认: 3 failureThreshold: 3 # 检查成功后,最少连续检查失败多少次才被认定为失败, 默认: 3
successThreshold: 2 # 失败后检查成功的最小连续成功次数, 默认:1 successThreshold: 2 # 失败后检查成功的最小连续成功次数, 默认:1
action: restart,wechat # 触发的动作: restart,email,wechat 默认: restart action: restart,email # 触发的动作: restart,exec,email,wechat (restart和exec互斥,同时设置时restart生效) 默认: restart
execCmd: command # action exec 的执行命令
# HTTP方式监控
cat3: cat3:
type: HTTP type: HTTP
mode: POST # http动作:POST,GET 默认: GET mode: POST # http动作:POST,GET 默认: GET
@ -665,7 +758,10 @@ cat3:
timeoutSeconds: 5 # 检查超时的秒数, 默认: 3 timeoutSeconds: 5 # 检查超时的秒数, 默认: 3
failureThreshold: 3 # 检查成功后,最少连续检查失败多少次才被认定为失败, 默认: 3 failureThreshold: 3 # 检查成功后,最少连续检查失败多少次才被认定为失败, 默认: 3
successThreshold: 2 # 失败后检查成功的最小连续成功次数, 默认:1 successThreshold: 2 # 失败后检查成功的最小连续成功次数, 默认:1
action: restart,email # 触发的动作: restart,email,wechat 默认: restart action: restart,email # 触发的动作: restart,exec,email,wechat (restart和exec互斥,同时设置时restart生效) 默认: restart
execCmd: command # action exec 的执行命令
# TCP方式监控
cat4: cat4:
type: TCP type: TCP
host: 127.0.0.1 # 主机地址, 默认: localhost host: 127.0.0.1 # 主机地址, 默认: localhost
@ -675,7 +771,8 @@ cat4:
timeoutSeconds: 5 # 检查超时的秒数, 默认: 3 timeoutSeconds: 5 # 检查超时的秒数, 默认: 3
failureThreshold: 3 # 检查成功后,最少连续检查失败多少次才被认定为失败, 默认: 3 failureThreshold: 3 # 检查成功后,最少连续检查失败多少次才被认定为失败, 默认: 3
successThreshold: 2 # 失败后检查成功的最小连续成功次数, 默认:1 successThreshold: 2 # 失败后检查成功的最小连续成功次数, 默认:1
action: restart,email # 触发的动作: restart,email,wechat 默认: restart action: restart,email # 触发的动作: restart,exec,email,wechat (restart和exec互斥,同时设置时restart生效) 默认: restart
execCmd: command # action exec 的执行命令
""" """
with open(config_file, 'w') as f: with open(config_file, 'w') as f:
f.write(example_config) f.write(example_config)

Loading…
Cancel
Save