Browse Source

add exec action

master
lework 5 years ago
parent
commit
2cbfcd54c6
  1. 229
      python/supervisor_healthCheck.py

229
python/supervisor_healthCheck.py

@ -17,6 +17,7 @@ import smtplib @@ -17,6 +17,7 @@ import smtplib
import datetime
import platform
import threading
import subprocess
from xmlrpclib import ServerProxy, Fault
from email.header import Header
from email.mime.text import MIMEText
@ -27,7 +28,6 @@ try: @@ -27,7 +28,6 @@ try:
except ImportError:
import http.client as httplib
PY2 = sys.version_info[0] == 2
PY3 = sys.version_info[0] == 3
@ -51,10 +51,18 @@ def shell(cmd): @@ -51,10 +51,18 @@ def shell(cmd):
"""
执行系统命令
:param cmd:
:return:
:return: (exitcode, stdout, stderr)
"""
with os.popen(cmd) as f:
return f.read()
# with os.popen(cmd) as f:
# return f.read()
env_to_pass = dict(os.environ)
proc = subprocess.Popen(cmd,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=env_to_pass)
proc.wait()
return (proc.returncode,) + proc.communicate()
def get_proc_cpu(pid):
@ -65,12 +73,12 @@ def get_proc_cpu(pid): @@ -65,12 +73,12 @@ def get_proc_cpu(pid):
"""
pscommand = 'ps -opcpu= -p %s'
data = shell(pscommand % pid)
_, data, _ = shell(pscommand % pid)
if not data:
# 未获取到数据值,或者没有此pid信息
return None
try:
cpu_utilization = data.lstrip().rstrip()
cpu_utilization = data.strip()
cpu_utilization = float(cpu_utilization)
except ValueError:
# 获取的结果不包含数据,或者无法识别cpu_utilization
@ -101,7 +109,7 @@ def get_proc_rss(pid, cumulative=False): @@ -101,7 +109,7 @@ def get_proc_rss(pid, cumulative=False):
if cumulative:
# 统计进程的子进程rss
data = shell(pstreecommand)
_, data, _ = shell(pstreecommand)
data = data.strip()
procs = []
@ -120,12 +128,12 @@ def get_proc_rss(pid, cumulative=False): @@ -120,12 +128,12 @@ def get_proc_rss(pid, cumulative=False):
return None
else:
data = shell(pscommand % pid)
_, data, _ = shell(pscommand % pid)
if not data:
# 未获取到数据值,或者没有此pid信息
return None
try:
rss = data.lstrip().rstrip()
rss = data.strip()
rss = int(rss)
except ValueError:
# 获取的结果不包含数据,或者无法识别rss
@ -149,7 +157,7 @@ class HealthCheck(object): @@ -149,7 +157,7 @@ class HealthCheck(object):
if 'config' in config:
self.mail_config = config['config'].get('mail')
self.wechat_config = config['config'].get('wechat')
self.supervisor_url = config['config'].get('supervistor_url', self.supervisor_url)
self.supervisor_url = config['config'].get('supervistorUrl', self.supervisor_url)
config.pop('config')
self.program_config = config
@ -163,6 +171,51 @@ class HealthCheck(object): @@ -163,6 +171,51 @@ class HealthCheck(object):
self.cumulative = False
self.max_cpu = 90
def get_pid(self, program, kind, pid_file):
"""
获取进程pid
:param program:
:param kind:
:param args:
:return:
"""
pid = 0
err = ''
if kind == 'supervisor':
try:
s = ServerProxy(self.supervisor_url)
info = s.supervisor.getProcessInfo(program)
pid = info.get('pid')
err = info.get('description')
except Exception as e:
self.log(program, "PID: Can't get pid from supervisor %s ", e)
elif kind == 'name':
pscommand = "ps -A -o pid,cmd |grep '[%s]%s' | awk '{print $1}' | head -1"
exitcode, stdout, stderr = shell(pscommand % (program[0], program[1:]))
if exitcode == 0:
pid = stdout.strip()
else:
self.log(program, "PID: Can't get pid from name %s ", stderr)
pid = 0
err = stderr
elif kind == 'file':
if pid_file:
try:
with open(pid_file) as f:
pid = f.read().strip()
except Exception as e:
self.log(program, "PID: Can't get pid from file %s ", e)
err = "Can't get pid from file"
else:
err = "PID: pid file not set"
self.log(program, err)
if not pid:
pid = 0
return pid, err
def log(self, program, msg, *args):
"""
写信息到 STDERR.
@ -189,6 +242,7 @@ class HealthCheck(object): @@ -189,6 +242,7 @@ class HealthCheck(object):
successThreshold = config.get('successThreshold', self.successThreshold)
initialDelaySeconds = config.get('initialDelaySeconds', self.initialDelaySeconds)
action_type = config.get('action', 'restart')
action_exec_cmd = config.get('execCmd')
check_type = config.get('type', 'HTTP').lower()
check_method = self.http_check
@ -234,9 +288,13 @@ class HealthCheck(object): @@ -234,9 +288,13 @@ class HealthCheck(object):
if check_state[program]['failure'] >= failureThreshold:
# 失败后, 只触发一次action,或者检测错误数是2倍periodSeconds的平方数时触发(避免重启失败导致服务一直不可用)
if not check_state[program]['action'] or (
check_state[program]['failure'] != 0 and check_state[program][
'failure'] % (periodSeconds * 2) == 0):
self.action(program, action_type, check_result.get('msg', ''))
check_state[program]['failure'] != 0 and check_state[program]['failure'] % (periodSeconds * 2) == 0):
action_param = {
'action_type': action_type,
'check_result': check_result.get('msg', ''),
'action_exec_cmd': action_exec_cmd
}
self.action(program, **action_param)
check_state[program]['action'] = True
# 间隔时间清0
@ -287,7 +345,7 @@ class HealthCheck(object): @@ -287,7 +345,7 @@ class HealthCheck(object):
self.log(program, 'HTTP: config_json not loads: %s , %s', json, e)
check_info = '%s %s %s %s %s %s' % (config_host, config_port, config_path, config_method,
config_body, headers)
config_body, headers)
try:
httpClient = httplib.HTTPConnection(config_host, config_port, timeout=config_timeoutSeconds)
@ -303,7 +361,7 @@ class HealthCheck(object): @@ -303,7 +361,7 @@ class HealthCheck(object):
if res.status != httplib.OK:
return {'status': 'failure', 'msg': '[http_check] return code %s' % res.status, 'info': check_info}
return {'status': 'success','info': check_info}
return {'status': 'success', 'info': check_info}
def tcp_check(self, config):
"""
@ -324,7 +382,7 @@ class HealthCheck(object): @@ -324,7 +382,7 @@ class HealthCheck(object):
except Exception as e:
self.log(program, 'TCP: conn error, %s', e)
return {'status': 'failure', 'msg': '[tcp_check] %s' % e, 'info': check_info}
return {'status': 'success','info': check_info}
return {'status': 'success', 'info': check_info}
def mem_check(self, config):
"""
@ -333,27 +391,25 @@ class HealthCheck(object): @@ -333,27 +391,25 @@ class HealthCheck(object):
:return: dict
"""
program = config.get('program')
max_rss = config.get('max_rss', self.max_rss)
max_rss = config.get('maxRss', self.max_rss)
cumulative = config.get('cumulative', self.cumulative)
check_info = 'max_rss:%sMB cumulative:%s' % (max_rss,cumulative)
try:
s = ServerProxy(self.supervisor_url)
info = s.supervisor.getProcessInfo(program)
pid = info.get('pid')
if pid == 0:
self.log(program, 'MEM: check error, program not starting')
return {'status': 'failure',
'msg': '[mem_check] program not starting, message: %s' % (info.get('description')),'info': check_info}
now_rss = get_proc_rss(pid, cumulative)
check_info = '%s now_rss:%sMB' % (check_info, now_rss)
if now_rss >= int(max_rss):
return {'status': 'failure', 'msg': '[mem_check] max_rss(%sMB) now_rss(%sMB)' % (max_rss, now_rss), 'info': check_info}
except Exception as e:
self.log(program, 'MEM: check error, %s', e)
return {'status': 'failure', 'msg': '[mem_check] %s' % e,'info': check_info}
return {'status': 'success','info': check_info}
pid_get = config.get('pidGet', 'supervisor')
pid_file = config.get('pidFile', )
check_info = 'max_rss:%sMB cumulative:%s' % (max_rss, cumulative)
pid, err = self.get_pid(program, pid_get, pid_file)
if pid == 0:
self.log(program, 'MEM: check error, program not starting')
return {'status': 'failure',
'msg': '[mem_check] program not starting, message: %s' % err,
'info': check_info}
now_rss = get_proc_rss(pid, cumulative)
check_info = '%s now_rss:%sMB pid:%s' % (check_info, now_rss, pid)
if now_rss >= int(max_rss):
return {'status': 'failure', 'msg': '[mem_check] max_rss(%sMB) now_rss(%sMB)' % (max_rss, now_rss),
'info': check_info}
return {'status': 'success', 'info': check_info}
def cpu_check(self, config):
"""
@ -362,28 +418,27 @@ class HealthCheck(object): @@ -362,28 +418,27 @@ class HealthCheck(object):
:return: dict
"""
program = config.get('program')
max_cpu = config.get('max_cpu', self.max_cpu)
max_cpu = config.get('maxCpu', self.max_cpu)
pid_get = config.get('pidGet', 'supervisor')
pid_file = config.get('pidFile', )
check_info = 'max_cpu:{cpu}%'.format(cpu=max_cpu)
try:
s = ServerProxy(self.supervisor_url)
info = s.supervisor.getProcessInfo(program)
pid = info.get('pid')
if pid == 0:
self.log(program, 'CPU: check error, program not starting')
return {'status': 'failure',
'msg': '[cpu_check] program not starting, message: %s' % (info.get('description')),'info': check_info}
now_cpu = get_proc_cpu(pid)
check_info = '{info} now_cpu:{now}%'.format(info=check_info, now=now_cpu)
if now_cpu >= int(max_cpu):
return {'status': 'failure', 'msg': '[cpu_check] max_cpu({max_cpu}%) now_cpu({now}%)'.format(max_cpu=max_cpu, now=now_cpu),'info': check_info}
except Exception as e:
self.log(program, 'CPU: check error, %s', e)
return {'status': 'failure', 'msg': '[cpu_check] %s' % e,'info': check_info}
return {'status': 'success','info': check_info}
def action(self, program, action_type, error):
pid, err = self.get_pid(program, pid_get, pid_file)
if pid == 0:
self.log(program, 'CPU: check error, program not starting')
return {'status': 'failure',
'msg': '[cpu_check] program not starting, message: %s' % err,
'info': check_info}
now_cpu = get_proc_cpu(pid)
check_info = '{info} now_cpu:{now}% pid:{pid}'.format(info=check_info, now=now_cpu, pid=pid)
if now_cpu >= int(max_cpu):
return {'status': 'failure',
'msg': '[cpu_check] max_cpu({max_cpu}%) now_cpu({now}%)'.format(max_cpu=max_cpu, now=now_cpu),
'info': check_info}
return {'status': 'success', 'info': check_info}
def action(self, program, **args):
"""
执行动作
:param program:
@ -391,15 +446,24 @@ class HealthCheck(object): @@ -391,15 +446,24 @@ class HealthCheck(object):
:param error:
:return:
"""
action_type = args.get('action_type')
check_result = args.get('check_result')
action_exec_cmd = args.get('action_exec_cmd')
self.log(program, 'Action: %s', action_type)
action_list = action_type.split(',')
if 'restart' in action_list:
restart_result = self.action_supervistor_restart(program)
error += '\r\n Restart:%s' % restart_result
check_result += '\r\n Restart:%s' % restart_result
elif 'exec' in action_list:
exec_result = self.action_exec(program, action_exec_cmd)
check_result += '\r\n Exec:%s' % exec_result
if 'email' in action_list and self.mail_config:
self.action_email(program, action_type, error)
self.action_email(program, action_type, check_result)
if 'wechat' in action_list and self.wechat_config:
self.action_wechat(program, action_type, error)
self.action_wechat(program, action_type, check_result)
def action_supervistor_restart(self, program):
"""
@ -442,6 +506,26 @@ class HealthCheck(object): @@ -442,6 +506,26 @@ class HealthCheck(object):
return result
def action_exec(self, program, cmd):
"""
执行系统命令
:param program:
:param exec:
:return:
"""
self.log(program, 'Action: exec')
result = 'success'
exitcode, stdout, stderr = shell(cmd)
if exitcode == 0:
self.log(program, "Action: exec result success")
else:
result = 'Failed to exec %s, exiting: %s' % (program, exitcode)
self.log(program, "Action: exec result %s", result)
return result
def action_email(self, program, action_type, msg):
"""
发送email
@ -616,7 +700,7 @@ if __name__ == '__main__': @@ -616,7 +700,7 @@ if __name__ == '__main__':
if not os.path.exists(config_file):
example_config = """
config: # 脚本配置名称,请勿更改
supervisord_url: http://localhost:9001/RPC2 # supervisor的rpc接口地址
supervisordUrl: http://localhost:9001/RPC2 # supervisor的rpc接口地址
# mail: # stmp配置
# host: 'smtp.test.com'
# port': '465'
@ -631,25 +715,34 @@ config: # 脚本配置名称,请勿更 @@ -631,25 +715,34 @@ config: # 脚本配置名称,请勿更
# toparty:
# totag:
# 内存方式监控
cat1: # supervisor中配置的program名称
type: mem # 检查类型: http,tcp,mem,cpu 默认: http
max_rss: 1024 # 单位MB, 默认: 1024
maxRss: 1024 # 内存阈值, 超过则为检测失败. 单位MB, 默认: 1024
cumulative: True # 是否统计子进程的内存, 默认: False
pidGet: supervistor # 获取pid的方式: supervistor,name,file, 选择name时,按program名称搜索pid,选择file时,需指定pidFile 默认: supervistor
pidFile: /var/run/t.pid # 指定pid文件的路径, 只在pidGet为file的时候有用
periodSeconds: 10 # 检查的频率(以秒为单位), 默认: 5
initialDelaySeconds: 10 # 首次检查等待的时间(以秒为单位), 默认: 1
failureThreshold: 3 # 检查成功后,最少连续检查失败多少次才被认定为失败, 默认: 3
successThreshold: 2 # 失败后检查成功的最小连续成功次数, 默认:1
action: restart,email # 触发的动作: restart,email,wechat 默认: restart
action: restart,email # 触发的动作: restart,exec,email,wechat (restart和exec互斥,同时设置时restart生效) 默认: restart
execCmd: command # action exec 的执行命令
# cpu方式监控
cat2: # supervisor中配置的program名称
type: cpu # 检查类型: http,tcp,mem,cpu 默认: http
max_cpu: 80 # cpu使用百分比,单位% 默认: 90%
maxCpu: 80 # CPU阈值, 超过则为检测失败. 单位% 默认: 90%
pidGet: supervistor # 获取pid的方式: supervistor,name,file, 选择name时,按program名称搜索pid,选择file时,需指定pidFile 默认: supervistor
pidFile: /var/run/t.pid # 指定pid文件的路径, 只在pidGet为file的时候有用
periodSeconds: 10 # 检查的频率(以秒为单位), 默认: 5
initialDelaySeconds: 10 # 首次检查等待的时间(以秒为单位), 默认: 1
failureThreshold: 3 # 检查成功后,最少连续检查失败多少次才被认定为失败, 默认: 3
successThreshold: 2 # 失败后检查成功的最小连续成功次数, 默认:1
action: restart,wechat # 触发的动作: restart,email,wechat 默认: restart
action: restart,email # 触发的动作: restart,exec,email,wechat (restart和exec互斥,同时设置时restart生效) 默认: restart
execCmd: command # action exec 的执行命令
# HTTP方式监控
cat3:
type: HTTP
mode: POST # http动作:POST,GET 默认: GET
@ -665,7 +758,10 @@ cat3: @@ -665,7 +758,10 @@ cat3:
timeoutSeconds: 5 # 检查超时的秒数, 默认: 3
failureThreshold: 3 # 检查成功后,最少连续检查失败多少次才被认定为失败, 默认: 3
successThreshold: 2 # 失败后检查成功的最小连续成功次数, 默认:1
action: restart,email # 触发的动作: restart,email,wechat 默认: restart
action: restart,email # 触发的动作: restart,exec,email,wechat (restart和exec互斥,同时设置时restart生效) 默认: restart
execCmd: command # action exec 的执行命令
# TCP方式监控
cat4:
type: TCP
host: 127.0.0.1 # 主机地址, 默认: localhost
@ -675,7 +771,8 @@ cat4: @@ -675,7 +771,8 @@ cat4:
timeoutSeconds: 5 # 检查超时的秒数, 默认: 3
failureThreshold: 3 # 检查成功后,最少连续检查失败多少次才被认定为失败, 默认: 3
successThreshold: 2 # 失败后检查成功的最小连续成功次数, 默认:1
action: restart,email # 触发的动作: restart,email,wechat 默认: restart
action: restart,email # 触发的动作: restart,exec,email,wechat (restart和exec互斥,同时设置时restart生效) 默认: restart
execCmd: command # action exec 的执行命令
"""
with open(config_file, 'w') as f:
f.write(example_config)

Loading…
Cancel
Save