|
|
@ -1,7 +1,7 @@ |
|
|
|
#!/usr/bin/python |
|
|
|
#!/usr/bin/python |
|
|
|
# -*- coding: utf-8 -*- |
|
|
|
# -*- coding: utf-8 -*- |
|
|
|
|
|
|
|
|
|
|
|
# @Time : 2019-11-22 |
|
|
|
# @Time : 2019-11-25 |
|
|
|
# @Author : lework |
|
|
|
# @Author : lework |
|
|
|
# @Desc : 针对supervisor的应用进行健康检查 |
|
|
|
# @Desc : 针对supervisor的应用进行健康检查 |
|
|
|
|
|
|
|
|
|
|
@ -13,6 +13,7 @@ import json |
|
|
|
import yaml |
|
|
|
import yaml |
|
|
|
import base64 |
|
|
|
import base64 |
|
|
|
import socket |
|
|
|
import socket |
|
|
|
|
|
|
|
import signal |
|
|
|
import smtplib |
|
|
|
import smtplib |
|
|
|
import datetime |
|
|
|
import datetime |
|
|
|
import platform |
|
|
|
import platform |
|
|
@ -21,51 +22,34 @@ import subprocess |
|
|
|
from email.header import Header |
|
|
|
from email.header import Header |
|
|
|
from email.mime.text import MIMEText |
|
|
|
from email.mime.text import MIMEText |
|
|
|
from collections import namedtuple |
|
|
|
from collections import namedtuple |
|
|
|
|
|
|
|
from supervisor.xmlrpc import SupervisorTransport |
|
|
|
|
|
|
|
|
|
|
|
PY2 = sys.version_info[0] == 2 |
|
|
|
|
|
|
|
PY3 = sys.version_info[0] == 3 |
|
|
|
PY3 = sys.version_info[0] == 3 |
|
|
|
|
|
|
|
|
|
|
|
if PY3: |
|
|
|
if PY3: |
|
|
|
import http.client as httplib |
|
|
|
import http.client as httplib |
|
|
|
from xmlrpc.client import Transport, ServerProxy, Fault |
|
|
|
from xmlrpc.client import Transport, ServerProxy, Fault |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def iterkeys(d, **kw): |
|
|
|
def iterkeys(d, **kw): |
|
|
|
return iter(d.keys(**kw)) |
|
|
|
return iter(d.keys(**kw)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def iteritems(d, **kw): |
|
|
|
def iteritems(d, **kw): |
|
|
|
return iter(d.items(**kw)) |
|
|
|
return iter(d.items(**kw)) |
|
|
|
else: |
|
|
|
else: |
|
|
|
import httplib |
|
|
|
import httplib |
|
|
|
from xmlrpclib import Transport, ServerProxy, Fault |
|
|
|
from xmlrpclib import Transport, ServerProxy, Fault |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def iterkeys(d, **kw): |
|
|
|
def iterkeys(d, **kw): |
|
|
|
return d.iterkeys(**kw) |
|
|
|
return d.iterkeys(**kw) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def iteritems(d, **kw): |
|
|
|
def iteritems(d, **kw): |
|
|
|
return d.iteritems(**kw) |
|
|
|
return d.iteritems(**kw) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class UnixStreamHTTPConnection(httplib.HTTPConnection): |
|
|
|
|
|
|
|
""" |
|
|
|
|
|
|
|
使用socket连接 |
|
|
|
|
|
|
|
""" |
|
|
|
|
|
|
|
def connect(self): |
|
|
|
|
|
|
|
self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) |
|
|
|
|
|
|
|
self.sock.connect(self.host) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class UnixStreamTransport(Transport, object): |
|
|
|
|
|
|
|
""" |
|
|
|
|
|
|
|
自定义xmlrpclb的Transport,以便使用socket文件连接通信 |
|
|
|
|
|
|
|
""" |
|
|
|
|
|
|
|
def __init__(self, socket_path): |
|
|
|
|
|
|
|
self.socket_path = socket_path |
|
|
|
|
|
|
|
super(UnixStreamTransport, self).__init__() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def make_connection(self, host): |
|
|
|
|
|
|
|
return UnixStreamHTTPConnection(self.socket_path) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def shell(cmd): |
|
|
|
def shell(cmd): |
|
|
|
""" |
|
|
|
""" |
|
|
|
执行系统命令 |
|
|
|
执行系统命令 |
|
|
@ -171,12 +155,14 @@ class HealthCheck(object): |
|
|
|
|
|
|
|
|
|
|
|
self.mail_config = None |
|
|
|
self.mail_config = None |
|
|
|
self.wechat_config = None |
|
|
|
self.wechat_config = None |
|
|
|
self.supervisor_url = '/var/run/supervisor.sock' |
|
|
|
self.supervisord_url = 'unix:///var/run/supervisor.sock' |
|
|
|
|
|
|
|
|
|
|
|
if 'config' in config: |
|
|
|
if 'config' in config: |
|
|
|
self.mail_config = config['config'].get('mail') |
|
|
|
self.mail_config = config['config'].get('mail') |
|
|
|
self.wechat_config = config['config'].get('wechat') |
|
|
|
self.wechat_config = config['config'].get('wechat') |
|
|
|
self.supervisor_url = config['config'].get('supervistorUrl', self.supervisor_url) |
|
|
|
self.supervisord_url = config['config'].get('supervisordUrl', self.supervisord_url) |
|
|
|
|
|
|
|
self.supervisord_user = config['config'].get('supervisordUser', None) |
|
|
|
|
|
|
|
self.supervisord_pass = config['config'].get('supervisordPass', None) |
|
|
|
config.pop('config') |
|
|
|
config.pop('config') |
|
|
|
|
|
|
|
|
|
|
|
self.program_config = config |
|
|
|
self.program_config = config |
|
|
@ -185,20 +171,20 @@ class HealthCheck(object): |
|
|
|
self.failureThreshold = 3 |
|
|
|
self.failureThreshold = 3 |
|
|
|
self.successThreshold = 1 |
|
|
|
self.successThreshold = 1 |
|
|
|
self.initialDelaySeconds = 1 |
|
|
|
self.initialDelaySeconds = 1 |
|
|
|
|
|
|
|
self.sendResolved = False |
|
|
|
|
|
|
|
|
|
|
|
self.max_rss = 1024 |
|
|
|
self.max_rss = 1024 |
|
|
|
self.cumulative = False |
|
|
|
self.cumulative = False |
|
|
|
self.max_cpu = 90 |
|
|
|
self.max_cpu = 90 |
|
|
|
|
|
|
|
|
|
|
|
def get_supervisor_conn(self): |
|
|
|
def get_supervisord_conn(self): |
|
|
|
""" |
|
|
|
""" |
|
|
|
获取supervisor的连接 |
|
|
|
获取supervisor的连接 |
|
|
|
:return: |
|
|
|
:return: |
|
|
|
""" |
|
|
|
""" |
|
|
|
if 'RPC2' == self.supervisor_url[-4:]: |
|
|
|
transport = SupervisorTransport(self.supervisord_user, self.supervisord_pass, self.supervisord_url) |
|
|
|
s = ServerProxy(self.supervisor_url) |
|
|
|
s = ServerProxy('http://127.0.0.1', transport=transport) |
|
|
|
else: |
|
|
|
|
|
|
|
s = ServerProxy('http://', transport=UnixStreamTransport(self.supervisor_url)) |
|
|
|
|
|
|
|
return s |
|
|
|
return s |
|
|
|
|
|
|
|
|
|
|
|
def get_pid(self, program, kind, pid_file): |
|
|
|
def get_pid(self, program, kind, pid_file): |
|
|
@ -206,7 +192,7 @@ class HealthCheck(object): |
|
|
|
获取进程pid |
|
|
|
获取进程pid |
|
|
|
:param program: |
|
|
|
:param program: |
|
|
|
:param kind: |
|
|
|
:param kind: |
|
|
|
:param args: |
|
|
|
:param pid_file: |
|
|
|
:return: |
|
|
|
:return: |
|
|
|
""" |
|
|
|
""" |
|
|
|
pid = 0 |
|
|
|
pid = 0 |
|
|
@ -214,7 +200,7 @@ class HealthCheck(object): |
|
|
|
|
|
|
|
|
|
|
|
if kind == 'supervisor': |
|
|
|
if kind == 'supervisor': |
|
|
|
try: |
|
|
|
try: |
|
|
|
s = self.get_supervisor_conn() |
|
|
|
s = self.get_supervisord_conn() |
|
|
|
info = s.supervisor.getProcessInfo(program) |
|
|
|
info = s.supervisor.getProcessInfo(program) |
|
|
|
pid = info.get('pid') |
|
|
|
pid = info.get('pid') |
|
|
|
err = info.get('description') |
|
|
|
err = info.get('description') |
|
|
@ -249,7 +235,10 @@ class HealthCheck(object): |
|
|
|
def log(self, program, msg, *args): |
|
|
|
def log(self, program, msg, *args): |
|
|
|
""" |
|
|
|
""" |
|
|
|
写信息到 STDERR. |
|
|
|
写信息到 STDERR. |
|
|
|
:param str msg: string message. |
|
|
|
:param program: |
|
|
|
|
|
|
|
:param msg: |
|
|
|
|
|
|
|
:param args: |
|
|
|
|
|
|
|
:return: |
|
|
|
""" |
|
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
curr_dt = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') |
|
|
|
curr_dt = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') |
|
|
@ -271,6 +260,7 @@ class HealthCheck(object): |
|
|
|
failureThreshold = config.get('failureThreshold', self.failureThreshold) |
|
|
|
failureThreshold = config.get('failureThreshold', self.failureThreshold) |
|
|
|
successThreshold = config.get('successThreshold', self.successThreshold) |
|
|
|
successThreshold = config.get('successThreshold', self.successThreshold) |
|
|
|
initialDelaySeconds = config.get('initialDelaySeconds', self.initialDelaySeconds) |
|
|
|
initialDelaySeconds = config.get('initialDelaySeconds', self.initialDelaySeconds) |
|
|
|
|
|
|
|
sendResolved = config.get('sendResolved', self.sendResolved) |
|
|
|
action_type = config.get('action', 'restart') |
|
|
|
action_type = config.get('action', 'restart') |
|
|
|
action_exec_cmd = config.get('execCmd') |
|
|
|
action_exec_cmd = config.get('execCmd') |
|
|
|
|
|
|
|
|
|
|
@ -298,7 +288,7 @@ class HealthCheck(object): |
|
|
|
# self.log(program, '%s check state: %s', check_type, json.dumps(check_state[program])) |
|
|
|
# self.log(program, '%s check state: %s', check_type, json.dumps(check_state[program])) |
|
|
|
if check_state[program]['periodSeconds'] % periodSeconds == 0: |
|
|
|
if check_state[program]['periodSeconds'] % periodSeconds == 0: |
|
|
|
check_result = check_method(config) |
|
|
|
check_result = check_method(config) |
|
|
|
check_status = check_result.get('status', 'unknow') |
|
|
|
check_status = check_result.get('status', None) |
|
|
|
check_info = check_result.get('info', '') |
|
|
|
check_info = check_result.get('info', '') |
|
|
|
self.log(program, '%s check: info(%s) state(%s)', check_type.upper(), check_info, check_status) |
|
|
|
self.log(program, '%s check: info(%s) state(%s)', check_type.upper(), check_info, check_status) |
|
|
|
|
|
|
|
|
|
|
@ -309,6 +299,18 @@ class HealthCheck(object): |
|
|
|
|
|
|
|
|
|
|
|
# 先判断成功次数 |
|
|
|
# 先判断成功次数 |
|
|
|
if check_state[program]['success'] >= successThreshold: |
|
|
|
if check_state[program]['success'] >= successThreshold: |
|
|
|
|
|
|
|
if sendResolved and check_state[program]['failure'] > 0: |
|
|
|
|
|
|
|
# 只保留通知action |
|
|
|
|
|
|
|
notice_action = ['email', 'wechat'] |
|
|
|
|
|
|
|
send_action = ','.join(list(set(action_type.split(',')) & set(notice_action))) |
|
|
|
|
|
|
|
self.log(program, 'Use %s send resolved.', send_action) |
|
|
|
|
|
|
|
action_param = { |
|
|
|
|
|
|
|
'check_status': check_status, |
|
|
|
|
|
|
|
'action_type': send_action, |
|
|
|
|
|
|
|
'msg': check_result.get('msg', '') |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
self.action(program, **action_param) |
|
|
|
|
|
|
|
|
|
|
|
# 成功后,将项目状态初始化 |
|
|
|
# 成功后,将项目状态初始化 |
|
|
|
check_state[program]['failure'] = 0 |
|
|
|
check_state[program]['failure'] = 0 |
|
|
|
check_state[program]['success'] = 0 |
|
|
|
check_state[program]['success'] = 0 |
|
|
@ -316,13 +318,14 @@ class HealthCheck(object): |
|
|
|
|
|
|
|
|
|
|
|
# 再判断失败次数 |
|
|
|
# 再判断失败次数 |
|
|
|
if check_state[program]['failure'] >= failureThreshold: |
|
|
|
if check_state[program]['failure'] >= failureThreshold: |
|
|
|
# 失败后, 只触发一次action,或者检测错误数是2倍periodSeconds的平方数时触发(避免重启失败导致服务一直不可用) |
|
|
|
# 失败后, 只触发一次action,或者检测错误数可以整除2倍periodSeconds与initialDelaySeconds时触发(避免重启失败导致服务一直不可用) |
|
|
|
if not check_state[program]['action'] or ( |
|
|
|
if not check_state[program]['action'] or ( |
|
|
|
check_state[program]['failure'] != 0 and check_state[program]['failure'] % ( |
|
|
|
check_state[program]['failure'] != 0 and check_state[program]['failure'] % ( |
|
|
|
periodSeconds * 2) == 0): |
|
|
|
(periodSeconds + initialDelaySeconds) * 2) == 0): |
|
|
|
action_param = { |
|
|
|
action_param = { |
|
|
|
'action_type': action_type, |
|
|
|
'action_type': action_type, |
|
|
|
'check_result': check_result.get('msg', ''), |
|
|
|
'check_status': check_status, |
|
|
|
|
|
|
|
'msg': check_result.get('msg', ''), |
|
|
|
'action_exec_cmd': action_exec_cmd |
|
|
|
'action_exec_cmd': action_exec_cmd |
|
|
|
} |
|
|
|
} |
|
|
|
self.action(program, **action_param) |
|
|
|
self.action(program, **action_param) |
|
|
@ -392,7 +395,7 @@ class HealthCheck(object): |
|
|
|
if res.status != httplib.OK: |
|
|
|
if res.status != httplib.OK: |
|
|
|
return {'status': 'failure', 'msg': '[http_check] return code %s' % res.status, 'info': check_info} |
|
|
|
return {'status': 'failure', 'msg': '[http_check] return code %s' % res.status, 'info': check_info} |
|
|
|
|
|
|
|
|
|
|
|
return {'status': 'success', 'info': check_info} |
|
|
|
return {'status': 'success', 'msg': '[http_check] return code %s' % res.status, 'info': check_info} |
|
|
|
|
|
|
|
|
|
|
|
def tcp_check(self, config): |
|
|
|
def tcp_check(self, config): |
|
|
|
""" |
|
|
|
""" |
|
|
@ -413,7 +416,7 @@ class HealthCheck(object): |
|
|
|
except Exception as e: |
|
|
|
except Exception as e: |
|
|
|
self.log(program, 'TCP: conn error, %s', e) |
|
|
|
self.log(program, 'TCP: conn error, %s', e) |
|
|
|
return {'status': 'failure', 'msg': '[tcp_check] %s' % e, 'info': check_info} |
|
|
|
return {'status': 'failure', 'msg': '[tcp_check] %s' % e, 'info': check_info} |
|
|
|
return {'status': 'success', 'info': check_info} |
|
|
|
return {'status': 'success', 'msg': '[tcp_check] connection succeeded', 'info': check_info} |
|
|
|
|
|
|
|
|
|
|
|
def mem_check(self, config): |
|
|
|
def mem_check(self, config): |
|
|
|
""" |
|
|
|
""" |
|
|
@ -440,7 +443,8 @@ class HealthCheck(object): |
|
|
|
return {'status': 'failure', 'msg': '[mem_check] max_rss(%sMB) now_rss(%sMB)' % (max_rss, now_rss), |
|
|
|
return {'status': 'failure', 'msg': '[mem_check] max_rss(%sMB) now_rss(%sMB)' % (max_rss, now_rss), |
|
|
|
'info': check_info} |
|
|
|
'info': check_info} |
|
|
|
|
|
|
|
|
|
|
|
return {'status': 'success', 'info': check_info} |
|
|
|
return {'status': 'success', 'msg': '[mem_check] max_rss(%sMB) now_rss(%sMB)' % (max_rss, now_rss), |
|
|
|
|
|
|
|
'info': check_info} |
|
|
|
|
|
|
|
|
|
|
|
def cpu_check(self, config): |
|
|
|
def cpu_check(self, config): |
|
|
|
""" |
|
|
|
""" |
|
|
@ -467,34 +471,36 @@ class HealthCheck(object): |
|
|
|
'msg': '[cpu_check] max_cpu({max_cpu}%) now_cpu({now}%)'.format(max_cpu=max_cpu, now=now_cpu), |
|
|
|
'msg': '[cpu_check] max_cpu({max_cpu}%) now_cpu({now}%)'.format(max_cpu=max_cpu, now=now_cpu), |
|
|
|
'info': check_info} |
|
|
|
'info': check_info} |
|
|
|
|
|
|
|
|
|
|
|
return {'status': 'success', 'info': check_info} |
|
|
|
return {'status': 'success', |
|
|
|
|
|
|
|
'msg': '[cpu_check] max_cpu({max_cpu}%) now_cpu({now}%)'.format(max_cpu=max_cpu, now=now_cpu), |
|
|
|
|
|
|
|
'info': check_info} |
|
|
|
|
|
|
|
|
|
|
|
def action(self, program, **args): |
|
|
|
def action(self, program, **args): |
|
|
|
""" |
|
|
|
""" |
|
|
|
执行动作 |
|
|
|
执行动作 |
|
|
|
:param program: |
|
|
|
:param program: |
|
|
|
:param action_type: |
|
|
|
:param args: |
|
|
|
:param error: |
|
|
|
:return: None |
|
|
|
:return: |
|
|
|
|
|
|
|
""" |
|
|
|
""" |
|
|
|
action_type = args.get('action_type') |
|
|
|
action_type = args.get('action_type') |
|
|
|
check_result = args.get('check_result') |
|
|
|
msg = args.get('msg') |
|
|
|
action_exec_cmd = args.get('action_exec_cmd') |
|
|
|
action_exec_cmd = args.get('action_exec_cmd') |
|
|
|
|
|
|
|
check_status = args.get('check_status') |
|
|
|
|
|
|
|
|
|
|
|
self.log(program, 'Action: %s', action_type) |
|
|
|
self.log(program, 'Action: %s', action_type) |
|
|
|
action_list = action_type.split(',') |
|
|
|
action_list = action_type.split(',') |
|
|
|
|
|
|
|
|
|
|
|
if 'restart' in action_list: |
|
|
|
if 'restart' in action_list: |
|
|
|
restart_result = self.action_supervistor_restart(program) |
|
|
|
restart_result = self.action_supervistor_restart(program) |
|
|
|
check_result += '\r\n Restart:%s' % restart_result |
|
|
|
msg += '\r\n Restart:%s' % restart_result |
|
|
|
elif 'exec' in action_list: |
|
|
|
elif 'exec' in action_list: |
|
|
|
exec_result = self.action_exec(program, action_exec_cmd) |
|
|
|
exec_result = self.action_exec(program, action_exec_cmd) |
|
|
|
check_result += '\r\n Exec:%s' % exec_result |
|
|
|
msg += '\r\n Exec:%s' % exec_result |
|
|
|
|
|
|
|
|
|
|
|
if 'email' in action_list and self.mail_config: |
|
|
|
if 'email' in action_list and self.mail_config: |
|
|
|
self.action_email(program, action_type, check_result) |
|
|
|
self.action_email(program, action_type, msg, check_status) |
|
|
|
if 'wechat' in action_list and self.wechat_config: |
|
|
|
if 'wechat' in action_list and self.wechat_config: |
|
|
|
self.action_wechat(program, action_type, check_result) |
|
|
|
self.action_wechat(program, action_type, msg, check_status) |
|
|
|
|
|
|
|
|
|
|
|
def action_supervistor_restart(self, program): |
|
|
|
def action_supervistor_restart(self, program): |
|
|
|
""" |
|
|
|
""" |
|
|
@ -505,7 +511,7 @@ class HealthCheck(object): |
|
|
|
self.log(program, 'Action: restart') |
|
|
|
self.log(program, 'Action: restart') |
|
|
|
result = 'success' |
|
|
|
result = 'success' |
|
|
|
try: |
|
|
|
try: |
|
|
|
s = self.get_supervisor_conn() |
|
|
|
s = self.get_supervisord_conn() |
|
|
|
info = s.supervisor.getProcessInfo(program) |
|
|
|
info = s.supervisor.getProcessInfo(program) |
|
|
|
except Exception as e: |
|
|
|
except Exception as e: |
|
|
|
result = 'Get %s ProcessInfo Error: %s' % (program, e) |
|
|
|
result = 'Get %s ProcessInfo Error: %s' % (program, e) |
|
|
@ -541,7 +547,7 @@ class HealthCheck(object): |
|
|
|
""" |
|
|
|
""" |
|
|
|
执行系统命令 |
|
|
|
执行系统命令 |
|
|
|
:param program: |
|
|
|
:param program: |
|
|
|
:param exec: |
|
|
|
:param cmd: |
|
|
|
:return: |
|
|
|
:return: |
|
|
|
""" |
|
|
|
""" |
|
|
|
self.log(program, 'Action: exec') |
|
|
|
self.log(program, 'Action: exec') |
|
|
@ -557,39 +563,45 @@ class HealthCheck(object): |
|
|
|
|
|
|
|
|
|
|
|
return result |
|
|
|
return result |
|
|
|
|
|
|
|
|
|
|
|
def action_email(self, program, action_type, msg): |
|
|
|
def action_email(self, program, action_type, msg, check_status): |
|
|
|
""" |
|
|
|
""" |
|
|
|
发送email |
|
|
|
发送email |
|
|
|
:param subject: str |
|
|
|
:param program: |
|
|
|
:param content: str |
|
|
|
:param action_type: |
|
|
|
:return: bool |
|
|
|
:param msg: |
|
|
|
|
|
|
|
:param check_status: |
|
|
|
|
|
|
|
:return: |
|
|
|
""" |
|
|
|
""" |
|
|
|
self.log(program, 'Action: email') |
|
|
|
self.log(program, 'Action: email') |
|
|
|
|
|
|
|
|
|
|
|
ip = "" |
|
|
|
ip = "" |
|
|
|
|
|
|
|
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) |
|
|
|
try: |
|
|
|
try: |
|
|
|
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) |
|
|
|
|
|
|
|
s.connect(('8.8.8.8', 80)) |
|
|
|
s.connect(('8.8.8.8', 80)) |
|
|
|
ip = s.getsockname()[0] |
|
|
|
ip = s.getsockname()[0] |
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
|
|
self.log(program, 'Action: email get ip error %s' % e) |
|
|
|
finally: |
|
|
|
finally: |
|
|
|
s.close() |
|
|
|
s.close() |
|
|
|
|
|
|
|
|
|
|
|
hostname = platform.node().split('.')[0] |
|
|
|
hostname = platform.node().split('.')[0] |
|
|
|
system_platform = platform.platform() |
|
|
|
system_platform = platform.platform() |
|
|
|
|
|
|
|
|
|
|
|
subject = "[Supervisor] %s health check Faild" % program |
|
|
|
if check_status == 'success': |
|
|
|
|
|
|
|
subject = "[Supervisor] %s Health check successful" % program |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
subject = "[Supervisor] %s Health check failed" % program |
|
|
|
curr_dt = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') |
|
|
|
curr_dt = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') |
|
|
|
content = """ |
|
|
|
content = """ |
|
|
|
DateTime: {curr_dt} |
|
|
|
DateTime: {curr_dt} |
|
|
|
Program: {program} |
|
|
|
Program: {program} |
|
|
|
IP: {ip} |
|
|
|
IP: {ip} |
|
|
|
Hostname: {hostname} |
|
|
|
Hostname: {hostname} |
|
|
|
Platfrom: {system_platform} |
|
|
|
Platfrom: {system_platform} |
|
|
|
Action: {action} |
|
|
|
Action: {action} |
|
|
|
Msg: {msg} |
|
|
|
Msg: {msg} |
|
|
|
""".format(curr_dt=curr_dt, program=program, ip=ip, hostname=hostname, system_platform=system_platform, |
|
|
|
""".format(curr_dt=curr_dt, program=program, ip=ip, hostname=hostname, system_platform=system_platform, |
|
|
|
action=action_type, |
|
|
|
action=action_type, msg=msg) |
|
|
|
msg=msg) |
|
|
|
|
|
|
|
mail_port = self.mail_config.get('port', '') |
|
|
|
mail_port = self.mail_config.get('port', '') |
|
|
|
mail_host = self.mail_config.get('host', '') |
|
|
|
mail_host = self.mail_config.get('host', '') |
|
|
|
mail_user = self.mail_config.get('user', '') |
|
|
|
mail_user = self.mail_config.get('user', '') |
|
|
@ -612,12 +624,13 @@ class HealthCheck(object): |
|
|
|
self.log(program, 'Action: email send success.') |
|
|
|
self.log(program, 'Action: email send success.') |
|
|
|
return True |
|
|
|
return True |
|
|
|
|
|
|
|
|
|
|
|
def action_wechat(self, program, action_type, msg): |
|
|
|
def action_wechat(self, program, action_type, msg, check_status): |
|
|
|
""" |
|
|
|
""" |
|
|
|
微信通知 |
|
|
|
微信通知 |
|
|
|
:param program: |
|
|
|
:param program: |
|
|
|
:param action_type: |
|
|
|
:param action_type: |
|
|
|
:param msg: |
|
|
|
:param msg: |
|
|
|
|
|
|
|
:param check_status: |
|
|
|
:return: |
|
|
|
:return: |
|
|
|
""" |
|
|
|
""" |
|
|
|
self.log(program, 'Action: wechat') |
|
|
|
self.log(program, 'Action: wechat') |
|
|
@ -651,10 +664,12 @@ class HealthCheck(object): |
|
|
|
send_url = '/cgi-bin/message/send?access_token={token}'.format(token=token) |
|
|
|
send_url = '/cgi-bin/message/send?access_token={token}'.format(token=token) |
|
|
|
|
|
|
|
|
|
|
|
ip = "" |
|
|
|
ip = "" |
|
|
|
|
|
|
|
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) |
|
|
|
try: |
|
|
|
try: |
|
|
|
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) |
|
|
|
|
|
|
|
s.connect(('8.8.8.8', 80)) |
|
|
|
s.connect(('8.8.8.8', 80)) |
|
|
|
ip = s.getsockname()[0] |
|
|
|
ip = s.getsockname()[0] |
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
|
|
self.log(program, 'Action: wechat get ip error %s' % e) |
|
|
|
finally: |
|
|
|
finally: |
|
|
|
s.close() |
|
|
|
s.close() |
|
|
|
|
|
|
|
|
|
|
@ -662,17 +677,22 @@ class HealthCheck(object): |
|
|
|
system_platform = platform.platform() |
|
|
|
system_platform = platform.platform() |
|
|
|
|
|
|
|
|
|
|
|
curr_dt = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') |
|
|
|
curr_dt = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') |
|
|
|
title = "<font color=\"warning\">[Supervisor] %s health check Faild</font>" % program |
|
|
|
|
|
|
|
|
|
|
|
if check_status == 'success': |
|
|
|
content = title \ |
|
|
|
title = "<font color=\"info\">[Supervisor] %s Health check successful</font>" % program |
|
|
|
+ "\n> **详情信息**" \ |
|
|
|
else: |
|
|
|
+ "\n> DataTime: " + curr_dt \ |
|
|
|
title = "<font color=\"warning\">[Supervisor] %s Health check failed</font>" % program |
|
|
|
+ "\n> Program: <font color=\"warning\">%s</font>" % program \ |
|
|
|
|
|
|
|
+ "\n> IP: " + ip \ |
|
|
|
content = "{title}\n \ |
|
|
|
+ "\n> Hostname: " + hostname \ |
|
|
|
> **详情信息**\n \ |
|
|
|
+ "\n> Platfrom: " + system_platform \ |
|
|
|
> DataTime: {curr_dt}\n \ |
|
|
|
+ "\n> Action: " + action_type \ |
|
|
|
> Program: <font color=\"warning\">{program}</font>\n \ |
|
|
|
+ "\n> Msg: " + str(msg) |
|
|
|
> IP: {ip}\n \ |
|
|
|
|
|
|
|
> Hostname: {hostname}\n \ |
|
|
|
|
|
|
|
> Platfrom: {platfrom}\n \ |
|
|
|
|
|
|
|
> Action: {action}\n \ |
|
|
|
|
|
|
|
> Msg: {msg}".format(title=title, curr_dt=curr_dt, program=program, ip=ip, hostname=hostname, |
|
|
|
|
|
|
|
platfrom=system_platform, action=action_type, msg=msg) |
|
|
|
|
|
|
|
|
|
|
|
data = { |
|
|
|
data = { |
|
|
|
"msgtype": 'markdown', |
|
|
|
"msgtype": 'markdown', |
|
|
@ -712,9 +732,6 @@ class HealthCheck(object): |
|
|
|
:return: |
|
|
|
:return: |
|
|
|
""" |
|
|
|
""" |
|
|
|
self.log('healthCheck:', 'start') |
|
|
|
self.log('healthCheck:', 'start') |
|
|
|
s = self.get_supervisor_conn() |
|
|
|
|
|
|
|
info = s.supervisor.getProcessInfo('api-paibloks-show') |
|
|
|
|
|
|
|
print(info) |
|
|
|
|
|
|
|
threads = [] |
|
|
|
threads = [] |
|
|
|
|
|
|
|
|
|
|
|
for key, value in iteritems(self.program_config): |
|
|
|
for key, value in iteritems(self.program_config): |
|
|
@ -723,18 +740,32 @@ class HealthCheck(object): |
|
|
|
t = threading.Thread(target=self.check, args=(item,)) |
|
|
|
t = threading.Thread(target=self.check, args=(item,)) |
|
|
|
threads.append(t) |
|
|
|
threads.append(t) |
|
|
|
for t in threads: |
|
|
|
for t in threads: |
|
|
|
|
|
|
|
t.setDaemon(True) |
|
|
|
t.start() |
|
|
|
t.start() |
|
|
|
for t in threads: |
|
|
|
|
|
|
|
t.join() |
|
|
|
while True: |
|
|
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 信号处理 |
|
|
|
|
|
|
|
def sig_handler(signum, frame): |
|
|
|
|
|
|
|
print("Exit check!") |
|
|
|
|
|
|
|
sys.exit(0) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
signal.signal(signal.SIGINT, sig_handler) |
|
|
|
|
|
|
|
signal.signal(signal.SIGTERM, sig_handler) |
|
|
|
|
|
|
|
signal.signal(signal.SIGQUIT, sig_handler) |
|
|
|
|
|
|
|
|
|
|
|
# 获取当前目录下的配置文件,没有的话就生成个模板 |
|
|
|
# 获取当前目录下的配置文件,没有的话就生成个模板 |
|
|
|
config_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.yaml') |
|
|
|
config_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.yaml') |
|
|
|
if not os.path.exists(config_file): |
|
|
|
if not os.path.exists(config_file): |
|
|
|
example_config = """ |
|
|
|
example_config = """ |
|
|
|
config: # 脚本配置名称,请勿更改 |
|
|
|
config: # 脚本配置名称,请勿更改 |
|
|
|
# supervisordUrl: http://localhost:9001/RPC2 # supervisor的接口地址, 默认使用本地socker文件/var/run/supervisor.sock |
|
|
|
# supervisordUrl: http://localhost:9001/RPC2 # supervisor的接口地址, 默认使用本地socket文件unix:///var/run/supervisor.sock |
|
|
|
|
|
|
|
# supervisordUser: user # supervisor中设置的username, 没有设置可不填 |
|
|
|
|
|
|
|
# supervisordPass: pass # supervisor中设置的password, 没有设置可不填 |
|
|
|
# mail: # stmp配置 |
|
|
|
# mail: # stmp配置 |
|
|
|
# host: 'smtp.test.com' |
|
|
|
# host: 'smtp.test.com' |
|
|
|
# port': '465' |
|
|
|
# port': '465' |
|
|
@ -762,6 +793,7 @@ cat1: # supervisor中配置的program名称 |
|
|
|
successThreshold: 2 # 失败后检查成功的最小连续成功次数, 默认:1 |
|
|
|
successThreshold: 2 # 失败后检查成功的最小连续成功次数, 默认:1 |
|
|
|
action: restart,email # 触发的动作: restart,exec,email,wechat (restart和exec互斥,同时设置时restart生效) 默认: restart |
|
|
|
action: restart,email # 触发的动作: restart,exec,email,wechat (restart和exec互斥,同时设置时restart生效) 默认: restart |
|
|
|
execCmd: command # action exec 的执行命令 |
|
|
|
execCmd: command # action exec 的执行命令 |
|
|
|
|
|
|
|
sendResolved: True # 是否发送恢复通知,仅用作于email,wechat. 默认: False |
|
|
|
|
|
|
|
|
|
|
|
# cpu方式监控 |
|
|
|
# cpu方式监控 |
|
|
|
cat2: # supervisor中配置的program名称 |
|
|
|
cat2: # supervisor中配置的program名称 |
|
|
@ -775,6 +807,7 @@ cat2: # supervisor中配置的program名称 |
|
|
|
successThreshold: 2 # 失败后检查成功的最小连续成功次数, 默认:1 |
|
|
|
successThreshold: 2 # 失败后检查成功的最小连续成功次数, 默认:1 |
|
|
|
action: restart,email # 触发的动作: restart,exec,email,wechat (restart和exec互斥,同时设置时restart生效) 默认: restart |
|
|
|
action: restart,email # 触发的动作: restart,exec,email,wechat (restart和exec互斥,同时设置时restart生效) 默认: restart |
|
|
|
execCmd: command # action exec 的执行命令 |
|
|
|
execCmd: command # action exec 的执行命令 |
|
|
|
|
|
|
|
sendResolved: True # 是否发送恢复通知,仅用作于email,wechat. 默认: False |
|
|
|
|
|
|
|
|
|
|
|
# HTTP方式监控 |
|
|
|
# HTTP方式监控 |
|
|
|
cat3: |
|
|
|
cat3: |
|
|
@ -794,6 +827,7 @@ cat3: |
|
|
|
successThreshold: 2 # 失败后检查成功的最小连续成功次数, 默认:1 |
|
|
|
successThreshold: 2 # 失败后检查成功的最小连续成功次数, 默认:1 |
|
|
|
action: restart,email # 触发的动作: restart,exec,email,wechat (restart和exec互斥,同时设置时restart生效) 默认: restart |
|
|
|
action: restart,email # 触发的动作: restart,exec,email,wechat (restart和exec互斥,同时设置时restart生效) 默认: restart |
|
|
|
execCmd: command # action exec 的执行命令 |
|
|
|
execCmd: command # action exec 的执行命令 |
|
|
|
|
|
|
|
sendResolved: True # 是否发送恢复通知,仅用作于email,wechat. 默认: False |
|
|
|
|
|
|
|
|
|
|
|
# TCP方式监控 |
|
|
|
# TCP方式监控 |
|
|
|
cat4: |
|
|
|
cat4: |
|
|
@ -807,6 +841,7 @@ cat4: |
|
|
|
successThreshold: 2 # 失败后检查成功的最小连续成功次数, 默认:1 |
|
|
|
successThreshold: 2 # 失败后检查成功的最小连续成功次数, 默认:1 |
|
|
|
action: restart,email # 触发的动作: restart,exec,email,wechat (restart和exec互斥,同时设置时restart生效) 默认: restart |
|
|
|
action: restart,email # 触发的动作: restart,exec,email,wechat (restart和exec互斥,同时设置时restart生效) 默认: restart |
|
|
|
execCmd: command # action exec 的执行命令 |
|
|
|
execCmd: command # action exec 的执行命令 |
|
|
|
|
|
|
|
sendResolved: True # 是否发送恢复通知,仅用作于email,wechat. 默认: False |
|
|
|
""" |
|
|
|
""" |
|
|
|
with open(config_file, 'w') as f: |
|
|
|
with open(config_file, 'w') as f: |
|
|
|
f.write(example_config) |
|
|
|
f.write(example_config) |
|
|
@ -816,11 +851,7 @@ cat4: |
|
|
|
sys.exit(0) |
|
|
|
sys.exit(0) |
|
|
|
|
|
|
|
|
|
|
|
with open(config_file) as f: |
|
|
|
with open(config_file) as f: |
|
|
|
config = yaml.load(f) |
|
|
|
config = yaml.safe_load(f) |
|
|
|
|
|
|
|
|
|
|
|
check = HealthCheck(config) |
|
|
|
check = HealthCheck(config) |
|
|
|
print(dir(check)) |
|
|
|
|
|
|
|
# s = check.get_supervisor_conn() |
|
|
|
|
|
|
|
# info = s.supervisor.getProcessInfo('api-paibloks-show') |
|
|
|
|
|
|
|
# print(info) |
|
|
|
|
|
|
|
check.start() |
|
|
|
check.start() |
|
|
|