mirror of https://github.com/lework/script
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1270 lines
44 KiB
1270 lines
44 KiB
5 years ago
|
import boto3
|
||
|
import json
|
||
|
|
||
|
# Create CloudWatch client
|
||
|
cloudwatch = boto3.client('cloudwatch')
|
||
|
sns_arn = "sns——arn"
|
||
|
region = "cn-north-1"
|
||
|
|
||
|
|
||
|
def add_elb_alarm(instance_arn, instance_name=None):
|
||
|
print("[Add elb alarm.]")
|
||
|
|
||
|
elb_client = boto3.client('elbv2')
|
||
|
|
||
|
instance = elb_client.describe_listeners(LoadBalancerArn=instance_arn)
|
||
|
arn = instance_arn.split(':loadbalancer/')[1]
|
||
|
print(instance)
|
||
|
targetgroup_list = []
|
||
|
for listener in instance.get('Listeners'):
|
||
|
for action in listener['DefaultActions']:
|
||
|
print(action['TargetGroupArn'])
|
||
|
|
||
|
targetgroup_arn = action['TargetGroupArn']
|
||
|
targetgroup = targetgroup_arn.split(':')[-1]
|
||
|
targetgroup_list.append(targetgroup)
|
||
|
|
||
|
print("[ActiveFlowCount, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 当链接数平均大于或等于 6000 就告警]")
|
||
|
response = cloudwatch.put_metric_alarm(
|
||
|
AlarmName='AWS_ELB_%s_ActiveFlowCount' % instance_name,
|
||
|
AlarmDescription='Alarm when ELB ActiveFlowCount exceeds 6000',
|
||
|
ActionsEnabled=True,
|
||
|
OKActions=[sns_arn],
|
||
|
AlarmActions=[sns_arn],
|
||
|
MetricName='ActiveFlowCount',
|
||
|
Namespace='AWS/ApplicationELB',
|
||
|
Statistic='Average',
|
||
|
Dimensions=[
|
||
|
{
|
||
|
'Name': 'LoadBalancer',
|
||
|
'Value': '%s' % arn
|
||
|
}
|
||
|
],
|
||
|
Period=60,
|
||
|
EvaluationPeriods=3,
|
||
|
DatapointsToAlarm=3,
|
||
|
Threshold=6000,
|
||
|
ComparisonOperator='GreaterThanOrEqualToThreshold',
|
||
|
TreatMissingData='notBreaching'
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
print("[NewFlowCount, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 当链接数平均大于或等于 1000 就告警]")
|
||
|
response = cloudwatch.put_metric_alarm(
|
||
|
AlarmName='AWS_ELB_%s_NewFlowCount' % instance_name,
|
||
|
AlarmDescription='Alarm when ELB NewFlowCount exceeds 6000',
|
||
|
ActionsEnabled=True,
|
||
|
OKActions=[sns_arn],
|
||
|
AlarmActions=[sns_arn],
|
||
|
MetricName='NewFlowCount',
|
||
|
Namespace='AWS/ApplicationELB',
|
||
|
Statistic='Average',
|
||
|
Dimensions=[
|
||
|
{
|
||
|
'Name': 'LoadBalancer',
|
||
|
'Value': '%s' % arn
|
||
|
}
|
||
|
],
|
||
|
Period=60,
|
||
|
EvaluationPeriods=3,
|
||
|
DatapointsToAlarm=3,
|
||
|
Threshold=1000,
|
||
|
ComparisonOperator='GreaterThanOrEqualToThreshold',
|
||
|
TreatMissingData='notBreaching'
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
print("[ProcessedBytes, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 当链接数平均大于或等于 5m 就告警]")
|
||
|
response = cloudwatch.put_metric_alarm(
|
||
|
AlarmName='AWS_ELB_%s_ProcessedBytes' % instance_name,
|
||
|
AlarmDescription='Alarm when ELB ProcessedBytes exceeds 5m',
|
||
|
ActionsEnabled=True,
|
||
|
OKActions=[sns_arn],
|
||
|
AlarmActions=[sns_arn],
|
||
|
MetricName='ProcessedBytes',
|
||
|
Namespace='AWS/ApplicationELB',
|
||
|
Statistic='Average',
|
||
|
Dimensions=[
|
||
|
{
|
||
|
'Name': 'LoadBalancer',
|
||
|
'Value': 'app/%s' % arn
|
||
|
}
|
||
|
],
|
||
|
Period=60,
|
||
|
EvaluationPeriods=3,
|
||
|
DatapointsToAlarm=3,
|
||
|
Threshold=5000000,
|
||
|
ComparisonOperator='GreaterThanOrEqualToThreshold',
|
||
|
TreatMissingData='notBreaching'
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
for targetgroup in set(targetgroup_list):
|
||
|
targetgroup_name = targetgroup.split('/')[1]
|
||
|
|
||
|
print("[HealthyHostCount, 1分钟采集1次, 周期为1分钟, 1分钟有1个数据点超过阈值就告警, 健康主机的最大值小于等于 0 就告警]")
|
||
|
response = cloudwatch.put_metric_alarm(
|
||
|
AlarmName='AWS_ELB_%s_%s_HealthyHostCount' % (instance_name, targetgroup_name),
|
||
|
AlarmDescription='Alarm when ELB HealthyHostCount less than 0',
|
||
|
ActionsEnabled=True,
|
||
|
OKActions=[sns_arn],
|
||
|
AlarmActions=[sns_arn],
|
||
|
MetricName='HealthyHostCount',
|
||
|
Namespace='AWS/ApplicationELB',
|
||
|
Statistic='Maximum',
|
||
|
Dimensions=[
|
||
|
{
|
||
|
'Name': 'TargetGroup',
|
||
|
'Value': '%s' % targetgroup
|
||
|
},
|
||
|
{
|
||
|
'Name': 'LoadBalancer',
|
||
|
'Value': '%s' % arn
|
||
|
}
|
||
|
],
|
||
|
Period=60,
|
||
|
Unit='Percent',
|
||
|
EvaluationPeriods=1,
|
||
|
DatapointsToAlarm=1,
|
||
|
Threshold=0,
|
||
|
ComparisonOperator='LessThanOrEqualToThreshold',
|
||
|
TreatMissingData='notBreaching'
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
print("[UnHealthyHostCount, 1分钟采集1次, 周期为1分钟, 1分钟有1个数据点超过阈值就告警, 当不健康主机数量大于或等于 1 个就告警]")
|
||
|
response = cloudwatch.put_metric_alarm(
|
||
|
AlarmName='AWS_ELB_%s_%s_UnHealthyHostCount' % (instance_name, targetgroup_name),
|
||
|
AlarmDescription='Alarm when ELB UnHealthyHostCount exceeds 5m',
|
||
|
ActionsEnabled=True,
|
||
|
OKActions=[sns_arn],
|
||
|
AlarmActions=[sns_arn],
|
||
|
MetricName='UnHealthyHostCount',
|
||
|
Namespace='AWS/ApplicationELB',
|
||
|
Statistic='Average',
|
||
|
Dimensions=[
|
||
|
{
|
||
|
'Name': 'TargetGroup',
|
||
|
'Value': '%s' % targetgroup
|
||
|
},
|
||
|
{
|
||
|
'Name': 'LoadBalancer',
|
||
|
'Value': '%s' % arn
|
||
|
}
|
||
|
],
|
||
|
Period=60,
|
||
|
EvaluationPeriods=1,
|
||
|
DatapointsToAlarm=1,
|
||
|
Threshold=1,
|
||
|
ComparisonOperator='GreaterThanOrEqualToThreshold',
|
||
|
TreatMissingData='notBreaching'
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
def add_elb_http_alarm(instance_group, instance_arn, port=80, instance_name=None):
|
||
|
print("[Add elb http %s alarm.]" % instance_group)
|
||
|
print("[HTTPCode_Target_5XX_Count, 1分钟采集1次, 周期为1分钟, 1分钟有1个数据点超过阈值就告警, 当5xx超过 10 个为超过阈值]")
|
||
|
response = cloudwatch.put_metric_alarm(
|
||
|
AlarmName='AWS_ELB_%s_%s_HTTPCode_Target_5XX_Count' % (instance_name, port),
|
||
|
AlarmDescription='Alarm when ELB HTTPCode_Target_5XX_Count exceeds 10',
|
||
|
ActionsEnabled=True,
|
||
|
OKActions=[sns_arn],
|
||
|
AlarmActions=[sns_arn],
|
||
|
MetricName='HTTPCode_Target_5XX_Count',
|
||
|
Namespace='AWS/ApplicationELB',
|
||
|
Statistic='Sum',
|
||
|
Dimensions=[
|
||
|
{
|
||
|
'Name': 'TargetGroup',
|
||
|
'Value': 'targetgroup/%s' % instance_group
|
||
|
},
|
||
|
{
|
||
|
'Name': 'LoadBalancer',
|
||
|
'Value': 'app/%s' % instance_arn
|
||
|
}
|
||
|
],
|
||
|
Period=60,
|
||
|
EvaluationPeriods=1,
|
||
|
DatapointsToAlarm=1,
|
||
|
Threshold=10,
|
||
|
ComparisonOperator='GreaterThanOrEqualToThreshold',
|
||
|
TreatMissingData='notBreaching'
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
print("[HTTPCode_Target_4XX_Count, 1分钟采集1次, 周期为1分钟, 5分钟有3个数据点超过阈值就告警, 当4xx超过 10% 为超过阈值]")
|
||
|
response = cloudwatch.put_metric_alarm(
|
||
|
AlarmName='AWS_ELB_%s_%s_HTTPCode_Target_4XX_Count' % (instance_name, port),
|
||
|
AlarmDescription='Alarm when ELB HTTPCode_Target_4XX_Count exceeds 10',
|
||
|
ActionsEnabled=True,
|
||
|
OKActions=[sns_arn],
|
||
|
AlarmActions=[sns_arn],
|
||
|
MetricName='HTTPCode_Target_4XX_Count',
|
||
|
Namespace='AWS/ApplicationELB',
|
||
|
Statistic='Sum',
|
||
|
Dimensions=[
|
||
|
{
|
||
|
'Name': 'TargetGroup',
|
||
|
'Value': 'targetgroup/%s' % instance_group
|
||
|
},
|
||
|
{
|
||
|
'Name': 'LoadBalancer',
|
||
|
'Value': 'app/%s' % instance_arn
|
||
|
}
|
||
|
],
|
||
|
Period=60,
|
||
|
EvaluationPeriods=5,
|
||
|
DatapointsToAlarm=3,
|
||
|
Threshold=10,
|
||
|
ComparisonOperator='GreaterThanOrEqualToThreshold',
|
||
|
TreatMissingData='notBreaching'
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
|
||
|
def add_ec2_alarm(instance_id, instance_name=None):
|
||
|
print("Add ec2 %s alarm." % instance_id)
|
||
|
print("[CPUUtilization, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 80%]")
|
||
|
response = cloudwatch.put_metric_alarm(
|
||
|
AlarmName='AWS_EC2_%s_CPUUtilization' % (instance_name if instance_name else instance_id),
|
||
|
AlarmDescription='Alarm when server CPU exceeds 80%',
|
||
|
ActionsEnabled=True,
|
||
|
OKActions=[sns_arn],
|
||
|
AlarmActions=[sns_arn],
|
||
|
MetricName='CPUUtilization',
|
||
|
Namespace='AWS/EC2',
|
||
|
Statistic='Average',
|
||
|
Dimensions=[
|
||
|
{
|
||
|
'Name': 'InstanceId',
|
||
|
'Value': '%s' % instance_id
|
||
|
},
|
||
|
],
|
||
|
Period=60,
|
||
|
Unit='Percent',
|
||
|
EvaluationPeriods=3,
|
||
|
DatapointsToAlarm=3,
|
||
|
Threshold=80.0,
|
||
|
ComparisonOperator='GreaterThanOrEqualToThreshold',
|
||
|
TreatMissingData='notBreaching'
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
print("[NetworkIn, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]")
|
||
|
response = cloudwatch.put_metric_alarm(
|
||
|
AlarmName='AWS_EC2_%s_NetworkIn' % (instance_name if instance_name else instance_id),
|
||
|
AlarmDescription='Alarm when server NetworkIn exceeds 5m',
|
||
|
ActionsEnabled=True,
|
||
|
OKActions=[sns_arn],
|
||
|
AlarmActions=[sns_arn],
|
||
|
MetricName='NetworkIn',
|
||
|
Namespace='AWS/EC2',
|
||
|
Statistic='Average',
|
||
|
Dimensions=[
|
||
|
{
|
||
|
'Name': 'InstanceId',
|
||
|
'Value': '%s' % instance_id
|
||
|
},
|
||
|
],
|
||
|
Period=60,
|
||
|
EvaluationPeriods=3,
|
||
|
DatapointsToAlarm=3,
|
||
|
Threshold=5000000,
|
||
|
ComparisonOperator='GreaterThanOrEqualToThreshold',
|
||
|
TreatMissingData='notBreaching'
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
print("[NetworkOut, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]")
|
||
|
response = cloudwatch.put_metric_alarm(
|
||
|
AlarmName='AWS_EC2_%s_NetworkOut' % (instance_name if instance_name else instance_id),
|
||
|
AlarmDescription='Alarm when server NetworkOut exceeds 5m',
|
||
|
ActionsEnabled=True,
|
||
|
OKActions=[sns_arn],
|
||
|
AlarmActions=[sns_arn],
|
||
|
MetricName='NetworkOut',
|
||
|
Namespace='AWS/EC2',
|
||
|
Statistic='Average',
|
||
|
Dimensions=[
|
||
|
{
|
||
|
'Name': 'InstanceId',
|
||
|
'Value': '%s' % instance_id
|
||
|
},
|
||
|
],
|
||
|
Period=60,
|
||
|
EvaluationPeriods=3,
|
||
|
DatapointsToAlarm=3,
|
||
|
Threshold=5000000,
|
||
|
ComparisonOperator='GreaterThanOrEqualToThreshold',
|
||
|
TreatMissingData='notBreaching'
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
print("[StatusCheckFailed_System, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]")
|
||
|
response = cloudwatch.put_metric_alarm(
|
||
|
AlarmName='AWS_EC2_%s_StatusCheckFailed_System' % (instance_name if instance_name else instance_id),
|
||
|
AlarmDescription='Alarm when server NetworkOut Status Check Failed',
|
||
|
ActionsEnabled=True,
|
||
|
OKActions=[sns_arn],
|
||
|
AlarmActions=[sns_arn],
|
||
|
MetricName='StatusCheckFailed_System',
|
||
|
Namespace='AWS/EC2',
|
||
|
Statistic='Average',
|
||
|
Dimensions=[
|
||
|
{
|
||
|
'Name': 'InstanceId',
|
||
|
'Value': '%s' % instance_id
|
||
|
},
|
||
|
],
|
||
|
Period=60,
|
||
|
EvaluationPeriods=3,
|
||
|
DatapointsToAlarm=3,
|
||
|
Threshold=1.0,
|
||
|
ComparisonOperator='GreaterThanOrEqualToThreshold',
|
||
|
TreatMissingData='notBreaching'
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
|
||
|
def add_ec2_ebs_alarm(instance_id, instance_name=None):
|
||
|
print("Add ec2 ebs %s alarm." % instance_id)
|
||
|
|
||
|
ec2d = boto3.resource('ec2')
|
||
|
instance = ec2d.Instance(instance_id)
|
||
|
vol_id = instance.volumes.all()
|
||
|
print(vol_id)
|
||
|
for v in vol_id:
|
||
|
print("[Found EBS volume %s on instance %s]" % (v.id, instance_id))
|
||
|
print("[VolumeIdleTime, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 80%]")
|
||
|
response = cloudwatch.put_metric_alarm(
|
||
|
AlarmName='AWS_EC2_%s_EBS_%s_VolumeIdleTime' % (instance_name if instance_name else instance_id, v.id),
|
||
|
AlarmDescription='Alarm when server CPU exceeds 80%',
|
||
|
ActionsEnabled=True,
|
||
|
OKActions=[sns_arn],
|
||
|
AlarmActions=[sns_arn],
|
||
|
MetricName='VolumeIdleTime',
|
||
|
Namespace='AWS/EBS',
|
||
|
Statistic='Average',
|
||
|
Dimensions=[
|
||
|
{
|
||
|
'Name': 'VolumeId',
|
||
|
'Value': '%s' % v.id
|
||
|
},
|
||
|
],
|
||
|
Period=60,
|
||
|
EvaluationPeriods=3,
|
||
|
DatapointsToAlarm=3,
|
||
|
Threshold=30.0,
|
||
|
ComparisonOperator='GreaterThanOrEqualToThreshold',
|
||
|
TreatMissingData='notBreaching'
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
|
||
|
def add_redis_alarm(instance_id, instance_name=None):
|
||
|
print("[Add redis %s alarm.]" % instance_id)
|
||
|
print("[CPUUtilization, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 80%]")
|
||
|
response = cloudwatch.put_metric_alarm(
|
||
|
AlarmName='AWS_REDIS_%s_CPUUtilization' % (instance_name if instance_name else instance_id),
|
||
|
AlarmDescription='Alarm when redis CPU exceeds 80%',
|
||
|
ActionsEnabled=True,
|
||
|
OKActions=[sns_arn],
|
||
|
AlarmActions=[sns_arn],
|
||
|
MetricName='CPUUtilization',
|
||
|
Namespace='AWS/ElastiCache',
|
||
|
Statistic='Average',
|
||
|
Dimensions=[
|
||
|
{
|
||
|
'Name': 'CacheClusterId',
|
||
|
'Value': '%s' % instance_id
|
||
|
},
|
||
|
],
|
||
|
Period=60,
|
||
|
Unit='Percent',
|
||
|
EvaluationPeriods=3,
|
||
|
DatapointsToAlarm=3,
|
||
|
Threshold=80.0,
|
||
|
ComparisonOperator='GreaterThanOrEqualToThreshold',
|
||
|
TreatMissingData='notBreaching'
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
print("[EngineCPUUtilization, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 80%]")
|
||
|
response = cloudwatch.put_metric_alarm(
|
||
|
AlarmName='AWS_REDIS_%s_EngineCPUUtilization' % (instance_name if instance_name else instance_id),
|
||
|
AlarmDescription='Alarm when redis Engine CPU exceeds 80%',
|
||
|
ActionsEnabled=True,
|
||
|
OKActions=[sns_arn],
|
||
|
AlarmActions=[sns_arn],
|
||
|
MetricName='EngineCPUUtilization',
|
||
|
Namespace='AWS/ElastiCache',
|
||
|
Statistic='Average',
|
||
|
Dimensions=[
|
||
|
{
|
||
|
'Name': 'CacheClusterId',
|
||
|
'Value': '%s' % instance_id
|
||
|
},
|
||
|
],
|
||
|
Period=60,
|
||
|
Unit='Percent',
|
||
|
EvaluationPeriods=3,
|
||
|
DatapointsToAlarm=3,
|
||
|
Threshold=80.0,
|
||
|
ComparisonOperator='GreaterThanOrEqualToThreshold',
|
||
|
TreatMissingData='notBreaching'
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
print("[CurrConnections, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 500]")
|
||
|
response = cloudwatch.put_metric_alarm(
|
||
|
AlarmName='AWS_REDIS_%s_CurrConnections' % (instance_name if instance_name else instance_id),
|
||
|
AlarmDescription='Alarm when redis connections exceeds 500',
|
||
|
ActionsEnabled=True,
|
||
|
OKActions=[sns_arn],
|
||
|
AlarmActions=[sns_arn],
|
||
|
MetricName='CurrConnections',
|
||
|
Namespace='AWS/ElastiCache',
|
||
|
Statistic='Average',
|
||
|
Dimensions=[
|
||
|
{
|
||
|
'Name': 'CacheClusterId',
|
||
|
'Value': '%s' % instance_id
|
||
|
},
|
||
|
],
|
||
|
Period=60,
|
||
|
Unit='Percent',
|
||
|
EvaluationPeriods=3,
|
||
|
DatapointsToAlarm=3,
|
||
|
Threshold=500,
|
||
|
ComparisonOperator='GreaterThanOrEqualToThreshold',
|
||
|
TreatMissingData='notBreaching'
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
print("[FreeableMemory, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值小于于或等于 1G]")
|
||
|
response = cloudwatch.put_metric_alarm(
|
||
|
AlarmName='AWS_REDIS_%s_FreeableMemory' % (instance_name if instance_name else instance_id),
|
||
|
AlarmDescription='Alarm when redis FreeableMemory Less than 1G',
|
||
|
ActionsEnabled=True,
|
||
|
OKActions=[sns_arn],
|
||
|
AlarmActions=[sns_arn],
|
||
|
MetricName='FreeableMemory',
|
||
|
Namespace='AWS/ElastiCache',
|
||
|
Statistic='Average',
|
||
|
Dimensions=[
|
||
|
{
|
||
|
'Name': 'CacheClusterId',
|
||
|
'Value': '%s' % instance_id
|
||
|
},
|
||
|
],
|
||
|
Period=60,
|
||
|
Unit='Percent',
|
||
|
EvaluationPeriods=3,
|
||
|
DatapointsToAlarm=3,
|
||
|
Threshold=1000000000,
|
||
|
ComparisonOperator='LessThanOrEqualToThreshold',
|
||
|
TreatMissingData='notBreaching'
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
print("[NetworkBytesIn, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]")
|
||
|
response = cloudwatch.put_metric_alarm(
|
||
|
AlarmName='AWS_REDIS_%s_NetworkBytesIn' % (instance_name if instance_name else instance_id),
|
||
|
AlarmDescription='Alarm when redis NetworkBytesIn exceeds 5m',
|
||
|
ActionsEnabled=True,
|
||
|
OKActions=[sns_arn],
|
||
|
AlarmActions=[sns_arn],
|
||
|
MetricName='NetworkBytesIn',
|
||
|
Namespace='AWS/ElastiCache',
|
||
|
Statistic='Average',
|
||
|
Dimensions=[
|
||
|
{
|
||
|
'Name': 'CacheClusterId',
|
||
|
'Value': '%s' % instance_id
|
||
|
},
|
||
|
],
|
||
|
Period=60,
|
||
|
Unit='Percent',
|
||
|
EvaluationPeriods=3,
|
||
|
DatapointsToAlarm=3,
|
||
|
Threshold=5000000,
|
||
|
ComparisonOperator='GreaterThanOrEqualToThreshold',
|
||
|
TreatMissingData='notBreaching'
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
print("[NetworkBytesOut, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]")
|
||
|
response = cloudwatch.put_metric_alarm(
|
||
|
AlarmName='AWS_REDIS_%s_NetworkBytesOut' % (instance_name if instance_name else instance_id),
|
||
|
AlarmDescription='Alarm when redis NetworkBytesOut exceeds 5m',
|
||
|
ActionsEnabled=True,
|
||
|
OKActions=[sns_arn],
|
||
|
AlarmActions=[sns_arn],
|
||
|
MetricName='NetworkBytesOut',
|
||
|
Namespace='AWS/ElastiCache',
|
||
|
Statistic='Average',
|
||
|
Dimensions=[
|
||
|
{
|
||
|
'Name': 'CacheClusterId',
|
||
|
'Value': '%s' % instance_id
|
||
|
},
|
||
|
],
|
||
|
Period=60,
|
||
|
Unit='Percent',
|
||
|
EvaluationPeriods=3,
|
||
|
DatapointsToAlarm=3,
|
||
|
Threshold=5000000,
|
||
|
ComparisonOperator='GreaterThanOrEqualToThreshold',
|
||
|
TreatMissingData='notBreaching'
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
print("[CacheMisses, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]")
|
||
|
response = cloudwatch.put_metric_alarm(
|
||
|
AlarmName='AWS_REDIS_%s_CacheMisses' % (instance_name if instance_name else instance_id),
|
||
|
AlarmDescription='Alarm when redis CacheMisses exceeds 5000',
|
||
|
ActionsEnabled=True,
|
||
|
OKActions=[sns_arn],
|
||
|
AlarmActions=[sns_arn],
|
||
|
MetricName='CacheMisses',
|
||
|
Namespace='AWS/ElastiCache',
|
||
|
Statistic='Average',
|
||
|
Dimensions=[
|
||
|
{
|
||
|
'Name': 'CacheClusterId',
|
||
|
'Value': '%s' % instance_id
|
||
|
},
|
||
|
],
|
||
|
Period=60,
|
||
|
Unit='Percent',
|
||
|
EvaluationPeriods=3,
|
||
|
DatapointsToAlarm=3,
|
||
|
Threshold=5000,
|
||
|
ComparisonOperator='GreaterThanOrEqualToThreshold',
|
||
|
TreatMissingData='notBreaching'
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
|
||
|
def add_mysql_alarm(instance_id, instance_name=None):
|
||
|
print("[Add mysql %s alarm.]" % instance_id)
|
||
|
print("[CPUUtilization, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]")
|
||
|
response = cloudwatch.put_metric_alarm(
|
||
|
AlarmName='AWS_MYSQL_%s_CPUUtilization' % (instance_name if instance_name else instance_id),
|
||
|
AlarmDescription='Alarm when mysql CPUUtilization exceeds 80%',
|
||
|
ActionsEnabled=True,
|
||
|
OKActions=[sns_arn],
|
||
|
AlarmActions=[sns_arn],
|
||
|
MetricName='CPUUtilization',
|
||
|
Namespace='AWS/RDS',
|
||
|
Statistic='Average',
|
||
|
Dimensions=[
|
||
|
{
|
||
|
'Name': 'DBInstanceIdentifier',
|
||
|
'Value': '%s' % instance_id
|
||
|
},
|
||
|
],
|
||
|
Period=60,
|
||
|
Unit='Percent',
|
||
|
EvaluationPeriods=3,
|
||
|
DatapointsToAlarm=3,
|
||
|
Threshold=80.0,
|
||
|
ComparisonOperator='GreaterThanOrEqualToThreshold',
|
||
|
TreatMissingData='notBreaching'
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
print("[DatabaseConnections, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 500]")
|
||
|
response = cloudwatch.put_metric_alarm(
|
||
|
AlarmName='AWS_MYSQL_%s_DatabaseConnections' % (instance_name if instance_name else instance_id),
|
||
|
AlarmDescription='Alarm when mysql DatabaseConnections exceeds 500',
|
||
|
ActionsEnabled=True,
|
||
|
OKActions=[sns_arn],
|
||
|
AlarmActions=[sns_arn],
|
||
|
MetricName='DatabaseConnections',
|
||
|
Namespace='AWS/RDS',
|
||
|
Statistic='Average',
|
||
|
Dimensions=[
|
||
|
{
|
||
|
'Name': 'DBInstanceIdentifier',
|
||
|
'Value': '%s' % instance_id
|
||
|
},
|
||
|
],
|
||
|
Period=60,
|
||
|
Unit='Percent',
|
||
|
EvaluationPeriods=3,
|
||
|
DatapointsToAlarm=3,
|
||
|
Threshold=500,
|
||
|
ComparisonOperator='GreaterThanOrEqualToThreshold',
|
||
|
TreatMissingData='notBreaching'
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
print("[FreeableMemory, 1分钟采集1次, 周期为3分钟, 3分钟有3个数据点超过阈值就告警, 平均值小于或等于 1g]")
|
||
|
response = cloudwatch.put_metric_alarm(
|
||
|
AlarmName='AWS_MYSQL_%s_FreeableMemory' % (instance_name if instance_name else instance_id),
|
||
|
AlarmDescription='Alarm when mysql FreeableMemory less than 1g',
|
||
|
ActionsEnabled=True,
|
||
|
OKActions=[sns_arn],
|
||
|
AlarmActions=[sns_arn],
|
||
|
MetricName='FreeableMemory',
|
||
|
Namespace='AWS/RDS',
|
||
|
Statistic='Average',
|
||
|
Dimensions=[
|
||
|
{
|
||
|
'Name': 'DBInstanceIdentifier',
|
||
|
'Value': '%s' % instance_id
|
||
|
},
|
||
|
],
|
||
|
Period=60,
|
||
|
Unit='Percent',
|
||
|
EvaluationPeriods=3,
|
||
|
DatapointsToAlarm=3,
|
||
|
Threshold=1000000000,
|
||
|
ComparisonOperator='LessThanOrEqualToThreshold',
|
||
|
TreatMissingData='notBreaching'
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
print("[FreeStorageSpace, 5分钟采集1次, 周期为5分钟, 15分钟有3个数据点超过阈值就告警, 平均值小于于或等于 10g]")
|
||
|
response = cloudwatch.put_metric_alarm(
|
||
|
AlarmName='AWS_MYSQL_%s_FreeStorageSpace' % (instance_name if instance_name else instance_id),
|
||
|
AlarmDescription='Alarm when mysql FreeStorageSpace less than 10g',
|
||
|
ActionsEnabled=True,
|
||
|
OKActions=[sns_arn],
|
||
|
AlarmActions=[sns_arn],
|
||
|
MetricName='FreeableMemory',
|
||
|
Namespace='AWS/RDS',
|
||
|
Statistic='Average',
|
||
|
Dimensions=[
|
||
|
{
|
||
|
'Name': 'DBInstanceIdentifier',
|
||
|
'Value': '%s' % instance_id
|
||
|
},
|
||
|
],
|
||
|
Period=300,
|
||
|
Unit='Percent',
|
||
|
EvaluationPeriods=3,
|
||
|
DatapointsToAlarm=3,
|
||
|
Threshold=10000000000,
|
||
|
ComparisonOperator='LessThanOrEqualToThreshold',
|
||
|
TreatMissingData='notBreaching'
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
print("[NetworkTransmitThroughput, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于于或等于5m就告警]")
|
||
|
response = cloudwatch.put_metric_alarm(
|
||
|
AlarmName='AWS_MYSQL_%s_NetworkTransmitThroughput' % (instance_name if instance_name else instance_id),
|
||
|
AlarmDescription='Alarm when mysql NetworkTransmitThroughput exceeds 5m',
|
||
|
ActionsEnabled=True,
|
||
|
OKActions=[sns_arn],
|
||
|
AlarmActions=[sns_arn],
|
||
|
MetricName='NetworkTransmitThroughput',
|
||
|
Namespace='AWS/RDS',
|
||
|
Statistic='Average',
|
||
|
Dimensions=[
|
||
|
{
|
||
|
'Name': 'DBInstanceIdentifier',
|
||
|
'Value': '%s' % instance_id
|
||
|
},
|
||
|
],
|
||
|
Period=60,
|
||
|
Unit='Percent',
|
||
|
EvaluationPeriods=3,
|
||
|
DatapointsToAlarm=3,
|
||
|
Threshold=5000000,
|
||
|
ComparisonOperator='GreaterThanOrEqualToThreshold',
|
||
|
TreatMissingData='notBreaching'
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
print("[NetworkReceiveThroughput, 5分钟采集1次, 周期为5分钟, 15分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]")
|
||
|
response = cloudwatch.put_metric_alarm(
|
||
|
AlarmName='AWS_MYSQL_%s_NetworkReceiveThroughput' % (instance_name if instance_name else instance_id),
|
||
|
AlarmDescription='Alarm when mysql NetworkReceiveThroughput exceeds 5m',
|
||
|
ActionsEnabled=True,
|
||
|
OKActions=[sns_arn],
|
||
|
AlarmActions=[sns_arn],
|
||
|
MetricName='NetworkReceiveThroughput',
|
||
|
Namespace='AWS/RDS',
|
||
|
Statistic='Average',
|
||
|
Dimensions=[
|
||
|
{
|
||
|
'Name': 'DBInstanceIdentifier',
|
||
|
'Value': '%s' % instance_id
|
||
|
},
|
||
|
],
|
||
|
Period=300,
|
||
|
Unit='Percent',
|
||
|
EvaluationPeriods=3,
|
||
|
DatapointsToAlarm=3,
|
||
|
Threshold=5000000,
|
||
|
ComparisonOperator='GreaterThanOrEqualToThreshold',
|
||
|
TreatMissingData='notBreaching'
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
def add_ec2_dashboard(instance_id, instance_name=None):
|
||
|
print("[add ec2 %s dashboard]" % (instance_name if instance_name else instance_id))
|
||
|
|
||
|
body = {"widgets": [
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 6,
|
||
|
"y": 0,
|
||
|
"width": 6,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"view": "timeSeries",
|
||
|
"stacked": False,
|
||
|
"metrics": [
|
||
|
["AWS/EC2", "NetworkIn", "InstanceId", instance_id],
|
||
|
[".", "NetworkOut", ".", "."]
|
||
|
],
|
||
|
"region": region,
|
||
|
"title": "Network"
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 0,
|
||
|
"y": 0,
|
||
|
"width": 6,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"view": "timeSeries",
|
||
|
"stacked": False,
|
||
|
"metrics": [
|
||
|
["AWS/EC2", "CPUUtilization", "InstanceId", instance_id],
|
||
|
],
|
||
|
"region": region,
|
||
|
"title": "CPUUtilization"
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 12,
|
||
|
"y": 0,
|
||
|
"width": 6,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"view": "timeSeries",
|
||
|
"stacked": False,
|
||
|
"metrics": [
|
||
|
["AWS/EC2", "EBSWriteOps", "InstanceId", instance_id],
|
||
|
[".", "EBSReadOps", ".", "."]
|
||
|
],
|
||
|
"region": region,
|
||
|
"title": "EBSOps"
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 18,
|
||
|
"y": 0,
|
||
|
"width": 6,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"view": "timeSeries",
|
||
|
"stacked": False,
|
||
|
"metrics": [
|
||
|
["AWS/EC2", "EBSIOBalance%", "InstanceId", instance_id],
|
||
|
[".", "EBSByteBalance%", ".", "."]
|
||
|
],
|
||
|
"region": region,
|
||
|
"title": "EBSBalance"
|
||
|
}
|
||
|
}
|
||
|
]
|
||
|
}
|
||
|
response = cloudwatch.put_dashboard(
|
||
|
DashboardName='AWS_EC2_%s' % (instance_name if instance_name else instance_id),
|
||
|
DashboardBody=json.dumps(body)
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
|
||
|
def add_mysql_dashboard(instance_id, instance_name=None):
|
||
|
print("[add mysql %s dashboard]" % (instance_name if instance_name else instance_id))
|
||
|
|
||
|
body = {
|
||
|
"widgets": [
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 0,
|
||
|
"y": 0,
|
||
|
"width": 6,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"view": "timeSeries",
|
||
|
"stacked": False,
|
||
|
"metrics": [
|
||
|
["AWS/RDS", "CPUUtilization", "DBInstanceIdentifier", instance_id]
|
||
|
],
|
||
|
"region": region
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 12,
|
||
|
"y": 0,
|
||
|
"width": 6,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"view": "timeSeries",
|
||
|
"stacked": False,
|
||
|
"metrics": [
|
||
|
["AWS/RDS", "DatabaseConnections", "DBInstanceIdentifier", instance_id]
|
||
|
],
|
||
|
"region": region
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 0,
|
||
|
"y": 6,
|
||
|
"width": 6,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"view": "timeSeries",
|
||
|
"stacked": True,
|
||
|
"metrics": [
|
||
|
["AWS/RDS", "ReadLatency", "DBInstanceIdentifier", instance_id],
|
||
|
[".", "WriteLatency", ".", "."]
|
||
|
],
|
||
|
"region": region
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 18,
|
||
|
"y": 0,
|
||
|
"width": 6,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"view": "timeSeries",
|
||
|
"stacked": True,
|
||
|
"metrics": [
|
||
|
["AWS/RDS", "WriteIOPS", "DBInstanceIdentifier", instance_id],
|
||
|
[".", "ReadIOPS", ".", "."]
|
||
|
],
|
||
|
"region": region
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 6,
|
||
|
"y": 6,
|
||
|
"width": 6,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"view": "timeSeries",
|
||
|
"stacked": True,
|
||
|
"metrics": [
|
||
|
["AWS/RDS", "NetworkTransmitThroughput", "DBInstanceIdentifier", instance_id],
|
||
|
[".", "NetworkReceiveThroughput", ".", "."]
|
||
|
],
|
||
|
"region": region
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 12,
|
||
|
"y": 6,
|
||
|
"width": 6,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"view": "timeSeries",
|
||
|
"stacked": True,
|
||
|
"metrics": [
|
||
|
["AWS/RDS", "FreeStorageSpace", "DBInstanceIdentifier", instance_id],
|
||
|
[".", "BinLogDiskUsage", ".", "."]
|
||
|
],
|
||
|
"region": region
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 6,
|
||
|
"y": 0,
|
||
|
"width": 6,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"view": "timeSeries",
|
||
|
"stacked": True,
|
||
|
"metrics": [
|
||
|
["AWS/RDS", "FreeableMemory", "DBInstanceIdentifier", instance_id]
|
||
|
],
|
||
|
"region": region
|
||
|
}
|
||
|
}
|
||
|
]
|
||
|
}
|
||
|
response = cloudwatch.put_dashboard(
|
||
|
DashboardName='AWS_MYSQL_%s' % (instance_name if instance_name else instance_id),
|
||
|
DashboardBody=json.dumps(body)
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
def add_redis_dashboard(clusters, group_id):
|
||
|
print("[add redis %s dashboard]" % group_id)
|
||
|
|
||
|
EngineCPUUtilization_metrcis = []
|
||
|
CurrConnections_metrcis = []
|
||
|
FreeableMemory_metrcis = []
|
||
|
NetworkBytes_metrcis = []
|
||
|
CacheHits_metrcis = []
|
||
|
CacheMisses_metrcis = []
|
||
|
CPUUtilization_metrcis = []
|
||
|
IsMaster_metrcis = []
|
||
|
NewConnections_metrcis = []
|
||
|
StringBasedCmds_metrcis = []
|
||
|
BytesUsedForCache_metrcis = []
|
||
|
ReplicationBytes_metrcis = []
|
||
|
|
||
|
for c in clusters:
|
||
|
EngineCPUUtilization_metrcis.append(["AWS/ElastiCache", "EngineCPUUtilization", "CacheClusterId", "%s" % c])
|
||
|
CurrConnections_metrcis.append(["AWS/ElastiCache", "CurrConnections", "CacheClusterId", "%s" % c])
|
||
|
FreeableMemory_metrcis.append(["AWS/ElastiCache", "FreeableMemory", "CacheClusterId", "%s" % c])
|
||
|
NetworkBytes_metrcis.append(["AWS/ElastiCache", "NetworkBytesIn", "CacheClusterId", "%s" % c])
|
||
|
NetworkBytes_metrcis.append(["AWS/ElastiCache", "NetworkBytesOut", "CacheClusterId", "%s" % c])
|
||
|
CacheHits_metrcis.append(["AWS/ElastiCache", "CacheHits", "CacheClusterId", "%s" % c])
|
||
|
CacheMisses_metrcis.append(["AWS/ElastiCache", "CacheMisses", "CacheClusterId", "%s" % c])
|
||
|
CPUUtilization_metrcis.append(["AWS/ElastiCache", "CPUUtilization", "CacheClusterId", "%s" % c])
|
||
|
IsMaster_metrcis.append(["AWS/ElastiCache", "IsMaster", "CacheClusterId", "%s" % c])
|
||
|
NewConnections_metrcis.append(["AWS/ElastiCache", "NewConnections", "CacheClusterId", "%s" % c])
|
||
|
StringBasedCmds_metrcis.append(["AWS/ElastiCache", "StringBasedCmds", "CacheClusterId", "%s" % c])
|
||
|
BytesUsedForCache_metrcis.append(["AWS/ElastiCache", "BytesUsedForCache", "CacheClusterId", "%s" % c])
|
||
|
ReplicationBytes_metrcis.append(["AWS/ElastiCache", "BytesUsedForCache", "CacheClusterId", "%s" % c])
|
||
|
|
||
|
body = {
|
||
|
"widgets": [
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 6,
|
||
|
"y": 0,
|
||
|
"width": 6,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"metrics": EngineCPUUtilization_metrcis,
|
||
|
"view": "timeSeries",
|
||
|
"stacked": True,
|
||
|
"region": region
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 12,
|
||
|
"y": 6,
|
||
|
"width": 6,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"metrics": CurrConnections_metrcis,
|
||
|
"view": "timeSeries",
|
||
|
"stacked": True,
|
||
|
"region": region
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 12,
|
||
|
"y": 0,
|
||
|
"width": 6,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"view": "timeSeries",
|
||
|
"stacked": True,
|
||
|
"metrics": FreeableMemory_metrcis,
|
||
|
"region": region
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 0,
|
||
|
"y": 6,
|
||
|
"width": 6,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"view": "timeSeries",
|
||
|
"stacked": True,
|
||
|
"metrics": NetworkBytes_metrcis,
|
||
|
"region": region,
|
||
|
"title": "NetworkBytes"
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 0,
|
||
|
"y": 12,
|
||
|
"width": 6,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"view": "timeSeries",
|
||
|
"stacked": False,
|
||
|
"metrics": CacheHits_metrcis,
|
||
|
"region": region,
|
||
|
"title": " CacheHits"
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 6,
|
||
|
"y": 12,
|
||
|
"width": 6,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"view": "timeSeries",
|
||
|
"stacked": False,
|
||
|
"metrics": CacheMisses_metrcis,
|
||
|
"region": region,
|
||
|
"title": " CacheMisses"
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 0,
|
||
|
"y": 0,
|
||
|
"width": 6,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"view": "timeSeries",
|
||
|
"stacked": False,
|
||
|
"metrics": CPUUtilization_metrcis,
|
||
|
"region": region,
|
||
|
"title": "CPUUtilization"
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 18,
|
||
|
"y": 0,
|
||
|
"width": 6,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"view": "timeSeries",
|
||
|
"stacked": False,
|
||
|
"metrics": IsMaster_metrcis,
|
||
|
"region": region,
|
||
|
"title": "IsMaster",
|
||
|
"period": 300
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 6,
|
||
|
"y": 6,
|
||
|
"width": 6,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"view": "timeSeries",
|
||
|
"stacked": False,
|
||
|
"metrics": NewConnections_metrcis,
|
||
|
"region": region,
|
||
|
"title": "NewConnections"
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 12,
|
||
|
"y": 12,
|
||
|
"width": 6,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"view": "timeSeries",
|
||
|
"stacked": False,
|
||
|
"metrics": StringBasedCmds_metrcis,
|
||
|
"region": region,
|
||
|
"title": "StringBasedCmds"
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 18,
|
||
|
"y": 6,
|
||
|
"width": 6,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"view": "timeSeries",
|
||
|
"stacked": False,
|
||
|
"metrics": BytesUsedForCache_metrcis,
|
||
|
"region": region,
|
||
|
"title": "BytesUsedForCache"
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 18,
|
||
|
"y": 12,
|
||
|
"width": 6,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"view": "timeSeries",
|
||
|
"stacked": False,
|
||
|
"metrics": ReplicationBytes_metrcis,
|
||
|
"region": region,
|
||
|
"title": "ReplicationBytes"
|
||
|
}
|
||
|
}
|
||
|
]
|
||
|
}
|
||
|
|
||
|
response = cloudwatch.put_dashboard(
|
||
|
DashboardName='AWS_REDIS_%s' % group_id,
|
||
|
DashboardBody=json.dumps(body)
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
def add_elb_dashboard(instance_arn, instance_name):
|
||
|
print("[add elb %s dashboard]" % instance_name)
|
||
|
elb_client = boto3.client('elbv2')
|
||
|
|
||
|
instance = elb_client.describe_listeners(LoadBalancerArn=instance_arn)
|
||
|
arn = instance_arn.split(':loadbalancer/')[1]
|
||
|
|
||
|
targetgroup_list = []
|
||
|
for listener in instance.get('Listeners'):
|
||
|
for action in listener['DefaultActions']:
|
||
|
print(action['TargetGroupArn'])
|
||
|
|
||
|
targetgroup_arn = action['TargetGroupArn']
|
||
|
targetgroup = targetgroup_arn.split(':')[-1]
|
||
|
targetgroup_list.append(targetgroup)
|
||
|
|
||
|
HealthyHost_metrcis = []
|
||
|
|
||
|
for targetgroup in set(targetgroup_list):
|
||
|
HealthyHost_metrcis.append(["AWS/NetworkELB", "UnHealthyHostCount", "TargetGroup",
|
||
|
targetgroup, "LoadBalancer", arn])
|
||
|
HealthyHost_metrcis.append(["AWS/NetworkELB", "HealthyHostCount", "TargetGroup",
|
||
|
targetgroup, "LoadBalancer", arn])
|
||
|
|
||
|
body = {
|
||
|
"widgets": [
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 0,
|
||
|
"y": 0,
|
||
|
"width": 12,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"view": "timeSeries",
|
||
|
"stacked": False,
|
||
|
"metrics": [
|
||
|
["AWS/NetworkELB", "ProcessedBytes", "LoadBalancer", "%s" % arn]
|
||
|
],
|
||
|
"region": region,
|
||
|
"title": "ProcessedBytes"
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 12,
|
||
|
"y": 0,
|
||
|
"width": 12,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"metrics": [
|
||
|
["AWS/NetworkELB", "ActiveFlowCount", "LoadBalancer", "%s" % arn],
|
||
|
[".", "NewFlowCount", ".", "."],
|
||
|
],
|
||
|
"view": "timeSeries",
|
||
|
"stacked": False,
|
||
|
"region": region,
|
||
|
"title": "FlowCount"
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 0,
|
||
|
"y": 6,
|
||
|
"width": 12,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"view": "timeSeries",
|
||
|
"stacked": False,
|
||
|
"metrics": [
|
||
|
["AWS/NetworkELB", "TCP_Client_Reset_Count", "LoadBalancer", "%s" % arn],
|
||
|
[".", "TCP_ELB_Reset_Count", ".", "."],
|
||
|
[".", "TCP_Target_Reset_Count", ".", "."]
|
||
|
],
|
||
|
"region": region,
|
||
|
"title": "TCP_Reset"
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"type": "metric",
|
||
|
"x": 12,
|
||
|
"y": 6,
|
||
|
"width": 12,
|
||
|
"height": 6,
|
||
|
"properties": {
|
||
|
"view": "timeSeries",
|
||
|
"stacked": False,
|
||
|
"metrics": HealthyHost_metrcis,
|
||
|
"region": region,
|
||
|
"title": "HealthyHost"
|
||
|
}
|
||
|
}
|
||
|
]
|
||
|
}
|
||
|
|
||
|
response = cloudwatch.put_dashboard(
|
||
|
DashboardName='AWS_ELB_%s' % instance_name,
|
||
|
DashboardBody=json.dumps(body)
|
||
|
)
|
||
|
print(response)
|
||
|
|
||
|
def lambda_handler(event, context):
|
||
|
# TODO implement
|
||
|
|
||
|
print(event)
|
||
|
detail = event.get('detail', {})
|
||
|
event_source = detail.get('eventSource')
|
||
|
event_name = detail.get('eventName')
|
||
|
event_response = detail.get('responseElements', {})
|
||
|
|
||
|
if event_source == 'ec2.amazonaws.com':
|
||
|
if event_name == 'RunInstances':
|
||
|
instances = event_response.get('instancesSet', {})
|
||
|
for item in instances.get('items', []):
|
||
|
print(item)
|
||
|
instance_id = item.get('instanceId')
|
||
|
if instance_id:
|
||
|
add_ec2_alarm(instance_id)
|
||
|
add_ec2_ebs_alarm(instance_id)
|
||
|
add_ec2_dashboard(instance_id)
|
||
|
|
||
|
elif event_source == 'rds.amazonaws.com':
|
||
|
if event_name == 'CreateDBInstance':
|
||
|
dBInstanceArn = event_response.get('dBInstanceArn')
|
||
|
dBInstanceIdentifier = event_response.get('dBInstanceIdentifier')
|
||
|
engine = event_response.get('engine')
|
||
|
if engine == 'mysql':
|
||
|
print(dBInstanceArn, dBInstanceIdentifier)
|
||
|
add_mysql_alarm(dBInstanceIdentifier)
|
||
|
add_mysql_dashboard(dBInstanceIdentifier)
|
||
|
elif event_source == 'elasticache.amazonaws.com':
|
||
|
if event_name == 'CreateReplicationGroup':
|
||
|
group_id = event_response.get('replicationGroupId')
|
||
|
clusters = event_response.get('memberClusters')
|
||
|
for c in clusters:
|
||
|
add_redis_alarm(c)
|
||
|
|
||
|
add_redis_dashboard(clusters, group_id)
|
||
|
elif event_source == 'elasticloadbalancing.amazonaws.com':
|
||
|
if event_name == 'CreateLoadBalancer':
|
||
|
loadBalancers = event_response.get('loadBalancers')
|
||
|
for loadbalancer in loadBalancers:
|
||
|
instance_name = loadbalancer.get('loadBalancerName')
|
||
|
instance_type = loadbalancer.get('type')
|
||
|
instance_arn = loadbalancer.get('loadBalancerArn')
|
||
|
|
||
|
if instance_type == 'network':
|
||
|
add_elb_alarm(instance_arn, instance_name)
|
||
|
add_elb_dashboard(instance_arn, instance_name)
|
||
|
|
||
|
return {
|
||
|
'statusCode': 200,
|
||
|
'body': json.dumps('add success.')
|
||
|
}
|