mirror of https://github.com/lework/script
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1269 lines
44 KiB
1269 lines
44 KiB
import boto3 |
|
import json |
|
|
|
# Create CloudWatch client |
|
cloudwatch = boto3.client('cloudwatch') |
|
sns_arn = "sns——arn" |
|
region = "cn-north-1" |
|
|
|
|
|
def add_elb_alarm(instance_arn, instance_name=None): |
|
print("[Add elb alarm.]") |
|
|
|
elb_client = boto3.client('elbv2') |
|
|
|
instance = elb_client.describe_listeners(LoadBalancerArn=instance_arn) |
|
arn = instance_arn.split(':loadbalancer/')[1] |
|
print(instance) |
|
targetgroup_list = [] |
|
for listener in instance.get('Listeners'): |
|
for action in listener['DefaultActions']: |
|
print(action['TargetGroupArn']) |
|
|
|
targetgroup_arn = action['TargetGroupArn'] |
|
targetgroup = targetgroup_arn.split(':')[-1] |
|
targetgroup_list.append(targetgroup) |
|
|
|
print("[ActiveFlowCount, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 当链接数平均大于或等于 6000 就告警]") |
|
response = cloudwatch.put_metric_alarm( |
|
AlarmName='AWS_ELB_%s_ActiveFlowCount' % instance_name, |
|
AlarmDescription='Alarm when ELB ActiveFlowCount exceeds 6000', |
|
ActionsEnabled=True, |
|
OKActions=[sns_arn], |
|
AlarmActions=[sns_arn], |
|
MetricName='ActiveFlowCount', |
|
Namespace='AWS/ApplicationELB', |
|
Statistic='Average', |
|
Dimensions=[ |
|
{ |
|
'Name': 'LoadBalancer', |
|
'Value': '%s' % arn |
|
} |
|
], |
|
Period=60, |
|
EvaluationPeriods=3, |
|
DatapointsToAlarm=3, |
|
Threshold=6000, |
|
ComparisonOperator='GreaterThanOrEqualToThreshold', |
|
TreatMissingData='notBreaching' |
|
) |
|
print(response) |
|
|
|
print("[NewFlowCount, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 当链接数平均大于或等于 1000 就告警]") |
|
response = cloudwatch.put_metric_alarm( |
|
AlarmName='AWS_ELB_%s_NewFlowCount' % instance_name, |
|
AlarmDescription='Alarm when ELB NewFlowCount exceeds 6000', |
|
ActionsEnabled=True, |
|
OKActions=[sns_arn], |
|
AlarmActions=[sns_arn], |
|
MetricName='NewFlowCount', |
|
Namespace='AWS/ApplicationELB', |
|
Statistic='Average', |
|
Dimensions=[ |
|
{ |
|
'Name': 'LoadBalancer', |
|
'Value': '%s' % arn |
|
} |
|
], |
|
Period=60, |
|
EvaluationPeriods=3, |
|
DatapointsToAlarm=3, |
|
Threshold=1000, |
|
ComparisonOperator='GreaterThanOrEqualToThreshold', |
|
TreatMissingData='notBreaching' |
|
) |
|
print(response) |
|
|
|
print("[ProcessedBytes, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 当链接数平均大于或等于 5m 就告警]") |
|
response = cloudwatch.put_metric_alarm( |
|
AlarmName='AWS_ELB_%s_ProcessedBytes' % instance_name, |
|
AlarmDescription='Alarm when ELB ProcessedBytes exceeds 5m', |
|
ActionsEnabled=True, |
|
OKActions=[sns_arn], |
|
AlarmActions=[sns_arn], |
|
MetricName='ProcessedBytes', |
|
Namespace='AWS/ApplicationELB', |
|
Statistic='Average', |
|
Dimensions=[ |
|
{ |
|
'Name': 'LoadBalancer', |
|
'Value': 'app/%s' % arn |
|
} |
|
], |
|
Period=60, |
|
EvaluationPeriods=3, |
|
DatapointsToAlarm=3, |
|
Threshold=5000000, |
|
ComparisonOperator='GreaterThanOrEqualToThreshold', |
|
TreatMissingData='notBreaching' |
|
) |
|
print(response) |
|
|
|
for targetgroup in set(targetgroup_list): |
|
targetgroup_name = targetgroup.split('/')[1] |
|
|
|
print("[HealthyHostCount, 1分钟采集1次, 周期为1分钟, 1分钟有1个数据点超过阈值就告警, 健康主机的最大值小于等于 0 就告警]") |
|
response = cloudwatch.put_metric_alarm( |
|
AlarmName='AWS_ELB_%s_%s_HealthyHostCount' % (instance_name, targetgroup_name), |
|
AlarmDescription='Alarm when ELB HealthyHostCount less than 0', |
|
ActionsEnabled=True, |
|
OKActions=[sns_arn], |
|
AlarmActions=[sns_arn], |
|
MetricName='HealthyHostCount', |
|
Namespace='AWS/ApplicationELB', |
|
Statistic='Maximum', |
|
Dimensions=[ |
|
{ |
|
'Name': 'TargetGroup', |
|
'Value': '%s' % targetgroup |
|
}, |
|
{ |
|
'Name': 'LoadBalancer', |
|
'Value': '%s' % arn |
|
} |
|
], |
|
Period=60, |
|
Unit='Percent', |
|
EvaluationPeriods=1, |
|
DatapointsToAlarm=1, |
|
Threshold=0, |
|
ComparisonOperator='LessThanOrEqualToThreshold', |
|
TreatMissingData='notBreaching' |
|
) |
|
print(response) |
|
|
|
print("[UnHealthyHostCount, 1分钟采集1次, 周期为1分钟, 1分钟有1个数据点超过阈值就告警, 当不健康主机数量大于或等于 1 个就告警]") |
|
response = cloudwatch.put_metric_alarm( |
|
AlarmName='AWS_ELB_%s_%s_UnHealthyHostCount' % (instance_name, targetgroup_name), |
|
AlarmDescription='Alarm when ELB UnHealthyHostCount exceeds 5m', |
|
ActionsEnabled=True, |
|
OKActions=[sns_arn], |
|
AlarmActions=[sns_arn], |
|
MetricName='UnHealthyHostCount', |
|
Namespace='AWS/ApplicationELB', |
|
Statistic='Average', |
|
Dimensions=[ |
|
{ |
|
'Name': 'TargetGroup', |
|
'Value': '%s' % targetgroup |
|
}, |
|
{ |
|
'Name': 'LoadBalancer', |
|
'Value': '%s' % arn |
|
} |
|
], |
|
Period=60, |
|
EvaluationPeriods=1, |
|
DatapointsToAlarm=1, |
|
Threshold=1, |
|
ComparisonOperator='GreaterThanOrEqualToThreshold', |
|
TreatMissingData='notBreaching' |
|
) |
|
print(response) |
|
|
|
def add_elb_http_alarm(instance_group, instance_arn, port=80, instance_name=None): |
|
print("[Add elb http %s alarm.]" % instance_group) |
|
print("[HTTPCode_Target_5XX_Count, 1分钟采集1次, 周期为1分钟, 1分钟有1个数据点超过阈值就告警, 当5xx超过 10 个为超过阈值]") |
|
response = cloudwatch.put_metric_alarm( |
|
AlarmName='AWS_ELB_%s_%s_HTTPCode_Target_5XX_Count' % (instance_name, port), |
|
AlarmDescription='Alarm when ELB HTTPCode_Target_5XX_Count exceeds 10', |
|
ActionsEnabled=True, |
|
OKActions=[sns_arn], |
|
AlarmActions=[sns_arn], |
|
MetricName='HTTPCode_Target_5XX_Count', |
|
Namespace='AWS/ApplicationELB', |
|
Statistic='Sum', |
|
Dimensions=[ |
|
{ |
|
'Name': 'TargetGroup', |
|
'Value': 'targetgroup/%s' % instance_group |
|
}, |
|
{ |
|
'Name': 'LoadBalancer', |
|
'Value': 'app/%s' % instance_arn |
|
} |
|
], |
|
Period=60, |
|
EvaluationPeriods=1, |
|
DatapointsToAlarm=1, |
|
Threshold=10, |
|
ComparisonOperator='GreaterThanOrEqualToThreshold', |
|
TreatMissingData='notBreaching' |
|
) |
|
print(response) |
|
|
|
print("[HTTPCode_Target_4XX_Count, 1分钟采集1次, 周期为1分钟, 5分钟有3个数据点超过阈值就告警, 当4xx超过 10% 为超过阈值]") |
|
response = cloudwatch.put_metric_alarm( |
|
AlarmName='AWS_ELB_%s_%s_HTTPCode_Target_4XX_Count' % (instance_name, port), |
|
AlarmDescription='Alarm when ELB HTTPCode_Target_4XX_Count exceeds 10', |
|
ActionsEnabled=True, |
|
OKActions=[sns_arn], |
|
AlarmActions=[sns_arn], |
|
MetricName='HTTPCode_Target_4XX_Count', |
|
Namespace='AWS/ApplicationELB', |
|
Statistic='Sum', |
|
Dimensions=[ |
|
{ |
|
'Name': 'TargetGroup', |
|
'Value': 'targetgroup/%s' % instance_group |
|
}, |
|
{ |
|
'Name': 'LoadBalancer', |
|
'Value': 'app/%s' % instance_arn |
|
} |
|
], |
|
Period=60, |
|
EvaluationPeriods=5, |
|
DatapointsToAlarm=3, |
|
Threshold=10, |
|
ComparisonOperator='GreaterThanOrEqualToThreshold', |
|
TreatMissingData='notBreaching' |
|
) |
|
print(response) |
|
|
|
|
|
def add_ec2_alarm(instance_id, instance_name=None): |
|
print("Add ec2 %s alarm." % instance_id) |
|
print("[CPUUtilization, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 80%]") |
|
response = cloudwatch.put_metric_alarm( |
|
AlarmName='AWS_EC2_%s_CPUUtilization' % (instance_name if instance_name else instance_id), |
|
AlarmDescription='Alarm when server CPU exceeds 80%', |
|
ActionsEnabled=True, |
|
OKActions=[sns_arn], |
|
AlarmActions=[sns_arn], |
|
MetricName='CPUUtilization', |
|
Namespace='AWS/EC2', |
|
Statistic='Average', |
|
Dimensions=[ |
|
{ |
|
'Name': 'InstanceId', |
|
'Value': '%s' % instance_id |
|
}, |
|
], |
|
Period=60, |
|
Unit='Percent', |
|
EvaluationPeriods=3, |
|
DatapointsToAlarm=3, |
|
Threshold=80.0, |
|
ComparisonOperator='GreaterThanOrEqualToThreshold', |
|
TreatMissingData='notBreaching' |
|
) |
|
print(response) |
|
|
|
print("[NetworkIn, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]") |
|
response = cloudwatch.put_metric_alarm( |
|
AlarmName='AWS_EC2_%s_NetworkIn' % (instance_name if instance_name else instance_id), |
|
AlarmDescription='Alarm when server NetworkIn exceeds 5m', |
|
ActionsEnabled=True, |
|
OKActions=[sns_arn], |
|
AlarmActions=[sns_arn], |
|
MetricName='NetworkIn', |
|
Namespace='AWS/EC2', |
|
Statistic='Average', |
|
Dimensions=[ |
|
{ |
|
'Name': 'InstanceId', |
|
'Value': '%s' % instance_id |
|
}, |
|
], |
|
Period=60, |
|
EvaluationPeriods=3, |
|
DatapointsToAlarm=3, |
|
Threshold=5000000, |
|
ComparisonOperator='GreaterThanOrEqualToThreshold', |
|
TreatMissingData='notBreaching' |
|
) |
|
print(response) |
|
|
|
print("[NetworkOut, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]") |
|
response = cloudwatch.put_metric_alarm( |
|
AlarmName='AWS_EC2_%s_NetworkOut' % (instance_name if instance_name else instance_id), |
|
AlarmDescription='Alarm when server NetworkOut exceeds 5m', |
|
ActionsEnabled=True, |
|
OKActions=[sns_arn], |
|
AlarmActions=[sns_arn], |
|
MetricName='NetworkOut', |
|
Namespace='AWS/EC2', |
|
Statistic='Average', |
|
Dimensions=[ |
|
{ |
|
'Name': 'InstanceId', |
|
'Value': '%s' % instance_id |
|
}, |
|
], |
|
Period=60, |
|
EvaluationPeriods=3, |
|
DatapointsToAlarm=3, |
|
Threshold=5000000, |
|
ComparisonOperator='GreaterThanOrEqualToThreshold', |
|
TreatMissingData='notBreaching' |
|
) |
|
print(response) |
|
|
|
print("[StatusCheckFailed_System, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]") |
|
response = cloudwatch.put_metric_alarm( |
|
AlarmName='AWS_EC2_%s_StatusCheckFailed_System' % (instance_name if instance_name else instance_id), |
|
AlarmDescription='Alarm when server NetworkOut Status Check Failed', |
|
ActionsEnabled=True, |
|
OKActions=[sns_arn], |
|
AlarmActions=[sns_arn], |
|
MetricName='StatusCheckFailed_System', |
|
Namespace='AWS/EC2', |
|
Statistic='Average', |
|
Dimensions=[ |
|
{ |
|
'Name': 'InstanceId', |
|
'Value': '%s' % instance_id |
|
}, |
|
], |
|
Period=60, |
|
EvaluationPeriods=3, |
|
DatapointsToAlarm=3, |
|
Threshold=1.0, |
|
ComparisonOperator='GreaterThanOrEqualToThreshold', |
|
TreatMissingData='notBreaching' |
|
) |
|
print(response) |
|
|
|
|
|
def add_ec2_ebs_alarm(instance_id, instance_name=None): |
|
print("Add ec2 ebs %s alarm." % instance_id) |
|
|
|
ec2d = boto3.resource('ec2') |
|
instance = ec2d.Instance(instance_id) |
|
vol_id = instance.volumes.all() |
|
print(vol_id) |
|
for v in vol_id: |
|
print("[Found EBS volume %s on instance %s]" % (v.id, instance_id)) |
|
print("[VolumeIdleTime, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 80%]") |
|
response = cloudwatch.put_metric_alarm( |
|
AlarmName='AWS_EC2_%s_EBS_%s_VolumeIdleTime' % (instance_name if instance_name else instance_id, v.id), |
|
AlarmDescription='Alarm when server CPU exceeds 80%', |
|
ActionsEnabled=True, |
|
OKActions=[sns_arn], |
|
AlarmActions=[sns_arn], |
|
MetricName='VolumeIdleTime', |
|
Namespace='AWS/EBS', |
|
Statistic='Average', |
|
Dimensions=[ |
|
{ |
|
'Name': 'VolumeId', |
|
'Value': '%s' % v.id |
|
}, |
|
], |
|
Period=60, |
|
EvaluationPeriods=3, |
|
DatapointsToAlarm=3, |
|
Threshold=30.0, |
|
ComparisonOperator='GreaterThanOrEqualToThreshold', |
|
TreatMissingData='notBreaching' |
|
) |
|
print(response) |
|
|
|
|
|
def add_redis_alarm(instance_id, instance_name=None): |
|
print("[Add redis %s alarm.]" % instance_id) |
|
print("[CPUUtilization, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 80%]") |
|
response = cloudwatch.put_metric_alarm( |
|
AlarmName='AWS_REDIS_%s_CPUUtilization' % (instance_name if instance_name else instance_id), |
|
AlarmDescription='Alarm when redis CPU exceeds 80%', |
|
ActionsEnabled=True, |
|
OKActions=[sns_arn], |
|
AlarmActions=[sns_arn], |
|
MetricName='CPUUtilization', |
|
Namespace='AWS/ElastiCache', |
|
Statistic='Average', |
|
Dimensions=[ |
|
{ |
|
'Name': 'CacheClusterId', |
|
'Value': '%s' % instance_id |
|
}, |
|
], |
|
Period=60, |
|
Unit='Percent', |
|
EvaluationPeriods=3, |
|
DatapointsToAlarm=3, |
|
Threshold=80.0, |
|
ComparisonOperator='GreaterThanOrEqualToThreshold', |
|
TreatMissingData='notBreaching' |
|
) |
|
print(response) |
|
|
|
print("[EngineCPUUtilization, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 80%]") |
|
response = cloudwatch.put_metric_alarm( |
|
AlarmName='AWS_REDIS_%s_EngineCPUUtilization' % (instance_name if instance_name else instance_id), |
|
AlarmDescription='Alarm when redis Engine CPU exceeds 80%', |
|
ActionsEnabled=True, |
|
OKActions=[sns_arn], |
|
AlarmActions=[sns_arn], |
|
MetricName='EngineCPUUtilization', |
|
Namespace='AWS/ElastiCache', |
|
Statistic='Average', |
|
Dimensions=[ |
|
{ |
|
'Name': 'CacheClusterId', |
|
'Value': '%s' % instance_id |
|
}, |
|
], |
|
Period=60, |
|
Unit='Percent', |
|
EvaluationPeriods=3, |
|
DatapointsToAlarm=3, |
|
Threshold=80.0, |
|
ComparisonOperator='GreaterThanOrEqualToThreshold', |
|
TreatMissingData='notBreaching' |
|
) |
|
print(response) |
|
|
|
print("[CurrConnections, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 500]") |
|
response = cloudwatch.put_metric_alarm( |
|
AlarmName='AWS_REDIS_%s_CurrConnections' % (instance_name if instance_name else instance_id), |
|
AlarmDescription='Alarm when redis connections exceeds 500', |
|
ActionsEnabled=True, |
|
OKActions=[sns_arn], |
|
AlarmActions=[sns_arn], |
|
MetricName='CurrConnections', |
|
Namespace='AWS/ElastiCache', |
|
Statistic='Average', |
|
Dimensions=[ |
|
{ |
|
'Name': 'CacheClusterId', |
|
'Value': '%s' % instance_id |
|
}, |
|
], |
|
Period=60, |
|
Unit='Percent', |
|
EvaluationPeriods=3, |
|
DatapointsToAlarm=3, |
|
Threshold=500, |
|
ComparisonOperator='GreaterThanOrEqualToThreshold', |
|
TreatMissingData='notBreaching' |
|
) |
|
print(response) |
|
|
|
print("[FreeableMemory, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值小于于或等于 1G]") |
|
response = cloudwatch.put_metric_alarm( |
|
AlarmName='AWS_REDIS_%s_FreeableMemory' % (instance_name if instance_name else instance_id), |
|
AlarmDescription='Alarm when redis FreeableMemory Less than 1G', |
|
ActionsEnabled=True, |
|
OKActions=[sns_arn], |
|
AlarmActions=[sns_arn], |
|
MetricName='FreeableMemory', |
|
Namespace='AWS/ElastiCache', |
|
Statistic='Average', |
|
Dimensions=[ |
|
{ |
|
'Name': 'CacheClusterId', |
|
'Value': '%s' % instance_id |
|
}, |
|
], |
|
Period=60, |
|
Unit='Percent', |
|
EvaluationPeriods=3, |
|
DatapointsToAlarm=3, |
|
Threshold=1000000000, |
|
ComparisonOperator='LessThanOrEqualToThreshold', |
|
TreatMissingData='notBreaching' |
|
) |
|
print(response) |
|
|
|
print("[NetworkBytesIn, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]") |
|
response = cloudwatch.put_metric_alarm( |
|
AlarmName='AWS_REDIS_%s_NetworkBytesIn' % (instance_name if instance_name else instance_id), |
|
AlarmDescription='Alarm when redis NetworkBytesIn exceeds 5m', |
|
ActionsEnabled=True, |
|
OKActions=[sns_arn], |
|
AlarmActions=[sns_arn], |
|
MetricName='NetworkBytesIn', |
|
Namespace='AWS/ElastiCache', |
|
Statistic='Average', |
|
Dimensions=[ |
|
{ |
|
'Name': 'CacheClusterId', |
|
'Value': '%s' % instance_id |
|
}, |
|
], |
|
Period=60, |
|
Unit='Percent', |
|
EvaluationPeriods=3, |
|
DatapointsToAlarm=3, |
|
Threshold=5000000, |
|
ComparisonOperator='GreaterThanOrEqualToThreshold', |
|
TreatMissingData='notBreaching' |
|
) |
|
print(response) |
|
|
|
print("[NetworkBytesOut, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]") |
|
response = cloudwatch.put_metric_alarm( |
|
AlarmName='AWS_REDIS_%s_NetworkBytesOut' % (instance_name if instance_name else instance_id), |
|
AlarmDescription='Alarm when redis NetworkBytesOut exceeds 5m', |
|
ActionsEnabled=True, |
|
OKActions=[sns_arn], |
|
AlarmActions=[sns_arn], |
|
MetricName='NetworkBytesOut', |
|
Namespace='AWS/ElastiCache', |
|
Statistic='Average', |
|
Dimensions=[ |
|
{ |
|
'Name': 'CacheClusterId', |
|
'Value': '%s' % instance_id |
|
}, |
|
], |
|
Period=60, |
|
Unit='Percent', |
|
EvaluationPeriods=3, |
|
DatapointsToAlarm=3, |
|
Threshold=5000000, |
|
ComparisonOperator='GreaterThanOrEqualToThreshold', |
|
TreatMissingData='notBreaching' |
|
) |
|
print(response) |
|
|
|
print("[CacheMisses, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]") |
|
response = cloudwatch.put_metric_alarm( |
|
AlarmName='AWS_REDIS_%s_CacheMisses' % (instance_name if instance_name else instance_id), |
|
AlarmDescription='Alarm when redis CacheMisses exceeds 5000', |
|
ActionsEnabled=True, |
|
OKActions=[sns_arn], |
|
AlarmActions=[sns_arn], |
|
MetricName='CacheMisses', |
|
Namespace='AWS/ElastiCache', |
|
Statistic='Average', |
|
Dimensions=[ |
|
{ |
|
'Name': 'CacheClusterId', |
|
'Value': '%s' % instance_id |
|
}, |
|
], |
|
Period=60, |
|
Unit='Percent', |
|
EvaluationPeriods=3, |
|
DatapointsToAlarm=3, |
|
Threshold=5000, |
|
ComparisonOperator='GreaterThanOrEqualToThreshold', |
|
TreatMissingData='notBreaching' |
|
) |
|
print(response) |
|
|
|
|
|
def add_mysql_alarm(instance_id, instance_name=None): |
|
print("[Add mysql %s alarm.]" % instance_id) |
|
print("[CPUUtilization, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]") |
|
response = cloudwatch.put_metric_alarm( |
|
AlarmName='AWS_MYSQL_%s_CPUUtilization' % (instance_name if instance_name else instance_id), |
|
AlarmDescription='Alarm when mysql CPUUtilization exceeds 80%', |
|
ActionsEnabled=True, |
|
OKActions=[sns_arn], |
|
AlarmActions=[sns_arn], |
|
MetricName='CPUUtilization', |
|
Namespace='AWS/RDS', |
|
Statistic='Average', |
|
Dimensions=[ |
|
{ |
|
'Name': 'DBInstanceIdentifier', |
|
'Value': '%s' % instance_id |
|
}, |
|
], |
|
Period=60, |
|
Unit='Percent', |
|
EvaluationPeriods=3, |
|
DatapointsToAlarm=3, |
|
Threshold=80.0, |
|
ComparisonOperator='GreaterThanOrEqualToThreshold', |
|
TreatMissingData='notBreaching' |
|
) |
|
print(response) |
|
|
|
print("[DatabaseConnections, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 500]") |
|
response = cloudwatch.put_metric_alarm( |
|
AlarmName='AWS_MYSQL_%s_DatabaseConnections' % (instance_name if instance_name else instance_id), |
|
AlarmDescription='Alarm when mysql DatabaseConnections exceeds 500', |
|
ActionsEnabled=True, |
|
OKActions=[sns_arn], |
|
AlarmActions=[sns_arn], |
|
MetricName='DatabaseConnections', |
|
Namespace='AWS/RDS', |
|
Statistic='Average', |
|
Dimensions=[ |
|
{ |
|
'Name': 'DBInstanceIdentifier', |
|
'Value': '%s' % instance_id |
|
}, |
|
], |
|
Period=60, |
|
Unit='Percent', |
|
EvaluationPeriods=3, |
|
DatapointsToAlarm=3, |
|
Threshold=500, |
|
ComparisonOperator='GreaterThanOrEqualToThreshold', |
|
TreatMissingData='notBreaching' |
|
) |
|
print(response) |
|
|
|
print("[FreeableMemory, 1分钟采集1次, 周期为3分钟, 3分钟有3个数据点超过阈值就告警, 平均值小于或等于 1g]") |
|
response = cloudwatch.put_metric_alarm( |
|
AlarmName='AWS_MYSQL_%s_FreeableMemory' % (instance_name if instance_name else instance_id), |
|
AlarmDescription='Alarm when mysql FreeableMemory less than 1g', |
|
ActionsEnabled=True, |
|
OKActions=[sns_arn], |
|
AlarmActions=[sns_arn], |
|
MetricName='FreeableMemory', |
|
Namespace='AWS/RDS', |
|
Statistic='Average', |
|
Dimensions=[ |
|
{ |
|
'Name': 'DBInstanceIdentifier', |
|
'Value': '%s' % instance_id |
|
}, |
|
], |
|
Period=60, |
|
Unit='Percent', |
|
EvaluationPeriods=3, |
|
DatapointsToAlarm=3, |
|
Threshold=1000000000, |
|
ComparisonOperator='LessThanOrEqualToThreshold', |
|
TreatMissingData='notBreaching' |
|
) |
|
print(response) |
|
|
|
print("[FreeStorageSpace, 5分钟采集1次, 周期为5分钟, 15分钟有3个数据点超过阈值就告警, 平均值小于于或等于 10g]") |
|
response = cloudwatch.put_metric_alarm( |
|
AlarmName='AWS_MYSQL_%s_FreeStorageSpace' % (instance_name if instance_name else instance_id), |
|
AlarmDescription='Alarm when mysql FreeStorageSpace less than 10g', |
|
ActionsEnabled=True, |
|
OKActions=[sns_arn], |
|
AlarmActions=[sns_arn], |
|
MetricName='FreeableMemory', |
|
Namespace='AWS/RDS', |
|
Statistic='Average', |
|
Dimensions=[ |
|
{ |
|
'Name': 'DBInstanceIdentifier', |
|
'Value': '%s' % instance_id |
|
}, |
|
], |
|
Period=300, |
|
Unit='Percent', |
|
EvaluationPeriods=3, |
|
DatapointsToAlarm=3, |
|
Threshold=10000000000, |
|
ComparisonOperator='LessThanOrEqualToThreshold', |
|
TreatMissingData='notBreaching' |
|
) |
|
print(response) |
|
|
|
print("[NetworkTransmitThroughput, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于于或等于5m就告警]") |
|
response = cloudwatch.put_metric_alarm( |
|
AlarmName='AWS_MYSQL_%s_NetworkTransmitThroughput' % (instance_name if instance_name else instance_id), |
|
AlarmDescription='Alarm when mysql NetworkTransmitThroughput exceeds 5m', |
|
ActionsEnabled=True, |
|
OKActions=[sns_arn], |
|
AlarmActions=[sns_arn], |
|
MetricName='NetworkTransmitThroughput', |
|
Namespace='AWS/RDS', |
|
Statistic='Average', |
|
Dimensions=[ |
|
{ |
|
'Name': 'DBInstanceIdentifier', |
|
'Value': '%s' % instance_id |
|
}, |
|
], |
|
Period=60, |
|
Unit='Percent', |
|
EvaluationPeriods=3, |
|
DatapointsToAlarm=3, |
|
Threshold=5000000, |
|
ComparisonOperator='GreaterThanOrEqualToThreshold', |
|
TreatMissingData='notBreaching' |
|
) |
|
print(response) |
|
|
|
print("[NetworkReceiveThroughput, 5分钟采集1次, 周期为5分钟, 15分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]") |
|
response = cloudwatch.put_metric_alarm( |
|
AlarmName='AWS_MYSQL_%s_NetworkReceiveThroughput' % (instance_name if instance_name else instance_id), |
|
AlarmDescription='Alarm when mysql NetworkReceiveThroughput exceeds 5m', |
|
ActionsEnabled=True, |
|
OKActions=[sns_arn], |
|
AlarmActions=[sns_arn], |
|
MetricName='NetworkReceiveThroughput', |
|
Namespace='AWS/RDS', |
|
Statistic='Average', |
|
Dimensions=[ |
|
{ |
|
'Name': 'DBInstanceIdentifier', |
|
'Value': '%s' % instance_id |
|
}, |
|
], |
|
Period=300, |
|
Unit='Percent', |
|
EvaluationPeriods=3, |
|
DatapointsToAlarm=3, |
|
Threshold=5000000, |
|
ComparisonOperator='GreaterThanOrEqualToThreshold', |
|
TreatMissingData='notBreaching' |
|
) |
|
print(response) |
|
|
|
def add_ec2_dashboard(instance_id, instance_name=None): |
|
print("[add ec2 %s dashboard]" % (instance_name if instance_name else instance_id)) |
|
|
|
body = {"widgets": [ |
|
{ |
|
"type": "metric", |
|
"x": 6, |
|
"y": 0, |
|
"width": 6, |
|
"height": 6, |
|
"properties": { |
|
"view": "timeSeries", |
|
"stacked": False, |
|
"metrics": [ |
|
["AWS/EC2", "NetworkIn", "InstanceId", instance_id], |
|
[".", "NetworkOut", ".", "."] |
|
], |
|
"region": region, |
|
"title": "Network" |
|
} |
|
}, |
|
{ |
|
"type": "metric", |
|
"x": 0, |
|
"y": 0, |
|
"width": 6, |
|
"height": 6, |
|
"properties": { |
|
"view": "timeSeries", |
|
"stacked": False, |
|
"metrics": [ |
|
["AWS/EC2", "CPUUtilization", "InstanceId", instance_id], |
|
], |
|
"region": region, |
|
"title": "CPUUtilization" |
|
} |
|
}, |
|
{ |
|
"type": "metric", |
|
"x": 12, |
|
"y": 0, |
|
"width": 6, |
|
"height": 6, |
|
"properties": { |
|
"view": "timeSeries", |
|
"stacked": False, |
|
"metrics": [ |
|
["AWS/EC2", "EBSWriteOps", "InstanceId", instance_id], |
|
[".", "EBSReadOps", ".", "."] |
|
], |
|
"region": region, |
|
"title": "EBSOps" |
|
} |
|
}, |
|
{ |
|
"type": "metric", |
|
"x": 18, |
|
"y": 0, |
|
"width": 6, |
|
"height": 6, |
|
"properties": { |
|
"view": "timeSeries", |
|
"stacked": False, |
|
"metrics": [ |
|
["AWS/EC2", "EBSIOBalance%", "InstanceId", instance_id], |
|
[".", "EBSByteBalance%", ".", "."] |
|
], |
|
"region": region, |
|
"title": "EBSBalance" |
|
} |
|
} |
|
] |
|
} |
|
response = cloudwatch.put_dashboard( |
|
DashboardName='AWS_EC2_%s' % (instance_name if instance_name else instance_id), |
|
DashboardBody=json.dumps(body) |
|
) |
|
print(response) |
|
|
|
|
|
def add_mysql_dashboard(instance_id, instance_name=None): |
|
print("[add mysql %s dashboard]" % (instance_name if instance_name else instance_id)) |
|
|
|
body = { |
|
"widgets": [ |
|
{ |
|
"type": "metric", |
|
"x": 0, |
|
"y": 0, |
|
"width": 6, |
|
"height": 6, |
|
"properties": { |
|
"view": "timeSeries", |
|
"stacked": False, |
|
"metrics": [ |
|
["AWS/RDS", "CPUUtilization", "DBInstanceIdentifier", instance_id] |
|
], |
|
"region": region |
|
} |
|
}, |
|
{ |
|
"type": "metric", |
|
"x": 12, |
|
"y": 0, |
|
"width": 6, |
|
"height": 6, |
|
"properties": { |
|
"view": "timeSeries", |
|
"stacked": False, |
|
"metrics": [ |
|
["AWS/RDS", "DatabaseConnections", "DBInstanceIdentifier", instance_id] |
|
], |
|
"region": region |
|
} |
|
}, |
|
{ |
|
"type": "metric", |
|
"x": 0, |
|
"y": 6, |
|
"width": 6, |
|
"height": 6, |
|
"properties": { |
|
"view": "timeSeries", |
|
"stacked": True, |
|
"metrics": [ |
|
["AWS/RDS", "ReadLatency", "DBInstanceIdentifier", instance_id], |
|
[".", "WriteLatency", ".", "."] |
|
], |
|
"region": region |
|
} |
|
}, |
|
{ |
|
"type": "metric", |
|
"x": 18, |
|
"y": 0, |
|
"width": 6, |
|
"height": 6, |
|
"properties": { |
|
"view": "timeSeries", |
|
"stacked": True, |
|
"metrics": [ |
|
["AWS/RDS", "WriteIOPS", "DBInstanceIdentifier", instance_id], |
|
[".", "ReadIOPS", ".", "."] |
|
], |
|
"region": region |
|
} |
|
}, |
|
{ |
|
"type": "metric", |
|
"x": 6, |
|
"y": 6, |
|
"width": 6, |
|
"height": 6, |
|
"properties": { |
|
"view": "timeSeries", |
|
"stacked": True, |
|
"metrics": [ |
|
["AWS/RDS", "NetworkTransmitThroughput", "DBInstanceIdentifier", instance_id], |
|
[".", "NetworkReceiveThroughput", ".", "."] |
|
], |
|
"region": region |
|
} |
|
}, |
|
{ |
|
"type": "metric", |
|
"x": 12, |
|
"y": 6, |
|
"width": 6, |
|
"height": 6, |
|
"properties": { |
|
"view": "timeSeries", |
|
"stacked": True, |
|
"metrics": [ |
|
["AWS/RDS", "FreeStorageSpace", "DBInstanceIdentifier", instance_id], |
|
[".", "BinLogDiskUsage", ".", "."] |
|
], |
|
"region": region |
|
} |
|
}, |
|
{ |
|
"type": "metric", |
|
"x": 6, |
|
"y": 0, |
|
"width": 6, |
|
"height": 6, |
|
"properties": { |
|
"view": "timeSeries", |
|
"stacked": True, |
|
"metrics": [ |
|
["AWS/RDS", "FreeableMemory", "DBInstanceIdentifier", instance_id] |
|
], |
|
"region": region |
|
} |
|
} |
|
] |
|
} |
|
response = cloudwatch.put_dashboard( |
|
DashboardName='AWS_MYSQL_%s' % (instance_name if instance_name else instance_id), |
|
DashboardBody=json.dumps(body) |
|
) |
|
print(response) |
|
|
|
def add_redis_dashboard(clusters, group_id): |
|
print("[add redis %s dashboard]" % group_id) |
|
|
|
EngineCPUUtilization_metrcis = [] |
|
CurrConnections_metrcis = [] |
|
FreeableMemory_metrcis = [] |
|
NetworkBytes_metrcis = [] |
|
CacheHits_metrcis = [] |
|
CacheMisses_metrcis = [] |
|
CPUUtilization_metrcis = [] |
|
IsMaster_metrcis = [] |
|
NewConnections_metrcis = [] |
|
StringBasedCmds_metrcis = [] |
|
BytesUsedForCache_metrcis = [] |
|
ReplicationBytes_metrcis = [] |
|
|
|
for c in clusters: |
|
EngineCPUUtilization_metrcis.append(["AWS/ElastiCache", "EngineCPUUtilization", "CacheClusterId", "%s" % c]) |
|
CurrConnections_metrcis.append(["AWS/ElastiCache", "CurrConnections", "CacheClusterId", "%s" % c]) |
|
FreeableMemory_metrcis.append(["AWS/ElastiCache", "FreeableMemory", "CacheClusterId", "%s" % c]) |
|
NetworkBytes_metrcis.append(["AWS/ElastiCache", "NetworkBytesIn", "CacheClusterId", "%s" % c]) |
|
NetworkBytes_metrcis.append(["AWS/ElastiCache", "NetworkBytesOut", "CacheClusterId", "%s" % c]) |
|
CacheHits_metrcis.append(["AWS/ElastiCache", "CacheHits", "CacheClusterId", "%s" % c]) |
|
CacheMisses_metrcis.append(["AWS/ElastiCache", "CacheMisses", "CacheClusterId", "%s" % c]) |
|
CPUUtilization_metrcis.append(["AWS/ElastiCache", "CPUUtilization", "CacheClusterId", "%s" % c]) |
|
IsMaster_metrcis.append(["AWS/ElastiCache", "IsMaster", "CacheClusterId", "%s" % c]) |
|
NewConnections_metrcis.append(["AWS/ElastiCache", "NewConnections", "CacheClusterId", "%s" % c]) |
|
StringBasedCmds_metrcis.append(["AWS/ElastiCache", "StringBasedCmds", "CacheClusterId", "%s" % c]) |
|
BytesUsedForCache_metrcis.append(["AWS/ElastiCache", "BytesUsedForCache", "CacheClusterId", "%s" % c]) |
|
ReplicationBytes_metrcis.append(["AWS/ElastiCache", "BytesUsedForCache", "CacheClusterId", "%s" % c]) |
|
|
|
body = { |
|
"widgets": [ |
|
{ |
|
"type": "metric", |
|
"x": 6, |
|
"y": 0, |
|
"width": 6, |
|
"height": 6, |
|
"properties": { |
|
"metrics": EngineCPUUtilization_metrcis, |
|
"view": "timeSeries", |
|
"stacked": True, |
|
"region": region |
|
} |
|
}, |
|
{ |
|
"type": "metric", |
|
"x": 12, |
|
"y": 6, |
|
"width": 6, |
|
"height": 6, |
|
"properties": { |
|
"metrics": CurrConnections_metrcis, |
|
"view": "timeSeries", |
|
"stacked": True, |
|
"region": region |
|
} |
|
}, |
|
{ |
|
"type": "metric", |
|
"x": 12, |
|
"y": 0, |
|
"width": 6, |
|
"height": 6, |
|
"properties": { |
|
"view": "timeSeries", |
|
"stacked": True, |
|
"metrics": FreeableMemory_metrcis, |
|
"region": region |
|
} |
|
}, |
|
{ |
|
"type": "metric", |
|
"x": 0, |
|
"y": 6, |
|
"width": 6, |
|
"height": 6, |
|
"properties": { |
|
"view": "timeSeries", |
|
"stacked": True, |
|
"metrics": NetworkBytes_metrcis, |
|
"region": region, |
|
"title": "NetworkBytes" |
|
} |
|
}, |
|
{ |
|
"type": "metric", |
|
"x": 0, |
|
"y": 12, |
|
"width": 6, |
|
"height": 6, |
|
"properties": { |
|
"view": "timeSeries", |
|
"stacked": False, |
|
"metrics": CacheHits_metrcis, |
|
"region": region, |
|
"title": " CacheHits" |
|
} |
|
}, |
|
{ |
|
"type": "metric", |
|
"x": 6, |
|
"y": 12, |
|
"width": 6, |
|
"height": 6, |
|
"properties": { |
|
"view": "timeSeries", |
|
"stacked": False, |
|
"metrics": CacheMisses_metrcis, |
|
"region": region, |
|
"title": " CacheMisses" |
|
} |
|
}, |
|
{ |
|
"type": "metric", |
|
"x": 0, |
|
"y": 0, |
|
"width": 6, |
|
"height": 6, |
|
"properties": { |
|
"view": "timeSeries", |
|
"stacked": False, |
|
"metrics": CPUUtilization_metrcis, |
|
"region": region, |
|
"title": "CPUUtilization" |
|
} |
|
}, |
|
{ |
|
"type": "metric", |
|
"x": 18, |
|
"y": 0, |
|
"width": 6, |
|
"height": 6, |
|
"properties": { |
|
"view": "timeSeries", |
|
"stacked": False, |
|
"metrics": IsMaster_metrcis, |
|
"region": region, |
|
"title": "IsMaster", |
|
"period": 300 |
|
} |
|
}, |
|
{ |
|
"type": "metric", |
|
"x": 6, |
|
"y": 6, |
|
"width": 6, |
|
"height": 6, |
|
"properties": { |
|
"view": "timeSeries", |
|
"stacked": False, |
|
"metrics": NewConnections_metrcis, |
|
"region": region, |
|
"title": "NewConnections" |
|
} |
|
}, |
|
{ |
|
"type": "metric", |
|
"x": 12, |
|
"y": 12, |
|
"width": 6, |
|
"height": 6, |
|
"properties": { |
|
"view": "timeSeries", |
|
"stacked": False, |
|
"metrics": StringBasedCmds_metrcis, |
|
"region": region, |
|
"title": "StringBasedCmds" |
|
} |
|
}, |
|
{ |
|
"type": "metric", |
|
"x": 18, |
|
"y": 6, |
|
"width": 6, |
|
"height": 6, |
|
"properties": { |
|
"view": "timeSeries", |
|
"stacked": False, |
|
"metrics": BytesUsedForCache_metrcis, |
|
"region": region, |
|
"title": "BytesUsedForCache" |
|
} |
|
}, |
|
{ |
|
"type": "metric", |
|
"x": 18, |
|
"y": 12, |
|
"width": 6, |
|
"height": 6, |
|
"properties": { |
|
"view": "timeSeries", |
|
"stacked": False, |
|
"metrics": ReplicationBytes_metrcis, |
|
"region": region, |
|
"title": "ReplicationBytes" |
|
} |
|
} |
|
] |
|
} |
|
|
|
response = cloudwatch.put_dashboard( |
|
DashboardName='AWS_REDIS_%s' % group_id, |
|
DashboardBody=json.dumps(body) |
|
) |
|
print(response) |
|
|
|
def add_elb_dashboard(instance_arn, instance_name): |
|
print("[add elb %s dashboard]" % instance_name) |
|
elb_client = boto3.client('elbv2') |
|
|
|
instance = elb_client.describe_listeners(LoadBalancerArn=instance_arn) |
|
arn = instance_arn.split(':loadbalancer/')[1] |
|
|
|
targetgroup_list = [] |
|
for listener in instance.get('Listeners'): |
|
for action in listener['DefaultActions']: |
|
print(action['TargetGroupArn']) |
|
|
|
targetgroup_arn = action['TargetGroupArn'] |
|
targetgroup = targetgroup_arn.split(':')[-1] |
|
targetgroup_list.append(targetgroup) |
|
|
|
HealthyHost_metrcis = [] |
|
|
|
for targetgroup in set(targetgroup_list): |
|
HealthyHost_metrcis.append(["AWS/NetworkELB", "UnHealthyHostCount", "TargetGroup", |
|
targetgroup, "LoadBalancer", arn]) |
|
HealthyHost_metrcis.append(["AWS/NetworkELB", "HealthyHostCount", "TargetGroup", |
|
targetgroup, "LoadBalancer", arn]) |
|
|
|
body = { |
|
"widgets": [ |
|
{ |
|
"type": "metric", |
|
"x": 0, |
|
"y": 0, |
|
"width": 12, |
|
"height": 6, |
|
"properties": { |
|
"view": "timeSeries", |
|
"stacked": False, |
|
"metrics": [ |
|
["AWS/NetworkELB", "ProcessedBytes", "LoadBalancer", "%s" % arn] |
|
], |
|
"region": region, |
|
"title": "ProcessedBytes" |
|
} |
|
}, |
|
{ |
|
"type": "metric", |
|
"x": 12, |
|
"y": 0, |
|
"width": 12, |
|
"height": 6, |
|
"properties": { |
|
"metrics": [ |
|
["AWS/NetworkELB", "ActiveFlowCount", "LoadBalancer", "%s" % arn], |
|
[".", "NewFlowCount", ".", "."], |
|
], |
|
"view": "timeSeries", |
|
"stacked": False, |
|
"region": region, |
|
"title": "FlowCount" |
|
} |
|
}, |
|
{ |
|
"type": "metric", |
|
"x": 0, |
|
"y": 6, |
|
"width": 12, |
|
"height": 6, |
|
"properties": { |
|
"view": "timeSeries", |
|
"stacked": False, |
|
"metrics": [ |
|
["AWS/NetworkELB", "TCP_Client_Reset_Count", "LoadBalancer", "%s" % arn], |
|
[".", "TCP_ELB_Reset_Count", ".", "."], |
|
[".", "TCP_Target_Reset_Count", ".", "."] |
|
], |
|
"region": region, |
|
"title": "TCP_Reset" |
|
} |
|
}, |
|
{ |
|
"type": "metric", |
|
"x": 12, |
|
"y": 6, |
|
"width": 12, |
|
"height": 6, |
|
"properties": { |
|
"view": "timeSeries", |
|
"stacked": False, |
|
"metrics": HealthyHost_metrcis, |
|
"region": region, |
|
"title": "HealthyHost" |
|
} |
|
} |
|
] |
|
} |
|
|
|
response = cloudwatch.put_dashboard( |
|
DashboardName='AWS_ELB_%s' % instance_name, |
|
DashboardBody=json.dumps(body) |
|
) |
|
print(response) |
|
|
|
def lambda_handler(event, context): |
|
# TODO implement |
|
|
|
print(event) |
|
detail = event.get('detail', {}) |
|
event_source = detail.get('eventSource') |
|
event_name = detail.get('eventName') |
|
event_response = detail.get('responseElements', {}) |
|
|
|
if event_source == 'ec2.amazonaws.com': |
|
if event_name == 'RunInstances': |
|
instances = event_response.get('instancesSet', {}) |
|
for item in instances.get('items', []): |
|
print(item) |
|
instance_id = item.get('instanceId') |
|
if instance_id: |
|
add_ec2_alarm(instance_id) |
|
add_ec2_ebs_alarm(instance_id) |
|
add_ec2_dashboard(instance_id) |
|
|
|
elif event_source == 'rds.amazonaws.com': |
|
if event_name == 'CreateDBInstance': |
|
dBInstanceArn = event_response.get('dBInstanceArn') |
|
dBInstanceIdentifier = event_response.get('dBInstanceIdentifier') |
|
engine = event_response.get('engine') |
|
if engine == 'mysql': |
|
print(dBInstanceArn, dBInstanceIdentifier) |
|
add_mysql_alarm(dBInstanceIdentifier) |
|
add_mysql_dashboard(dBInstanceIdentifier) |
|
elif event_source == 'elasticache.amazonaws.com': |
|
if event_name == 'CreateReplicationGroup': |
|
group_id = event_response.get('replicationGroupId') |
|
clusters = event_response.get('memberClusters') |
|
for c in clusters: |
|
add_redis_alarm(c) |
|
|
|
add_redis_dashboard(clusters, group_id) |
|
elif event_source == 'elasticloadbalancing.amazonaws.com': |
|
if event_name == 'CreateLoadBalancer': |
|
loadBalancers = event_response.get('loadBalancers') |
|
for loadbalancer in loadBalancers: |
|
instance_name = loadbalancer.get('loadBalancerName') |
|
instance_type = loadbalancer.get('type') |
|
instance_arn = loadbalancer.get('loadBalancerArn') |
|
|
|
if instance_type == 'network': |
|
add_elb_alarm(instance_arn, instance_name) |
|
add_elb_dashboard(instance_arn, instance_name) |
|
|
|
return { |
|
'statusCode': 200, |
|
'body': json.dumps('add success.') |
|
}
|
|
|