You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

1269 lines
44 KiB

import boto3
import json
# Create CloudWatch client
cloudwatch = boto3.client('cloudwatch')
sns_arn = "sns——arn"
region = "cn-north-1"
def add_elb_alarm(instance_arn, instance_name=None):
print("[Add elb alarm.]")
elb_client = boto3.client('elbv2')
instance = elb_client.describe_listeners(LoadBalancerArn=instance_arn)
arn = instance_arn.split(':loadbalancer/')[1]
print(instance)
targetgroup_list = []
for listener in instance.get('Listeners'):
for action in listener['DefaultActions']:
print(action['TargetGroupArn'])
targetgroup_arn = action['TargetGroupArn']
targetgroup = targetgroup_arn.split(':')[-1]
targetgroup_list.append(targetgroup)
print("[ActiveFlowCount, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 当链接数平均大于或等于 6000 就告警]")
response = cloudwatch.put_metric_alarm(
AlarmName='AWS_ELB_%s_ActiveFlowCount' % instance_name,
AlarmDescription='Alarm when ELB ActiveFlowCount exceeds 6000',
ActionsEnabled=True,
OKActions=[sns_arn],
AlarmActions=[sns_arn],
MetricName='ActiveFlowCount',
Namespace='AWS/ApplicationELB',
Statistic='Average',
Dimensions=[
{
'Name': 'LoadBalancer',
'Value': '%s' % arn
}
],
Period=60,
EvaluationPeriods=3,
DatapointsToAlarm=3,
Threshold=6000,
ComparisonOperator='GreaterThanOrEqualToThreshold',
TreatMissingData='notBreaching'
)
print(response)
print("[NewFlowCount, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 当链接数平均大于或等于 1000 就告警]")
response = cloudwatch.put_metric_alarm(
AlarmName='AWS_ELB_%s_NewFlowCount' % instance_name,
AlarmDescription='Alarm when ELB NewFlowCount exceeds 6000',
ActionsEnabled=True,
OKActions=[sns_arn],
AlarmActions=[sns_arn],
MetricName='NewFlowCount',
Namespace='AWS/ApplicationELB',
Statistic='Average',
Dimensions=[
{
'Name': 'LoadBalancer',
'Value': '%s' % arn
}
],
Period=60,
EvaluationPeriods=3,
DatapointsToAlarm=3,
Threshold=1000,
ComparisonOperator='GreaterThanOrEqualToThreshold',
TreatMissingData='notBreaching'
)
print(response)
print("[ProcessedBytes, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 当链接数平均大于或等于 5m 就告警]")
response = cloudwatch.put_metric_alarm(
AlarmName='AWS_ELB_%s_ProcessedBytes' % instance_name,
AlarmDescription='Alarm when ELB ProcessedBytes exceeds 5m',
ActionsEnabled=True,
OKActions=[sns_arn],
AlarmActions=[sns_arn],
MetricName='ProcessedBytes',
Namespace='AWS/ApplicationELB',
Statistic='Average',
Dimensions=[
{
'Name': 'LoadBalancer',
'Value': 'app/%s' % arn
}
],
Period=60,
EvaluationPeriods=3,
DatapointsToAlarm=3,
Threshold=5000000,
ComparisonOperator='GreaterThanOrEqualToThreshold',
TreatMissingData='notBreaching'
)
print(response)
for targetgroup in set(targetgroup_list):
targetgroup_name = targetgroup.split('/')[1]
print("[HealthyHostCount, 1分钟采集1次, 周期为1分钟, 1分钟有1个数据点超过阈值就告警, 健康主机的最大值小于等于 0 就告警]")
response = cloudwatch.put_metric_alarm(
AlarmName='AWS_ELB_%s_%s_HealthyHostCount' % (instance_name, targetgroup_name),
AlarmDescription='Alarm when ELB HealthyHostCount less than 0',
ActionsEnabled=True,
OKActions=[sns_arn],
AlarmActions=[sns_arn],
MetricName='HealthyHostCount',
Namespace='AWS/ApplicationELB',
Statistic='Maximum',
Dimensions=[
{
'Name': 'TargetGroup',
'Value': '%s' % targetgroup
},
{
'Name': 'LoadBalancer',
'Value': '%s' % arn
}
],
Period=60,
Unit='Percent',
EvaluationPeriods=1,
DatapointsToAlarm=1,
Threshold=0,
ComparisonOperator='LessThanOrEqualToThreshold',
TreatMissingData='notBreaching'
)
print(response)
print("[UnHealthyHostCount, 1分钟采集1次, 周期为1分钟, 1分钟有1个数据点超过阈值就告警, 当不健康主机数量大于或等于 1 个就告警]")
response = cloudwatch.put_metric_alarm(
AlarmName='AWS_ELB_%s_%s_UnHealthyHostCount' % (instance_name, targetgroup_name),
AlarmDescription='Alarm when ELB UnHealthyHostCount exceeds 5m',
ActionsEnabled=True,
OKActions=[sns_arn],
AlarmActions=[sns_arn],
MetricName='UnHealthyHostCount',
Namespace='AWS/ApplicationELB',
Statistic='Average',
Dimensions=[
{
'Name': 'TargetGroup',
'Value': '%s' % targetgroup
},
{
'Name': 'LoadBalancer',
'Value': '%s' % arn
}
],
Period=60,
EvaluationPeriods=1,
DatapointsToAlarm=1,
Threshold=1,
ComparisonOperator='GreaterThanOrEqualToThreshold',
TreatMissingData='notBreaching'
)
print(response)
def add_elb_http_alarm(instance_group, instance_arn, port=80, instance_name=None):
print("[Add elb http %s alarm.]" % instance_group)
print("[HTTPCode_Target_5XX_Count, 1分钟采集1次, 周期为1分钟, 1分钟有1个数据点超过阈值就告警, 当5xx超过 10 个为超过阈值]")
response = cloudwatch.put_metric_alarm(
AlarmName='AWS_ELB_%s_%s_HTTPCode_Target_5XX_Count' % (instance_name, port),
AlarmDescription='Alarm when ELB HTTPCode_Target_5XX_Count exceeds 10',
ActionsEnabled=True,
OKActions=[sns_arn],
AlarmActions=[sns_arn],
MetricName='HTTPCode_Target_5XX_Count',
Namespace='AWS/ApplicationELB',
Statistic='Sum',
Dimensions=[
{
'Name': 'TargetGroup',
'Value': 'targetgroup/%s' % instance_group
},
{
'Name': 'LoadBalancer',
'Value': 'app/%s' % instance_arn
}
],
Period=60,
EvaluationPeriods=1,
DatapointsToAlarm=1,
Threshold=10,
ComparisonOperator='GreaterThanOrEqualToThreshold',
TreatMissingData='notBreaching'
)
print(response)
print("[HTTPCode_Target_4XX_Count, 1分钟采集1次, 周期为1分钟, 5分钟有3个数据点超过阈值就告警, 当4xx超过 10% 为超过阈值]")
response = cloudwatch.put_metric_alarm(
AlarmName='AWS_ELB_%s_%s_HTTPCode_Target_4XX_Count' % (instance_name, port),
AlarmDescription='Alarm when ELB HTTPCode_Target_4XX_Count exceeds 10',
ActionsEnabled=True,
OKActions=[sns_arn],
AlarmActions=[sns_arn],
MetricName='HTTPCode_Target_4XX_Count',
Namespace='AWS/ApplicationELB',
Statistic='Sum',
Dimensions=[
{
'Name': 'TargetGroup',
'Value': 'targetgroup/%s' % instance_group
},
{
'Name': 'LoadBalancer',
'Value': 'app/%s' % instance_arn
}
],
Period=60,
EvaluationPeriods=5,
DatapointsToAlarm=3,
Threshold=10,
ComparisonOperator='GreaterThanOrEqualToThreshold',
TreatMissingData='notBreaching'
)
print(response)
def add_ec2_alarm(instance_id, instance_name=None):
print("Add ec2 %s alarm." % instance_id)
print("[CPUUtilization, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 80%]")
response = cloudwatch.put_metric_alarm(
AlarmName='AWS_EC2_%s_CPUUtilization' % (instance_name if instance_name else instance_id),
AlarmDescription='Alarm when server CPU exceeds 80%',
ActionsEnabled=True,
OKActions=[sns_arn],
AlarmActions=[sns_arn],
MetricName='CPUUtilization',
Namespace='AWS/EC2',
Statistic='Average',
Dimensions=[
{
'Name': 'InstanceId',
'Value': '%s' % instance_id
},
],
Period=60,
Unit='Percent',
EvaluationPeriods=3,
DatapointsToAlarm=3,
Threshold=80.0,
ComparisonOperator='GreaterThanOrEqualToThreshold',
TreatMissingData='notBreaching'
)
print(response)
print("[NetworkIn, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]")
response = cloudwatch.put_metric_alarm(
AlarmName='AWS_EC2_%s_NetworkIn' % (instance_name if instance_name else instance_id),
AlarmDescription='Alarm when server NetworkIn exceeds 5m',
ActionsEnabled=True,
OKActions=[sns_arn],
AlarmActions=[sns_arn],
MetricName='NetworkIn',
Namespace='AWS/EC2',
Statistic='Average',
Dimensions=[
{
'Name': 'InstanceId',
'Value': '%s' % instance_id
},
],
Period=60,
EvaluationPeriods=3,
DatapointsToAlarm=3,
Threshold=5000000,
ComparisonOperator='GreaterThanOrEqualToThreshold',
TreatMissingData='notBreaching'
)
print(response)
print("[NetworkOut, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]")
response = cloudwatch.put_metric_alarm(
AlarmName='AWS_EC2_%s_NetworkOut' % (instance_name if instance_name else instance_id),
AlarmDescription='Alarm when server NetworkOut exceeds 5m',
ActionsEnabled=True,
OKActions=[sns_arn],
AlarmActions=[sns_arn],
MetricName='NetworkOut',
Namespace='AWS/EC2',
Statistic='Average',
Dimensions=[
{
'Name': 'InstanceId',
'Value': '%s' % instance_id
},
],
Period=60,
EvaluationPeriods=3,
DatapointsToAlarm=3,
Threshold=5000000,
ComparisonOperator='GreaterThanOrEqualToThreshold',
TreatMissingData='notBreaching'
)
print(response)
print("[StatusCheckFailed_System, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]")
response = cloudwatch.put_metric_alarm(
AlarmName='AWS_EC2_%s_StatusCheckFailed_System' % (instance_name if instance_name else instance_id),
AlarmDescription='Alarm when server NetworkOut Status Check Failed',
ActionsEnabled=True,
OKActions=[sns_arn],
AlarmActions=[sns_arn],
MetricName='StatusCheckFailed_System',
Namespace='AWS/EC2',
Statistic='Average',
Dimensions=[
{
'Name': 'InstanceId',
'Value': '%s' % instance_id
},
],
Period=60,
EvaluationPeriods=3,
DatapointsToAlarm=3,
Threshold=1.0,
ComparisonOperator='GreaterThanOrEqualToThreshold',
TreatMissingData='notBreaching'
)
print(response)
def add_ec2_ebs_alarm(instance_id, instance_name=None):
print("Add ec2 ebs %s alarm." % instance_id)
ec2d = boto3.resource('ec2')
instance = ec2d.Instance(instance_id)
vol_id = instance.volumes.all()
print(vol_id)
for v in vol_id:
print("[Found EBS volume %s on instance %s]" % (v.id, instance_id))
print("[VolumeIdleTime, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 80%]")
response = cloudwatch.put_metric_alarm(
AlarmName='AWS_EC2_%s_EBS_%s_VolumeIdleTime' % (instance_name if instance_name else instance_id, v.id),
AlarmDescription='Alarm when server CPU exceeds 80%',
ActionsEnabled=True,
OKActions=[sns_arn],
AlarmActions=[sns_arn],
MetricName='VolumeIdleTime',
Namespace='AWS/EBS',
Statistic='Average',
Dimensions=[
{
'Name': 'VolumeId',
'Value': '%s' % v.id
},
],
Period=60,
EvaluationPeriods=3,
DatapointsToAlarm=3,
Threshold=30.0,
ComparisonOperator='GreaterThanOrEqualToThreshold',
TreatMissingData='notBreaching'
)
print(response)
def add_redis_alarm(instance_id, instance_name=None):
print("[Add redis %s alarm.]" % instance_id)
print("[CPUUtilization, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 80%]")
response = cloudwatch.put_metric_alarm(
AlarmName='AWS_REDIS_%s_CPUUtilization' % (instance_name if instance_name else instance_id),
AlarmDescription='Alarm when redis CPU exceeds 80%',
ActionsEnabled=True,
OKActions=[sns_arn],
AlarmActions=[sns_arn],
MetricName='CPUUtilization',
Namespace='AWS/ElastiCache',
Statistic='Average',
Dimensions=[
{
'Name': 'CacheClusterId',
'Value': '%s' % instance_id
},
],
Period=60,
Unit='Percent',
EvaluationPeriods=3,
DatapointsToAlarm=3,
Threshold=80.0,
ComparisonOperator='GreaterThanOrEqualToThreshold',
TreatMissingData='notBreaching'
)
print(response)
print("[EngineCPUUtilization, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 80%]")
response = cloudwatch.put_metric_alarm(
AlarmName='AWS_REDIS_%s_EngineCPUUtilization' % (instance_name if instance_name else instance_id),
AlarmDescription='Alarm when redis Engine CPU exceeds 80%',
ActionsEnabled=True,
OKActions=[sns_arn],
AlarmActions=[sns_arn],
MetricName='EngineCPUUtilization',
Namespace='AWS/ElastiCache',
Statistic='Average',
Dimensions=[
{
'Name': 'CacheClusterId',
'Value': '%s' % instance_id
},
],
Period=60,
Unit='Percent',
EvaluationPeriods=3,
DatapointsToAlarm=3,
Threshold=80.0,
ComparisonOperator='GreaterThanOrEqualToThreshold',
TreatMissingData='notBreaching'
)
print(response)
print("[CurrConnections, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 500]")
response = cloudwatch.put_metric_alarm(
AlarmName='AWS_REDIS_%s_CurrConnections' % (instance_name if instance_name else instance_id),
AlarmDescription='Alarm when redis connections exceeds 500',
ActionsEnabled=True,
OKActions=[sns_arn],
AlarmActions=[sns_arn],
MetricName='CurrConnections',
Namespace='AWS/ElastiCache',
Statistic='Average',
Dimensions=[
{
'Name': 'CacheClusterId',
'Value': '%s' % instance_id
},
],
Period=60,
Unit='Percent',
EvaluationPeriods=3,
DatapointsToAlarm=3,
Threshold=500,
ComparisonOperator='GreaterThanOrEqualToThreshold',
TreatMissingData='notBreaching'
)
print(response)
print("[FreeableMemory, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值小于于或等于 1G]")
response = cloudwatch.put_metric_alarm(
AlarmName='AWS_REDIS_%s_FreeableMemory' % (instance_name if instance_name else instance_id),
AlarmDescription='Alarm when redis FreeableMemory Less than 1G',
ActionsEnabled=True,
OKActions=[sns_arn],
AlarmActions=[sns_arn],
MetricName='FreeableMemory',
Namespace='AWS/ElastiCache',
Statistic='Average',
Dimensions=[
{
'Name': 'CacheClusterId',
'Value': '%s' % instance_id
},
],
Period=60,
Unit='Percent',
EvaluationPeriods=3,
DatapointsToAlarm=3,
Threshold=1000000000,
ComparisonOperator='LessThanOrEqualToThreshold',
TreatMissingData='notBreaching'
)
print(response)
print("[NetworkBytesIn, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]")
response = cloudwatch.put_metric_alarm(
AlarmName='AWS_REDIS_%s_NetworkBytesIn' % (instance_name if instance_name else instance_id),
AlarmDescription='Alarm when redis NetworkBytesIn exceeds 5m',
ActionsEnabled=True,
OKActions=[sns_arn],
AlarmActions=[sns_arn],
MetricName='NetworkBytesIn',
Namespace='AWS/ElastiCache',
Statistic='Average',
Dimensions=[
{
'Name': 'CacheClusterId',
'Value': '%s' % instance_id
},
],
Period=60,
Unit='Percent',
EvaluationPeriods=3,
DatapointsToAlarm=3,
Threshold=5000000,
ComparisonOperator='GreaterThanOrEqualToThreshold',
TreatMissingData='notBreaching'
)
print(response)
print("[NetworkBytesOut, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]")
response = cloudwatch.put_metric_alarm(
AlarmName='AWS_REDIS_%s_NetworkBytesOut' % (instance_name if instance_name else instance_id),
AlarmDescription='Alarm when redis NetworkBytesOut exceeds 5m',
ActionsEnabled=True,
OKActions=[sns_arn],
AlarmActions=[sns_arn],
MetricName='NetworkBytesOut',
Namespace='AWS/ElastiCache',
Statistic='Average',
Dimensions=[
{
'Name': 'CacheClusterId',
'Value': '%s' % instance_id
},
],
Period=60,
Unit='Percent',
EvaluationPeriods=3,
DatapointsToAlarm=3,
Threshold=5000000,
ComparisonOperator='GreaterThanOrEqualToThreshold',
TreatMissingData='notBreaching'
)
print(response)
print("[CacheMisses, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]")
response = cloudwatch.put_metric_alarm(
AlarmName='AWS_REDIS_%s_CacheMisses' % (instance_name if instance_name else instance_id),
AlarmDescription='Alarm when redis CacheMisses exceeds 5000',
ActionsEnabled=True,
OKActions=[sns_arn],
AlarmActions=[sns_arn],
MetricName='CacheMisses',
Namespace='AWS/ElastiCache',
Statistic='Average',
Dimensions=[
{
'Name': 'CacheClusterId',
'Value': '%s' % instance_id
},
],
Period=60,
Unit='Percent',
EvaluationPeriods=3,
DatapointsToAlarm=3,
Threshold=5000,
ComparisonOperator='GreaterThanOrEqualToThreshold',
TreatMissingData='notBreaching'
)
print(response)
def add_mysql_alarm(instance_id, instance_name=None):
print("[Add mysql %s alarm.]" % instance_id)
print("[CPUUtilization, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]")
response = cloudwatch.put_metric_alarm(
AlarmName='AWS_MYSQL_%s_CPUUtilization' % (instance_name if instance_name else instance_id),
AlarmDescription='Alarm when mysql CPUUtilization exceeds 80%',
ActionsEnabled=True,
OKActions=[sns_arn],
AlarmActions=[sns_arn],
MetricName='CPUUtilization',
Namespace='AWS/RDS',
Statistic='Average',
Dimensions=[
{
'Name': 'DBInstanceIdentifier',
'Value': '%s' % instance_id
},
],
Period=60,
Unit='Percent',
EvaluationPeriods=3,
DatapointsToAlarm=3,
Threshold=80.0,
ComparisonOperator='GreaterThanOrEqualToThreshold',
TreatMissingData='notBreaching'
)
print(response)
print("[DatabaseConnections, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 500]")
response = cloudwatch.put_metric_alarm(
AlarmName='AWS_MYSQL_%s_DatabaseConnections' % (instance_name if instance_name else instance_id),
AlarmDescription='Alarm when mysql DatabaseConnections exceeds 500',
ActionsEnabled=True,
OKActions=[sns_arn],
AlarmActions=[sns_arn],
MetricName='DatabaseConnections',
Namespace='AWS/RDS',
Statistic='Average',
Dimensions=[
{
'Name': 'DBInstanceIdentifier',
'Value': '%s' % instance_id
},
],
Period=60,
Unit='Percent',
EvaluationPeriods=3,
DatapointsToAlarm=3,
Threshold=500,
ComparisonOperator='GreaterThanOrEqualToThreshold',
TreatMissingData='notBreaching'
)
print(response)
print("[FreeableMemory, 1分钟采集1次, 周期为3分钟, 3分钟有3个数据点超过阈值就告警, 平均值小于或等于 1g]")
response = cloudwatch.put_metric_alarm(
AlarmName='AWS_MYSQL_%s_FreeableMemory' % (instance_name if instance_name else instance_id),
AlarmDescription='Alarm when mysql FreeableMemory less than 1g',
ActionsEnabled=True,
OKActions=[sns_arn],
AlarmActions=[sns_arn],
MetricName='FreeableMemory',
Namespace='AWS/RDS',
Statistic='Average',
Dimensions=[
{
'Name': 'DBInstanceIdentifier',
'Value': '%s' % instance_id
},
],
Period=60,
Unit='Percent',
EvaluationPeriods=3,
DatapointsToAlarm=3,
Threshold=1000000000,
ComparisonOperator='LessThanOrEqualToThreshold',
TreatMissingData='notBreaching'
)
print(response)
print("[FreeStorageSpace, 5分钟采集1次, 周期为5分钟, 15分钟有3个数据点超过阈值就告警, 平均值小于于或等于 10g]")
response = cloudwatch.put_metric_alarm(
AlarmName='AWS_MYSQL_%s_FreeStorageSpace' % (instance_name if instance_name else instance_id),
AlarmDescription='Alarm when mysql FreeStorageSpace less than 10g',
ActionsEnabled=True,
OKActions=[sns_arn],
AlarmActions=[sns_arn],
MetricName='FreeableMemory',
Namespace='AWS/RDS',
Statistic='Average',
Dimensions=[
{
'Name': 'DBInstanceIdentifier',
'Value': '%s' % instance_id
},
],
Period=300,
Unit='Percent',
EvaluationPeriods=3,
DatapointsToAlarm=3,
Threshold=10000000000,
ComparisonOperator='LessThanOrEqualToThreshold',
TreatMissingData='notBreaching'
)
print(response)
print("[NetworkTransmitThroughput, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于于或等于5m就告警]")
response = cloudwatch.put_metric_alarm(
AlarmName='AWS_MYSQL_%s_NetworkTransmitThroughput' % (instance_name if instance_name else instance_id),
AlarmDescription='Alarm when mysql NetworkTransmitThroughput exceeds 5m',
ActionsEnabled=True,
OKActions=[sns_arn],
AlarmActions=[sns_arn],
MetricName='NetworkTransmitThroughput',
Namespace='AWS/RDS',
Statistic='Average',
Dimensions=[
{
'Name': 'DBInstanceIdentifier',
'Value': '%s' % instance_id
},
],
Period=60,
Unit='Percent',
EvaluationPeriods=3,
DatapointsToAlarm=3,
Threshold=5000000,
ComparisonOperator='GreaterThanOrEqualToThreshold',
TreatMissingData='notBreaching'
)
print(response)
print("[NetworkReceiveThroughput, 5分钟采集1次, 周期为5分钟, 15分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]")
response = cloudwatch.put_metric_alarm(
AlarmName='AWS_MYSQL_%s_NetworkReceiveThroughput' % (instance_name if instance_name else instance_id),
AlarmDescription='Alarm when mysql NetworkReceiveThroughput exceeds 5m',
ActionsEnabled=True,
OKActions=[sns_arn],
AlarmActions=[sns_arn],
MetricName='NetworkReceiveThroughput',
Namespace='AWS/RDS',
Statistic='Average',
Dimensions=[
{
'Name': 'DBInstanceIdentifier',
'Value': '%s' % instance_id
},
],
Period=300,
Unit='Percent',
EvaluationPeriods=3,
DatapointsToAlarm=3,
Threshold=5000000,
ComparisonOperator='GreaterThanOrEqualToThreshold',
TreatMissingData='notBreaching'
)
print(response)
def add_ec2_dashboard(instance_id, instance_name=None):
print("[add ec2 %s dashboard]" % (instance_name if instance_name else instance_id))
body = {"widgets": [
{
"type": "metric",
"x": 6,
"y": 0,
"width": 6,
"height": 6,
"properties": {
"view": "timeSeries",
"stacked": False,
"metrics": [
["AWS/EC2", "NetworkIn", "InstanceId", instance_id],
[".", "NetworkOut", ".", "."]
],
"region": region,
"title": "Network"
}
},
{
"type": "metric",
"x": 0,
"y": 0,
"width": 6,
"height": 6,
"properties": {
"view": "timeSeries",
"stacked": False,
"metrics": [
["AWS/EC2", "CPUUtilization", "InstanceId", instance_id],
],
"region": region,
"title": "CPUUtilization"
}
},
{
"type": "metric",
"x": 12,
"y": 0,
"width": 6,
"height": 6,
"properties": {
"view": "timeSeries",
"stacked": False,
"metrics": [
["AWS/EC2", "EBSWriteOps", "InstanceId", instance_id],
[".", "EBSReadOps", ".", "."]
],
"region": region,
"title": "EBSOps"
}
},
{
"type": "metric",
"x": 18,
"y": 0,
"width": 6,
"height": 6,
"properties": {
"view": "timeSeries",
"stacked": False,
"metrics": [
["AWS/EC2", "EBSIOBalance%", "InstanceId", instance_id],
[".", "EBSByteBalance%", ".", "."]
],
"region": region,
"title": "EBSBalance"
}
}
]
}
response = cloudwatch.put_dashboard(
DashboardName='AWS_EC2_%s' % (instance_name if instance_name else instance_id),
DashboardBody=json.dumps(body)
)
print(response)
def add_mysql_dashboard(instance_id, instance_name=None):
print("[add mysql %s dashboard]" % (instance_name if instance_name else instance_id))
body = {
"widgets": [
{
"type": "metric",
"x": 0,
"y": 0,
"width": 6,
"height": 6,
"properties": {
"view": "timeSeries",
"stacked": False,
"metrics": [
["AWS/RDS", "CPUUtilization", "DBInstanceIdentifier", instance_id]
],
"region": region
}
},
{
"type": "metric",
"x": 12,
"y": 0,
"width": 6,
"height": 6,
"properties": {
"view": "timeSeries",
"stacked": False,
"metrics": [
["AWS/RDS", "DatabaseConnections", "DBInstanceIdentifier", instance_id]
],
"region": region
}
},
{
"type": "metric",
"x": 0,
"y": 6,
"width": 6,
"height": 6,
"properties": {
"view": "timeSeries",
"stacked": True,
"metrics": [
["AWS/RDS", "ReadLatency", "DBInstanceIdentifier", instance_id],
[".", "WriteLatency", ".", "."]
],
"region": region
}
},
{
"type": "metric",
"x": 18,
"y": 0,
"width": 6,
"height": 6,
"properties": {
"view": "timeSeries",
"stacked": True,
"metrics": [
["AWS/RDS", "WriteIOPS", "DBInstanceIdentifier", instance_id],
[".", "ReadIOPS", ".", "."]
],
"region": region
}
},
{
"type": "metric",
"x": 6,
"y": 6,
"width": 6,
"height": 6,
"properties": {
"view": "timeSeries",
"stacked": True,
"metrics": [
["AWS/RDS", "NetworkTransmitThroughput", "DBInstanceIdentifier", instance_id],
[".", "NetworkReceiveThroughput", ".", "."]
],
"region": region
}
},
{
"type": "metric",
"x": 12,
"y": 6,
"width": 6,
"height": 6,
"properties": {
"view": "timeSeries",
"stacked": True,
"metrics": [
["AWS/RDS", "FreeStorageSpace", "DBInstanceIdentifier", instance_id],
[".", "BinLogDiskUsage", ".", "."]
],
"region": region
}
},
{
"type": "metric",
"x": 6,
"y": 0,
"width": 6,
"height": 6,
"properties": {
"view": "timeSeries",
"stacked": True,
"metrics": [
["AWS/RDS", "FreeableMemory", "DBInstanceIdentifier", instance_id]
],
"region": region
}
}
]
}
response = cloudwatch.put_dashboard(
DashboardName='AWS_MYSQL_%s' % (instance_name if instance_name else instance_id),
DashboardBody=json.dumps(body)
)
print(response)
def add_redis_dashboard(clusters, group_id):
print("[add redis %s dashboard]" % group_id)
EngineCPUUtilization_metrcis = []
CurrConnections_metrcis = []
FreeableMemory_metrcis = []
NetworkBytes_metrcis = []
CacheHits_metrcis = []
CacheMisses_metrcis = []
CPUUtilization_metrcis = []
IsMaster_metrcis = []
NewConnections_metrcis = []
StringBasedCmds_metrcis = []
BytesUsedForCache_metrcis = []
ReplicationBytes_metrcis = []
for c in clusters:
EngineCPUUtilization_metrcis.append(["AWS/ElastiCache", "EngineCPUUtilization", "CacheClusterId", "%s" % c])
CurrConnections_metrcis.append(["AWS/ElastiCache", "CurrConnections", "CacheClusterId", "%s" % c])
FreeableMemory_metrcis.append(["AWS/ElastiCache", "FreeableMemory", "CacheClusterId", "%s" % c])
NetworkBytes_metrcis.append(["AWS/ElastiCache", "NetworkBytesIn", "CacheClusterId", "%s" % c])
NetworkBytes_metrcis.append(["AWS/ElastiCache", "NetworkBytesOut", "CacheClusterId", "%s" % c])
CacheHits_metrcis.append(["AWS/ElastiCache", "CacheHits", "CacheClusterId", "%s" % c])
CacheMisses_metrcis.append(["AWS/ElastiCache", "CacheMisses", "CacheClusterId", "%s" % c])
CPUUtilization_metrcis.append(["AWS/ElastiCache", "CPUUtilization", "CacheClusterId", "%s" % c])
IsMaster_metrcis.append(["AWS/ElastiCache", "IsMaster", "CacheClusterId", "%s" % c])
NewConnections_metrcis.append(["AWS/ElastiCache", "NewConnections", "CacheClusterId", "%s" % c])
StringBasedCmds_metrcis.append(["AWS/ElastiCache", "StringBasedCmds", "CacheClusterId", "%s" % c])
BytesUsedForCache_metrcis.append(["AWS/ElastiCache", "BytesUsedForCache", "CacheClusterId", "%s" % c])
ReplicationBytes_metrcis.append(["AWS/ElastiCache", "BytesUsedForCache", "CacheClusterId", "%s" % c])
body = {
"widgets": [
{
"type": "metric",
"x": 6,
"y": 0,
"width": 6,
"height": 6,
"properties": {
"metrics": EngineCPUUtilization_metrcis,
"view": "timeSeries",
"stacked": True,
"region": region
}
},
{
"type": "metric",
"x": 12,
"y": 6,
"width": 6,
"height": 6,
"properties": {
"metrics": CurrConnections_metrcis,
"view": "timeSeries",
"stacked": True,
"region": region
}
},
{
"type": "metric",
"x": 12,
"y": 0,
"width": 6,
"height": 6,
"properties": {
"view": "timeSeries",
"stacked": True,
"metrics": FreeableMemory_metrcis,
"region": region
}
},
{
"type": "metric",
"x": 0,
"y": 6,
"width": 6,
"height": 6,
"properties": {
"view": "timeSeries",
"stacked": True,
"metrics": NetworkBytes_metrcis,
"region": region,
"title": "NetworkBytes"
}
},
{
"type": "metric",
"x": 0,
"y": 12,
"width": 6,
"height": 6,
"properties": {
"view": "timeSeries",
"stacked": False,
"metrics": CacheHits_metrcis,
"region": region,
"title": " CacheHits"
}
},
{
"type": "metric",
"x": 6,
"y": 12,
"width": 6,
"height": 6,
"properties": {
"view": "timeSeries",
"stacked": False,
"metrics": CacheMisses_metrcis,
"region": region,
"title": " CacheMisses"
}
},
{
"type": "metric",
"x": 0,
"y": 0,
"width": 6,
"height": 6,
"properties": {
"view": "timeSeries",
"stacked": False,
"metrics": CPUUtilization_metrcis,
"region": region,
"title": "CPUUtilization"
}
},
{
"type": "metric",
"x": 18,
"y": 0,
"width": 6,
"height": 6,
"properties": {
"view": "timeSeries",
"stacked": False,
"metrics": IsMaster_metrcis,
"region": region,
"title": "IsMaster",
"period": 300
}
},
{
"type": "metric",
"x": 6,
"y": 6,
"width": 6,
"height": 6,
"properties": {
"view": "timeSeries",
"stacked": False,
"metrics": NewConnections_metrcis,
"region": region,
"title": "NewConnections"
}
},
{
"type": "metric",
"x": 12,
"y": 12,
"width": 6,
"height": 6,
"properties": {
"view": "timeSeries",
"stacked": False,
"metrics": StringBasedCmds_metrcis,
"region": region,
"title": "StringBasedCmds"
}
},
{
"type": "metric",
"x": 18,
"y": 6,
"width": 6,
"height": 6,
"properties": {
"view": "timeSeries",
"stacked": False,
"metrics": BytesUsedForCache_metrcis,
"region": region,
"title": "BytesUsedForCache"
}
},
{
"type": "metric",
"x": 18,
"y": 12,
"width": 6,
"height": 6,
"properties": {
"view": "timeSeries",
"stacked": False,
"metrics": ReplicationBytes_metrcis,
"region": region,
"title": "ReplicationBytes"
}
}
]
}
response = cloudwatch.put_dashboard(
DashboardName='AWS_REDIS_%s' % group_id,
DashboardBody=json.dumps(body)
)
print(response)
def add_elb_dashboard(instance_arn, instance_name):
print("[add elb %s dashboard]" % instance_name)
elb_client = boto3.client('elbv2')
instance = elb_client.describe_listeners(LoadBalancerArn=instance_arn)
arn = instance_arn.split(':loadbalancer/')[1]
targetgroup_list = []
for listener in instance.get('Listeners'):
for action in listener['DefaultActions']:
print(action['TargetGroupArn'])
targetgroup_arn = action['TargetGroupArn']
targetgroup = targetgroup_arn.split(':')[-1]
targetgroup_list.append(targetgroup)
HealthyHost_metrcis = []
for targetgroup in set(targetgroup_list):
HealthyHost_metrcis.append(["AWS/NetworkELB", "UnHealthyHostCount", "TargetGroup",
targetgroup, "LoadBalancer", arn])
HealthyHost_metrcis.append(["AWS/NetworkELB", "HealthyHostCount", "TargetGroup",
targetgroup, "LoadBalancer", arn])
body = {
"widgets": [
{
"type": "metric",
"x": 0,
"y": 0,
"width": 12,
"height": 6,
"properties": {
"view": "timeSeries",
"stacked": False,
"metrics": [
["AWS/NetworkELB", "ProcessedBytes", "LoadBalancer", "%s" % arn]
],
"region": region,
"title": "ProcessedBytes"
}
},
{
"type": "metric",
"x": 12,
"y": 0,
"width": 12,
"height": 6,
"properties": {
"metrics": [
["AWS/NetworkELB", "ActiveFlowCount", "LoadBalancer", "%s" % arn],
[".", "NewFlowCount", ".", "."],
],
"view": "timeSeries",
"stacked": False,
"region": region,
"title": "FlowCount"
}
},
{
"type": "metric",
"x": 0,
"y": 6,
"width": 12,
"height": 6,
"properties": {
"view": "timeSeries",
"stacked": False,
"metrics": [
["AWS/NetworkELB", "TCP_Client_Reset_Count", "LoadBalancer", "%s" % arn],
[".", "TCP_ELB_Reset_Count", ".", "."],
[".", "TCP_Target_Reset_Count", ".", "."]
],
"region": region,
"title": "TCP_Reset"
}
},
{
"type": "metric",
"x": 12,
"y": 6,
"width": 12,
"height": 6,
"properties": {
"view": "timeSeries",
"stacked": False,
"metrics": HealthyHost_metrcis,
"region": region,
"title": "HealthyHost"
}
}
]
}
response = cloudwatch.put_dashboard(
DashboardName='AWS_ELB_%s' % instance_name,
DashboardBody=json.dumps(body)
)
print(response)
def lambda_handler(event, context):
# TODO implement
print(event)
detail = event.get('detail', {})
event_source = detail.get('eventSource')
event_name = detail.get('eventName')
event_response = detail.get('responseElements', {})
if event_source == 'ec2.amazonaws.com':
if event_name == 'RunInstances':
instances = event_response.get('instancesSet', {})
for item in instances.get('items', []):
print(item)
instance_id = item.get('instanceId')
if instance_id:
add_ec2_alarm(instance_id)
add_ec2_ebs_alarm(instance_id)
add_ec2_dashboard(instance_id)
elif event_source == 'rds.amazonaws.com':
if event_name == 'CreateDBInstance':
dBInstanceArn = event_response.get('dBInstanceArn')
dBInstanceIdentifier = event_response.get('dBInstanceIdentifier')
engine = event_response.get('engine')
if engine == 'mysql':
print(dBInstanceArn, dBInstanceIdentifier)
add_mysql_alarm(dBInstanceIdentifier)
add_mysql_dashboard(dBInstanceIdentifier)
elif event_source == 'elasticache.amazonaws.com':
if event_name == 'CreateReplicationGroup':
group_id = event_response.get('replicationGroupId')
clusters = event_response.get('memberClusters')
for c in clusters:
add_redis_alarm(c)
add_redis_dashboard(clusters, group_id)
elif event_source == 'elasticloadbalancing.amazonaws.com':
if event_name == 'CreateLoadBalancer':
loadBalancers = event_response.get('loadBalancers')
for loadbalancer in loadBalancers:
instance_name = loadbalancer.get('loadBalancerName')
instance_type = loadbalancer.get('type')
instance_arn = loadbalancer.get('loadBalancerArn')
if instance_type == 'network':
add_elb_alarm(instance_arn, instance_name)
add_elb_dashboard(instance_arn, instance_name)
return {
'statusCode': 200,
'body': json.dumps('add success.')
}