diff --git a/cloud/aws/lambda/addAlarm.py b/cloud/aws/lambda/addAlarm.py new file mode 100644 index 0000000..d9b1d54 --- /dev/null +++ b/cloud/aws/lambda/addAlarm.py @@ -0,0 +1,1269 @@ +import boto3 +import json + +# Create CloudWatch client +cloudwatch = boto3.client('cloudwatch') +sns_arn = "sns——arn" +region = "cn-north-1" + + +def add_elb_alarm(instance_arn, instance_name=None): + print("[Add elb alarm.]") + + elb_client = boto3.client('elbv2') + + instance = elb_client.describe_listeners(LoadBalancerArn=instance_arn) + arn = instance_arn.split(':loadbalancer/')[1] + print(instance) + targetgroup_list = [] + for listener in instance.get('Listeners'): + for action in listener['DefaultActions']: + print(action['TargetGroupArn']) + + targetgroup_arn = action['TargetGroupArn'] + targetgroup = targetgroup_arn.split(':')[-1] + targetgroup_list.append(targetgroup) + + print("[ActiveFlowCount, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 当链接数平均大于或等于 6000 就告警]") + response = cloudwatch.put_metric_alarm( + AlarmName='AWS_ELB_%s_ActiveFlowCount' % instance_name, + AlarmDescription='Alarm when ELB ActiveFlowCount exceeds 6000', + ActionsEnabled=True, + OKActions=[sns_arn], + AlarmActions=[sns_arn], + MetricName='ActiveFlowCount', + Namespace='AWS/ApplicationELB', + Statistic='Average', + Dimensions=[ + { + 'Name': 'LoadBalancer', + 'Value': '%s' % arn + } + ], + Period=60, + EvaluationPeriods=3, + DatapointsToAlarm=3, + Threshold=6000, + ComparisonOperator='GreaterThanOrEqualToThreshold', + TreatMissingData='notBreaching' + ) + print(response) + + print("[NewFlowCount, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 当链接数平均大于或等于 1000 就告警]") + response = cloudwatch.put_metric_alarm( + AlarmName='AWS_ELB_%s_NewFlowCount' % instance_name, + AlarmDescription='Alarm when ELB NewFlowCount exceeds 6000', + ActionsEnabled=True, + OKActions=[sns_arn], + AlarmActions=[sns_arn], + MetricName='NewFlowCount', + Namespace='AWS/ApplicationELB', + Statistic='Average', + Dimensions=[ + { + 'Name': 'LoadBalancer', + 'Value': '%s' % arn + } + ], + Period=60, + EvaluationPeriods=3, + DatapointsToAlarm=3, + Threshold=1000, + ComparisonOperator='GreaterThanOrEqualToThreshold', + TreatMissingData='notBreaching' + ) + print(response) + + print("[ProcessedBytes, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 当链接数平均大于或等于 5m 就告警]") + response = cloudwatch.put_metric_alarm( + AlarmName='AWS_ELB_%s_ProcessedBytes' % instance_name, + AlarmDescription='Alarm when ELB ProcessedBytes exceeds 5m', + ActionsEnabled=True, + OKActions=[sns_arn], + AlarmActions=[sns_arn], + MetricName='ProcessedBytes', + Namespace='AWS/ApplicationELB', + Statistic='Average', + Dimensions=[ + { + 'Name': 'LoadBalancer', + 'Value': 'app/%s' % arn + } + ], + Period=60, + EvaluationPeriods=3, + DatapointsToAlarm=3, + Threshold=5000000, + ComparisonOperator='GreaterThanOrEqualToThreshold', + TreatMissingData='notBreaching' + ) + print(response) + + for targetgroup in set(targetgroup_list): + targetgroup_name = targetgroup.split('/')[1] + + print("[HealthyHostCount, 1分钟采集1次, 周期为1分钟, 1分钟有1个数据点超过阈值就告警, 健康主机的最大值小于等于 0 就告警]") + response = cloudwatch.put_metric_alarm( + AlarmName='AWS_ELB_%s_%s_HealthyHostCount' % (instance_name, targetgroup_name), + AlarmDescription='Alarm when ELB HealthyHostCount less than 0', + ActionsEnabled=True, + OKActions=[sns_arn], + AlarmActions=[sns_arn], + MetricName='HealthyHostCount', + Namespace='AWS/ApplicationELB', + Statistic='Maximum', + Dimensions=[ + { + 'Name': 'TargetGroup', + 'Value': '%s' % targetgroup + }, + { + 'Name': 'LoadBalancer', + 'Value': '%s' % arn + } + ], + Period=60, + Unit='Percent', + EvaluationPeriods=1, + DatapointsToAlarm=1, + Threshold=0, + ComparisonOperator='LessThanOrEqualToThreshold', + TreatMissingData='notBreaching' + ) + print(response) + + print("[UnHealthyHostCount, 1分钟采集1次, 周期为1分钟, 1分钟有1个数据点超过阈值就告警, 当不健康主机数量大于或等于 1 个就告警]") + response = cloudwatch.put_metric_alarm( + AlarmName='AWS_ELB_%s_%s_UnHealthyHostCount' % (instance_name, targetgroup_name), + AlarmDescription='Alarm when ELB UnHealthyHostCount exceeds 5m', + ActionsEnabled=True, + OKActions=[sns_arn], + AlarmActions=[sns_arn], + MetricName='UnHealthyHostCount', + Namespace='AWS/ApplicationELB', + Statistic='Average', + Dimensions=[ + { + 'Name': 'TargetGroup', + 'Value': '%s' % targetgroup + }, + { + 'Name': 'LoadBalancer', + 'Value': '%s' % arn + } + ], + Period=60, + EvaluationPeriods=1, + DatapointsToAlarm=1, + Threshold=1, + ComparisonOperator='GreaterThanOrEqualToThreshold', + TreatMissingData='notBreaching' + ) + print(response) + +def add_elb_http_alarm(instance_group, instance_arn, port=80, instance_name=None): + print("[Add elb http %s alarm.]" % instance_group) + print("[HTTPCode_Target_5XX_Count, 1分钟采集1次, 周期为1分钟, 1分钟有1个数据点超过阈值就告警, 当5xx超过 10 个为超过阈值]") + response = cloudwatch.put_metric_alarm( + AlarmName='AWS_ELB_%s_%s_HTTPCode_Target_5XX_Count' % (instance_name, port), + AlarmDescription='Alarm when ELB HTTPCode_Target_5XX_Count exceeds 10', + ActionsEnabled=True, + OKActions=[sns_arn], + AlarmActions=[sns_arn], + MetricName='HTTPCode_Target_5XX_Count', + Namespace='AWS/ApplicationELB', + Statistic='Sum', + Dimensions=[ + { + 'Name': 'TargetGroup', + 'Value': 'targetgroup/%s' % instance_group + }, + { + 'Name': 'LoadBalancer', + 'Value': 'app/%s' % instance_arn + } + ], + Period=60, + EvaluationPeriods=1, + DatapointsToAlarm=1, + Threshold=10, + ComparisonOperator='GreaterThanOrEqualToThreshold', + TreatMissingData='notBreaching' + ) + print(response) + + print("[HTTPCode_Target_4XX_Count, 1分钟采集1次, 周期为1分钟, 5分钟有3个数据点超过阈值就告警, 当4xx超过 10% 为超过阈值]") + response = cloudwatch.put_metric_alarm( + AlarmName='AWS_ELB_%s_%s_HTTPCode_Target_4XX_Count' % (instance_name, port), + AlarmDescription='Alarm when ELB HTTPCode_Target_4XX_Count exceeds 10', + ActionsEnabled=True, + OKActions=[sns_arn], + AlarmActions=[sns_arn], + MetricName='HTTPCode_Target_4XX_Count', + Namespace='AWS/ApplicationELB', + Statistic='Sum', + Dimensions=[ + { + 'Name': 'TargetGroup', + 'Value': 'targetgroup/%s' % instance_group + }, + { + 'Name': 'LoadBalancer', + 'Value': 'app/%s' % instance_arn + } + ], + Period=60, + EvaluationPeriods=5, + DatapointsToAlarm=3, + Threshold=10, + ComparisonOperator='GreaterThanOrEqualToThreshold', + TreatMissingData='notBreaching' + ) + print(response) + + +def add_ec2_alarm(instance_id, instance_name=None): + print("Add ec2 %s alarm." % instance_id) + print("[CPUUtilization, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 80%]") + response = cloudwatch.put_metric_alarm( + AlarmName='AWS_EC2_%s_CPUUtilization' % (instance_name if instance_name else instance_id), + AlarmDescription='Alarm when server CPU exceeds 80%', + ActionsEnabled=True, + OKActions=[sns_arn], + AlarmActions=[sns_arn], + MetricName='CPUUtilization', + Namespace='AWS/EC2', + Statistic='Average', + Dimensions=[ + { + 'Name': 'InstanceId', + 'Value': '%s' % instance_id + }, + ], + Period=60, + Unit='Percent', + EvaluationPeriods=3, + DatapointsToAlarm=3, + Threshold=80.0, + ComparisonOperator='GreaterThanOrEqualToThreshold', + TreatMissingData='notBreaching' + ) + print(response) + + print("[NetworkIn, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]") + response = cloudwatch.put_metric_alarm( + AlarmName='AWS_EC2_%s_NetworkIn' % (instance_name if instance_name else instance_id), + AlarmDescription='Alarm when server NetworkIn exceeds 5m', + ActionsEnabled=True, + OKActions=[sns_arn], + AlarmActions=[sns_arn], + MetricName='NetworkIn', + Namespace='AWS/EC2', + Statistic='Average', + Dimensions=[ + { + 'Name': 'InstanceId', + 'Value': '%s' % instance_id + }, + ], + Period=60, + EvaluationPeriods=3, + DatapointsToAlarm=3, + Threshold=5000000, + ComparisonOperator='GreaterThanOrEqualToThreshold', + TreatMissingData='notBreaching' + ) + print(response) + + print("[NetworkOut, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]") + response = cloudwatch.put_metric_alarm( + AlarmName='AWS_EC2_%s_NetworkOut' % (instance_name if instance_name else instance_id), + AlarmDescription='Alarm when server NetworkOut exceeds 5m', + ActionsEnabled=True, + OKActions=[sns_arn], + AlarmActions=[sns_arn], + MetricName='NetworkOut', + Namespace='AWS/EC2', + Statistic='Average', + Dimensions=[ + { + 'Name': 'InstanceId', + 'Value': '%s' % instance_id + }, + ], + Period=60, + EvaluationPeriods=3, + DatapointsToAlarm=3, + Threshold=5000000, + ComparisonOperator='GreaterThanOrEqualToThreshold', + TreatMissingData='notBreaching' + ) + print(response) + + print("[StatusCheckFailed_System, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]") + response = cloudwatch.put_metric_alarm( + AlarmName='AWS_EC2_%s_StatusCheckFailed_System' % (instance_name if instance_name else instance_id), + AlarmDescription='Alarm when server NetworkOut Status Check Failed', + ActionsEnabled=True, + OKActions=[sns_arn], + AlarmActions=[sns_arn], + MetricName='StatusCheckFailed_System', + Namespace='AWS/EC2', + Statistic='Average', + Dimensions=[ + { + 'Name': 'InstanceId', + 'Value': '%s' % instance_id + }, + ], + Period=60, + EvaluationPeriods=3, + DatapointsToAlarm=3, + Threshold=1.0, + ComparisonOperator='GreaterThanOrEqualToThreshold', + TreatMissingData='notBreaching' + ) + print(response) + + +def add_ec2_ebs_alarm(instance_id, instance_name=None): + print("Add ec2 ebs %s alarm." % instance_id) + + ec2d = boto3.resource('ec2') + instance = ec2d.Instance(instance_id) + vol_id = instance.volumes.all() + print(vol_id) + for v in vol_id: + print("[Found EBS volume %s on instance %s]" % (v.id, instance_id)) + print("[VolumeIdleTime, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 80%]") + response = cloudwatch.put_metric_alarm( + AlarmName='AWS_EC2_%s_EBS_%s_VolumeIdleTime' % (instance_name if instance_name else instance_id, v.id), + AlarmDescription='Alarm when server CPU exceeds 80%', + ActionsEnabled=True, + OKActions=[sns_arn], + AlarmActions=[sns_arn], + MetricName='VolumeIdleTime', + Namespace='AWS/EBS', + Statistic='Average', + Dimensions=[ + { + 'Name': 'VolumeId', + 'Value': '%s' % v.id + }, + ], + Period=60, + EvaluationPeriods=3, + DatapointsToAlarm=3, + Threshold=30.0, + ComparisonOperator='GreaterThanOrEqualToThreshold', + TreatMissingData='notBreaching' + ) + print(response) + + +def add_redis_alarm(instance_id, instance_name=None): + print("[Add redis %s alarm.]" % instance_id) + print("[CPUUtilization, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 80%]") + response = cloudwatch.put_metric_alarm( + AlarmName='AWS_REDIS_%s_CPUUtilization' % (instance_name if instance_name else instance_id), + AlarmDescription='Alarm when redis CPU exceeds 80%', + ActionsEnabled=True, + OKActions=[sns_arn], + AlarmActions=[sns_arn], + MetricName='CPUUtilization', + Namespace='AWS/ElastiCache', + Statistic='Average', + Dimensions=[ + { + 'Name': 'CacheClusterId', + 'Value': '%s' % instance_id + }, + ], + Period=60, + Unit='Percent', + EvaluationPeriods=3, + DatapointsToAlarm=3, + Threshold=80.0, + ComparisonOperator='GreaterThanOrEqualToThreshold', + TreatMissingData='notBreaching' + ) + print(response) + + print("[EngineCPUUtilization, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 80%]") + response = cloudwatch.put_metric_alarm( + AlarmName='AWS_REDIS_%s_EngineCPUUtilization' % (instance_name if instance_name else instance_id), + AlarmDescription='Alarm when redis Engine CPU exceeds 80%', + ActionsEnabled=True, + OKActions=[sns_arn], + AlarmActions=[sns_arn], + MetricName='EngineCPUUtilization', + Namespace='AWS/ElastiCache', + Statistic='Average', + Dimensions=[ + { + 'Name': 'CacheClusterId', + 'Value': '%s' % instance_id + }, + ], + Period=60, + Unit='Percent', + EvaluationPeriods=3, + DatapointsToAlarm=3, + Threshold=80.0, + ComparisonOperator='GreaterThanOrEqualToThreshold', + TreatMissingData='notBreaching' + ) + print(response) + + print("[CurrConnections, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 500]") + response = cloudwatch.put_metric_alarm( + AlarmName='AWS_REDIS_%s_CurrConnections' % (instance_name if instance_name else instance_id), + AlarmDescription='Alarm when redis connections exceeds 500', + ActionsEnabled=True, + OKActions=[sns_arn], + AlarmActions=[sns_arn], + MetricName='CurrConnections', + Namespace='AWS/ElastiCache', + Statistic='Average', + Dimensions=[ + { + 'Name': 'CacheClusterId', + 'Value': '%s' % instance_id + }, + ], + Period=60, + Unit='Percent', + EvaluationPeriods=3, + DatapointsToAlarm=3, + Threshold=500, + ComparisonOperator='GreaterThanOrEqualToThreshold', + TreatMissingData='notBreaching' + ) + print(response) + + print("[FreeableMemory, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值小于于或等于 1G]") + response = cloudwatch.put_metric_alarm( + AlarmName='AWS_REDIS_%s_FreeableMemory' % (instance_name if instance_name else instance_id), + AlarmDescription='Alarm when redis FreeableMemory Less than 1G', + ActionsEnabled=True, + OKActions=[sns_arn], + AlarmActions=[sns_arn], + MetricName='FreeableMemory', + Namespace='AWS/ElastiCache', + Statistic='Average', + Dimensions=[ + { + 'Name': 'CacheClusterId', + 'Value': '%s' % instance_id + }, + ], + Period=60, + Unit='Percent', + EvaluationPeriods=3, + DatapointsToAlarm=3, + Threshold=1000000000, + ComparisonOperator='LessThanOrEqualToThreshold', + TreatMissingData='notBreaching' + ) + print(response) + + print("[NetworkBytesIn, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]") + response = cloudwatch.put_metric_alarm( + AlarmName='AWS_REDIS_%s_NetworkBytesIn' % (instance_name if instance_name else instance_id), + AlarmDescription='Alarm when redis NetworkBytesIn exceeds 5m', + ActionsEnabled=True, + OKActions=[sns_arn], + AlarmActions=[sns_arn], + MetricName='NetworkBytesIn', + Namespace='AWS/ElastiCache', + Statistic='Average', + Dimensions=[ + { + 'Name': 'CacheClusterId', + 'Value': '%s' % instance_id + }, + ], + Period=60, + Unit='Percent', + EvaluationPeriods=3, + DatapointsToAlarm=3, + Threshold=5000000, + ComparisonOperator='GreaterThanOrEqualToThreshold', + TreatMissingData='notBreaching' + ) + print(response) + + print("[NetworkBytesOut, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]") + response = cloudwatch.put_metric_alarm( + AlarmName='AWS_REDIS_%s_NetworkBytesOut' % (instance_name if instance_name else instance_id), + AlarmDescription='Alarm when redis NetworkBytesOut exceeds 5m', + ActionsEnabled=True, + OKActions=[sns_arn], + AlarmActions=[sns_arn], + MetricName='NetworkBytesOut', + Namespace='AWS/ElastiCache', + Statistic='Average', + Dimensions=[ + { + 'Name': 'CacheClusterId', + 'Value': '%s' % instance_id + }, + ], + Period=60, + Unit='Percent', + EvaluationPeriods=3, + DatapointsToAlarm=3, + Threshold=5000000, + ComparisonOperator='GreaterThanOrEqualToThreshold', + TreatMissingData='notBreaching' + ) + print(response) + + print("[CacheMisses, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]") + response = cloudwatch.put_metric_alarm( + AlarmName='AWS_REDIS_%s_CacheMisses' % (instance_name if instance_name else instance_id), + AlarmDescription='Alarm when redis CacheMisses exceeds 5000', + ActionsEnabled=True, + OKActions=[sns_arn], + AlarmActions=[sns_arn], + MetricName='CacheMisses', + Namespace='AWS/ElastiCache', + Statistic='Average', + Dimensions=[ + { + 'Name': 'CacheClusterId', + 'Value': '%s' % instance_id + }, + ], + Period=60, + Unit='Percent', + EvaluationPeriods=3, + DatapointsToAlarm=3, + Threshold=5000, + ComparisonOperator='GreaterThanOrEqualToThreshold', + TreatMissingData='notBreaching' + ) + print(response) + + +def add_mysql_alarm(instance_id, instance_name=None): + print("[Add mysql %s alarm.]" % instance_id) + print("[CPUUtilization, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]") + response = cloudwatch.put_metric_alarm( + AlarmName='AWS_MYSQL_%s_CPUUtilization' % (instance_name if instance_name else instance_id), + AlarmDescription='Alarm when mysql CPUUtilization exceeds 80%', + ActionsEnabled=True, + OKActions=[sns_arn], + AlarmActions=[sns_arn], + MetricName='CPUUtilization', + Namespace='AWS/RDS', + Statistic='Average', + Dimensions=[ + { + 'Name': 'DBInstanceIdentifier', + 'Value': '%s' % instance_id + }, + ], + Period=60, + Unit='Percent', + EvaluationPeriods=3, + DatapointsToAlarm=3, + Threshold=80.0, + ComparisonOperator='GreaterThanOrEqualToThreshold', + TreatMissingData='notBreaching' + ) + print(response) + + print("[DatabaseConnections, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于或等于 500]") + response = cloudwatch.put_metric_alarm( + AlarmName='AWS_MYSQL_%s_DatabaseConnections' % (instance_name if instance_name else instance_id), + AlarmDescription='Alarm when mysql DatabaseConnections exceeds 500', + ActionsEnabled=True, + OKActions=[sns_arn], + AlarmActions=[sns_arn], + MetricName='DatabaseConnections', + Namespace='AWS/RDS', + Statistic='Average', + Dimensions=[ + { + 'Name': 'DBInstanceIdentifier', + 'Value': '%s' % instance_id + }, + ], + Period=60, + Unit='Percent', + EvaluationPeriods=3, + DatapointsToAlarm=3, + Threshold=500, + ComparisonOperator='GreaterThanOrEqualToThreshold', + TreatMissingData='notBreaching' + ) + print(response) + + print("[FreeableMemory, 1分钟采集1次, 周期为3分钟, 3分钟有3个数据点超过阈值就告警, 平均值小于或等于 1g]") + response = cloudwatch.put_metric_alarm( + AlarmName='AWS_MYSQL_%s_FreeableMemory' % (instance_name if instance_name else instance_id), + AlarmDescription='Alarm when mysql FreeableMemory less than 1g', + ActionsEnabled=True, + OKActions=[sns_arn], + AlarmActions=[sns_arn], + MetricName='FreeableMemory', + Namespace='AWS/RDS', + Statistic='Average', + Dimensions=[ + { + 'Name': 'DBInstanceIdentifier', + 'Value': '%s' % instance_id + }, + ], + Period=60, + Unit='Percent', + EvaluationPeriods=3, + DatapointsToAlarm=3, + Threshold=1000000000, + ComparisonOperator='LessThanOrEqualToThreshold', + TreatMissingData='notBreaching' + ) + print(response) + + print("[FreeStorageSpace, 5分钟采集1次, 周期为5分钟, 15分钟有3个数据点超过阈值就告警, 平均值小于于或等于 10g]") + response = cloudwatch.put_metric_alarm( + AlarmName='AWS_MYSQL_%s_FreeStorageSpace' % (instance_name if instance_name else instance_id), + AlarmDescription='Alarm when mysql FreeStorageSpace less than 10g', + ActionsEnabled=True, + OKActions=[sns_arn], + AlarmActions=[sns_arn], + MetricName='FreeableMemory', + Namespace='AWS/RDS', + Statistic='Average', + Dimensions=[ + { + 'Name': 'DBInstanceIdentifier', + 'Value': '%s' % instance_id + }, + ], + Period=300, + Unit='Percent', + EvaluationPeriods=3, + DatapointsToAlarm=3, + Threshold=10000000000, + ComparisonOperator='LessThanOrEqualToThreshold', + TreatMissingData='notBreaching' + ) + print(response) + + print("[NetworkTransmitThroughput, 1分钟采集1次, 周期为1分钟, 3分钟有3个数据点超过阈值就告警, 平均值大于于或等于5m就告警]") + response = cloudwatch.put_metric_alarm( + AlarmName='AWS_MYSQL_%s_NetworkTransmitThroughput' % (instance_name if instance_name else instance_id), + AlarmDescription='Alarm when mysql NetworkTransmitThroughput exceeds 5m', + ActionsEnabled=True, + OKActions=[sns_arn], + AlarmActions=[sns_arn], + MetricName='NetworkTransmitThroughput', + Namespace='AWS/RDS', + Statistic='Average', + Dimensions=[ + { + 'Name': 'DBInstanceIdentifier', + 'Value': '%s' % instance_id + }, + ], + Period=60, + Unit='Percent', + EvaluationPeriods=3, + DatapointsToAlarm=3, + Threshold=5000000, + ComparisonOperator='GreaterThanOrEqualToThreshold', + TreatMissingData='notBreaching' + ) + print(response) + + print("[NetworkReceiveThroughput, 5分钟采集1次, 周期为5分钟, 15分钟有3个数据点超过阈值就告警, 平均值大于或等于 5m]") + response = cloudwatch.put_metric_alarm( + AlarmName='AWS_MYSQL_%s_NetworkReceiveThroughput' % (instance_name if instance_name else instance_id), + AlarmDescription='Alarm when mysql NetworkReceiveThroughput exceeds 5m', + ActionsEnabled=True, + OKActions=[sns_arn], + AlarmActions=[sns_arn], + MetricName='NetworkReceiveThroughput', + Namespace='AWS/RDS', + Statistic='Average', + Dimensions=[ + { + 'Name': 'DBInstanceIdentifier', + 'Value': '%s' % instance_id + }, + ], + Period=300, + Unit='Percent', + EvaluationPeriods=3, + DatapointsToAlarm=3, + Threshold=5000000, + ComparisonOperator='GreaterThanOrEqualToThreshold', + TreatMissingData='notBreaching' + ) + print(response) + +def add_ec2_dashboard(instance_id, instance_name=None): + print("[add ec2 %s dashboard]" % (instance_name if instance_name else instance_id)) + + body = {"widgets": [ + { + "type": "metric", + "x": 6, + "y": 0, + "width": 6, + "height": 6, + "properties": { + "view": "timeSeries", + "stacked": False, + "metrics": [ + ["AWS/EC2", "NetworkIn", "InstanceId", instance_id], + [".", "NetworkOut", ".", "."] + ], + "region": region, + "title": "Network" + } + }, + { + "type": "metric", + "x": 0, + "y": 0, + "width": 6, + "height": 6, + "properties": { + "view": "timeSeries", + "stacked": False, + "metrics": [ + ["AWS/EC2", "CPUUtilization", "InstanceId", instance_id], + ], + "region": region, + "title": "CPUUtilization" + } + }, + { + "type": "metric", + "x": 12, + "y": 0, + "width": 6, + "height": 6, + "properties": { + "view": "timeSeries", + "stacked": False, + "metrics": [ + ["AWS/EC2", "EBSWriteOps", "InstanceId", instance_id], + [".", "EBSReadOps", ".", "."] + ], + "region": region, + "title": "EBSOps" + } + }, + { + "type": "metric", + "x": 18, + "y": 0, + "width": 6, + "height": 6, + "properties": { + "view": "timeSeries", + "stacked": False, + "metrics": [ + ["AWS/EC2", "EBSIOBalance%", "InstanceId", instance_id], + [".", "EBSByteBalance%", ".", "."] + ], + "region": region, + "title": "EBSBalance" + } + } + ] + } + response = cloudwatch.put_dashboard( + DashboardName='AWS_EC2_%s' % (instance_name if instance_name else instance_id), + DashboardBody=json.dumps(body) + ) + print(response) + + +def add_mysql_dashboard(instance_id, instance_name=None): + print("[add mysql %s dashboard]" % (instance_name if instance_name else instance_id)) + + body = { + "widgets": [ + { + "type": "metric", + "x": 0, + "y": 0, + "width": 6, + "height": 6, + "properties": { + "view": "timeSeries", + "stacked": False, + "metrics": [ + ["AWS/RDS", "CPUUtilization", "DBInstanceIdentifier", instance_id] + ], + "region": region + } + }, + { + "type": "metric", + "x": 12, + "y": 0, + "width": 6, + "height": 6, + "properties": { + "view": "timeSeries", + "stacked": False, + "metrics": [ + ["AWS/RDS", "DatabaseConnections", "DBInstanceIdentifier", instance_id] + ], + "region": region + } + }, + { + "type": "metric", + "x": 0, + "y": 6, + "width": 6, + "height": 6, + "properties": { + "view": "timeSeries", + "stacked": True, + "metrics": [ + ["AWS/RDS", "ReadLatency", "DBInstanceIdentifier", instance_id], + [".", "WriteLatency", ".", "."] + ], + "region": region + } + }, + { + "type": "metric", + "x": 18, + "y": 0, + "width": 6, + "height": 6, + "properties": { + "view": "timeSeries", + "stacked": True, + "metrics": [ + ["AWS/RDS", "WriteIOPS", "DBInstanceIdentifier", instance_id], + [".", "ReadIOPS", ".", "."] + ], + "region": region + } + }, + { + "type": "metric", + "x": 6, + "y": 6, + "width": 6, + "height": 6, + "properties": { + "view": "timeSeries", + "stacked": True, + "metrics": [ + ["AWS/RDS", "NetworkTransmitThroughput", "DBInstanceIdentifier", instance_id], + [".", "NetworkReceiveThroughput", ".", "."] + ], + "region": region + } + }, + { + "type": "metric", + "x": 12, + "y": 6, + "width": 6, + "height": 6, + "properties": { + "view": "timeSeries", + "stacked": True, + "metrics": [ + ["AWS/RDS", "FreeStorageSpace", "DBInstanceIdentifier", instance_id], + [".", "BinLogDiskUsage", ".", "."] + ], + "region": region + } + }, + { + "type": "metric", + "x": 6, + "y": 0, + "width": 6, + "height": 6, + "properties": { + "view": "timeSeries", + "stacked": True, + "metrics": [ + ["AWS/RDS", "FreeableMemory", "DBInstanceIdentifier", instance_id] + ], + "region": region + } + } + ] + } + response = cloudwatch.put_dashboard( + DashboardName='AWS_MYSQL_%s' % (instance_name if instance_name else instance_id), + DashboardBody=json.dumps(body) + ) + print(response) + +def add_redis_dashboard(clusters, group_id): + print("[add redis %s dashboard]" % group_id) + + EngineCPUUtilization_metrcis = [] + CurrConnections_metrcis = [] + FreeableMemory_metrcis = [] + NetworkBytes_metrcis = [] + CacheHits_metrcis = [] + CacheMisses_metrcis = [] + CPUUtilization_metrcis = [] + IsMaster_metrcis = [] + NewConnections_metrcis = [] + StringBasedCmds_metrcis = [] + BytesUsedForCache_metrcis = [] + ReplicationBytes_metrcis = [] + + for c in clusters: + EngineCPUUtilization_metrcis.append(["AWS/ElastiCache", "EngineCPUUtilization", "CacheClusterId", "%s" % c]) + CurrConnections_metrcis.append(["AWS/ElastiCache", "CurrConnections", "CacheClusterId", "%s" % c]) + FreeableMemory_metrcis.append(["AWS/ElastiCache", "FreeableMemory", "CacheClusterId", "%s" % c]) + NetworkBytes_metrcis.append(["AWS/ElastiCache", "NetworkBytesIn", "CacheClusterId", "%s" % c]) + NetworkBytes_metrcis.append(["AWS/ElastiCache", "NetworkBytesOut", "CacheClusterId", "%s" % c]) + CacheHits_metrcis.append(["AWS/ElastiCache", "CacheHits", "CacheClusterId", "%s" % c]) + CacheMisses_metrcis.append(["AWS/ElastiCache", "CacheMisses", "CacheClusterId", "%s" % c]) + CPUUtilization_metrcis.append(["AWS/ElastiCache", "CPUUtilization", "CacheClusterId", "%s" % c]) + IsMaster_metrcis.append(["AWS/ElastiCache", "IsMaster", "CacheClusterId", "%s" % c]) + NewConnections_metrcis.append(["AWS/ElastiCache", "NewConnections", "CacheClusterId", "%s" % c]) + StringBasedCmds_metrcis.append(["AWS/ElastiCache", "StringBasedCmds", "CacheClusterId", "%s" % c]) + BytesUsedForCache_metrcis.append(["AWS/ElastiCache", "BytesUsedForCache", "CacheClusterId", "%s" % c]) + ReplicationBytes_metrcis.append(["AWS/ElastiCache", "BytesUsedForCache", "CacheClusterId", "%s" % c]) + + body = { + "widgets": [ + { + "type": "metric", + "x": 6, + "y": 0, + "width": 6, + "height": 6, + "properties": { + "metrics": EngineCPUUtilization_metrcis, + "view": "timeSeries", + "stacked": True, + "region": region + } + }, + { + "type": "metric", + "x": 12, + "y": 6, + "width": 6, + "height": 6, + "properties": { + "metrics": CurrConnections_metrcis, + "view": "timeSeries", + "stacked": True, + "region": region + } + }, + { + "type": "metric", + "x": 12, + "y": 0, + "width": 6, + "height": 6, + "properties": { + "view": "timeSeries", + "stacked": True, + "metrics": FreeableMemory_metrcis, + "region": region + } + }, + { + "type": "metric", + "x": 0, + "y": 6, + "width": 6, + "height": 6, + "properties": { + "view": "timeSeries", + "stacked": True, + "metrics": NetworkBytes_metrcis, + "region": region, + "title": "NetworkBytes" + } + }, + { + "type": "metric", + "x": 0, + "y": 12, + "width": 6, + "height": 6, + "properties": { + "view": "timeSeries", + "stacked": False, + "metrics": CacheHits_metrcis, + "region": region, + "title": " CacheHits" + } + }, + { + "type": "metric", + "x": 6, + "y": 12, + "width": 6, + "height": 6, + "properties": { + "view": "timeSeries", + "stacked": False, + "metrics": CacheMisses_metrcis, + "region": region, + "title": " CacheMisses" + } + }, + { + "type": "metric", + "x": 0, + "y": 0, + "width": 6, + "height": 6, + "properties": { + "view": "timeSeries", + "stacked": False, + "metrics": CPUUtilization_metrcis, + "region": region, + "title": "CPUUtilization" + } + }, + { + "type": "metric", + "x": 18, + "y": 0, + "width": 6, + "height": 6, + "properties": { + "view": "timeSeries", + "stacked": False, + "metrics": IsMaster_metrcis, + "region": region, + "title": "IsMaster", + "period": 300 + } + }, + { + "type": "metric", + "x": 6, + "y": 6, + "width": 6, + "height": 6, + "properties": { + "view": "timeSeries", + "stacked": False, + "metrics": NewConnections_metrcis, + "region": region, + "title": "NewConnections" + } + }, + { + "type": "metric", + "x": 12, + "y": 12, + "width": 6, + "height": 6, + "properties": { + "view": "timeSeries", + "stacked": False, + "metrics": StringBasedCmds_metrcis, + "region": region, + "title": "StringBasedCmds" + } + }, + { + "type": "metric", + "x": 18, + "y": 6, + "width": 6, + "height": 6, + "properties": { + "view": "timeSeries", + "stacked": False, + "metrics": BytesUsedForCache_metrcis, + "region": region, + "title": "BytesUsedForCache" + } + }, + { + "type": "metric", + "x": 18, + "y": 12, + "width": 6, + "height": 6, + "properties": { + "view": "timeSeries", + "stacked": False, + "metrics": ReplicationBytes_metrcis, + "region": region, + "title": "ReplicationBytes" + } + } + ] + } + + response = cloudwatch.put_dashboard( + DashboardName='AWS_REDIS_%s' % group_id, + DashboardBody=json.dumps(body) + ) + print(response) + +def add_elb_dashboard(instance_arn, instance_name): + print("[add elb %s dashboard]" % instance_name) + elb_client = boto3.client('elbv2') + + instance = elb_client.describe_listeners(LoadBalancerArn=instance_arn) + arn = instance_arn.split(':loadbalancer/')[1] + + targetgroup_list = [] + for listener in instance.get('Listeners'): + for action in listener['DefaultActions']: + print(action['TargetGroupArn']) + + targetgroup_arn = action['TargetGroupArn'] + targetgroup = targetgroup_arn.split(':')[-1] + targetgroup_list.append(targetgroup) + + HealthyHost_metrcis = [] + + for targetgroup in set(targetgroup_list): + HealthyHost_metrcis.append(["AWS/NetworkELB", "UnHealthyHostCount", "TargetGroup", + targetgroup, "LoadBalancer", arn]) + HealthyHost_metrcis.append(["AWS/NetworkELB", "HealthyHostCount", "TargetGroup", + targetgroup, "LoadBalancer", arn]) + + body = { + "widgets": [ + { + "type": "metric", + "x": 0, + "y": 0, + "width": 12, + "height": 6, + "properties": { + "view": "timeSeries", + "stacked": False, + "metrics": [ + ["AWS/NetworkELB", "ProcessedBytes", "LoadBalancer", "%s" % arn] + ], + "region": region, + "title": "ProcessedBytes" + } + }, + { + "type": "metric", + "x": 12, + "y": 0, + "width": 12, + "height": 6, + "properties": { + "metrics": [ + ["AWS/NetworkELB", "ActiveFlowCount", "LoadBalancer", "%s" % arn], + [".", "NewFlowCount", ".", "."], + ], + "view": "timeSeries", + "stacked": False, + "region": region, + "title": "FlowCount" + } + }, + { + "type": "metric", + "x": 0, + "y": 6, + "width": 12, + "height": 6, + "properties": { + "view": "timeSeries", + "stacked": False, + "metrics": [ + ["AWS/NetworkELB", "TCP_Client_Reset_Count", "LoadBalancer", "%s" % arn], + [".", "TCP_ELB_Reset_Count", ".", "."], + [".", "TCP_Target_Reset_Count", ".", "."] + ], + "region": region, + "title": "TCP_Reset" + } + }, + { + "type": "metric", + "x": 12, + "y": 6, + "width": 12, + "height": 6, + "properties": { + "view": "timeSeries", + "stacked": False, + "metrics": HealthyHost_metrcis, + "region": region, + "title": "HealthyHost" + } + } + ] + } + + response = cloudwatch.put_dashboard( + DashboardName='AWS_ELB_%s' % instance_name, + DashboardBody=json.dumps(body) + ) + print(response) + +def lambda_handler(event, context): + # TODO implement + + print(event) + detail = event.get('detail', {}) + event_source = detail.get('eventSource') + event_name = detail.get('eventName') + event_response = detail.get('responseElements', {}) + + if event_source == 'ec2.amazonaws.com': + if event_name == 'RunInstances': + instances = event_response.get('instancesSet', {}) + for item in instances.get('items', []): + print(item) + instance_id = item.get('instanceId') + if instance_id: + add_ec2_alarm(instance_id) + add_ec2_ebs_alarm(instance_id) + add_ec2_dashboard(instance_id) + + elif event_source == 'rds.amazonaws.com': + if event_name == 'CreateDBInstance': + dBInstanceArn = event_response.get('dBInstanceArn') + dBInstanceIdentifier = event_response.get('dBInstanceIdentifier') + engine = event_response.get('engine') + if engine == 'mysql': + print(dBInstanceArn, dBInstanceIdentifier) + add_mysql_alarm(dBInstanceIdentifier) + add_mysql_dashboard(dBInstanceIdentifier) + elif event_source == 'elasticache.amazonaws.com': + if event_name == 'CreateReplicationGroup': + group_id = event_response.get('replicationGroupId') + clusters = event_response.get('memberClusters') + for c in clusters: + add_redis_alarm(c) + + add_redis_dashboard(clusters, group_id) + elif event_source == 'elasticloadbalancing.amazonaws.com': + if event_name == 'CreateLoadBalancer': + loadBalancers = event_response.get('loadBalancers') + for loadbalancer in loadBalancers: + instance_name = loadbalancer.get('loadBalancerName') + instance_type = loadbalancer.get('type') + instance_arn = loadbalancer.get('loadBalancerArn') + + if instance_type == 'network': + add_elb_alarm(instance_arn, instance_name) + add_elb_dashboard(instance_arn, instance_name) + + return { + 'statusCode': 200, + 'body': json.dumps('add success.') + } diff --git a/cloud/aws/lambda/delAlarm.py b/cloud/aws/lambda/delAlarm.py new file mode 100644 index 0000000..dd6e492 --- /dev/null +++ b/cloud/aws/lambda/delAlarm.py @@ -0,0 +1,83 @@ +import boto3 +import json + +cloudwatch = boto3.client('cloudwatch') + + +def del_alarm(type, name_prefix): + print("[delete %s alarm.]" % type) + + rep = cloudwatch.describe_alarms(AlarmNamePrefix=name_prefix, ) + + alarm_names = [] + for m in rep.get('MetricAlarms'): + alarm_names.append(m.get('AlarmName')) + + response = cloudwatch.delete_alarms( + AlarmNames=alarm_names + ) + print('alarm_names', alarm_names) + print('response', response) + + +def del_dashboards(type, name_prefix): + + print("[delete %s dashboards.]" % type) + + rep = cloudwatch.list_dashboards(DashboardNamePrefix=name_prefix) + + dashboard_names = [] + for m in rep.get('DashboardEntries'): + dashboard_names.append(m.get('DashboardName')) + + response = cloudwatch.delete_dashboards( + DashboardNames=dashboard_names + ) + print('dashboard_names', dashboard_names) + print('response', response) + + +def lambda_handler(event, context): + # TODO implement + + print(event) + detail = event.get('detail', {}) + event_source = detail.get('eventSource') + event_name = detail.get('eventName') + event_response = detail.get('responseElements', {}) + + if event_source == 'ec2.amazonaws.com': + if event_name == 'TerminateInstances': + instances = event_response.get('instancesSet', {}) + for item in instances.get('items', []): + instance_id = item.get('instanceId') + if instance_id: + del_alarm('ec2', 'AWS_EC2_%s' % instance_id) + del_dashboards('ec2', 'AWS_EC2_%s' % instance_id) + + elif event_source == 'rds.amazonaws.com': + if event_name == 'DeleteDBInstance': + dBInstanceIdentifier = event_response.get('dBInstanceIdentifier') + engine = event_response.get('engine') + if engine == 'mysql': + del_alarm('mysql', 'AWS_MYSQL_%s' % dBInstanceIdentifier) + del_dashboards('mysql', 'AWS_MYSQL_%s' % dBInstanceIdentifier) + + elif event_source == 'elasticache.amazonaws.com': + if event_name == 'DeleteReplicationGroup': + group_id = event_response.get('replicationGroupId') + del_alarm('redis', 'AWS_REDIS_%s' % group_id) + del_dashboards('redis', 'AWS_REDIS_%s' % group_id) + + elif event_source == 'elasticloadbalancing.amazonaws.com': + if event_name == 'DeleteLoadBalancer': + requestParameters = detail.get('requestParameters', {}) + loadBalancerArn = requestParameters.get('loadBalancerArn') + instance_name = loadBalancerArn.split(':loadbalancer/')[1].split('/')[1] + del_alarm('elb', 'AWS_ELB_%s' % instance_name) + del_dashboards('elb', 'AWS_ELB_%s' % instance_name) + + return { + 'statusCode': 200, + 'body': json.dumps('add success.') + } \ No newline at end of file diff --git a/cloud/aws/lambda/wechat.py b/cloud/aws/lambda/wechat.py new file mode 100644 index 0000000..77faa60 --- /dev/null +++ b/cloud/aws/lambda/wechat.py @@ -0,0 +1,77 @@ +import json +from botocore.vendored import requests + +def lambda_handler(event, context): + # TODO implement + url = "https://qyapi.weixin.qq.com" + + corpid = "" + secret = "" + agentid = "" + touser = '' + toparty = '' + totag = '' + + headers={ + 'Content-Type':'application/json' + } + + access_token_url = '{url}/cgi-bin/gettoken?corpid={id}&corpsecret={crt}'.format(url=url, id=corpid, crt=secret) + access_token_response = requests.get(url=access_token_url, headers=headers) + token = json.loads(access_token_response.text)['access_token'] + + send_url = '{url}/cgi-bin/message/send?access_token={token}'.format(url=url, token=token) + message = event['Records'][0]['Sns'] + Timestamp = message['Timestamp'] + Subject = message['Subject'] + sns_message = json.loads(message['Message']) + region = message['TopicArn'].split(':')[-3] + state_exclude = ['INSUFFICIENT_DATA'] + + if sns_message['OldStateValue'] in state_exclude: + return + + if "ALARM" in Subject: + title='[aws] 警报!!警报!!' + elif "OK" in Subject: + title='[aws] 故障恢复' + else: + title='[aws]' + + content = title \ + + "\n> **详情信息**" \ + + "\n> 时间: " + Timestamp \ + + "\n> 内容: " + Subject \ + + "\n> 状态: {old} => {new}".format(old=sns_message['OldStateValue'], new=sns_message['NewStateValue']) \ + + "\n> " \ + + "\n> Region: " + sns_message['Region'] \ + + "\n> Namespace: " + sns_message['Trigger']['Namespace'] \ + + "\n> MetricName: " + sns_message['Trigger']['MetricName'] \ + + "\n> " \ + + "\n> AlarmName: " + sns_message['AlarmName'] \ + + "\n> AlarmDescription: " + sns_message['AlarmDescription'] \ + + "\n> " \ + + "\n> 详情请点击:[Alarm](https://{region}.console.amazonaws.cn/cloudwatch/home?region={region}#s=Alarms&alarm={alarm})".format(region=region, alarm=sns_message['AlarmName']) + + msg = { + "msgtype": 'markdown', + "agentid": agentid, + "markdown": {'content': content }, + "safe": 0 + } + + if touser: + msg['touser'] = touser + if toparty: + msg['toparty'] = toparty + if toparty: + msg['totag'] = totag + + response = requests.post(url=send_url, data=json.dumps(msg), headers=headers) + + errcode = json.loads(response.text)['errcode'] + if errcode == 0: + print('Succesfully') + else: + print(response.json()) + print('Failed')