Skip to content

Instantly share code, notes, and snippets.

@csereno
Created April 19, 2021 18:25
Show Gist options
  • Save csereno/4837e90c71b62c8c706e8a4619ef24b7 to your computer and use it in GitHub Desktop.
Save csereno/4837e90c71b62c8c706e8a4619ef24b7 to your computer and use it in GitHub Desktop.

Revisions

  1. csereno created this gist Apr 19, 2021.
    279 changes: 279 additions & 0 deletions Recover-EC2-CW-Alarm.yaml
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,279 @@
    # CW Alarm Template
    #
    # Author: Chris Sereno
    # Description: This template creates or adds CW alarms to EC2 Instances and will restart and recover instances.
    # There are requirements to instance recovery. See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-recover.html
    # This is a personal test template and is NOT a vetted or approved template. Use at your own risk.
    #
    ---
    AWSTemplateFormatVersion: '2010-09-09'
    Description: 'CloudWatch Alarms to recover EC2 Template'

    Parameters:
    InstanceId:
    Type: AWS::EC2::Instance::Id
    Description: Instance ID of your EC2
    ConstraintDescription: Instance Id of an existing EC2

    #Status check alarm
    StatusCheckFailed:
    Type: String
    Default: 'No'
    Description: 'This check verifies that your instance is reachable'
    AllowedValues:
    - 'Yes'
    - 'No'
    SCAlarmName:
    Description: Name of the Alarm
    Type: String
    Default: "EC2 Status Check Failed"
    SCAlarmDescription:
    Description: Description of the Alarm
    Type: String
    Default: "An EC2 Status Check Failed"
    SCAlarmPeriod:
    Description: The length of a period (in seconds)
    Type: Number
    Default: 60
    SCAlarmEvalPeriods:
    Description: The number of specified periods to evaluate
    Type: Number
    Default: 2
    SCAlarmThreshold:
    Description: The number of failures that have to occur
    Type: Number
    Default: 1

    #System status check alarm
    StatusCheckSystemFailed:
    Type: String
    Default: 'No'
    Description: "This check verifies that your instance's operating system is accepting traffic."
    AllowedValues:
    - 'Yes'
    - 'No'
    SCSAlarmName:
    Description: Name of the alarm
    Type: String
    Default: "EC2 System Status Check Failed"
    SCSAlarmDescription:
    Description: "Description of the Alarm"
    Type: String
    Default: "An EC2 System Status Check Failed"
    SCSAlarmPeriod:
    Description: The length of a period (in seconds)
    Type: Number
    Default: 60
    SCSAlarmEvalPeriods:
    Description: The number of specified periods to evaluate
    Type: Number
    Default: 2
    SCSAlarmThreshold:
    Description: The number of failures that have to occur
    Type: Number
    Default: 1

    #High CPU alarm
    HighCPU:
    Type: String
    Default: 'No'
    Description: "This will alarm if the CPU is above the given threshold."
    AllowedValues:
    - 'Yes'
    - 'No'
    HighCPUAlarmPeriod:
    Description: The length of a period (in seconds)
    Type: Number
    Default: 300
    HighCPUAlarmEvalPeriods:
    Description: The number of specified periods to evaluate
    Type: Number
    Default: 2
    HighCPUAlarmThreshold:
    Description: The number of failures that have to occur
    Type: Number
    Default: 90

    #High memory alarm
    HighMemory:
    Type: String
    Default: 'No'
    Description: "This will alarm if the CPU is above the given threshold."
    AllowedValues:
    - 'Yes'
    - 'No'
    HighMemAlarmPeriod:
    Description: The length of a period (in seconds)
    Type: Number
    Default: 300
    HighMemAlarmEvalPeriods:
    Description: The number of specified periods to evaluate
    Type: Number
    Default: 2
    HighMemAlarmThreshold:
    Description: The number of failures that have to occur
    Type: Number
    Default: 90

    #Alarm Actions
    AlarmAction:
    Description: This allows you take an action such as send an email or text alert. (Must be in ARN format)
    Type: String

    Metadata:
    AWS::CloudFormation::Interface:
    ParameterGroups:
    -
    Label:
    default: "Instance ID"
    Parameters:
    - InstanceId
    -
    Label:
    default: "Status Check Failed Alarm"
    Parameters:
    - StatusCheckFailed
    - SCAlarmName
    - SCAlarmDescription
    - SCAlarmPeriod
    - SCAlarmEvalPeriods
    - SCAlarmThreshold
    -
    Label:
    default: "Status Check Failed System Alarm"
    Parameters:
    - StatusCheckSystemFailed
    - SCSAlarmName
    - SCSAlarmDescription
    - SCSAlarmPeriod
    - SCSAlarmEvalPeriods
    - SCSAlarmThreshold
    -
    Label:
    default: "High CPU Alarm"
    Parameters:
    - HighCPU
    - HighCPUAlarmPeriod
    - HighCPUAlarmEvalPeriods
    - HighCPUAlarmThreshold
    -
    Label:
    default: "High Memory Alarm"
    Parameters:
    - HighMemory
    - HighMemAlarmPeriod
    - HighMemAlarmEvalPeriods
    - HighMemAlarmThreshold
    -
    Label:
    default: "Alarm Actions"
    Parameters:
    - AlarmAction

    ParameterLabels:
    StatusCheckFailed:
    default: "Create Status Check Failed Alarm?"
    StatusCheckSystemFailed:
    default: "Create System Status Check Failed Alarm?"
    HighCPU:
    default: "Create High CPU Alarm?"

    Conditions:
    # These conditions determine which OS Resource to create
    StatusAlarm: !Equals [!Ref StatusCheckFailed, 'Yes']
    SystemStatusAlarm: !Equals [!Ref StatusCheckSystemFailed, 'Yes']
    HighCPUAlarm: !Equals [!Ref HighCPU, 'Yes']
    HighMemAlarm: !Equals [!Ref HighMemory, 'Yes']

    Resources:
    StatusCheckFailedAlarm:
    Type: AWS::CloudWatch::Alarm
    Condition: StatusAlarm
    Properties:
    AlarmActions:
    - {"Fn::Join" : ["", ["arn:aws:automate:", { "Ref" : "AWS::Region" }, ":ec2:reboot" ]]}
    - Ref: "AlarmAction"
    AlarmDescription: !Ref SCAlarmDescription
    AlarmName: !Join [ "-", [!Ref SCAlarmName, !Ref InstanceId] ]
    ComparisonOperator: GreaterThanOrEqualToThreshold
    Dimensions:
    - Name: InstanceId
    Value:
    Ref: "InstanceId"
    EvaluationPeriods: '2'
    MetricName: StatusCheckFailed
    Namespace: AWS/EC2
    Period: '60'
    Statistic: Maximum
    Threshold: '1'
    TreatMissingData: missing
    Unit: Count

    StatusCheckSystemFailedAlarm:
    Type: AWS::CloudWatch::Alarm
    Condition: SystemStatusAlarm
    Properties:
    AlarmActions:
    - {"Fn::Join" : ["", ["arn:aws:automate:", { "Ref" : "AWS::Region" }, ":ec2:recover" ]]}
    - Ref: "AlarmAction"
    AlarmDescription: !Ref SCSAlarmDescription
    AlarmName: !Ref SCSAlarmName
    ComparisonOperator: GreaterThanOrEqualToThreshold
    Dimensions:
    - Name: InstanceId
    Value:
    Ref: "InstanceId"
    EvaluationPeriods: '2'
    MetricName: StatusCheckFailed_System
    Namespace: AWS/EC2
    Period: '60'
    Statistic: Maximum
    Threshold: '1'
    TreatMissingData: missing
    Unit: Count

    HighCPUAlarm:
    Type: AWS::CloudWatch::Alarm
    Condition: HighCPUAlarm
    Properties:
    AlarmActions:
    - Ref: "AlarmAction"
    AlarmDescription: "High CPU alert on instance"
    AlarmName: "High CPU alert on instance"
    ComparisonOperator: GreaterThanOrEqualToThreshold
    Dimensions:
    - Name: InstanceId
    Value:
    Ref: "InstanceId"
    EvaluationPeriods: '2'
    MetricName: CPUUtilization
    Namespace: AWS/EC2
    Period: '300'
    Statistic: Maximum
    Threshold: '90'
    TreatMissingData: missing
    Unit: Count

    # For memory alarms install the Cloudwatch agent on the EC2
    # https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/Install-CloudWatch-Agent.html
    HighMemAlarm:
    Type: AWS::CloudWatch::Alarm
    Condition: HighMemAlarm
    Properties:
    AlarmActions:
    - Ref: "AlarmAction"
    AlarmDescription: "High memory alert on instance"
    AlarmName: "High memory alert on instance"
    ComparisonOperator: GreaterThanOrEqualToThreshold
    Dimensions:
    - Name: InstanceId
    Value:
    Ref: "InstanceId"
    EvaluationPeriods: '2'
    MetricName: mem_used_percent
    Namespace: AWS/EC2
    Period: '300'
    Statistic: Maximum
    Threshold: '90'
    TreatMissingData: missing
    Unit: Count