from troposphere import \ AWSHelperFn, Base64, Cidr, Condition, Equals, GetAtt, Join, Not, Output, Parameter, Ref, \ Region, Select, Split, StackName, Sub, Tags, Template from troposphere.autoscaling import \ AutoScalingGroup, LaunchTemplateSpecification, LifecycleHookSpecification from troposphere.awslambda import \ Code, Function, Permission from troposphere.ec2 import \ CreditSpecification, IamInstanceProfile, InternetGateway, LaunchTemplate, LaunchTemplateData, \ Route, RouteTable, SecurityGroup, SecurityGroupRule, Subnet, SubnetRouteTableAssociation, VPC, \ VPCGatewayAttachment from troposphere.ecs import \ Cluster, ContainerDefinition, DeploymentConfiguration, LoadBalancer as ServiceLoadBalancer, \ LogConfiguration, PortMapping, Service, TaskDefinition from troposphere.elasticloadbalancingv2 import \ Action as ListenerAction, Listener, LoadBalancer, LoadBalancerAttributes, TargetGroup, \ TargetGroupAttribute from troposphere.iam import \ InstanceProfile, Policy as NamedPolicy, Role from troposphere.logs import \ LogGroup from troposphere.sns import \ SubscriptionResource, Topic from troposphere.policies import \ AutoScalingReplacingUpdate, CreationPolicy, ResourceSignal, UpdatePolicy from awacs import autoscaling, ec2, ecs, elasticloadbalancing, route53, s3, sns, sts from awacs.aws import Action, Allow, Policy, Principal, Statement import inspect import textwrap def lifecycle_ecs_drain_handler(event, _context): """ Self-contained to be scooped up with inspect """ import json import logging import time import boto3 logger = logging.getLogger(__name__) logging.root.setLevel(logging.INFO) class RetryLater(Exception): pass sns_record = event['Records'][0]['Sns'] message = json.loads(sns_record['Message']) logger.info('Processing message: %s', message) if message.get('LifecycleTransition') != 'autoscaling:EC2_INSTANCE_TERMINATING': logger.warning('Not an instance termination message, ignoring') return metadata = json.loads(message.get('NotificationMetadata', '{}')) try: ecs = boto3.client('ecs') response = ecs.list_container_instances( cluster=metadata['Cluster'], filter='attribute:ec2-instance-id=={}'.format(message['EC2InstanceId']), ) if not response['containerInstanceArns']: logger.warning('EC2 instance %s not in cluster, ignoring', message['EC2InstanceId']) return container_instance_arn = response['containerInstanceArns'][0] logger.info('Container instance ARN: %s', container_instance_arn) response = ecs.describe_container_instances( cluster=metadata['Cluster'], containerInstances=[container_instance_arn], ) container_instance = response['containerInstances'][0] if container_instance['status'] == 'ACTIVE': logger.info('Setting state to DRAINING') ecs.update_container_instances_state( cluster=metadata['Cluster'], containerInstances=[container_instance['containerInstanceArn']], status='DRAINING', ) raise RetryLater('Container instance state changed to DRAINING') if container_instance['runningTasksCount']: raise RetryLater('{} tasks running'.format(container_instance['runningTasksCount'])) logger.info('Instance drained, completing lifecycle action') boto3.client('autoscaling').complete_lifecycle_action( LifecycleHookName=message['LifecycleHookName'], AutoScalingGroupName=message['AutoScalingGroupName'], LifecycleActionResult='CONTINUE', InstanceId=message['EC2InstanceId'], ) logger.info('Done, AutoScaling will now terminate the instance') except RetryLater: logger.exception('Retry required; republishing to SNS topic in 10 seconds') time.sleep(10) boto3.client('sns').publish( TopicArn=sns_record['TopicArn'], Message=sns_record['Message'], ) logger.info('Republished') def create_template(): t = Template( Description='OMG, CFN 4 ECS on EC2 ASG w/ ALB', ) ami_id = t.add_parameter(Parameter( 'ECSImageParameter', Default='/aws/service/ecs/optimized-ami/amazon-linux/recommended/image_id', Type='AWS::SSM::Parameter::Value', )) instance_type = t.add_parameter(Parameter( 'InstanceType', Default='t2.small', Type='String', )) instance_count = t.add_parameter(Parameter( 'InstanceCount', Default=3, Type='Number', )) task_count = t.add_parameter(Parameter( 'TaskCount', Default=3, Type='Number', )) docker_image = t.add_parameter(Parameter( 'DockerImage', Default='tutum/hello-world', Type='String', )) log_retention = t.add_parameter(Parameter( 'LogRetentionDays', Default=90, Type='Number', )) az_suffixes = ['a', 'b', 'c'] vpc = t.add_resource(VPC( 'VPC', CidrBlock='10.69.0.0/16', Tags=Tags( Name=StackName, ), )) internet_gateway = t.add_resource(InternetGateway( 'InternetGateway', Tags=Tags( Name=StackName, ), )) vpc_gateway_attachment = t.add_resource(VPCGatewayAttachment( 'VPCGatewayAttachment', VpcId=Ref(vpc), InternetGatewayId=Ref(internet_gateway), )) subnets = [] routes = [] for idx, az_suffix in enumerate(az_suffixes): subnet = t.add_resource(Subnet( 'PrivateSubnet{}'.format(idx), VpcId=Ref(vpc), CidrBlock=Select(idx, Cidr(vpc.CidrBlock, 32, 8)), AvailabilityZone=Sub('${AWS::Region}${Suffix}', Suffix=az_suffix), MapPublicIpOnLaunch=True, Tags=Tags( Name=StackName, ), )) subnets.append(subnet) route_table = t.add_resource(RouteTable( 'PrivateSubnetRouteTable{}'.format(idx), VpcId=Ref(vpc), Tags=Tags( Name=StackName, ), )) route_association = t.add_resource(SubnetRouteTableAssociation( 'PrivateSubnetRouteAssociation{}'.format(idx), SubnetId=Ref(subnet), RouteTableId=Ref(route_table), )) route = t.add_resource(Route( 'PrivateSubnetInternetRoute{}'.format(idx), RouteTableId=Ref(route_table), DestinationCidrBlock='0.0.0.0/0', GatewayId=Ref(internet_gateway), DependsOn=[vpc_gateway_attachment.title], )) routes.append(route) target_group = t.add_resource(TargetGroup( 'LoadBalancerTargetGroup', Port=80, Protocol='HTTP', VpcId=Ref(vpc), TargetGroupAttributes=[ TargetGroupAttribute( Key='deregistration_delay.timeout_seconds', Value='30', ), ], )) load_balancer_security_group = t.add_resource(SecurityGroup( 'LoadBalancerSecurityGroup', GroupDescription=Sub('${AWS::StackName} load balancer'), SecurityGroupIngress=[ SecurityGroupRule( IpProtocol='tcp', FromPort='80', ToPort='80', CidrIp='0.0.0.0/0', ), SecurityGroupRule( IpProtocol='tcp', FromPort='443', ToPort='443', CidrIp='0.0.0.0/0', ), ], VpcId=Ref(vpc), )) load_balancer = t.add_resource(LoadBalancer( 'LoadBalancer', Type='application', Scheme='internet-facing', LoadBalancerAttributes=[ LoadBalancerAttributes( Key='routing.http2.enabled', Value='true', ), ], SecurityGroups=[Ref(load_balancer_security_group)], Subnets=[Ref(subnet) for subnet in subnets], DependsOn=[route.title for route in routes], )) http_listener = t.add_resource(Listener( 'LoadBalancerHTTPListener', Protocol='HTTP', Port=80, LoadBalancerArn=Ref(load_balancer), DefaultActions=[ ListenerAction( Type='forward', TargetGroupArn=Ref(target_group), ), ], )) cluster = t.add_resource(Cluster( 'Cluster', )) instance_role = t.add_resource(Role( 'ECSInstanceRole', AssumeRolePolicyDocument=Policy( Version='2012-10-17', Statement=[ Statement( Effect=Allow, Principal=Principal('Service', 'ec2.amazonaws.com'), Action=[sts.AssumeRole], ), ], ), ManagedPolicyArns=[ 'arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role', ], )) instance_profile = t.add_resource(InstanceProfile( 'ECSInstanceProfile', Roles=[ Ref(instance_role), ], )) drain_hook_topic = t.add_resource(Topic( 'DrainHookInvokeTopic', )) drain_hook_role = t.add_resource(Role( 'DrainHookExecutionRole', AssumeRolePolicyDocument=Policy( Version='2012-10-17', Statement=[ Statement( Effect=Allow, Principal=Principal('Service', 'lambda.amazonaws.com'), Action=[sts.AssumeRole], ), ], ), ManagedPolicyArns=[ 'arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole', ], Policies=[ NamedPolicy( PolicyName='drain-ecs-policy', PolicyDocument=Policy( Version='2012-10-17', Statement=[ Statement( Effect=Allow, Resource=[Ref(drain_hook_topic)], Action=[sns.Publish], ), Statement( Effect=Allow, Resource=['*'], Action=[ autoscaling.CompleteLifecycleAction, ecs.DescribeContainerInstances, ecs.ListContainerInstances, ecs.Action('UpdateContainerInstancesState'), ], ), ], ), ), ], )) drain_hook_function = t.add_resource(Function( 'DrainHookFunction', Runtime='python3.6', Role=GetAtt(drain_hook_role, 'Arn'), MemorySize=256, Timeout=60, Code=Code( ZipFile=inspect.getsource(lifecycle_ecs_drain_handler), ), Handler='.'.join(('index', lifecycle_ecs_drain_handler.__name__)), )) drain_hook_log_group = t.add_resource(LogGroup( 'DrainHookLogGroup', LogGroupName=Sub('/aws/lambda/${{{}}}'.format(drain_hook_function.title)), RetentionInDays=Ref(log_retention), )) drain_hook_permission = t.add_resource(Permission( 'DrainHookInvokePermission', Action='lambda:InvokeFunction', Principal='sns.amazonaws.com', SourceArn=Ref(drain_hook_topic), FunctionName=GetAtt(drain_hook_function, 'Arn'), DependsOn=[drain_hook_log_group.title], # prevent invokes before log group is set up )) drain_hook_subscription = t.add_resource(SubscriptionResource( 'DrainHookInvokeSubscription', Endpoint=GetAtt(drain_hook_function, 'Arn'), Protocol='lambda', TopicArn=Ref(drain_hook_topic), DependsOn=[drain_hook_permission.title], )) lifecycle_notification_role = t.add_resource(Role( 'AutoScalingLifecycleNotificationRole', AssumeRolePolicyDocument=Policy( Version='2012-10-17', Statement=[ Statement( Effect=Allow, Principal=Principal('Service', 'autoscaling.amazonaws.com'), Action=[sts.AssumeRole], ), ], ), ManagedPolicyArns=[ 'arn:aws:iam::aws:policy/service-role/AutoScalingNotificationAccessRole', ], )) instance_security_group = t.add_resource(SecurityGroup( 'InstanceSecurityGroup', GroupDescription=Sub('${AWS::StackName} instance'), SecurityGroupIngress=[ SecurityGroupRule( IpProtocol='-1', SourceSecurityGroupId=Ref(load_balancer_security_group), ), ], VpcId=Ref(vpc), )) CreditSpecification.props['CpuCredits'] = (str, False) # bugfix launch_template = t.add_resource(LaunchTemplate( 'InstanceLaunchTemplate', LaunchTemplateData=LaunchTemplateData( CreditSpecification=CreditSpecification( CpuCredits='unlimited', ), ImageId=Ref(ami_id), InstanceType=Ref(instance_type), SecurityGroupIds=[Ref(instance_security_group)], UserData=Base64(Sub( textwrap.dedent(""" #!/bin/bash echo ECS_CLUSTER=${ClusterName} >> /etc/ecs/ecs.config INSTANCE_ID=$(curl -s 'http://169.254.169.254/latest/meta-data/instance-id') ATTRIBUTES='{"ec2-instance-id": "'$INSTANCE_ID'"}' echo ECS_INSTANCE_ATTRIBUTES=$ATTRIBUTES >> /etc/ecs/ecs.config yum install -y aws-cfn-bootstrap /opt/aws/bin/cfn-signal --success true --stack ${AWS::StackName} \ --resource ${AutoScalingGroupLogicalId} --region ${AWS::Region} """), ClusterName=Ref(cluster), AutoScalingGroupLogicalId='AutoScalingGroup', # circular ref )), IamInstanceProfile=IamInstanceProfile( Arn=GetAtt(instance_profile, 'Arn'), ), ), )) autoscaling_group = t.add_resource(AutoScalingGroup( 'AutoScalingGroup', CreationPolicy=CreationPolicy( ResourceSignal=ResourceSignal( Timeout='PT15M', ), ), UpdatePolicy=UpdatePolicy( AutoScalingReplacingUpdate=AutoScalingReplacingUpdate( WillReplace=True, ), ), DesiredCapacity=Ref(instance_count), MinSize=Ref(instance_count), MaxSize=Ref(instance_count), VPCZoneIdentifier=[Ref(subnet) for subnet in subnets], LaunchTemplate=LaunchTemplateSpecification( LaunchTemplateId=Ref(launch_template), Version=GetAtt(launch_template, 'LatestVersionNumber'), ), LifecycleHookSpecificationList=[ LifecycleHookSpecification( LifecycleHookName='drain-ecs-tasks', LifecycleTransition='autoscaling:EC2_INSTANCE_TERMINATING', NotificationMetadata=Sub('{"Cluster":"${Cluster}"}'), NotificationTargetARN=Ref(drain_hook_topic), RoleARN=GetAtt(lifecycle_notification_role, 'Arn'), HeartbeatTimeout=str(30 * 60), ), ], DependsOn=[route.title for route in routes] + [drain_hook_subscription.title], )) agent_role = t.add_resource(Role( 'ECSAgentRole', AssumeRolePolicyDocument=Policy( Version='2012-10-17', Statement=[ Statement( Effect=Allow, Principal=Principal('Service', 'ecs-tasks.amazonaws.com'), Action=[sts.AssumeRole], ), ], ), ManagedPolicyArns=[ 'arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy', ], )) task_role = t.add_resource(Role( 'ECSTaskRole', AssumeRolePolicyDocument=Policy( Version='2012-10-17', Statement=[ Statement( Effect=Allow, Principal=Principal('Service', 'ecs-tasks.amazonaws.com'), Action=[sts.AssumeRole], ), ], ), Policies=[ NamedPolicy( PolicyName='s3-bucket-access', PolicyDocument=Policy( Version='2012-10-17', Statement=[ Statement( Effect=Allow, Resource=['*'], Action=[s3.Action('*')], ), ], ), ), ], )) log_group = t.add_resource(LogGroup( 'TaskLogGroup', LogGroupName=Sub('/${AWS::StackName}/ecs-task'), RetentionInDays=Ref(log_retention), )) task = t.add_resource(TaskDefinition( 'ClusterTaskDefinition', Cpu='256', Memory='0.5GB', ContainerDefinitions=[ ContainerDefinition( Name='service-container', Image=Ref(docker_image), PortMappings=[ PortMapping( ContainerPort=80, ), ], LogConfiguration=LogConfiguration( LogDriver='awslogs', Options={ 'awslogs-group': Ref(log_group), 'awslogs-region': Region, 'awslogs-stream-prefix': 'ecs', }, ), ), ], ExecutionRoleArn=GetAtt(agent_role, 'Arn'), TaskRoleArn=GetAtt(task_role, 'Arn'), )) elb_management_role = t.add_resource(Role( 'ECSLoadBalancerManagementRole', AssumeRolePolicyDocument=Policy( Version='2012-10-17', Statement=[ Statement( Effect=Allow, Principal=Principal('Service', 'ecs.amazonaws.com'), Action=[sts.AssumeRole], ), ], ), Policies=[ NamedPolicy( PolicyName='copied-AmazonECSServiceRolePolicy', PolicyDocument=Policy( Version='2012-10-17', Statement=[ Statement( Effect=Allow, Resource=['*'], Action=[ ec2.AttachNetworkInterface, ec2.CreateNetworkInterface, ec2.Action('CreateNetworkInterfacePermission'), ec2.DeleteNetworkInterface, ec2.Action('DeleteNetworkInterfacePermission'), ec2.Action('Describe*'), ec2.DetachNetworkInterface, elasticloadbalancing.DeregisterInstancesFromLoadBalancer, elasticloadbalancing.DeregisterTargets, elasticloadbalancing.Action('Describe*'), elasticloadbalancing.RegisterInstancesWithLoadBalancer, elasticloadbalancing.RegisterTargets, route53.ChangeResourceRecordSets, route53.CreateHealthCheck, route53.DeleteHealthCheck, route53.Action('Get*'), route53.Action('List*'), route53.UpdateHealthCheck, Action('servicediscovery', 'DeregisterInstance'), Action('servicediscovery', 'Get*'), Action('servicediscovery', 'List*'), Action('servicediscovery', 'RegisterInstance'), Action('servicediscovery', 'UpdateInstanceCustomHealthStatus'), ], ), ], ), ), ], )) service = t.add_resource(Service( 'ClusterService', Cluster=Ref(cluster), DeploymentConfiguration=DeploymentConfiguration( MinimumHealthyPercent=50, ), LoadBalancers=[ ServiceLoadBalancer( ContainerName=task.ContainerDefinitions[0].Name, ContainerPort=80, TargetGroupArn=Ref(target_group), ), ], DesiredCount=Ref(task_count), TaskDefinition=Ref(task), DependsOn=[http_listener.title], Role=GetAtt(elb_management_role, 'Arn'), )) t.add_output(Output( 'LoadBalancerEndpoint', Value=GetAtt(load_balancer, 'DNSName'), )) return t if __name__ == '__main__': print(create_template().to_json())