When building the first iteration of the AWS provider for Anzu, I wanted to provide an easy way to deploy containers on AWS ECS. While newer versions of the provider offer low-level resources, back then I believed that one resource could deploy all related dependencies as well.
While I strongly advise avoiding deploying resources to AWS using the SDK (please use literally any other tool that removes the need to write this code), I’ve learned a lot about the cloud primitives that make up the AWS experience, from networking to compute, so I decided that it would still be worthwhile sharing the code for educational reasons (and for me to remember).
Our goal: Running containers on ECS Fargate
In the end, we want to deploy services to AWS ECS running on the Fargate platform to get that juicy isolation, as well as provisioning DNS records to route traffic to the service through an ELB instance (using Cloudflare).
Secondary Goal: Adding rollback capabilities
In case something fails or we want to tear down our resources, we need to provide an easy way to roll back everything we deployed up to that point. As it turns out, some resources may still serve traffic, so we’ll have to add some logic to gracefully drop everything.
Passing credentials
To deploy everything, we’ll need a couple of tokens from AWS and Cloudflare. We’ll store them in the following struct:
type ConfigInputs struct {
AwsAccessKeyId *string `json:"awsAccessKeyId"`
AwsSecretAccessKey *string `json:"awsSecretAccessKey"`
AwsRegion *string `json:"awsRegion"`
CloudflareApiToken *string `json:"cloudflareApiToken"`
}
Wiring up our SDKs
To reach our goal, we’ll interact with a lot of different AWS services. We’ll create an internal struct collecting clients for us to use.
type clients struct {
iam *iam.Client
elb *elasticloadbalancingv2.Client
ec2 *ec2.Client
acm *acm.Client
ecs *ecs.Client
s3 *s3.Client
sqs *sqs.Client
secretsManager *secretsmanager.Client
batch *batch.Client
cf *cloudflare.API
}
This is followed by helper functions to prepare the clients struct above.
func buildAWSConfig(ctx context.Context, providerConfig *ConfigInputs) (*aws.Config, error) {
if providerConfig.AwsAccessKeyId == nil || providerConfig.AwsSecretAccessKey == nil {
return nil, errors.New("awsAccessKeyId and awsSecretAccessKey must be set")
}
keyCredentials := credentials.NewStaticCredentialsProvider(*providerConfig.AwsAccessKeyId, *providerConfig.AwsSecretAccessKey, "")
cfg, err := config.LoadDefaultConfig(
ctx,
config.WithCredentialsProvider(keyCredentials),
config.WithRegion("eu-central-1"),
)
if err != nil {
return nil, fmt.Errorf("failed to load config: %w", err)
}
if providerConfig.AwsRegion != nil {
cfg.Region = *providerConfig.AwsRegion
}
return &cfg, nil
}
func configureECS(ctx context.Context, providerConfig *ConfigInputs) (*ecs.Client, error) {
awsConfig, err := buildAWSConfig(ctx, providerConfig)
if err != nil {
return nil, fmt.Errorf("failed to build aws config: %w", err)
}
client := ecs.NewFromConfig(*awsConfig)
return client, nil
}
func configureS3(ctx context.Context, providerConfig *ConfigInputs) (*s3.Client, error) {
awsConfig, err := buildAWSConfig(ctx, providerConfig)
if err != nil {
return nil, fmt.Errorf("failed to build aws config: %w", err)
}
s3Client := s3.NewFromConfig(*awsConfig)
return s3Client, nil
}
func configureSQS(ctx context.Context, providerConfig *ConfigInputs) (*sqs.Client, error) {
awsConfig, err := buildAWSConfig(ctx, providerConfig)
if err != nil {
return nil, fmt.Errorf("failed to build aws config: %w", err)
}
sqsClient := sqs.NewFromConfig(*awsConfig)
return sqsClient, nil
}
func configureIAM(ctx context.Context, providerConfig *ConfigInputs) (*iam.Client, error) {
awsConfig, err := buildAWSConfig(ctx, providerConfig)
if err != nil {
return nil, fmt.Errorf("failed to build aws config: %w", err)
}
client := iam.NewFromConfig(*awsConfig)
return client, nil
}
func configureELB(ctx context.Context, providerConfig *ConfigInputs) (*elasticloadbalancingv2.Client, error) {
awsConfig, err := buildAWSConfig(ctx, providerConfig)
if err != nil {
return nil, fmt.Errorf("failed to build aws config: %w", err)
}
client := elasticloadbalancingv2.NewFromConfig(*awsConfig)
return client, nil
}
func configureEC2(ctx context.Context, providerConfig *ConfigInputs) (*ec2.Client, error) {
awsConfig, err := buildAWSConfig(ctx, providerConfig)
if err != nil {
return nil, fmt.Errorf("failed to build aws config: %w", err)
}
client := ec2.NewFromConfig(*awsConfig)
return client, nil
}
func configureACM(ctx context.Context, providerConfig *ConfigInputs) (*acm.Client, error) {
awsConfig, err := buildAWSConfig(ctx, providerConfig)
if err != nil {
return nil, fmt.Errorf("failed to build aws config: %w", err)
}
client := acm.NewFromConfig(*awsConfig)
return client, nil
}
func configureCloudflare(_ context.Context, providerConfig *ConfigInputs) (*cloudflare.API, error) {
if providerConfig.CloudflareApiToken == nil {
return nil, fmt.Errorf("missing cloudflareApiToken in provider configuration")
}
cf, err := cloudflare.NewWithAPIToken(*providerConfig.CloudflareApiToken)
if err != nil {
return nil, fmt.Errorf("failed to build cloudflare client: %w", err)
}
return cf, nil
}
func configureSecretsManager(ctx context.Context, providerConfig *ConfigInputs) (*secretsmanager.Client, error) {
awsConfig, err := buildAWSConfig(ctx, providerConfig)
if err != nil {
return nil, fmt.Errorf("failed to build aws config: %w", err)
}
client := secretsmanager.NewFromConfig(*awsConfig)
return client, nil
}
func configureBatch(ctx context.Context, providerConfig *ConfigInputs) (*batch.Client, error) {
awsConfig, err := buildAWSConfig(ctx, providerConfig)
if err != nil {
return nil, fmt.Errorf("failed to build aws config: %w", err)
}
client := batch.NewFromConfig(*awsConfig)
return client, nil
}
func CreateAWSClients(ctx context.Context, configInputs *ConfigInputs) (*clients, error) {
iamClient, err := configureIAM(ctx, configInputs)
if err != nil {
return nil, fmt.Errorf("failed to configure iam client: %w", err)
}
elbClient, err := configureELB(ctx, configInputs)
if err != nil {
return nil, fmt.Errorf("failed to configure elb client: %w", err)
}
ec2Client, err := configureEC2(ctx, configInputs)
if err != nil {
return nil, fmt.Errorf("failed to configure ec2 client: %w", err)
}
acmClient, err := configureACM(ctx, configInputs)
if err != nil {
return nil, fmt.Errorf("failed to configure acm client: %w", err)
}
ecsClient, err := configureECS(ctx, configInputs)
if err != nil {
return nil, fmt.Errorf("failed to configure ecs client: %w", err)
}
s3Client, err := configureS3(ctx, configInputs)
if err != nil {
return nil, fmt.Errorf("failed to configure s3 client: %w", err)
}
sqsClient, err := configureSQS(ctx, configInputs)
if err != nil {
return nil, fmt.Errorf("failed to configure sqs client: %w", err)
}
secretsManagerClient, err := configureSecretsManager(ctx, configInputs)
if err != nil {
return nil, fmt.Errorf("failed to configure secrets manager client: %w", err)
}
batchClient, err := configureBatch(ctx, configInputs)
if err != nil {
return nil, fmt.Errorf("failed to configure batch client: %w", err)
}
return &clients{
iam: iamClient,
elb: elbClient,
ec2: ec2Client,
acm: acmClient,
ecs: ecsClient,
s3: s3Client,
sqs: sqsClient,
secretsManager: secretsManagerClient,
batch: batchClient,
}, nil
}
func CreateCloudflareClient(ctx context.Context, configInputs *ConfigInputs) (*clients, error) {
cfClient, err := configureCloudflare(ctx, configInputs)
if err != nil {
return nil, fmt.Errorf("failed to configure cloudflare client: %w", err)
}
return &clients{cf: cfClient}, nil
}
func CreateClients(ctx context.Context, configInputs *ConfigInputs) (*clients, error) {
awsClients, err := CreateAWSClients(ctx, configInputs)
if err != nil {
return nil, fmt.Errorf("failed to create aws clients: %w", err)
}
cfClient, err := CreateCloudflareClient(ctx, configInputs)
if err != nil {
return nil, fmt.Errorf("failed to configure cloudflare client: %w", err)
}
awsClients.cf = cfClient.cf
return awsClients, nil
}
Tracking created resources
First, we’ll introduce a simple data structure to keep track of our resources
type CreatedResource struct {
Use string `json:"use,omitempty"`
// Generic resource type + id
ResourceType string `json:"resourceType"`
ResourceId string `json:"resourceId"`
// Additional attributes needed for deleting specific resources
EcsClusterArn string `json:"ecsClusterArn,omitempty"`
CfZoneId string `json:"cfZoneId,omitempty"`
IgwVpcId string `json:"igwVpcId,omitempty"`
rollbackErr error
}
Graceful deletion
Once we’ve collected all resources, we can simply reverse the list and drop them step by step. In case deletion fails, I’ve decided that it’s still best to keep going to get rid of as many resources as possible. Of course, this may lead to cascading errors when the resource still exists and subsequent ones depend on it being deleted first.
func Rollback(ctx context.Context, logger logrus.FieldLogger, created []CreatedResource, clients *clients) error {
failed := make([]CreatedResource, 0)
for i := len(created) - 1; i >= 0; i-- {
fields := logger.WithFields(logrus.Fields{
"type": created[i].ResourceType,
"id": created[i].ResourceId,
"use": created[i].Use,
})
fields.Infoln("Rolling back")
err := created[i].drop(ctx, logger, clients)
if err != nil {
failed = append(failed, CreatedResource{
ResourceType: created[i].ResourceType,
ResourceId: created[i].ResourceId,
CfZoneId: created[i].CfZoneId,
IgwVpcId: created[i].IgwVpcId,
rollbackErr: err,
})
fields.Errorf("Failed to roll back: %s\n", err.Error())
}
}
if len(failed) > 0 {
for _, temp := range failed {
logger.
WithFields(logrus.Fields{
"type": temp.ResourceType,
"id": temp.ResourceId,
"use": temp.Use,
}).
Errorln("Failed to roll back")
}
return fmt.Errorf("failed to rollback some resources")
}
return nil
}
To drop resources, we simply provide a method that runs the appropriate logic depending on the resource type. For some resources, we may have to wait a bit, so I added a simple wait loop with a timeout of five minutes respecting the passed context.
func (c *CreatedResource) drop(ctx context.Context, logger logrus.FieldLogger, clients *clients) error {
switch c.ResourceType {
case "iam:role":
attached, err := clients.iam.ListAttachedRolePolicies(ctx, &iam.ListAttachedRolePoliciesInput{
RoleName: &c.ResourceId,
})
if err != nil {
return fmt.Errorf("failed to list attached policies for role %s: %w", c.ResourceId, err)
}
for _, policy := range attached.AttachedPolicies {
_, err = clients.iam.DetachRolePolicy(ctx, &iam.DetachRolePolicyInput{
PolicyArn: policy.PolicyArn,
RoleName: &c.ResourceId,
})
if err != nil {
return fmt.Errorf("failed to detach policy %s from role %s: %w", *policy.PolicyArn, c.ResourceId, err)
}
}
_, err = clients.iam.DeleteRole(ctx, &iam.DeleteRoleInput{RoleName: &c.ResourceId})
if err != nil {
return fmt.Errorf("failed to delete iam role: %w", err)
}
case "ecs:task-definition":
_, err := clients.ecs.DeregisterTaskDefinition(ctx, &ecs.DeregisterTaskDefinitionInput{TaskDefinition: &c.ResourceId})
if err != nil {
return fmt.Errorf("failed to delete ecs task definition: %w", err)
}
case "ec2:vpc":
_, err := clients.ec2.DeleteVpc(ctx, &ec2.DeleteVpcInput{VpcId: &c.ResourceId})
if err != nil {
return fmt.Errorf("failed to delete ec2 vpc: %w", err)
}
case "ec2:subnet":
_, err := clients.ec2.DeleteSubnet(ctx, &ec2.DeleteSubnetInput{SubnetId: &c.ResourceId})
if err != nil {
return fmt.Errorf("failed to delete ec2 subnet: %w", err)
}
case "ec2:security-group":
// Check if the security group is attached to any other resources
for {
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(time.Minute * 5):
return fmt.Errorf("failed to delete security group %s: security group is still attached to resources", c.ResourceId)
case <-time.After(time.Second * 5):
}
attached, err := clients.ec2.DescribeNetworkInterfaces(ctx, &ec2.DescribeNetworkInterfacesInput{
Filters: []typesEc2.Filter{
{
Name: aws.String("group-id"),
Values: []string{c.ResourceId},
},
},
})
if err != nil {
return fmt.Errorf("failed to describe network interfaces: %w", err)
}
if len(attached.NetworkInterfaces) == 0 {
break
}
for _, networkInterface := range attached.NetworkInterfaces {
logrus.
WithField("type", networkInterface.InterfaceType).
WithField("arn", networkInterface.NetworkInterfaceId).
WithField("sg", c.ResourceId).
Debugf("security group still attached to network interface")
}
}
_, err := clients.ec2.DeleteSecurityGroup(ctx, &ec2.DeleteSecurityGroupInput{GroupId: &c.ResourceId})
if err != nil {
return fmt.Errorf("failed to delete ec2 security group: %w", err)
}
case "elb:load-balancer":
_, err := clients.elb.DeleteLoadBalancer(ctx, &elasticloadbalancingv2.DeleteLoadBalancerInput{LoadBalancerArn: &c.ResourceId})
if err != nil {
return fmt.Errorf("failed to delete elb load balancer: %w", err)
}
case "elb:target-group":
_, err := clients.elb.DeleteTargetGroup(ctx, &elasticloadbalancingv2.DeleteTargetGroupInput{TargetGroupArn: &c.ResourceId})
if err != nil {
return fmt.Errorf("failed to delete elb target group: %w", err)
}
case "acm:certificate":
// Wait until certificate is no longer in use
for {
select {
case <-ctx.Done():
return fmt.Errorf("context cancelled")
case <-time.After(time.Minute * 5):
return fmt.Errorf("timeout waiting for certificate to be verified")
case <-time.After(time.Second * 5):
}
currentCert, err := clients.acm.DescribeCertificate(ctx, &acm.DescribeCertificateInput{
CertificateArn: &c.ResourceId,
})
if err != nil {
return fmt.Errorf("failed to get certificate: %w", err)
}
if len(currentCert.Certificate.InUseBy) == 0 {
break
}
for _, s := range currentCert.Certificate.InUseBy {
logrus.WithField("arn", s).Debugf("Certificate still being used")
}
}
_, err := clients.acm.DeleteCertificate(ctx, &acm.DeleteCertificateInput{CertificateArn: &c.ResourceId})
if err != nil {
return fmt.Errorf("failed to delete acm certificate: %w", err)
}
case "elb:listener":
_, err := clients.elb.DeleteListener(ctx, &elasticloadbalancingv2.DeleteListenerInput{ListenerArn: &c.ResourceId})
if err != nil {
return fmt.Errorf("failed to delete elb listener: %w", err)
}
case "ecs:cluster":
for {
select {
case <-ctx.Done():
return fmt.Errorf("context cancelled")
case <-time.After(time.Minute * 5):
return fmt.Errorf("timeout waiting for cluster to be deleted")
case <-time.After(time.Second * 5):
}
list, err := clients.ecs.ListTasks(ctx, &ecs.ListTasksInput{
Cluster: &c.ResourceId,
})
if err != nil {
return fmt.Errorf("failed to list tasks: %w", err)
}
if len(list.TaskArns) == 0 {
break
}
logrus.Debugf("Tasks still not deleted")
}
_, err := clients.ecs.DeleteCluster(ctx, &ecs.DeleteClusterInput{Cluster: &c.ResourceId})
if err != nil {
return fmt.Errorf("failed to delete ecs cluster: %w", err)
}
case "ecs:service":
_, err := clients.ecs.DeleteService(ctx, &ecs.DeleteServiceInput{Cluster: &c.EcsClusterArn, Service: &c.ResourceId, Force: aws.Bool(true)})
if err != nil {
return fmt.Errorf("failed to delete ecs service: %w", err)
}
case "cf:record":
err := clients.cf.DeleteDNSRecord(ctx, c.CfZoneId, c.ResourceId)
if err != nil {
return fmt.Errorf("failed to delete cloudflare record: %w", err)
}
case "elb:rule":
_, err := clients.elb.DeleteRule(ctx, &elasticloadbalancingv2.DeleteRuleInput{RuleArn: &c.ResourceId})
if err != nil {
return fmt.Errorf("failed to delete elb rule: %w", err)
}
case "ec2:internet-gateway":
_, err := clients.ec2.DetachInternetGateway(ctx, &ec2.DetachInternetGatewayInput{
InternetGatewayId: &c.ResourceId,
VpcId: &c.IgwVpcId,
})
if err != nil {
return fmt.Errorf("failed to detach internet gateway: %w", err)
}
_, err = clients.ec2.DeleteInternetGateway(ctx, &ec2.DeleteInternetGatewayInput{InternetGatewayId: &c.ResourceId})
if err != nil {
return fmt.Errorf("failed to delete ec2 internet gateway: %w", err)
}
case "ec2:route-table":
_, err := clients.ec2.DeleteRouteTable(ctx, &ec2.DeleteRouteTableInput{RouteTableId: &c.ResourceId})
if err != nil {
return fmt.Errorf("failed to delete ec2 route table: %w", err)
}
case "secretsmanager:secret":
_, err := clients.secretsManager.DeleteSecret(ctx, &secretsmanager.DeleteSecretInput{SecretId: &c.ResourceId})
if err != nil {
return fmt.Errorf("failed to delete secrets manager secret: %w", err)
}
default:
return fmt.Errorf("no rollback rule for %s %s", c.ResourceType, c.ResourceId)
}
return nil
}
Random suffixes
To generate unique resources in our AWS account (which allows us to run provision multiple instances of our setup), we add a random suffix to our resource names.
func randomSuffix() string {
var str string
length := 5
random := rand.New(rand.NewSource(time.Now().UnixNano()))
allowedChars := []rune("abcdefghijklmnopqrstuvwxyz")
for i := 0; i < length; i++ {
str += string(allowedChars[random.Intn(len(allowedChars))])
}
return str
}
Loading availability zones
To create subnets later on, we’ll have to know the availability zones we want to deploy those subnets into.
// loadAZs loads up to limit AZs from the given region
func loadAZs(ctx context.Context, logger logrus.FieldLogger, region string, clients *clients, limit int) ([]string, error) {
logger.WithField("region", region).Debugf("Loading AZs")
azs, err := clients.ec2.DescribeAvailabilityZones(ctx, &ec2.DescribeAvailabilityZonesInput{
Filters: []typesEc2.Filter{
{Name: aws.String("region-name"), Values: []string{region}},
{Name: aws.String("state"), Values: []string{"available"}},
},
})
if err != nil {
return nil, fmt.Errorf("failed to describe availability zones: %w", err)
}
availabilityZones := make([]string, 3)
for i := 0; i < limit; i++ {
az := azs.AvailabilityZones[i]
availabilityZones[i] = *az.ZoneName
}
return availabilityZones, nil
}
Creating roles
Up next, we’ll provision all roles needed for AWS ECS.
type createRolesResp struct {
taskRoleArn string
taskExecutionRoleArn string
}
func createRoles(ctx context.Context, logger logrus.FieldLogger, clients *clients) ([]CreatedResource, *createRolesResp, error) {
createdResources := make([]CreatedResource, 0)
logger.Debugf("Ensuring AWSServiceRoleForECS exists")
// Check if AWSServiceRoleForECS exists (so AWS ECS can interact with other AWS services)
// This is usually created by AWS ECS when using it for the first time, but it might not exist
_, err := clients.iam.GetRole(ctx, &iam.GetRoleInput{RoleName: aws.String("AWSServiceRoleForECS")})
if err != nil {
// Check if the error is a not found error
var roleNotFound *iamTypes.NoSuchEntityException
if !errors.As(err, &roleNotFound) {
return nil, nil, fmt.Errorf("failed to get role: %w", err)
}
logger.Debugf("AWSServiceRoleForECS does not exist, creating")
// Create the role
_, err = clients.iam.CreateRole(ctx, &iam.CreateRoleInput{
AssumeRolePolicyDocument: aws.String(`{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "ecs.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}`),
RoleName: aws.String("AWSServiceRoleForECS"),
})
if err != nil {
return nil, nil, fmt.Errorf("failed to create AWSServiceRoleForECS role: %w", err)
}
}
assumeRolePolicy := `{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "ecs-tasks.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}`
logger.Debugf("Creating ECS task execution rule")
// Create new namespaced role for execution (to pull images and push logs)
createdExecutionRole, err := clients.iam.CreateRole(ctx, &iam.CreateRoleInput{
RoleName: aws.String(fmt.Sprintf("ecsTaskExecutionRole-%s", randomSuffix())),
AssumeRolePolicyDocument: &assumeRolePolicy,
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to create role: %w", err)
}
createdResources = append(createdResources, CreatedResource{
Use: "TaskExecutionRole",
ResourceType: "iam:role",
ResourceId: *createdExecutionRole.Role.RoleName,
})
// Attach managed ECS task execution role policy
managedECSPolicy := "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
_, err = clients.iam.AttachRolePolicy(ctx, &iam.AttachRolePolicyInput{
PolicyArn: aws.String(managedECSPolicy),
RoleName: createdExecutionRole.Role.RoleName,
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to attach role policy: %w", err)
}
logger.Debugf("Creating ECS task role")
// Create a new namespaced role for the task to assume (to access AWS services with specific permissions from inside Fargate)
createdTaskRole, err := clients.iam.CreateRole(ctx, &iam.CreateRoleInput{
RoleName: aws.String(fmt.Sprintf("ecsTaskRole-%s", randomSuffix())),
AssumeRolePolicyDocument: &assumeRolePolicy,
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to create role: %w", err)
}
createdResources = append(createdResources, CreatedResource{
Use: "TaskRole",
ResourceType: "iam:role",
ResourceId: *createdTaskRole.Role.RoleName,
})
return createdResources, &createRolesResp{
taskRoleArn: *createdTaskRole.Role.Arn,
taskExecutionRoleArn: *createdExecutionRole.Role.Arn,
}, nil
}
Storing secrets for private image registries
This is an extra to support fetching images from private registries: We’ll simply store the credentials in SecretsManager and provide the secret ARN to our container definition later on.
type createPrivateRegistryAuthArgs struct {
taskExecutionRoleName string
userName string
password string
privateRegistrySecretArn string
}
type createPrivateRegistryAuthResp struct {
registryAuthSecretArn string
}
func createPrivateRegistryAuth(ctx context.Context, logger logrus.FieldLogger, clients *clients, args createPrivateRegistryAuthArgs) ([]CreatedResource, *createPrivateRegistryAuthResp, error) {
createdResources := make([]CreatedResource, 0)
if args.privateRegistrySecretArn != "" {
// Update secret
logger.Debugf("Updating registry auth secret")
_, err := clients.secretsManager.UpdateSecret(ctx, &secretsmanager.UpdateSecretInput{
SecretId: aws.String(args.privateRegistrySecretArn),
SecretString: aws.String(fmt.Sprintf("{\"username\": \"%s\", \"password\": \"%s\"}", args.userName, args.password)),
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to update registry auth secret: %w", err)
}
return createdResources, &createPrivateRegistryAuthResp{
registryAuthSecretArn: args.privateRegistrySecretArn,
}, nil
}
// Create secret holding username and password
logger.Debugf("Creating registry auth secret")
createdSecret, err := clients.secretsManager.CreateSecret(ctx, &secretsmanager.CreateSecretInput{
Name: aws.String(fmt.Sprintf("ecs-registry-auth-%s", randomSuffix())),
SecretString: aws.String(fmt.Sprintf("{\"username\": \"%s\", \"password\": \"%s\"}", args.userName, args.password)),
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to create registry auth secret: %w", err)
}
createdResources = append(createdResources, CreatedResource{
Use: "PrivateRegistryCredentials",
ResourceType: "secretsmanager:secret",
ResourceId: *createdSecret.ARN,
})
// Create policy for the private registry
logger.Debugf("Creating private registry policy")
createdPolicy, err := clients.iam.CreatePolicy(ctx, &iam.CreatePolicyInput{
PolicyDocument: aws.String(fmt.Sprintf(`{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"secretsmanager:GetSecretValue"
],
"Resource": [
"%s",
]
}
]
}
`, *createdSecret.ARN)),
PolicyName: aws.String(fmt.Sprintf("privateRegistryAuthPolicy-%s", randomSuffix())),
})
if err != nil {
return nil, nil, fmt.Errorf("failed to create policy: %w", err)
}
createdResources = append(createdResources, CreatedResource{
Use: "PrivateRegistryCredentials",
ResourceType: "iam:policy",
ResourceId: *createdPolicy.Policy.PolicyName,
})
// Attach the policy to the task execution role
logger.Debugf("Attaching policy to task execution role")
_, err = clients.iam.AttachRolePolicy(ctx, &iam.AttachRolePolicyInput{
PolicyArn: aws.String(*createdPolicy.Policy.Arn),
RoleName: aws.String(args.taskExecutionRoleName),
})
if err != nil {
return nil, nil, fmt.Errorf("failed to attach policy to task execution role: %w", err)
}
return createdResources, &createPrivateRegistryAuthResp{
registryAuthSecretArn: *createdSecret.ARN,
}, nil
}
Preparing the network
Now it gets interesting! To expose our services, we’ll have to create public subnets. This means we either need to run an expensive NAT gateway, or we simply allocate public IP addresses in our relevant subnets.
type createNetworkingResp struct {
subnetIds []string
ecsSecurityGroupIds []string
lbSecurityGroupIds []string
vpcId string
}
// Creates a VPC, public subnets for every availability zone, and security groups for traffic to the Load Balancer, as well as traffic of ECS tasks
func createNetworking(ctx context.Context, logger logrus.FieldLogger, clients *clients, vpcName string, availabilityZones []string) ([]CreatedResource, *createNetworkingResp, error) {
createdResources := make([]CreatedResource, 0)
logger.Debugf("Creating VPC")
// https://docs.aws.amazon.com/vpc/latest/userguide/VPC_Subnets.html
vpcCIDR := "10.0.0.0/16"
createdVpc, err := clients.ec2.CreateVpc(ctx, &ec2.CreateVpcInput{
CidrBlock: aws.String(vpcCIDR),
TagSpecifications: []typesEc2.TagSpecification{
{
ResourceType: typesEc2.ResourceTypeVpc,
Tags: []typesEc2.Tag{
{
Key: aws.String("Name"),
Value: aws.String(vpcName),
},
},
},
},
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to create vpc: %w", err)
}
createdResources = append(createdResources, CreatedResource{
ResourceType: "ec2:vpc",
ResourceId: *createdVpc.Vpc.VpcId,
})
logger.Debugf("Creating internet gateway")
// https://docs.aws.amazon.com/vpc/latest/userguide/VPC_Internet_Gateway.html
createdIgw, err := clients.ec2.CreateInternetGateway(ctx, &ec2.CreateInternetGatewayInput{
TagSpecifications: []typesEc2.TagSpecification{
{
ResourceType: typesEc2.ResourceTypeInternetGateway,
Tags: []typesEc2.Tag{
{
Key: aws.String("Name"),
Value: aws.String(fmt.Sprintf("%s-igw", vpcName)),
},
},
},
},
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to create internet gateway: %w", err)
}
createdResources = append(createdResources, CreatedResource{
ResourceType: "ec2:internet-gateway",
ResourceId: *createdIgw.InternetGateway.InternetGatewayId,
IgwVpcId: *createdVpc.Vpc.VpcId,
})
logger.Debugf("Attaching internet gateway to VPC")
_, err = clients.ec2.AttachInternetGateway(ctx, &ec2.AttachInternetGatewayInput{
InternetGatewayId: createdIgw.InternetGateway.InternetGatewayId,
VpcId: createdVpc.Vpc.VpcId,
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to attach internet gateway: %w", err)
}
// Get main route table
routeTables, err := clients.ec2.DescribeRouteTables(ctx, &ec2.DescribeRouteTablesInput{
Filters: []typesEc2.Filter{
{
Name: aws.String("vpc-id"),
Values: []string{*createdVpc.Vpc.VpcId},
},
},
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to describe route tables: %w", err)
}
mainRouteTable := routeTables.RouteTables[0]
logger.Debugf("Creating route to internet")
// Create IPv4 route to internet
// https://docs.aws.amazon.com/vpc/latest/userguide/VPC_Route_Tables.html#route-table-routes
// https://docs.aws.amazon.com/vpc/latest/userguide/route-table-options.html#route-tables-internet-gateway
_, err = clients.ec2.CreateRoute(ctx, &ec2.CreateRouteInput{
RouteTableId: mainRouteTable.RouteTableId,
DestinationCidrBlock: aws.String("0.0.0.0/0"),
GatewayId: createdIgw.InternetGateway.InternetGatewayId,
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to create route to internet: %w", err)
}
// Create IPv6 route to internet
_, err = clients.ec2.CreateRoute(ctx, &ec2.CreateRouteInput{
RouteTableId: mainRouteTable.RouteTableId,
DestinationIpv6CidrBlock: aws.String("::/0"),
GatewayId: createdIgw.InternetGateway.InternetGatewayId,
})
// Create subnets for each AZ
subnetPerAZ := make(map[string]string)
subnetIds := make([]string, 0, len(availabilityZones))
for i, zone := range availabilityZones {
cidr := fmt.Sprintf("10.0.%d.0/24", i+1)
logger.WithField("cidr", cidr).WithField("zone", zone).Debugf("Creating subnet")
createdSubnet, err := clients.ec2.CreateSubnet(ctx, &ec2.CreateSubnetInput{
VpcId: createdVpc.Vpc.VpcId,
AvailabilityZone: &zone,
// start with 10.0.1.0/24 [10.0.1.1;10.0.1.254]
CidrBlock: aws.String(cidr),
TagSpecifications: []typesEc2.TagSpecification{
{
ResourceType: typesEc2.ResourceTypeSubnet,
Tags: []typesEc2.Tag{
{
Key: aws.String("Name"),
Value: aws.String(fmt.Sprintf("%s-%s", vpcName, zone)),
},
},
},
},
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to create subnet: %w", err)
}
// Auto-assign public IPv4 address to the subnet
_, err = clients.ec2.ModifySubnetAttribute(ctx, &ec2.ModifySubnetAttributeInput{
SubnetId: createdSubnet.Subnet.SubnetId,
MapPublicIpOnLaunch: &typesEc2.AttributeBooleanValue{
Value: aws.Bool(true),
},
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to assign public IPv4 address to subnet: %w", err)
}
createdResources = append(createdResources, CreatedResource{
ResourceType: "ec2:subnet",
ResourceId: *createdSubnet.Subnet.SubnetId,
})
subnetPerAZ[zone] = *createdSubnet.Subnet.SubnetId
subnetIds = append(subnetIds, *createdSubnet.Subnet.SubnetId)
}
logger.Debugf("Creating security group to LB")
securityGroupToLB, err := clients.ec2.CreateSecurityGroup(ctx, &ec2.CreateSecurityGroupInput{
GroupName: aws.String(fmt.Sprintf("%s-to-lb", vpcName)),
Description: aws.String("Security group for load balancer"),
VpcId: createdVpc.Vpc.VpcId,
TagSpecifications: []typesEc2.TagSpecification{
{
ResourceType: typesEc2.ResourceTypeSecurityGroup,
Tags: []typesEc2.Tag{
{
Key: aws.String("Name"),
Value: aws.String(fmt.Sprintf("%s-to-lb", vpcName)),
},
},
},
},
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to create security group to LB: %w", err)
}
createdResources = append(createdResources, CreatedResource{
ResourceType: "ec2:security-group",
ResourceId: *securityGroupToLB.GroupId,
})
// Allow all HTTP/HTTPS ingress traffic from IPv4/IPv6
_, err = clients.ec2.AuthorizeSecurityGroupIngress(ctx, &ec2.AuthorizeSecurityGroupIngressInput{
GroupId: securityGroupToLB.GroupId,
IpPermissions: []typesEc2.IpPermission{
{
IpProtocol: aws.String("6"),
IpRanges: []typesEc2.IpRange{
{
CidrIp: aws.String("0.0.0.0/0"),
},
},
FromPort: aws.Int32(80),
ToPort: aws.Int32(80),
},
{
IpProtocol: aws.String("6"),
Ipv6Ranges: []typesEc2.Ipv6Range{
{
CidrIpv6: aws.String("::/0"),
},
},
FromPort: aws.Int32(80),
ToPort: aws.Int32(80),
},
{
IpProtocol: aws.String("6"),
IpRanges: []typesEc2.IpRange{
{
CidrIp: aws.String("0.0.0.0/0"),
},
},
FromPort: aws.Int32(443),
ToPort: aws.Int32(443),
},
{
IpProtocol: aws.String("6"),
Ipv6Ranges: []typesEc2.Ipv6Range{
{
CidrIpv6: aws.String("::/0"),
},
},
FromPort: aws.Int32(443),
ToPort: aws.Int32(443),
},
},
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to authorize security group egress: %w", err)
}
logger.Debugf("Creating security group for ECS")
// Allow all outgoing traffic
securityGroupECS, err := clients.ec2.CreateSecurityGroup(ctx, &ec2.CreateSecurityGroupInput{
GroupName: aws.String(fmt.Sprintf("ecsSecurityGroup-%s", randomSuffix())),
VpcId: createdVpc.Vpc.VpcId,
Description: aws.String(
"Security group for ECS cluster",
),
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to create security group for ECS: %w", err)
}
createdResources = append(createdResources, CreatedResource{
ResourceType: "ec2:security-group",
ResourceId: *securityGroupECS.GroupId,
})
// Allow incoming traffic from same VPC (LB -> ECS)
_, err = clients.ec2.AuthorizeSecurityGroupIngress(ctx, &ec2.AuthorizeSecurityGroupIngressInput{
GroupId: securityGroupECS.GroupId,
IpPermissions: []typesEc2.IpPermission{
{
FromPort: aws.Int32(0),
ToPort: aws.Int32(65535),
IpProtocol: aws.String("6"),
IpRanges: []typesEc2.IpRange{
{
CidrIp: aws.String(vpcCIDR),
Description: aws.String("VPC CIDR"),
},
},
},
},
})
return createdResources, &createNetworkingResp{
subnetIds: subnetIds,
lbSecurityGroupIds: []string{*securityGroupToLB.GroupId},
ecsSecurityGroupIds: []string{*securityGroupECS.GroupId},
vpcId: *createdVpc.Vpc.VpcId,
}, nil
}
Provisioning certificates
To serve our traffic securely, we’ll provision public TLS certificates using AWS Certificate Manager. These certificates come entirely for free, which is great. We’ll wait until the certificate is properly provisioned (including the validation step), as our load balancer will not accept pending certificates.
type createCertificateArgs struct {
certificateDomain string
zoneId string
}
type createCertificateResp struct {
certificateArn string
}
func CreateRecord(ctx context.Context, logger logrus.FieldLogger, c *clients, zoneId string, kind, fullNameIncludingDomain, content string) ([]CreatedResource, error) {
createdResources := make([]CreatedResource, 0)
// Remove trailing dot as Cloudflare doesn't save it
if strings.HasSuffix(fullNameIncludingDomain, ".") {
logger.WithField("previous", fullNameIncludingDomain).Debugln("Removing trailing dot from record name")
fullNameIncludingDomain = fullNameIncludingDomain[:len(fullNameIncludingDomain)-1]
}
existing, err := c.cf.DNSRecords(ctx, zoneId, cloudflare.DNSRecord{
Name: fullNameIncludingDomain,
Type: kind,
})
if err != nil {
return nil, fmt.Errorf("failed to get existing record: %w", err)
}
toCreate := cloudflare.DNSRecord{
Name: fullNameIncludingDomain,
Type: kind,
Content: content,
// TTL in seconds
TTL: 60,
}
fieldLogger := logger.WithFields(logrus.Fields{
"zoneId": zoneId,
"type": kind,
"name": fullNameIncludingDomain,
"content": content,
})
if len(existing) == 0 {
fieldLogger.Debugln("Creating DNS record")
created, err := c.cf.CreateDNSRecord(ctx, zoneId, toCreate)
if err != nil {
return nil, fmt.Errorf("failed to create record: %w", err)
}
createdResources = append(createdResources, CreatedResource{
ResourceType: "cf:record",
ResourceId: created.Result.ID,
CfZoneId: zoneId,
})
return createdResources, nil
}
fieldLogger.WithField("recordId", existing[0].ID).Debugln("Updating existing DNS record")
err = c.cf.UpdateDNSRecord(ctx, zoneId, existing[0].ID, toCreate)
if err != nil {
return createdResources, fmt.Errorf("failed to update record: %w", err)
}
createdResources = append(createdResources, CreatedResource{
ResourceType: "cf:record",
ResourceId: existing[0].ID,
CfZoneId: zoneId,
})
return createdResources, nil
}
func createCertificate(ctx context.Context, logger logrus.FieldLogger, clients *clients, args createCertificateArgs) ([]CreatedResource, *createCertificateResp, error) {
createdResources := make([]CreatedResource, 0)
logger.WithField("domain", args.certificateDomain).Debugf("Requesting certificate")
createdCert, err := clients.acm.RequestCertificate(ctx, &acm.RequestCertificateInput{
DomainName: aws.String(args.certificateDomain),
ValidationMethod: typesAcm.ValidationMethodDns,
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to request certificate: %w", err)
}
createdResources = append(createdResources, CreatedResource{
ResourceType: "acm:certificate",
ResourceId: *createdCert.CertificateArn,
})
// Wait and refresh certificate until resource record is present, this might take a couple of seconds
var certificate *acm.DescribeCertificateOutput
for {
select {
case <-ctx.Done():
return createdResources, nil, fmt.Errorf("context canceled while waiting for certificate resource record: %w", ctx.Err())
case <-time.After(time.Minute * 5):
return createdResources, nil, fmt.Errorf("certificate did not provide resource record in time: %w", ctx.Err())
case <-time.After(time.Second * 5):
break
}
certificate, err = clients.acm.DescribeCertificate(ctx, &acm.DescribeCertificateInput{
CertificateArn: createdCert.CertificateArn,
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to describe certificate: %w", err)
}
// Sometimes ACM caches verification statuses, so the certificate might be completely issued without a need for verification
if certificate.Certificate.Status == typesAcm.CertificateStatusIssued {
break
}
// Otherwise, we'd expect the certificate to be in the process of being verified
if certificate.Certificate.Status != typesAcm.CertificateStatusPendingValidation {
return createdResources, nil, fmt.Errorf("certificate status is not pending validation: %s", certificate.Certificate.Status)
}
var hasResourceRecord bool
for _, validationOption := range certificate.Certificate.DomainValidationOptions {
if validationOption.ValidationMethod == typesAcm.ValidationMethodDns && validationOption.ResourceRecord != nil && *validationOption.ResourceRecord.Name == args.certificateDomain {
hasResourceRecord = true
break
}
}
if hasResourceRecord {
break
}
logger.Debugf("Waiting for certificate to be ready")
}
// If certificate is issued, return early
if certificate.Certificate.Status == typesAcm.CertificateStatusIssued {
return createdResources, &createCertificateResp{
certificateArn: *createdCert.CertificateArn,
}, nil
}
// Otherwise verify certificate
logger.WithField("validationOptions", len(certificate.Certificate.DomainValidationOptions)).Debugf("Creating records for domain validation")
for _, option := range certificate.Certificate.DomainValidationOptions {
if option.ValidationMethod != typesAcm.ValidationMethodDns || option.ResourceRecord == nil {
continue
}
// Create Cloudflare Record
created, err := CreateRecord(ctx, logger, clients, args.zoneId, "CNAME", *option.ResourceRecord.Name, *option.ResourceRecord.Value)
if err != nil {
return createdResources, nil, fmt.Errorf("failed to create cname: %w", err)
}
createdResources = append(createdResources, created...)
}
// Wait until certificate is verified
for {
select {
case <-ctx.Done():
return createdResources, nil, fmt.Errorf("context cancelled")
case <-time.After(time.Minute * 5):
return createdResources, nil, fmt.Errorf("timeout waiting for certificate to be verified")
case <-time.After(time.Second * 5):
}
currentCert, err := clients.acm.DescribeCertificate(ctx, &acm.DescribeCertificateInput{
CertificateArn: createdCert.CertificateArn,
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to get certificate: %w", err)
}
if currentCert.Certificate.Status == typesAcm.CertificateStatusIssued {
break
}
logger.Debugf("Waiting for certificate to be verified")
}
return createdResources, &createCertificateResp{
certificateArn: *createdCert.CertificateArn,
}, nil
}
Setting up Load Balancing
Next, we’ll expose our service to the internet through an Application Load Balancer instance. This terminates TLS using our provisioned certificate and routes all traffic to an available container (determined by using the /healthz
health check endpoint).
type createLoadBalancingResp struct {
targetGroupArn string
loadBalancerArn string
}
type loadBalancingArgs struct {
serviceName string
certificateArn string
vpcId string
serviceHostname string
lbSecurityGroupIds []string
subnetIds []string
cloudflareZoneId string
}
func createLoadBalancing(ctx context.Context, logger logrus.FieldLogger, clients *clients, args loadBalancingArgs) ([]CreatedResource, *createLoadBalancingResp, error) {
createdResources := make([]CreatedResource, 0)
logger.Debugf("Creating load balancer")
// Create internet-facing IPv4 load balancer
// TODO Check if we can get a dualstack Load Balancer running (still haven't given up on IPv6)
createdLBs, err := clients.elb.CreateLoadBalancer(ctx, &elasticloadbalancingv2.CreateLoadBalancerInput{
Name: aws.String(args.serviceName),
IpAddressType: elbTypes.IpAddressTypeIpv4,
Scheme: elbTypes.LoadBalancerSchemeEnumInternetFacing,
SecurityGroups: args.lbSecurityGroupIds,
Subnets: args.subnetIds,
Type: elbTypes.LoadBalancerTypeEnumApplication,
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to create load balancer: %w", err)
}
createdLB := createdLBs.LoadBalancers[0]
createdResources = append(createdResources, CreatedResource{
ResourceType: "elb:load-balancer",
ResourceId: *createdLB.LoadBalancerArn,
})
logger.Debugf("Creating record for load balancer")
// Create record to point traffic to load balancer (will use one of the possible availability zones/subnets randomly)
created, err := CreateRecord(ctx, logger, clients, args.cloudflareZoneId, "CNAME", args.serviceHostname, *createdLB.DNSName)
if err != nil {
return createdResources, nil, fmt.Errorf("failed to create cname: %w", err)
}
createdResources = append(createdResources, created...)
logger.Debugf("Creating target group")
// Target group includes all deployed services that could handle traffic
createdTargetGroup, err := clients.elb.CreateTargetGroup(ctx, &elasticloadbalancingv2.CreateTargetGroupInput{
Name: aws.String(args.serviceName),
TargetType: elbTypes.TargetTypeEnumIp,
// Health check allows to dynamically deregister unhealthy services from accepting traffic
HealthCheckPath: aws.String("/healthz"),
HealthyThresholdCount: aws.Int32(2),
// Port is completely ignored for IP target type / ECS
Port: aws.Int32(80),
Protocol: elbTypes.ProtocolEnumHttp,
ProtocolVersion: aws.String("HTTP1"),
VpcId: aws.String(args.vpcId),
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to create target group: %w", err)
}
targetGroup := createdTargetGroup.TargetGroups[0]
createdResources = append(createdResources, CreatedResource{
ResourceType: "elb:target-group",
ResourceId: *targetGroup.TargetGroupArn,
})
logger.Debugf("Creating HTTPS listener")
// Accept HTTPS traffic
createdHTTPSListener, err := clients.elb.CreateListener(ctx, &elasticloadbalancingv2.CreateListenerInput{
DefaultActions: []elbTypes.Action{
// When a request comes in and isn't caught by another rule, serve fallback 404
{
Type: elbTypes.ActionTypeEnumFixedResponse,
FixedResponseConfig: &elbTypes.FixedResponseActionConfig{
StatusCode: aws.String("404"),
ContentType: aws.String("application/json"),
MessageBody: aws.String("{\"message\": \"Not Found\"}"),
},
},
},
LoadBalancerArn: createdLB.LoadBalancerArn,
Certificates: []elbTypes.Certificate{
{
CertificateArn: &args.certificateArn,
},
},
Port: aws.Int32(443),
Protocol: elbTypes.ProtocolEnumHttps,
SslPolicy: aws.String("ELBSecurityPolicy-2016-08"),
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to create listener: %w", err)
}
createdResources = append(createdResources, CreatedResource{
ResourceType: "elb:listener",
ResourceId: *createdHTTPSListener.Listeners[0].ListenerArn,
})
logger.Debugf("Creating HTTP listener")
// Accept and redirect HTTP traffic
createdHTTPListener, err := clients.elb.CreateListener(ctx, &elasticloadbalancingv2.CreateListenerInput{
DefaultActions: []elbTypes.Action{
// Always redirect to HTTPS
{
Type: elbTypes.ActionTypeEnumRedirect,
RedirectConfig: &elbTypes.RedirectActionConfig{
StatusCode: elbTypes.RedirectActionStatusCodeEnumHttp301,
Protocol: aws.String("HTTPS"),
},
},
},
LoadBalancerArn: createdLB.LoadBalancerArn,
Port: aws.Int32(80),
Protocol: elbTypes.ProtocolEnumHttp,
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to create listener: %w", err)
}
createdResources = append(createdResources, CreatedResource{
ResourceType: "elb:listener",
ResourceId: *createdHTTPListener.Listeners[0].ListenerArn,
})
logger.WithField("host", args.serviceHostname).Debugf("Creating forward rule")
// Forward all HTTPS traffic to created target group
createdRule, err := clients.elb.CreateRule(ctx, &elasticloadbalancingv2.CreateRuleInput{
Actions: []elbTypes.Action{
{
Type: elbTypes.ActionTypeEnumForward,
TargetGroupArn: targetGroup.TargetGroupArn,
},
},
Conditions: []elbTypes.RuleCondition{
{
Field: aws.String("host-header"),
HostHeaderConfig: &elbTypes.HostHeaderConditionConfig{
Values: []string{args.serviceHostname},
},
},
},
ListenerArn: createdHTTPSListener.Listeners[0].ListenerArn,
Priority: aws.Int32(1),
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to create rule: %w", err)
}
createdResources = append(createdResources, CreatedResource{
ResourceType: "elb:rule",
ResourceId: *createdRule.Rules[0].RuleArn,
})
return createdResources, &createLoadBalancingResp{
targetGroupArn: *targetGroup.TargetGroupArn,
loadBalancerArn: *createdLB.LoadBalancerArn,
}, nil
}
Setting up the ECS service
We’re all set now, so we can finally deploy the ECS service. First, we’ll create a helper function to register a new task definition.
type createTaskDefinitionArgs struct {
envMap map[string]string
port int
image *ServiceImageInputs
executionRoleArn string
command []string
entrypoint []string
mainContainerName string
serviceName string
taskRoleArn string
privateRegistrySecretArn string
}
type createTaskDefinitionResp struct {
taskDefinitionArn string
}
type ServiceImageRegistryInputs struct {
Username string `json:"username"`
Password string `json:"password"`
}
type ServiceImageInputs struct {
Name string `json:"name"`
Registry *ServiceImageRegistryInputs `json:"registry"`
}
func createTaskDefinition(ctx context.Context, logger logrus.FieldLogger, clients *clients, args createTaskDefinitionArgs) ([]CreatedResource, *createTaskDefinitionResp, error) {
createdResources := make([]CreatedResource, 0)
env := make([]types.KeyValuePair, 0, len(args.envMap))
for key, value := range args.envMap {
env = append(env, types.KeyValuePair{
Name: aws.String(key),
Value: aws.String(value),
})
}
env = append(env, types.KeyValuePair{
Name: aws.String("PORT"),
Value: aws.String(fmt.Sprintf("%d", args.port)),
})
logger.Debugf("Creating ECS task definition")
var repositoryCredentials *types.RepositoryCredentials
if args.image.Registry != nil {
created, resp, err := createPrivateRegistryAuth(ctx, logger, clients, createPrivateRegistryAuthArgs{
taskExecutionRoleName: args.executionRoleArn,
userName: args.image.Registry.Username,
password: args.image.Registry.Password,
privateRegistrySecretArn: args.privateRegistrySecretArn,
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to create private registry auth: %w", err)
}
createdResources = append(createdResources, created...)
repositoryCredentials = &types.RepositoryCredentials{
CredentialsParameter: &resp.registryAuthSecretArn,
}
}
createdTaskDef, err := clients.ecs.RegisterTaskDefinition(ctx, &ecs.RegisterTaskDefinitionInput{
ContainerDefinitions: []types.ContainerDefinition{
{
Command: args.command,
EntryPoint: args.entrypoint,
Environment: env,
Essential: aws.Bool(true),
Image: &args.image.Name,
Name: aws.String(args.mainContainerName),
PortMappings: []types.PortMapping{
{
ContainerPort: aws.Int32(int32(args.port)),
},
},
RepositoryCredentials: repositoryCredentials,
},
},
Family: aws.String(args.serviceName),
Cpu: aws.String("1 vCPU"),
ExecutionRoleArn: &args.executionRoleArn,
Memory: aws.String("2 GB"),
NetworkMode: "awsvpc",
RequiresCompatibilities: []types.Compatibility{types.CompatibilityFargate},
TaskRoleArn: &args.taskRoleArn,
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to register task definition: %w", err)
}
createdResources = append(createdResources, CreatedResource{
ResourceType: "ecs:task-definition",
ResourceId: *createdTaskDef.TaskDefinition.TaskDefinitionArn,
})
return createdResources, &createTaskDefinitionResp{
taskDefinitionArn: *createdTaskDef.TaskDefinition.TaskDefinitionArn,
}, nil
}
Next, we’ll create a helper to create an ECS service.
type createEcsServiceArgs struct {
envMap map[string]string
port int
command []string
entrypoint []string
mainContainerName string
serviceName string
image *ServiceImageInputs
executionRoleArn string
taskRoleArn string
targetGroupArn string
subnetIds []string
ecsServiceSecurityGroupIds []string
instanceCount int
}
type createEcsServiceResp struct {
serviceArn string
taskDefinitionArn string
}
func createEcsService(ctx context.Context, logger logrus.FieldLogger, clients *clients, args createEcsServiceArgs) ([]CreatedResource, *createEcsServiceResp, error) {
createdResources := make([]CreatedResource, 0)
logger.Debugf("Creating ECS cluster")
createdCluster, err := clients.ecs.CreateCluster(ctx, &ecs.CreateClusterInput{
ClusterName: aws.String(args.serviceName),
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to create cluster: %w", err)
}
createdResources = append(createdResources, CreatedResource{
ResourceType: "ecs:cluster",
ResourceId: *createdCluster.Cluster.ClusterName,
})
created, taskDefResp, err := createTaskDefinition(ctx, logger, clients, createTaskDefinitionArgs{
envMap: args.envMap,
port: args.port,
image: args.image,
executionRoleArn: args.executionRoleArn,
command: args.command,
entrypoint: args.entrypoint,
mainContainerName: args.mainContainerName,
serviceName: args.serviceName,
taskRoleArn: args.taskRoleArn,
})
createdResources = append(createdResources, created...)
if err != nil {
return createdResources, nil, fmt.Errorf("failed to create task definition: %w", err)
}
logger.Debugf("Creating ECS service")
createdSvc, err := clients.ecs.CreateService(ctx, &ecs.CreateServiceInput{
Cluster: createdCluster.Cluster.ClusterArn,
ServiceName: aws.String(args.serviceName),
DeploymentConfiguration: &types.DeploymentConfiguration{
DeploymentCircuitBreaker: &types.DeploymentCircuitBreaker{
Enable: true,
Rollback: true,
},
MaximumPercent: aws.Int32(200),
MinimumHealthyPercent: aws.Int32(100),
},
DeploymentController: &types.DeploymentController{
Type: types.DeploymentControllerTypeEcs,
},
DesiredCount: aws.Int32(1),
HealthCheckGracePeriodSeconds: aws.Int32(10),
LaunchType: types.LaunchTypeFargate,
LoadBalancers: []types.LoadBalancer{
{
ContainerName: aws.String(args.mainContainerName),
ContainerPort: aws.Int32(int32(args.port)),
TargetGroupArn: &args.targetGroupArn,
},
},
NetworkConfiguration: &types.NetworkConfiguration{
AwsvpcConfiguration: &types.AwsVpcConfiguration{
AssignPublicIp: types.AssignPublicIpEnabled,
Subnets: args.subnetIds,
SecurityGroups: args.ecsServiceSecurityGroupIds,
},
},
TaskDefinition: &taskDefResp.taskDefinitionArn,
})
if err != nil {
return createdResources, nil, fmt.Errorf("failed to create service: %w", err)
}
createdResources = append(createdResources, CreatedResource{
Use: "ECSService",
ResourceType: "ecs:service",
ResourceId: *createdSvc.Service.ServiceArn,
EcsClusterArn: *createdSvc.Service.ClusterArn,
})
return createdResources, &createEcsServiceResp{
serviceArn: *createdSvc.Service.ServiceArn,
taskDefinitionArn: taskDefResp.taskDefinitionArn,
}, nil
}
Finally, we can create a public function that puts it all together, provisioning roles, networking, certificates, load balancing, followed by task definition and service.
const (
mainContainerName string = "main"
containerPort int = 8080
)
type ServiceInputs struct {
Hostname string `json:"hostname"`
InstanceCount *int `json:"instanceCount"`
Env map[string]string `json:"env"`
Command []string `json:"command"`
Entrypoint []string `json:"entrypoint"`
Image *ServiceImageInputs `json:"image"`
}
func CreateService(ctx context.Context, logger logrus.FieldLogger, serviceName, region, zoneId string, inputs *ServiceInputs, clients *clients) ([]CreatedResource, error) {
createdResources := make([]CreatedResource, 0)
availabilityZones, err := loadAZs(ctx, logger, region, clients, 3)
if err != nil {
return createdResources, fmt.Errorf("failed to load availability zones: %w", err)
}
created, roles, err := createRoles(ctx, logger, clients)
createdResources = append(createdResources, created...)
if err != nil {
return createdResources, fmt.Errorf("failed to create roles: %w", err)
}
created, networking, err := createNetworking(ctx, logger, clients, serviceName, availabilityZones)
createdResources = append(createdResources, created...)
if err != nil {
return createdResources, fmt.Errorf("failed to create networking: %w", err)
}
created, cert, err := createCertificate(ctx, logger, clients, createCertificateArgs{
certificateDomain: inputs.Hostname,
zoneId: zoneId,
})
createdResources = append(createdResources, created...)
if err != nil {
return createdResources, fmt.Errorf("failed to create certificate: %w", err)
}
created, loadBalancing, err := createLoadBalancing(ctx, logger, clients, loadBalancingArgs{
serviceName: serviceName,
certificateArn: cert.certificateArn,
vpcId: networking.vpcId,
cloudflareZoneId: zoneId,
serviceHostname: inputs.Hostname,
subnetIds: networking.subnetIds,
lbSecurityGroupIds: networking.lbSecurityGroupIds,
})
createdResources = append(createdResources, created...)
if err != nil {
return createdResources, fmt.Errorf("failed to create load balancing: %w", err)
}
instanceCount := 1
if inputs.InstanceCount != nil {
instanceCount = *inputs.InstanceCount
}
created, _, err = createEcsService(ctx, logger, clients, createEcsServiceArgs{
envMap: inputs.Env,
port: containerPort,
command: inputs.Command,
entrypoint: inputs.Entrypoint,
mainContainerName: mainContainerName,
serviceName: serviceName,
image: inputs.Image,
executionRoleArn: roles.taskExecutionRoleArn,
taskRoleArn: roles.taskRoleArn,
targetGroupArn: loadBalancing.targetGroupArn,
subnetIds: networking.subnetIds,
ecsServiceSecurityGroupIds: networking.ecsSecurityGroupIds,
instanceCount: instanceCount,
})
createdResources = append(createdResources, created...)
if err != nil {
return createdResources, fmt.Errorf("failed to create ecs service: %w", err)
}
return createdResources, nil
}
We’ll also add a way to update our service. To know which service we need to update, we’ll simply go through the list of created resources, which acts as a store for our deployment state.
type UpdateServiceArgs struct {
EnvMap map[string]string
Command []string
Entrypoint []string
Image *ServiceImageInputs
InstanceCount int
}
func UpdateService(ctx context.Context, logger logrus.FieldLogger, state []CreatedResource, clients *clients, args UpdateServiceArgs) ([]CreatedResource, error) {
createdResources := make([]CreatedResource, 0)
var executionRoleArn string
var taskRoleArn string
var privateRegistrySecretArn string
var serviceArn string
var clusterArn string
for _, resource := range createdResources {
switch resource.Use {
case "TaskExecutionRole":
executionRoleArn = resource.ResourceId
case "TaskRole":
taskRoleArn = resource.ResourceId
case "ECSService":
serviceArn = resource.ResourceId
clusterArn = resource.EcsClusterArn
}
}
// Create new task definition with updated details
created, resp, err := createTaskDefinition(ctx, logger, clients, createTaskDefinitionArgs{
envMap: args.EnvMap,
port: containerPort,
image: args.Image,
privateRegistrySecretArn: privateRegistrySecretArn,
executionRoleArn: executionRoleArn,
command: args.Command,
entrypoint: args.Entrypoint,
mainContainerName: mainContainerName,
serviceName: state[0].ResourceId,
taskRoleArn: taskRoleArn,
})
if err != nil {
return createdResources, fmt.Errorf("failed to create task definition: %w", err)
}
createdResources = append(createdResources, created...)
// Update service with new task definition
_, err = clients.ecs.UpdateService(ctx, &ecs.UpdateServiceInput{
Service: &serviceArn,
Cluster: &clusterArn,
DesiredCount: aws.Int32(int32(args.InstanceCount)),
ForceNewDeployment: true,
TaskDefinition: &resp.taskDefinitionArn,
})
if err != nil {
return createdResources, fmt.Errorf("failed to update service: %w", err)
}
return createdResources, nil
}
That was quite intense, but we managed to deploy a container to ECS, with scalable load balancing, multi-AZ availability, automatic TLS, and support for custom image registries. As I said earlier, you probably wouldn’t want to go through this, instead, I’d recommend using a managed service, like managed containers on Anzu.