Aug 7th, 2022

Deploying Containers on AWS ECS Using the SDK

When building the first iteration of the AWS provider for Anzu, I wanted to provide an easy way to deploy containers on AWS ECS. While newer versions of the provider offer low-level resources, back then I believed that one resource could deploy all related dependencies as well.

While I strongly advise avoiding deploying resources to AWS using the SDK (please use literally any other tool that removes the need to write this code), I’ve learned a lot about the cloud primitives that make up the AWS experience, from networking to compute, so I decided that it would still be worthwhile sharing the code for educational reasons (and for me to remember).

Our goal: Running containers on ECS Fargate

In the end, we want to deploy services to AWS ECS running on the Fargate platform to get that juicy isolation, as well as provisioning DNS records to route traffic to the service through an ELB instance (using Cloudflare).

Secondary Goal: Adding rollback capabilities

In case something fails or we want to tear down our resources, we need to provide an easy way to roll back everything we deployed up to that point. As it turns out, some resources may still serve traffic, so we’ll have to add some logic to gracefully drop everything.

Passing credentials

To deploy everything, we’ll need a couple of tokens from AWS and Cloudflare. We’ll store them in the following struct:

type ConfigInputs struct {
	AwsAccessKeyId     *string `json:"awsAccessKeyId"`
	AwsSecretAccessKey *string `json:"awsSecretAccessKey"`
	AwsRegion          *string `json:"awsRegion"`

	CloudflareApiToken *string `json:"cloudflareApiToken"`
}

Wiring up our SDKs

To reach our goal, we’ll interact with a lot of different AWS services. We’ll create an internal struct collecting clients for us to use.

type clients struct {
	iam            *iam.Client
	elb            *elasticloadbalancingv2.Client
	ec2            *ec2.Client
	acm            *acm.Client
	ecs            *ecs.Client
	s3             *s3.Client
	sqs            *sqs.Client
	secretsManager *secretsmanager.Client
	batch          *batch.Client

	cf *cloudflare.API
}

This is followed by helper functions to prepare the clients struct above.

func buildAWSConfig(ctx context.Context, providerConfig *ConfigInputs) (*aws.Config, error) {
	if providerConfig.AwsAccessKeyId == nil || providerConfig.AwsSecretAccessKey == nil {
		return nil, errors.New("awsAccessKeyId and awsSecretAccessKey must be set")
	}

	keyCredentials := credentials.NewStaticCredentialsProvider(*providerConfig.AwsAccessKeyId, *providerConfig.AwsSecretAccessKey, "")

	cfg, err := config.LoadDefaultConfig(
		ctx,
		config.WithCredentialsProvider(keyCredentials),
		config.WithRegion("eu-central-1"),
	)
	if err != nil {
		return nil, fmt.Errorf("failed to load config: %w", err)
	}

	if providerConfig.AwsRegion != nil {
		cfg.Region = *providerConfig.AwsRegion
	}

	return &cfg, nil
}

func configureECS(ctx context.Context, providerConfig *ConfigInputs) (*ecs.Client, error) {
	awsConfig, err := buildAWSConfig(ctx, providerConfig)
	if err != nil {
		return nil, fmt.Errorf("failed to build aws config: %w", err)
	}
	client := ecs.NewFromConfig(*awsConfig)
	return client, nil
}

func configureS3(ctx context.Context, providerConfig *ConfigInputs) (*s3.Client, error) {
	awsConfig, err := buildAWSConfig(ctx, providerConfig)
	if err != nil {
		return nil, fmt.Errorf("failed to build aws config: %w", err)
	}
	s3Client := s3.NewFromConfig(*awsConfig)
	return s3Client, nil
}

func configureSQS(ctx context.Context, providerConfig *ConfigInputs) (*sqs.Client, error) {
	awsConfig, err := buildAWSConfig(ctx, providerConfig)
	if err != nil {
		return nil, fmt.Errorf("failed to build aws config: %w", err)
	}
	sqsClient := sqs.NewFromConfig(*awsConfig)
	return sqsClient, nil
}

func configureIAM(ctx context.Context, providerConfig *ConfigInputs) (*iam.Client, error) {
	awsConfig, err := buildAWSConfig(ctx, providerConfig)
	if err != nil {
		return nil, fmt.Errorf("failed to build aws config: %w", err)
	}
	client := iam.NewFromConfig(*awsConfig)
	return client, nil
}

func configureELB(ctx context.Context, providerConfig *ConfigInputs) (*elasticloadbalancingv2.Client, error) {
	awsConfig, err := buildAWSConfig(ctx, providerConfig)
	if err != nil {
		return nil, fmt.Errorf("failed to build aws config: %w", err)
	}
	client := elasticloadbalancingv2.NewFromConfig(*awsConfig)
	return client, nil
}

func configureEC2(ctx context.Context, providerConfig *ConfigInputs) (*ec2.Client, error) {
	awsConfig, err := buildAWSConfig(ctx, providerConfig)
	if err != nil {
		return nil, fmt.Errorf("failed to build aws config: %w", err)
	}
	client := ec2.NewFromConfig(*awsConfig)
	return client, nil
}

func configureACM(ctx context.Context, providerConfig *ConfigInputs) (*acm.Client, error) {
	awsConfig, err := buildAWSConfig(ctx, providerConfig)
	if err != nil {
		return nil, fmt.Errorf("failed to build aws config: %w", err)
	}
	client := acm.NewFromConfig(*awsConfig)
	return client, nil
}

func configureCloudflare(_ context.Context, providerConfig *ConfigInputs) (*cloudflare.API, error) {
	if providerConfig.CloudflareApiToken == nil {
		return nil, fmt.Errorf("missing cloudflareApiToken in provider configuration")
	}

	cf, err := cloudflare.NewWithAPIToken(*providerConfig.CloudflareApiToken)
	if err != nil {
		return nil, fmt.Errorf("failed to build cloudflare client: %w", err)
	}
	return cf, nil
}

func configureSecretsManager(ctx context.Context, providerConfig *ConfigInputs) (*secretsmanager.Client, error) {
	awsConfig, err := buildAWSConfig(ctx, providerConfig)
	if err != nil {
		return nil, fmt.Errorf("failed to build aws config: %w", err)
	}
	client := secretsmanager.NewFromConfig(*awsConfig)
	return client, nil
}

func configureBatch(ctx context.Context, providerConfig *ConfigInputs) (*batch.Client, error) {
	awsConfig, err := buildAWSConfig(ctx, providerConfig)
	if err != nil {
		return nil, fmt.Errorf("failed to build aws config: %w", err)
	}
	client := batch.NewFromConfig(*awsConfig)
	return client, nil
}

func CreateAWSClients(ctx context.Context, configInputs *ConfigInputs) (*clients, error) {
	iamClient, err := configureIAM(ctx, configInputs)
	if err != nil {
		return nil, fmt.Errorf("failed to configure iam client: %w", err)
	}

	elbClient, err := configureELB(ctx, configInputs)
	if err != nil {
		return nil, fmt.Errorf("failed to configure elb client: %w", err)
	}

	ec2Client, err := configureEC2(ctx, configInputs)
	if err != nil {
		return nil, fmt.Errorf("failed to configure ec2 client: %w", err)
	}

	acmClient, err := configureACM(ctx, configInputs)
	if err != nil {
		return nil, fmt.Errorf("failed to configure acm client: %w", err)
	}

	ecsClient, err := configureECS(ctx, configInputs)
	if err != nil {
		return nil, fmt.Errorf("failed to configure ecs client: %w", err)
	}

	s3Client, err := configureS3(ctx, configInputs)
	if err != nil {
		return nil, fmt.Errorf("failed to configure s3 client: %w", err)
	}

	sqsClient, err := configureSQS(ctx, configInputs)
	if err != nil {
		return nil, fmt.Errorf("failed to configure sqs client: %w", err)
	}

	secretsManagerClient, err := configureSecretsManager(ctx, configInputs)
	if err != nil {
		return nil, fmt.Errorf("failed to configure secrets manager client: %w", err)
	}

	batchClient, err := configureBatch(ctx, configInputs)
	if err != nil {
		return nil, fmt.Errorf("failed to configure batch client: %w", err)
	}

	return &clients{
		iam:            iamClient,
		elb:            elbClient,
		ec2:            ec2Client,
		acm:            acmClient,
		ecs:            ecsClient,
		s3:             s3Client,
		sqs:            sqsClient,
		secretsManager: secretsManagerClient,
		batch:          batchClient,
	}, nil
}

func CreateCloudflareClient(ctx context.Context, configInputs *ConfigInputs) (*clients, error) {
	cfClient, err := configureCloudflare(ctx, configInputs)
	if err != nil {
		return nil, fmt.Errorf("failed to configure cloudflare client: %w", err)
	}

	return &clients{cf: cfClient}, nil
}

func CreateClients(ctx context.Context, configInputs *ConfigInputs) (*clients, error) {
	awsClients, err := CreateAWSClients(ctx, configInputs)
	if err != nil {
		return nil, fmt.Errorf("failed to create aws clients: %w", err)
	}

	cfClient, err := CreateCloudflareClient(ctx, configInputs)
	if err != nil {
		return nil, fmt.Errorf("failed to configure cloudflare client: %w", err)
	}

	awsClients.cf = cfClient.cf

	return awsClients, nil
}

Tracking created resources

First, we’ll introduce a simple data structure to keep track of our resources

type CreatedResource struct {
	Use string `json:"use,omitempty"`

	// Generic resource type + id
	ResourceType string `json:"resourceType"`
	ResourceId   string `json:"resourceId"`

	// Additional attributes needed for deleting specific resources
	EcsClusterArn string `json:"ecsClusterArn,omitempty"`
	CfZoneId      string `json:"cfZoneId,omitempty"`
	IgwVpcId      string `json:"igwVpcId,omitempty"`

	rollbackErr error
}

Graceful deletion

Once we’ve collected all resources, we can simply reverse the list and drop them step by step. In case deletion fails, I’ve decided that it’s still best to keep going to get rid of as many resources as possible. Of course, this may lead to cascading errors when the resource still exists and subsequent ones depend on it being deleted first.

func Rollback(ctx context.Context, logger logrus.FieldLogger, created []CreatedResource, clients *clients) error {
	failed := make([]CreatedResource, 0)

	for i := len(created) - 1; i >= 0; i-- {
		fields := logger.WithFields(logrus.Fields{
			"type": created[i].ResourceType,
			"id":   created[i].ResourceId,
			"use":  created[i].Use,
		})

		fields.Infoln("Rolling back")
		err := created[i].drop(ctx, logger, clients)
		if err != nil {
			failed = append(failed, CreatedResource{
				ResourceType: created[i].ResourceType,
				ResourceId:   created[i].ResourceId,
				CfZoneId:     created[i].CfZoneId,
				IgwVpcId:     created[i].IgwVpcId,
				rollbackErr:  err,
			})
			fields.Errorf("Failed to roll back: %s\n", err.Error())
		}
	}

	if len(failed) > 0 {
		for _, temp := range failed {
			logger.
				WithFields(logrus.Fields{
					"type": temp.ResourceType,
					"id":   temp.ResourceId,
					"use":  temp.Use,
				}).
				Errorln("Failed to roll back")
		}
		return fmt.Errorf("failed to rollback some resources")
	}

	return nil
}

To drop resources, we simply provide a method that runs the appropriate logic depending on the resource type. For some resources, we may have to wait a bit, so I added a simple wait loop with a timeout of five minutes respecting the passed context.

func (c *CreatedResource) drop(ctx context.Context, logger logrus.FieldLogger, clients *clients) error {
	switch c.ResourceType {
	case "iam:role":
		attached, err := clients.iam.ListAttachedRolePolicies(ctx, &iam.ListAttachedRolePoliciesInput{
			RoleName: &c.ResourceId,
		})
		if err != nil {
			return fmt.Errorf("failed to list attached policies for role %s: %w", c.ResourceId, err)
		}

		for _, policy := range attached.AttachedPolicies {
			_, err = clients.iam.DetachRolePolicy(ctx, &iam.DetachRolePolicyInput{
				PolicyArn: policy.PolicyArn,
				RoleName:  &c.ResourceId,
			})
			if err != nil {
				return fmt.Errorf("failed to detach policy %s from role %s: %w", *policy.PolicyArn, c.ResourceId, err)
			}
		}

		_, err = clients.iam.DeleteRole(ctx, &iam.DeleteRoleInput{RoleName: &c.ResourceId})
		if err != nil {
			return fmt.Errorf("failed to delete iam role: %w", err)
		}
	case "ecs:task-definition":
		_, err := clients.ecs.DeregisterTaskDefinition(ctx, &ecs.DeregisterTaskDefinitionInput{TaskDefinition: &c.ResourceId})
		if err != nil {
			return fmt.Errorf("failed to delete ecs task definition: %w", err)
		}
	case "ec2:vpc":
		_, err := clients.ec2.DeleteVpc(ctx, &ec2.DeleteVpcInput{VpcId: &c.ResourceId})
		if err != nil {
			return fmt.Errorf("failed to delete ec2 vpc: %w", err)
		}
	case "ec2:subnet":
		_, err := clients.ec2.DeleteSubnet(ctx, &ec2.DeleteSubnetInput{SubnetId: &c.ResourceId})
		if err != nil {
			return fmt.Errorf("failed to delete ec2 subnet: %w", err)
		}
	case "ec2:security-group":
		// Check if the security group is attached to any other resources
		for {
			select {
			case <-ctx.Done():
				return ctx.Err()
			case <-time.After(time.Minute * 5):
				return fmt.Errorf("failed to delete security group %s: security group is still attached to resources", c.ResourceId)
			case <-time.After(time.Second * 5):
			}

			attached, err := clients.ec2.DescribeNetworkInterfaces(ctx, &ec2.DescribeNetworkInterfacesInput{
				Filters: []typesEc2.Filter{
					{
						Name:   aws.String("group-id"),
						Values: []string{c.ResourceId},
					},
				},
			})
			if err != nil {
				return fmt.Errorf("failed to describe network interfaces: %w", err)
			}

			if len(attached.NetworkInterfaces) == 0 {
				break
			}

			for _, networkInterface := range attached.NetworkInterfaces {
				logrus.
					WithField("type", networkInterface.InterfaceType).
					WithField("arn", networkInterface.NetworkInterfaceId).
					WithField("sg", c.ResourceId).
					Debugf("security group still attached to network interface")
			}
		}

		_, err := clients.ec2.DeleteSecurityGroup(ctx, &ec2.DeleteSecurityGroupInput{GroupId: &c.ResourceId})
		if err != nil {
			return fmt.Errorf("failed to delete ec2 security group: %w", err)
		}
	case "elb:load-balancer":
		_, err := clients.elb.DeleteLoadBalancer(ctx, &elasticloadbalancingv2.DeleteLoadBalancerInput{LoadBalancerArn: &c.ResourceId})
		if err != nil {
			return fmt.Errorf("failed to delete elb load balancer: %w", err)
		}
	case "elb:target-group":
		_, err := clients.elb.DeleteTargetGroup(ctx, &elasticloadbalancingv2.DeleteTargetGroupInput{TargetGroupArn: &c.ResourceId})
		if err != nil {
			return fmt.Errorf("failed to delete elb target group: %w", err)
		}
	case "acm:certificate":
		// Wait until certificate is no longer in use
		for {
			select {
			case <-ctx.Done():
				return fmt.Errorf("context cancelled")
			case <-time.After(time.Minute * 5):
				return fmt.Errorf("timeout waiting for certificate to be verified")
			case <-time.After(time.Second * 5):
			}

			currentCert, err := clients.acm.DescribeCertificate(ctx, &acm.DescribeCertificateInput{
				CertificateArn: &c.ResourceId,
			})
			if err != nil {
				return fmt.Errorf("failed to get certificate: %w", err)
			}

			if len(currentCert.Certificate.InUseBy) == 0 {
				break
			}

			for _, s := range currentCert.Certificate.InUseBy {
				logrus.WithField("arn", s).Debugf("Certificate still being used")
			}
		}

		_, err := clients.acm.DeleteCertificate(ctx, &acm.DeleteCertificateInput{CertificateArn: &c.ResourceId})
		if err != nil {
			return fmt.Errorf("failed to delete acm certificate: %w", err)
		}
	case "elb:listener":
		_, err := clients.elb.DeleteListener(ctx, &elasticloadbalancingv2.DeleteListenerInput{ListenerArn: &c.ResourceId})
		if err != nil {
			return fmt.Errorf("failed to delete elb listener: %w", err)
		}
	case "ecs:cluster":
		for {
			select {
			case <-ctx.Done():
				return fmt.Errorf("context cancelled")
			case <-time.After(time.Minute * 5):
				return fmt.Errorf("timeout waiting for cluster to be deleted")
			case <-time.After(time.Second * 5):
			}

			list, err := clients.ecs.ListTasks(ctx, &ecs.ListTasksInput{
				Cluster: &c.ResourceId,
			})
			if err != nil {
				return fmt.Errorf("failed to list tasks: %w", err)
			}

			if len(list.TaskArns) == 0 {
				break
			}

			logrus.Debugf("Tasks still not deleted")
		}

		_, err := clients.ecs.DeleteCluster(ctx, &ecs.DeleteClusterInput{Cluster: &c.ResourceId})
		if err != nil {
			return fmt.Errorf("failed to delete ecs cluster: %w", err)
		}
	case "ecs:service":
		_, err := clients.ecs.DeleteService(ctx, &ecs.DeleteServiceInput{Cluster: &c.EcsClusterArn, Service: &c.ResourceId, Force: aws.Bool(true)})
		if err != nil {
			return fmt.Errorf("failed to delete ecs service: %w", err)
		}
	case "cf:record":
		err := clients.cf.DeleteDNSRecord(ctx, c.CfZoneId, c.ResourceId)
		if err != nil {
			return fmt.Errorf("failed to delete cloudflare record: %w", err)
		}
	case "elb:rule":
		_, err := clients.elb.DeleteRule(ctx, &elasticloadbalancingv2.DeleteRuleInput{RuleArn: &c.ResourceId})
		if err != nil {
			return fmt.Errorf("failed to delete elb rule: %w", err)
		}
	case "ec2:internet-gateway":
		_, err := clients.ec2.DetachInternetGateway(ctx, &ec2.DetachInternetGatewayInput{
			InternetGatewayId: &c.ResourceId,
			VpcId:             &c.IgwVpcId,
		})
		if err != nil {
			return fmt.Errorf("failed to detach internet gateway: %w", err)
		}

		_, err = clients.ec2.DeleteInternetGateway(ctx, &ec2.DeleteInternetGatewayInput{InternetGatewayId: &c.ResourceId})
		if err != nil {
			return fmt.Errorf("failed to delete ec2 internet gateway: %w", err)
		}
	case "ec2:route-table":
		_, err := clients.ec2.DeleteRouteTable(ctx, &ec2.DeleteRouteTableInput{RouteTableId: &c.ResourceId})
		if err != nil {
			return fmt.Errorf("failed to delete ec2 route table: %w", err)
		}
	case "secretsmanager:secret":
		_, err := clients.secretsManager.DeleteSecret(ctx, &secretsmanager.DeleteSecretInput{SecretId: &c.ResourceId})
		if err != nil {
			return fmt.Errorf("failed to delete secrets manager secret: %w", err)
		}
	default:
		return fmt.Errorf("no rollback rule for %s %s", c.ResourceType, c.ResourceId)
	}

	return nil
}

Random suffixes

To generate unique resources in our AWS account (which allows us to run provision multiple instances of our setup), we add a random suffix to our resource names.

func randomSuffix() string {
	var str string

	length := 5

	random := rand.New(rand.NewSource(time.Now().UnixNano()))
	allowedChars := []rune("abcdefghijklmnopqrstuvwxyz")

	for i := 0; i < length; i++ {
		str += string(allowedChars[random.Intn(len(allowedChars))])
	}

	return str
}

Loading availability zones

To create subnets later on, we’ll have to know the availability zones we want to deploy those subnets into.

// loadAZs loads up to limit AZs from the given region
func loadAZs(ctx context.Context, logger logrus.FieldLogger, region string, clients *clients, limit int) ([]string, error) {
	logger.WithField("region", region).Debugf("Loading AZs")

	azs, err := clients.ec2.DescribeAvailabilityZones(ctx, &ec2.DescribeAvailabilityZonesInput{
		Filters: []typesEc2.Filter{
			{Name: aws.String("region-name"), Values: []string{region}},
			{Name: aws.String("state"), Values: []string{"available"}},
		},
	})
	if err != nil {
		return nil, fmt.Errorf("failed to describe availability zones: %w", err)
	}

	availabilityZones := make([]string, 3)
	for i := 0; i < limit; i++ {
		az := azs.AvailabilityZones[i]
		availabilityZones[i] = *az.ZoneName
	}

	return availabilityZones, nil
}

Creating roles

Up next, we’ll provision all roles needed for AWS ECS.

type createRolesResp struct {
	taskRoleArn          string
	taskExecutionRoleArn string
}

func createRoles(ctx context.Context, logger logrus.FieldLogger, clients *clients) ([]CreatedResource, *createRolesResp, error) {
	createdResources := make([]CreatedResource, 0)

	logger.Debugf("Ensuring AWSServiceRoleForECS exists")

	// Check if AWSServiceRoleForECS exists (so AWS ECS can interact with other AWS services)
	// This is usually created by AWS ECS when using it for the first time, but it might not exist
	_, err := clients.iam.GetRole(ctx, &iam.GetRoleInput{RoleName: aws.String("AWSServiceRoleForECS")})
	if err != nil {
		// Check if the error is a not found error
		var roleNotFound *iamTypes.NoSuchEntityException
		if !errors.As(err, &roleNotFound) {
			return nil, nil, fmt.Errorf("failed to get role: %w", err)
		}

		logger.Debugf("AWSServiceRoleForECS does not exist, creating")

		// Create the role
		_, err = clients.iam.CreateRole(ctx, &iam.CreateRoleInput{
			AssumeRolePolicyDocument: aws.String(`{
				"Version": "2012-10-17",
				"Statement": [
					{
						"Effect": "Allow",
						"Principal": {
							"Service": "ecs.amazonaws.com"
						},
						"Action": "sts:AssumeRole"
					}
				]
			}`),
			RoleName: aws.String("AWSServiceRoleForECS"),
		})
		if err != nil {
			return nil, nil, fmt.Errorf("failed to create AWSServiceRoleForECS role: %w", err)
		}
	}

	assumeRolePolicy := `{
	  "Version": "2012-10-17",
	  "Statement": [
		{
		  "Effect": "Allow",
		  "Principal": {
			"Service": "ecs-tasks.amazonaws.com"
		  },
		  "Action": "sts:AssumeRole"
		}
	  ]
	}`

	logger.Debugf("Creating ECS task execution rule")

	// Create new namespaced role for execution (to pull images and push logs)
	createdExecutionRole, err := clients.iam.CreateRole(ctx, &iam.CreateRoleInput{
		RoleName:                 aws.String(fmt.Sprintf("ecsTaskExecutionRole-%s", randomSuffix())),
		AssumeRolePolicyDocument: &assumeRolePolicy,
	})
	if err != nil {
		return createdResources, nil, fmt.Errorf("failed to create role: %w", err)
	}
	createdResources = append(createdResources, CreatedResource{
		Use:          "TaskExecutionRole",
		ResourceType: "iam:role",
		ResourceId:   *createdExecutionRole.Role.RoleName,
	})

	// Attach managed ECS task execution role policy
	managedECSPolicy := "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
	_, err = clients.iam.AttachRolePolicy(ctx, &iam.AttachRolePolicyInput{
		PolicyArn: aws.String(managedECSPolicy),
		RoleName:  createdExecutionRole.Role.RoleName,
	})
	if err != nil {
		return createdResources, nil, fmt.Errorf("failed to attach role policy: %w", err)
	}

	logger.Debugf("Creating ECS task role")

	// Create a new namespaced role for the task to assume (to access AWS services with specific permissions from inside Fargate)
	createdTaskRole, err := clients.iam.CreateRole(ctx, &iam.CreateRoleInput{
		RoleName:                 aws.String(fmt.Sprintf("ecsTaskRole-%s", randomSuffix())),
		AssumeRolePolicyDocument: &assumeRolePolicy,
	})
	if err != nil {
		return createdResources, nil, fmt.Errorf("failed to create role: %w", err)
	}

	createdResources = append(createdResources, CreatedResource{
		Use:          "TaskRole",
		ResourceType: "iam:role",
		ResourceId:   *createdTaskRole.Role.RoleName,
	})

	return createdResources, &createRolesResp{
		taskRoleArn:          *createdTaskRole.Role.Arn,
		taskExecutionRoleArn: *createdExecutionRole.Role.Arn,
	}, nil
}

Storing secrets for private image registries

This is an extra to support fetching images from private registries: We’ll simply store the credentials in SecretsManager and provide the secret ARN to our container definition later on.

type createPrivateRegistryAuthArgs struct {
	taskExecutionRoleName    string
	userName                 string
	password                 string
	privateRegistrySecretArn string
}

type createPrivateRegistryAuthResp struct {
	registryAuthSecretArn string
}

func createPrivateRegistryAuth(ctx context.Context, logger logrus.FieldLogger, clients *clients, args createPrivateRegistryAuthArgs) ([]CreatedResource, *createPrivateRegistryAuthResp, error) {
	createdResources := make([]CreatedResource, 0)

	if args.privateRegistrySecretArn != "" {
		// Update secret
		logger.Debugf("Updating registry auth secret")

		_, err := clients.secretsManager.UpdateSecret(ctx, &secretsmanager.UpdateSecretInput{
			SecretId:     aws.String(args.privateRegistrySecretArn),
			SecretString: aws.String(fmt.Sprintf("{\"username\": \"%s\", \"password\": \"%s\"}", args.userName, args.password)),
		})
		if err != nil {
			return createdResources, nil, fmt.Errorf("failed to update registry auth secret: %w", err)
		}

		return createdResources, &createPrivateRegistryAuthResp{
			registryAuthSecretArn: args.privateRegistrySecretArn,
		}, nil
	}

	// Create secret holding username and password
	logger.Debugf("Creating registry auth secret")

	createdSecret, err := clients.secretsManager.CreateSecret(ctx, &secretsmanager.CreateSecretInput{
		Name:         aws.String(fmt.Sprintf("ecs-registry-auth-%s", randomSuffix())),
		SecretString: aws.String(fmt.Sprintf("{\"username\": \"%s\", \"password\": \"%s\"}", args.userName, args.password)),
	})
	if err != nil {
		return createdResources, nil, fmt.Errorf("failed to create registry auth secret: %w", err)
	}

	createdResources = append(createdResources, CreatedResource{
		Use:          "PrivateRegistryCredentials",
		ResourceType: "secretsmanager:secret",
		ResourceId:   *createdSecret.ARN,
	})

	// Create policy for the private registry
	logger.Debugf("Creating private registry policy")

	createdPolicy, err := clients.iam.CreatePolicy(ctx, &iam.CreatePolicyInput{
		PolicyDocument: aws.String(fmt.Sprintf(`{
  "Version": "2012-10-17",
  "Statement": [
    {
      "Effect": "Allow",
      "Action": [
        "secretsmanager:GetSecretValue"
      ],
      "Resource": [
        "%s",
      ]
    }
  ]
}

`, *createdSecret.ARN)),
		PolicyName: aws.String(fmt.Sprintf("privateRegistryAuthPolicy-%s", randomSuffix())),
	})
	if err != nil {
		return nil, nil, fmt.Errorf("failed to create policy: %w", err)
	}

	createdResources = append(createdResources, CreatedResource{
		Use:          "PrivateRegistryCredentials",
		ResourceType: "iam:policy",
		ResourceId:   *createdPolicy.Policy.PolicyName,
	})

	// Attach the policy to the task execution role
	logger.Debugf("Attaching policy to task execution role")

	_, err = clients.iam.AttachRolePolicy(ctx, &iam.AttachRolePolicyInput{
		PolicyArn: aws.String(*createdPolicy.Policy.Arn),
		RoleName:  aws.String(args.taskExecutionRoleName),
	})
	if err != nil {
		return nil, nil, fmt.Errorf("failed to attach policy to task execution role: %w", err)
	}

	return createdResources, &createPrivateRegistryAuthResp{
		registryAuthSecretArn: *createdSecret.ARN,
	}, nil
}

Preparing the network

Now it gets interesting! To expose our services, we’ll have to create public subnets. This means we either need to run an expensive NAT gateway, or we simply allocate public IP addresses in our relevant subnets.

type createNetworkingResp struct {
	subnetIds           []string
	ecsSecurityGroupIds []string
	lbSecurityGroupIds  []string
	vpcId               string
}

// Creates a VPC, public subnets for every availability zone, and security groups for traffic to the Load Balancer, as well as traffic of ECS tasks
func createNetworking(ctx context.Context, logger logrus.FieldLogger, clients *clients, vpcName string, availabilityZones []string) ([]CreatedResource, *createNetworkingResp, error) {
	createdResources := make([]CreatedResource, 0)

	logger.Debugf("Creating VPC")

	// https://docs.aws.amazon.com/vpc/latest/userguide/VPC_Subnets.html
	vpcCIDR := "10.0.0.0/16"
	createdVpc, err := clients.ec2.CreateVpc(ctx, &ec2.CreateVpcInput{
		CidrBlock: aws.String(vpcCIDR),
		TagSpecifications: []typesEc2.TagSpecification{
			{
				ResourceType: typesEc2.ResourceTypeVpc,
				Tags: []typesEc2.Tag{
					{
						Key:   aws.String("Name"),
						Value: aws.String(vpcName),
					},
				},
			},
		},
	})
	if err != nil {
		return createdResources, nil, fmt.Errorf("failed to create vpc: %w", err)
	}

	createdResources = append(createdResources, CreatedResource{
		ResourceType: "ec2:vpc",
		ResourceId:   *createdVpc.Vpc.VpcId,
	})

	logger.Debugf("Creating internet gateway")

	// https://docs.aws.amazon.com/vpc/latest/userguide/VPC_Internet_Gateway.html
	createdIgw, err := clients.ec2.CreateInternetGateway(ctx, &ec2.CreateInternetGatewayInput{
		TagSpecifications: []typesEc2.TagSpecification{
			{
				ResourceType: typesEc2.ResourceTypeInternetGateway,
				Tags: []typesEc2.Tag{
					{
						Key:   aws.String("Name"),
						Value: aws.String(fmt.Sprintf("%s-igw", vpcName)),
					},
				},
			},
		},
	})
	if err != nil {
		return createdResources, nil, fmt.Errorf("failed to create internet gateway: %w", err)
	}

	createdResources = append(createdResources, CreatedResource{
		ResourceType: "ec2:internet-gateway",
		ResourceId:   *createdIgw.InternetGateway.InternetGatewayId,
		IgwVpcId:     *createdVpc.Vpc.VpcId,
	})

	logger.Debugf("Attaching internet gateway to VPC")

	_, err = clients.ec2.AttachInternetGateway(ctx, &ec2.AttachInternetGatewayInput{
		InternetGatewayId: createdIgw.InternetGateway.InternetGatewayId,
		VpcId:             createdVpc.Vpc.VpcId,
	})
	if err != nil {
		return createdResources, nil, fmt.Errorf("failed to attach internet gateway: %w", err)
	}

	// Get main route table
	routeTables, err := clients.ec2.DescribeRouteTables(ctx, &ec2.DescribeRouteTablesInput{
		Filters: []typesEc2.Filter{
			{
				Name:   aws.String("vpc-id"),
				Values: []string{*createdVpc.Vpc.VpcId},
			},
		},
	})
	if err != nil {
		return createdResources, nil, fmt.Errorf("failed to describe route tables: %w", err)
	}

	mainRouteTable := routeTables.RouteTables[0]

	logger.Debugf("Creating route to internet")

	// Create IPv4 route to internet
	// https://docs.aws.amazon.com/vpc/latest/userguide/VPC_Route_Tables.html#route-table-routes
	// https://docs.aws.amazon.com/vpc/latest/userguide/route-table-options.html#route-tables-internet-gateway
	_, err = clients.ec2.CreateRoute(ctx, &ec2.CreateRouteInput{
		RouteTableId:         mainRouteTable.RouteTableId,
		DestinationCidrBlock: aws.String("0.0.0.0/0"),
		GatewayId:            createdIgw.InternetGateway.InternetGatewayId,
	})
	if err != nil {
		return createdResources, nil, fmt.Errorf("failed to create route to internet: %w", err)
	}

	// Create IPv6 route to internet
	_, err = clients.ec2.CreateRoute(ctx, &ec2.CreateRouteInput{
		RouteTableId:             mainRouteTable.RouteTableId,
		DestinationIpv6CidrBlock: aws.String("::/0"),
		GatewayId:                createdIgw.InternetGateway.InternetGatewayId,
	})

	// Create subnets for each AZ
	subnetPerAZ := make(map[string]string)
	subnetIds := make([]string, 0, len(availabilityZones))

	for i, zone := range availabilityZones {
		cidr := fmt.Sprintf("10.0.%d.0/24", i+1)
		logger.WithField("cidr", cidr).WithField("zone", zone).Debugf("Creating subnet")
		createdSubnet, err := clients.ec2.CreateSubnet(ctx, &ec2.CreateSubnetInput{
			VpcId:            createdVpc.Vpc.VpcId,
			AvailabilityZone: &zone,
			// start with 10.0.1.0/24 [10.0.1.1;10.0.1.254]
			CidrBlock: aws.String(cidr),
			TagSpecifications: []typesEc2.TagSpecification{
				{
					ResourceType: typesEc2.ResourceTypeSubnet,
					Tags: []typesEc2.Tag{
						{
							Key:   aws.String("Name"),
							Value: aws.String(fmt.Sprintf("%s-%s", vpcName, zone)),
						},
					},
				},
			},
		})
		if err != nil {
			return createdResources, nil, fmt.Errorf("failed to create subnet: %w", err)
		}

		// Auto-assign public IPv4 address to the subnet
		_, err = clients.ec2.ModifySubnetAttribute(ctx, &ec2.ModifySubnetAttributeInput{
			SubnetId: createdSubnet.Subnet.SubnetId,
			MapPublicIpOnLaunch: &typesEc2.AttributeBooleanValue{
				Value: aws.Bool(true),
			},
		})
		if err != nil {
			return createdResources, nil, fmt.Errorf("failed to assign public IPv4 address to subnet: %w", err)
		}

		createdResources = append(createdResources, CreatedResource{
			ResourceType: "ec2:subnet",
			ResourceId:   *createdSubnet.Subnet.SubnetId,
		})

		subnetPerAZ[zone] = *createdSubnet.Subnet.SubnetId
		subnetIds = append(subnetIds, *createdSubnet.Subnet.SubnetId)
	}

	logger.Debugf("Creating security group to LB")

	securityGroupToLB, err := clients.ec2.CreateSecurityGroup(ctx, &ec2.CreateSecurityGroupInput{
		GroupName:   aws.String(fmt.Sprintf("%s-to-lb", vpcName)),
		Description: aws.String("Security group for load balancer"),
		VpcId:       createdVpc.Vpc.VpcId,
		TagSpecifications: []typesEc2.TagSpecification{
			{
				ResourceType: typesEc2.ResourceTypeSecurityGroup,
				Tags: []typesEc2.Tag{
					{
						Key:   aws.String("Name"),
						Value: aws.String(fmt.Sprintf("%s-to-lb", vpcName)),
					},
				},
			},
		},
	})
	if err != nil {
		return createdResources, nil, fmt.Errorf("failed to create security group to LB: %w", err)
	}

	createdResources = append(createdResources, CreatedResource{
		ResourceType: "ec2:security-group",
		ResourceId:   *securityGroupToLB.GroupId,
	})

	// Allow all HTTP/HTTPS ingress traffic from IPv4/IPv6
	_, err = clients.ec2.AuthorizeSecurityGroupIngress(ctx, &ec2.AuthorizeSecurityGroupIngressInput{
		GroupId: securityGroupToLB.GroupId,
		IpPermissions: []typesEc2.IpPermission{
			{
				IpProtocol: aws.String("6"),
				IpRanges: []typesEc2.IpRange{
					{
						CidrIp: aws.String("0.0.0.0/0"),
					},
				},
				FromPort: aws.Int32(80),
				ToPort:   aws.Int32(80),
			},
			{
				IpProtocol: aws.String("6"),
				Ipv6Ranges: []typesEc2.Ipv6Range{
					{
						CidrIpv6: aws.String("::/0"),
					},
				},
				FromPort: aws.Int32(80),
				ToPort:   aws.Int32(80),
			},
			{
				IpProtocol: aws.String("6"),
				IpRanges: []typesEc2.IpRange{
					{
						CidrIp: aws.String("0.0.0.0/0"),
					},
				},
				FromPort: aws.Int32(443),
				ToPort:   aws.Int32(443),
			},
			{
				IpProtocol: aws.String("6"),
				Ipv6Ranges: []typesEc2.Ipv6Range{
					{
						CidrIpv6: aws.String("::/0"),
					},
				},
				FromPort: aws.Int32(443),
				ToPort:   aws.Int32(443),
			},
		},
	})
	if err != nil {
		return createdResources, nil, fmt.Errorf("failed to authorize security group egress: %w", err)
	}

	logger.Debugf("Creating security group for ECS")

	// Allow all outgoing traffic
	securityGroupECS, err := clients.ec2.CreateSecurityGroup(ctx, &ec2.CreateSecurityGroupInput{
		GroupName: aws.String(fmt.Sprintf("ecsSecurityGroup-%s", randomSuffix())),
		VpcId:     createdVpc.Vpc.VpcId,
		Description: aws.String(
			"Security group for ECS cluster",
		),
	})
	if err != nil {
		return createdResources, nil, fmt.Errorf("failed to create security group for ECS: %w", err)
	}

	createdResources = append(createdResources, CreatedResource{
		ResourceType: "ec2:security-group",
		ResourceId:   *securityGroupECS.GroupId,
	})

	// Allow incoming traffic from same VPC (LB -> ECS)
	_, err = clients.ec2.AuthorizeSecurityGroupIngress(ctx, &ec2.AuthorizeSecurityGroupIngressInput{
		GroupId: securityGroupECS.GroupId,
		IpPermissions: []typesEc2.IpPermission{
			{
				FromPort:   aws.Int32(0),
				ToPort:     aws.Int32(65535),
				IpProtocol: aws.String("6"),
				IpRanges: []typesEc2.IpRange{
					{
						CidrIp:      aws.String(vpcCIDR),
						Description: aws.String("VPC CIDR"),
					},
				},
			},
		},
	})

	return createdResources, &createNetworkingResp{
		subnetIds:           subnetIds,
		lbSecurityGroupIds:  []string{*securityGroupToLB.GroupId},
		ecsSecurityGroupIds: []string{*securityGroupECS.GroupId},
		vpcId:               *createdVpc.Vpc.VpcId,
	}, nil
}

Provisioning certificates

To serve our traffic securely, we’ll provision public TLS certificates using AWS Certificate Manager. These certificates come entirely for free, which is great. We’ll wait until the certificate is properly provisioned (including the validation step), as our load balancer will not accept pending certificates.

type createCertificateArgs struct {
	certificateDomain string
	zoneId            string
}

type createCertificateResp struct {
	certificateArn string
}

func CreateRecord(ctx context.Context, logger logrus.FieldLogger, c *clients, zoneId string, kind, fullNameIncludingDomain, content string) ([]CreatedResource, error) {
	createdResources := make([]CreatedResource, 0)
	// Remove trailing dot as Cloudflare doesn't save it
	if strings.HasSuffix(fullNameIncludingDomain, ".") {
		logger.WithField("previous", fullNameIncludingDomain).Debugln("Removing trailing dot from record name")
		fullNameIncludingDomain = fullNameIncludingDomain[:len(fullNameIncludingDomain)-1]
	}

	existing, err := c.cf.DNSRecords(ctx, zoneId, cloudflare.DNSRecord{
		Name: fullNameIncludingDomain,
		Type: kind,
	})
	if err != nil {
		return nil, fmt.Errorf("failed to get existing record: %w", err)
	}

	toCreate := cloudflare.DNSRecord{
		Name:    fullNameIncludingDomain,
		Type:    kind,
		Content: content,
		// TTL in seconds
		TTL: 60,
	}

	fieldLogger := logger.WithFields(logrus.Fields{
		"zoneId":  zoneId,
		"type":    kind,
		"name":    fullNameIncludingDomain,
		"content": content,
	})

	if len(existing) == 0 {
		fieldLogger.Debugln("Creating DNS record")
		created, err := c.cf.CreateDNSRecord(ctx, zoneId, toCreate)
		if err != nil {
			return nil, fmt.Errorf("failed to create record: %w", err)
		}
		createdResources = append(createdResources, CreatedResource{
			ResourceType: "cf:record",
			ResourceId:   created.Result.ID,
			CfZoneId:     zoneId,
		})
		return createdResources, nil
	}

	fieldLogger.WithField("recordId", existing[0].ID).Debugln("Updating existing DNS record")

	err = c.cf.UpdateDNSRecord(ctx, zoneId, existing[0].ID, toCreate)
	if err != nil {
		return createdResources, fmt.Errorf("failed to update record: %w", err)
	}

	createdResources = append(createdResources, CreatedResource{
		ResourceType: "cf:record",
		ResourceId:   existing[0].ID,
		CfZoneId:     zoneId,
	})

	return createdResources, nil
}

func createCertificate(ctx context.Context, logger logrus.FieldLogger, clients *clients, args createCertificateArgs) ([]CreatedResource, *createCertificateResp, error) {
	createdResources := make([]CreatedResource, 0)

	logger.WithField("domain", args.certificateDomain).Debugf("Requesting certificate")

	createdCert, err := clients.acm.RequestCertificate(ctx, &acm.RequestCertificateInput{
		DomainName:       aws.String(args.certificateDomain),
		ValidationMethod: typesAcm.ValidationMethodDns,
	})
	if err != nil {
		return createdResources, nil, fmt.Errorf("failed to request certificate: %w", err)
	}

	createdResources = append(createdResources, CreatedResource{
		ResourceType: "acm:certificate",
		ResourceId:   *createdCert.CertificateArn,
	})

	// Wait and refresh certificate until resource record is present, this might take a couple of seconds
	var certificate *acm.DescribeCertificateOutput
	for {
		select {
		case <-ctx.Done():
			return createdResources, nil, fmt.Errorf("context canceled while waiting for certificate resource record: %w", ctx.Err())
		case <-time.After(time.Minute * 5):
			return createdResources, nil, fmt.Errorf("certificate did not provide resource record in time: %w", ctx.Err())
		case <-time.After(time.Second * 5):
			break
		}

		certificate, err = clients.acm.DescribeCertificate(ctx, &acm.DescribeCertificateInput{
			CertificateArn: createdCert.CertificateArn,
		})
		if err != nil {
			return createdResources, nil, fmt.Errorf("failed to describe certificate: %w", err)
		}

		// Sometimes ACM caches verification statuses, so the certificate might be completely issued without a need for verification
		if certificate.Certificate.Status == typesAcm.CertificateStatusIssued {
			break
		}

		// Otherwise, we'd expect the certificate to be in the process of being verified
		if certificate.Certificate.Status != typesAcm.CertificateStatusPendingValidation {
			return createdResources, nil, fmt.Errorf("certificate status is not pending validation: %s", certificate.Certificate.Status)
		}

		var hasResourceRecord bool
		for _, validationOption := range certificate.Certificate.DomainValidationOptions {
			if validationOption.ValidationMethod == typesAcm.ValidationMethodDns && validationOption.ResourceRecord != nil && *validationOption.ResourceRecord.Name == args.certificateDomain {
				hasResourceRecord = true
				break
			}
		}

		if hasResourceRecord {
			break
		}

		logger.Debugf("Waiting for certificate to be ready")
	}

	// If certificate is issued, return early
	if certificate.Certificate.Status == typesAcm.CertificateStatusIssued {
		return createdResources, &createCertificateResp{
			certificateArn: *createdCert.CertificateArn,
		}, nil
	}

	// Otherwise verify certificate
	logger.WithField("validationOptions", len(certificate.Certificate.DomainValidationOptions)).Debugf("Creating records for domain validation")

	for _, option := range certificate.Certificate.DomainValidationOptions {
		if option.ValidationMethod != typesAcm.ValidationMethodDns || option.ResourceRecord == nil {
			continue
		}

		// Create Cloudflare Record
		created, err := CreateRecord(ctx, logger, clients, args.zoneId, "CNAME", *option.ResourceRecord.Name, *option.ResourceRecord.Value)
		if err != nil {
			return createdResources, nil, fmt.Errorf("failed to create cname: %w", err)
		}

		createdResources = append(createdResources, created...)
	}

	// Wait until certificate is verified
	for {
		select {
		case <-ctx.Done():
			return createdResources, nil, fmt.Errorf("context cancelled")
		case <-time.After(time.Minute * 5):
			return createdResources, nil, fmt.Errorf("timeout waiting for certificate to be verified")
		case <-time.After(time.Second * 5):
		}

		currentCert, err := clients.acm.DescribeCertificate(ctx, &acm.DescribeCertificateInput{
			CertificateArn: createdCert.CertificateArn,
		})
		if err != nil {
			return createdResources, nil, fmt.Errorf("failed to get certificate: %w", err)
		}

		if currentCert.Certificate.Status == typesAcm.CertificateStatusIssued {
			break
		}

		logger.Debugf("Waiting for certificate to be verified")
	}

	return createdResources, &createCertificateResp{
		certificateArn: *createdCert.CertificateArn,
	}, nil
}

Setting up Load Balancing

Next, we’ll expose our service to the internet through an Application Load Balancer instance. This terminates TLS using our provisioned certificate and routes all traffic to an available container (determined by using the /healthz health check endpoint).

type createLoadBalancingResp struct {
	targetGroupArn  string
	loadBalancerArn string
}

type loadBalancingArgs struct {
	serviceName        string
	certificateArn     string
	vpcId              string
	serviceHostname    string
	lbSecurityGroupIds []string
	subnetIds          []string
	cloudflareZoneId   string
}

func createLoadBalancing(ctx context.Context, logger logrus.FieldLogger, clients *clients, args loadBalancingArgs) ([]CreatedResource, *createLoadBalancingResp, error) {
	createdResources := make([]CreatedResource, 0)

	logger.Debugf("Creating load balancer")

	// Create internet-facing IPv4 load balancer
	// TODO Check if we can get a dualstack Load Balancer running (still haven't given up on IPv6)
	createdLBs, err := clients.elb.CreateLoadBalancer(ctx, &elasticloadbalancingv2.CreateLoadBalancerInput{
		Name:           aws.String(args.serviceName),
		IpAddressType:  elbTypes.IpAddressTypeIpv4,
		Scheme:         elbTypes.LoadBalancerSchemeEnumInternetFacing,
		SecurityGroups: args.lbSecurityGroupIds,
		Subnets:        args.subnetIds,
		Type:           elbTypes.LoadBalancerTypeEnumApplication,
	})
	if err != nil {
		return createdResources, nil, fmt.Errorf("failed to create load balancer: %w", err)
	}

	createdLB := createdLBs.LoadBalancers[0]
	createdResources = append(createdResources, CreatedResource{
		ResourceType: "elb:load-balancer",
		ResourceId:   *createdLB.LoadBalancerArn,
	})

	logger.Debugf("Creating record for load balancer")

	// Create record to point traffic to load balancer (will use one of the possible availability zones/subnets randomly)
	created, err := CreateRecord(ctx, logger, clients, args.cloudflareZoneId, "CNAME", args.serviceHostname, *createdLB.DNSName)
	if err != nil {
		return createdResources, nil, fmt.Errorf("failed to create cname: %w", err)
	}

	createdResources = append(createdResources, created...)

	logger.Debugf("Creating target group")

	// Target group includes all deployed services that could handle traffic
	createdTargetGroup, err := clients.elb.CreateTargetGroup(ctx, &elasticloadbalancingv2.CreateTargetGroupInput{
		Name:       aws.String(args.serviceName),
		TargetType: elbTypes.TargetTypeEnumIp,

		// Health check allows to dynamically deregister unhealthy services from accepting traffic
		HealthCheckPath:       aws.String("/healthz"),
		HealthyThresholdCount: aws.Int32(2),

		// Port is completely ignored for IP target type / ECS
		Port: aws.Int32(80),

		Protocol:        elbTypes.ProtocolEnumHttp,
		ProtocolVersion: aws.String("HTTP1"),

		VpcId: aws.String(args.vpcId),
	})
	if err != nil {
		return createdResources, nil, fmt.Errorf("failed to create target group: %w", err)
	}
	targetGroup := createdTargetGroup.TargetGroups[0]

	createdResources = append(createdResources, CreatedResource{
		ResourceType: "elb:target-group",
		ResourceId:   *targetGroup.TargetGroupArn,
	})

	logger.Debugf("Creating HTTPS listener")

	// Accept HTTPS traffic
	createdHTTPSListener, err := clients.elb.CreateListener(ctx, &elasticloadbalancingv2.CreateListenerInput{
		DefaultActions: []elbTypes.Action{
			// When a request comes in and isn't caught by another rule, serve fallback 404
			{
				Type: elbTypes.ActionTypeEnumFixedResponse,
				FixedResponseConfig: &elbTypes.FixedResponseActionConfig{
					StatusCode:  aws.String("404"),
					ContentType: aws.String("application/json"),
					MessageBody: aws.String("{\"message\": \"Not Found\"}"),
				},
			},
		},
		LoadBalancerArn: createdLB.LoadBalancerArn,
		Certificates: []elbTypes.Certificate{
			{
				CertificateArn: &args.certificateArn,
			},
		},
		Port:      aws.Int32(443),
		Protocol:  elbTypes.ProtocolEnumHttps,
		SslPolicy: aws.String("ELBSecurityPolicy-2016-08"),
	})
	if err != nil {
		return createdResources, nil, fmt.Errorf("failed to create listener: %w", err)
	}

	createdResources = append(createdResources, CreatedResource{
		ResourceType: "elb:listener",
		ResourceId:   *createdHTTPSListener.Listeners[0].ListenerArn,
	})

	logger.Debugf("Creating HTTP listener")

	// Accept and redirect HTTP traffic
	createdHTTPListener, err := clients.elb.CreateListener(ctx, &elasticloadbalancingv2.CreateListenerInput{
		DefaultActions: []elbTypes.Action{
			// Always redirect to HTTPS
			{
				Type: elbTypes.ActionTypeEnumRedirect,
				RedirectConfig: &elbTypes.RedirectActionConfig{
					StatusCode: elbTypes.RedirectActionStatusCodeEnumHttp301,
					Protocol:   aws.String("HTTPS"),
				},
			},
		},
		LoadBalancerArn: createdLB.LoadBalancerArn,
		Port:            aws.Int32(80),
		Protocol:        elbTypes.ProtocolEnumHttp,
	})
	if err != nil {
		return createdResources, nil, fmt.Errorf("failed to create listener: %w", err)
	}

	createdResources = append(createdResources, CreatedResource{
		ResourceType: "elb:listener",
		ResourceId:   *createdHTTPListener.Listeners[0].ListenerArn,
	})

	logger.WithField("host", args.serviceHostname).Debugf("Creating forward rule")

	// Forward all HTTPS traffic to created target group
	createdRule, err := clients.elb.CreateRule(ctx, &elasticloadbalancingv2.CreateRuleInput{
		Actions: []elbTypes.Action{
			{
				Type:           elbTypes.ActionTypeEnumForward,
				TargetGroupArn: targetGroup.TargetGroupArn,
			},
		},
		Conditions: []elbTypes.RuleCondition{
			{
				Field: aws.String("host-header"),
				HostHeaderConfig: &elbTypes.HostHeaderConditionConfig{
					Values: []string{args.serviceHostname},
				},
			},
		},
		ListenerArn: createdHTTPSListener.Listeners[0].ListenerArn,
		Priority:    aws.Int32(1),
	})
	if err != nil {
		return createdResources, nil, fmt.Errorf("failed to create rule: %w", err)
	}

	createdResources = append(createdResources, CreatedResource{
		ResourceType: "elb:rule",
		ResourceId:   *createdRule.Rules[0].RuleArn,
	})

	return createdResources, &createLoadBalancingResp{
		targetGroupArn:  *targetGroup.TargetGroupArn,
		loadBalancerArn: *createdLB.LoadBalancerArn,
	}, nil
}

Setting up the ECS service

We’re all set now, so we can finally deploy the ECS service. First, we’ll create a helper function to register a new task definition.

type createTaskDefinitionArgs struct {
	envMap                   map[string]string
	port                     int
	image                    *ServiceImageInputs
	executionRoleArn         string
	command                  []string
	entrypoint               []string
	mainContainerName        string
	serviceName              string
	taskRoleArn              string
	privateRegistrySecretArn string
}

type createTaskDefinitionResp struct {
	taskDefinitionArn string
}

type ServiceImageRegistryInputs struct {
	Username string `json:"username"`
	Password string `json:"password"`
}

type ServiceImageInputs struct {
	Name     string                      `json:"name"`
	Registry *ServiceImageRegistryInputs `json:"registry"`
}

func createTaskDefinition(ctx context.Context, logger logrus.FieldLogger, clients *clients, args createTaskDefinitionArgs) ([]CreatedResource, *createTaskDefinitionResp, error) {
	createdResources := make([]CreatedResource, 0)

	env := make([]types.KeyValuePair, 0, len(args.envMap))

	for key, value := range args.envMap {
		env = append(env, types.KeyValuePair{
			Name:  aws.String(key),
			Value: aws.String(value),
		})
	}

	env = append(env, types.KeyValuePair{
		Name:  aws.String("PORT"),
		Value: aws.String(fmt.Sprintf("%d", args.port)),
	})

	logger.Debugf("Creating ECS task definition")

	var repositoryCredentials *types.RepositoryCredentials
	if args.image.Registry != nil {
		created, resp, err := createPrivateRegistryAuth(ctx, logger, clients, createPrivateRegistryAuthArgs{
			taskExecutionRoleName:    args.executionRoleArn,
			userName:                 args.image.Registry.Username,
			password:                 args.image.Registry.Password,
			privateRegistrySecretArn: args.privateRegistrySecretArn,
		})
		if err != nil {
			return createdResources, nil, fmt.Errorf("failed to create private registry auth: %w", err)
		}

		createdResources = append(createdResources, created...)

		repositoryCredentials = &types.RepositoryCredentials{
			CredentialsParameter: &resp.registryAuthSecretArn,
		}
	}

	createdTaskDef, err := clients.ecs.RegisterTaskDefinition(ctx, &ecs.RegisterTaskDefinitionInput{
		ContainerDefinitions: []types.ContainerDefinition{
			{
				Command:     args.command,
				EntryPoint:  args.entrypoint,
				Environment: env,
				Essential:   aws.Bool(true),
				Image:       &args.image.Name,
				Name:        aws.String(args.mainContainerName),
				PortMappings: []types.PortMapping{
					{
						ContainerPort: aws.Int32(int32(args.port)),
					},
				},
				RepositoryCredentials: repositoryCredentials,
			},
		},
		Family:                  aws.String(args.serviceName),
		Cpu:                     aws.String("1 vCPU"),
		ExecutionRoleArn:        &args.executionRoleArn,
		Memory:                  aws.String("2 GB"),
		NetworkMode:             "awsvpc",
		RequiresCompatibilities: []types.Compatibility{types.CompatibilityFargate},
		TaskRoleArn:             &args.taskRoleArn,
	})
	if err != nil {
		return createdResources, nil, fmt.Errorf("failed to register task definition: %w", err)
	}

	createdResources = append(createdResources, CreatedResource{
		ResourceType: "ecs:task-definition",
		ResourceId:   *createdTaskDef.TaskDefinition.TaskDefinitionArn,
	})

	return createdResources, &createTaskDefinitionResp{
		taskDefinitionArn: *createdTaskDef.TaskDefinition.TaskDefinitionArn,
	}, nil
}

Next, we’ll create a helper to create an ECS service.

type createEcsServiceArgs struct {
	envMap                     map[string]string
	port                       int
	command                    []string
	entrypoint                 []string
	mainContainerName          string
	serviceName                string
	image                      *ServiceImageInputs
	executionRoleArn           string
	taskRoleArn                string
	targetGroupArn             string
	subnetIds                  []string
	ecsServiceSecurityGroupIds []string
	instanceCount              int
}

type createEcsServiceResp struct {
	serviceArn        string
	taskDefinitionArn string
}

func createEcsService(ctx context.Context, logger logrus.FieldLogger, clients *clients, args createEcsServiceArgs) ([]CreatedResource, *createEcsServiceResp, error) {
	createdResources := make([]CreatedResource, 0)

	logger.Debugf("Creating ECS cluster")

	createdCluster, err := clients.ecs.CreateCluster(ctx, &ecs.CreateClusterInput{
		ClusterName: aws.String(args.serviceName),
	})
	if err != nil {
		return createdResources, nil, fmt.Errorf("failed to create cluster: %w", err)
	}

	createdResources = append(createdResources, CreatedResource{
		ResourceType: "ecs:cluster",
		ResourceId:   *createdCluster.Cluster.ClusterName,
	})

	created, taskDefResp, err := createTaskDefinition(ctx, logger, clients, createTaskDefinitionArgs{
		envMap:            args.envMap,
		port:              args.port,
		image:             args.image,
		executionRoleArn:  args.executionRoleArn,
		command:           args.command,
		entrypoint:        args.entrypoint,
		mainContainerName: args.mainContainerName,
		serviceName:       args.serviceName,
		taskRoleArn:       args.taskRoleArn,
	})
	createdResources = append(createdResources, created...)
	if err != nil {
		return createdResources, nil, fmt.Errorf("failed to create task definition: %w", err)
	}

	logger.Debugf("Creating ECS service")

	createdSvc, err := clients.ecs.CreateService(ctx, &ecs.CreateServiceInput{
		Cluster:     createdCluster.Cluster.ClusterArn,
		ServiceName: aws.String(args.serviceName),
		DeploymentConfiguration: &types.DeploymentConfiguration{
			DeploymentCircuitBreaker: &types.DeploymentCircuitBreaker{
				Enable:   true,
				Rollback: true,
			},
			MaximumPercent:        aws.Int32(200),
			MinimumHealthyPercent: aws.Int32(100),
		},
		DeploymentController: &types.DeploymentController{
			Type: types.DeploymentControllerTypeEcs,
		},
		DesiredCount:                  aws.Int32(1),
		HealthCheckGracePeriodSeconds: aws.Int32(10),
		LaunchType:                    types.LaunchTypeFargate,
		LoadBalancers: []types.LoadBalancer{
			{
				ContainerName:  aws.String(args.mainContainerName),
				ContainerPort:  aws.Int32(int32(args.port)),
				TargetGroupArn: &args.targetGroupArn,
			},
		},
		NetworkConfiguration: &types.NetworkConfiguration{
			AwsvpcConfiguration: &types.AwsVpcConfiguration{
				AssignPublicIp: types.AssignPublicIpEnabled,
				Subnets:        args.subnetIds,
				SecurityGroups: args.ecsServiceSecurityGroupIds,
			},
		},
		TaskDefinition: &taskDefResp.taskDefinitionArn,
	})
	if err != nil {
		return createdResources, nil, fmt.Errorf("failed to create service: %w", err)
	}

	createdResources = append(createdResources, CreatedResource{
		Use:           "ECSService",
		ResourceType:  "ecs:service",
		ResourceId:    *createdSvc.Service.ServiceArn,
		EcsClusterArn: *createdSvc.Service.ClusterArn,
	})

	return createdResources, &createEcsServiceResp{
		serviceArn:        *createdSvc.Service.ServiceArn,
		taskDefinitionArn: taskDefResp.taskDefinitionArn,
	}, nil
}

Finally, we can create a public function that puts it all together, provisioning roles, networking, certificates, load balancing, followed by task definition and service.

const (
	mainContainerName string = "main"
	containerPort     int    = 8080
)

type ServiceInputs struct {
	Hostname      string              `json:"hostname"`
	InstanceCount *int                `json:"instanceCount"`
	Env           map[string]string   `json:"env"`
	Command       []string            `json:"command"`
	Entrypoint    []string            `json:"entrypoint"`
	Image         *ServiceImageInputs `json:"image"`
}

func CreateService(ctx context.Context, logger logrus.FieldLogger, serviceName, region, zoneId string, inputs *ServiceInputs, clients *clients) ([]CreatedResource, error) {
	createdResources := make([]CreatedResource, 0)

	availabilityZones, err := loadAZs(ctx, logger, region, clients, 3)
	if err != nil {
		return createdResources, fmt.Errorf("failed to load availability zones: %w", err)
	}

	created, roles, err := createRoles(ctx, logger, clients)
	createdResources = append(createdResources, created...)
	if err != nil {
		return createdResources, fmt.Errorf("failed to create roles: %w", err)
	}

	created, networking, err := createNetworking(ctx, logger, clients, serviceName, availabilityZones)
	createdResources = append(createdResources, created...)
	if err != nil {
		return createdResources, fmt.Errorf("failed to create networking: %w", err)
	}

	created, cert, err := createCertificate(ctx, logger, clients, createCertificateArgs{
		certificateDomain: inputs.Hostname,
		zoneId:            zoneId,
	})
	createdResources = append(createdResources, created...)
	if err != nil {
		return createdResources, fmt.Errorf("failed to create certificate: %w", err)
	}

	created, loadBalancing, err := createLoadBalancing(ctx, logger, clients, loadBalancingArgs{
		serviceName:        serviceName,
		certificateArn:     cert.certificateArn,
		vpcId:              networking.vpcId,
		cloudflareZoneId:   zoneId,
		serviceHostname:    inputs.Hostname,
		subnetIds:          networking.subnetIds,
		lbSecurityGroupIds: networking.lbSecurityGroupIds,
	})
	createdResources = append(createdResources, created...)
	if err != nil {
		return createdResources, fmt.Errorf("failed to create load balancing: %w", err)
	}

	instanceCount := 1
	if inputs.InstanceCount != nil {
		instanceCount = *inputs.InstanceCount
	}

	created, _, err = createEcsService(ctx, logger, clients, createEcsServiceArgs{
		envMap:                     inputs.Env,
		port:                       containerPort,
		command:                    inputs.Command,
		entrypoint:                 inputs.Entrypoint,
		mainContainerName:          mainContainerName,
		serviceName:                serviceName,
		image:                      inputs.Image,
		executionRoleArn:           roles.taskExecutionRoleArn,
		taskRoleArn:                roles.taskRoleArn,
		targetGroupArn:             loadBalancing.targetGroupArn,
		subnetIds:                  networking.subnetIds,
		ecsServiceSecurityGroupIds: networking.ecsSecurityGroupIds,
		instanceCount:              instanceCount,
	})
	createdResources = append(createdResources, created...)
	if err != nil {
		return createdResources, fmt.Errorf("failed to create ecs service: %w", err)
	}

	return createdResources, nil
}

We’ll also add a way to update our service. To know which service we need to update, we’ll simply go through the list of created resources, which acts as a store for our deployment state.

type UpdateServiceArgs struct {
	EnvMap        map[string]string
	Command       []string
	Entrypoint    []string
	Image         *ServiceImageInputs
	InstanceCount int
}

func UpdateService(ctx context.Context, logger logrus.FieldLogger, state []CreatedResource, clients *clients, args UpdateServiceArgs) ([]CreatedResource, error) {
	createdResources := make([]CreatedResource, 0)

	var executionRoleArn string
	var taskRoleArn string
	var privateRegistrySecretArn string
	var serviceArn string
	var clusterArn string
	for _, resource := range createdResources {
		switch resource.Use {
		case "TaskExecutionRole":
			executionRoleArn = resource.ResourceId
		case "TaskRole":
			taskRoleArn = resource.ResourceId
		case "ECSService":
			serviceArn = resource.ResourceId
			clusterArn = resource.EcsClusterArn
		}
	}

	// Create new task definition with updated details
	created, resp, err := createTaskDefinition(ctx, logger, clients, createTaskDefinitionArgs{
		envMap:                   args.EnvMap,
		port:                     containerPort,
		image:                    args.Image,
		privateRegistrySecretArn: privateRegistrySecretArn,
		executionRoleArn:         executionRoleArn,
		command:                  args.Command,
		entrypoint:               args.Entrypoint,
		mainContainerName:        mainContainerName,
		serviceName:              state[0].ResourceId,
		taskRoleArn:              taskRoleArn,
	})
	if err != nil {
		return createdResources, fmt.Errorf("failed to create task definition: %w", err)
	}

	createdResources = append(createdResources, created...)

	// Update service with new task definition
	_, err = clients.ecs.UpdateService(ctx, &ecs.UpdateServiceInput{
		Service:            &serviceArn,
		Cluster:            &clusterArn,
		DesiredCount:       aws.Int32(int32(args.InstanceCount)),
		ForceNewDeployment: true,
		TaskDefinition:     &resp.taskDefinitionArn,
	})
	if err != nil {
		return createdResources, fmt.Errorf("failed to update service: %w", err)
	}

	return createdResources, nil
}

That was quite intense, but we managed to deploy a container to ECS, with scalable load balancing, multi-AZ availability, automatic TLS, and support for custom image registries. As I said earlier, you probably wouldn’t want to go through this, instead, I’d recommend using a managed service, like managed containers on Anzu.

Bruno Scheufler

Software Engineering, Management

Projects

On other platforms