provider/aws: Wait for Spot Fleet to drain before removing from state (#8938)

* provider/aws: Wait for Spot Fleet to drain before removing from state Ensures the spot fleet is drained before reporting successful destroy and moving on * remove unreachable code * hack to sleep and test regression/leak * fix broken english in warning
2016-09-22 15:22:27 -05:00 · 2016-09-22 15:22:27 -05:00 · becdfef87b
parent b0e751129a
commit becdfef87b
2 changed files with 48 additions and 2 deletions
--- a/builtin/providers/aws/resource_aws_spot_fleet_request.go
+++ b/builtin/providers/aws/resource_aws_spot_fleet_request.go
@ -936,7 +936,7 @@ func resourceAwsSpotFleetRequestDelete(d *schema.ResourceData, meta interface{})
 	conn := meta.(*AWSClient).ec2conn

 	log.Printf("[INFO] Cancelling spot fleet request: %s", d.Id())
-	_, err := conn.CancelSpotFleetRequests(&ec2.CancelSpotFleetRequestsInput{
+	resp, err := conn.CancelSpotFleetRequests(&ec2.CancelSpotFleetRequestsInput{
 		SpotFleetRequestIds: []*string{aws.String(d.Id())},
 		TerminateInstances:  aws.Bool(d.Get("terminate_instances_with_expiration").(bool)),
 	})
@ -945,7 +945,36 @@ func resourceAwsSpotFleetRequestDelete(d *schema.ResourceData, meta interface{})
 		return fmt.Errorf("Error cancelling spot request (%s): %s", d.Id(), err)
 	}

-	return nil
+	// check response successfulFleetRequestSet to make sure our request was canceled
+	var found bool
+	for _, s := range resp.SuccessfulFleetRequests {
+		if *s.SpotFleetRequestId == d.Id() {
+			found = true
+		}
+	}
+
+	if !found {
+		return fmt.Errorf("[ERR] Spot Fleet request (%s) was not found to be successfully canceled, dangling resources may exit", d.Id())
+	}
+
+	return resource.Retry(5*time.Minute, func() *resource.RetryError {
+		resp, err := conn.DescribeSpotFleetInstances(&ec2.DescribeSpotFleetInstancesInput{
+			SpotFleetRequestId: aws.String(d.Id()),
+		})
+		if err != nil {
+			return resource.NonRetryableError(err)
+		}
+
+		if len(resp.ActiveInstances) == 0 {
+			log.Printf("[DEBUG] Active instance count is 0 for Spot Fleet Request (%s), removing", d.Id())
+			return nil
+		}
+
+		log.Printf("[DEBUG] Active instance count in Spot Fleet Request (%s): %d", d.Id(), len(resp.ActiveInstances))
+
+		return resource.RetryableError(
+			fmt.Errorf("fleet still has (%d) running instances", len(resp.ActiveInstances)))
+	})
 }

 func hashEphemeralBlockDevice(v interface{}) int {
--- a/builtin/providers/aws/resource_aws_spot_fleet_request_test.go
+++ b/builtin/providers/aws/resource_aws_spot_fleet_request_test.go
@ -3,7 +3,9 @@ package aws
 import (
 	"encoding/base64"
 	"fmt"
+	"log"
 	"testing"
+	"time"

 	"github.com/aws/aws-sdk-go/aws"
 	"github.com/aws/aws-sdk-go/service/ec2"
@ -248,6 +250,20 @@ func TestAccAWSSpotFleetRequest_withWeightedCapacity(t *testing.T) {
 	var sfr ec2.SpotFleetRequestConfig
 	rName := acctest.RandString(10)

+	fulfillSleep := func() resource.TestCheckFunc {
+		// sleep so that EC2 can fuflill the request. We do this to guard against a
+		// regression and possible leak where we'll destroy the request and the
+		// associated IAM role before anything is actually provisioned and running,
+		// thus leaking when those newly started instances are attempted to be
+		// destroyed
+		// See https://github.com/hashicorp/terraform/pull/8938
+		return func(s *terraform.State) error {
+			log.Printf("[DEBUG] Test: Sleep to allow EC2 to actually begin fulfilling TestAccAWSSpotFleetRequest_withWeightedCapacity request")
+			time.Sleep(1 * time.Minute)
+			return nil
+		}
+	}
+
 	resource.Test(t, resource.TestCase{
 		PreCheck:     func() { testAccPreCheck(t) },
 		Providers:    testAccProviders,
@ -256,6 +272,7 @@ func TestAccAWSSpotFleetRequest_withWeightedCapacity(t *testing.T) {
 			resource.TestStep{
 				Config: testAccAWSSpotFleetRequestConfigWithWeightedCapacity(rName),
 				Check: resource.ComposeAggregateTestCheckFunc(
+					fulfillSleep(),
 					testAccCheckAWSSpotFleetRequestExists(
 						"aws_spot_fleet_request.foo", &sfr),
 					resource.TestCheckResourceAttr(