diff --git a/backend/remote-state/consul/backend.go b/backend/remote-state/consul/backend.go index ce7c2ee1f..271a60b63 100644 --- a/backend/remote-state/consul/backend.go +++ b/backend/remote-state/consul/backend.go @@ -2,7 +2,9 @@ package consul import ( "context" + "net" "strings" + "time" consulapi "github.com/hashicorp/consul/api" "github.com/hashicorp/terraform/backend" @@ -100,6 +102,7 @@ type Backend struct { *schema.Backend // The fields below are set from configure + client *consulapi.Client configData *schema.ResourceData lock bool } @@ -111,16 +114,14 @@ func (b *Backend) configure(ctx context.Context) error { // Store the lock information b.lock = b.configData.Get("lock").(bool) - // Initialize a client to test config - _, err := b.clientRaw() - return err -} - -func (b *Backend) clientRaw() (*consulapi.Client, error) { data := b.configData // Configure the client config := consulapi.DefaultConfig() + + // replace the default Transport Dialer to reduce the KeepAlive + config.Transport.DialContext = dialContext + if v, ok := data.GetOk("access_token"); ok && v.(string) != "" { config.Token = v.(string) } @@ -162,5 +163,18 @@ func (b *Backend) clientRaw() (*consulapi.Client, error) { } } - return consulapi.NewClient(config) + client, err := consulapi.NewClient(config) + if err != nil { + return err + } + + b.client = client + return nil } + +// dialContext is the DialContext function for the consul client transport. +// This is stored in a package var to inject a different dialer for tests. +var dialContext = (&net.Dialer{ + Timeout: 30 * time.Second, + KeepAlive: 17 * time.Second, +}).DialContext diff --git a/backend/remote-state/consul/backend_state.go b/backend/remote-state/consul/backend_state.go index e77729446..95010aa0e 100644 --- a/backend/remote-state/consul/backend_state.go +++ b/backend/remote-state/consul/backend_state.go @@ -15,15 +15,9 @@ const ( ) func (b *Backend) States() ([]string, error) { - // Get the Consul client - client, err := b.clientRaw() - if err != nil { - return nil, err - } - // List our raw path prefix := b.configData.Get("path").(string) + keyEnvPrefix - keys, _, err := client.KV().Keys(prefix, "/", nil) + keys, _, err := b.client.KV().Keys(prefix, "/", nil) if err != nil { return nil, err } @@ -60,28 +54,16 @@ func (b *Backend) DeleteState(name string) error { return fmt.Errorf("can't delete default state") } - // Get the Consul API client - client, err := b.clientRaw() - if err != nil { - return err - } - // Determine the path of the data path := b.path(name) // Delete it. We just delete it without any locking since // the DeleteState API is documented as such. - _, err = client.KV().Delete(path, nil) + _, err := b.client.KV().Delete(path, nil) return err } func (b *Backend) State(name string) (state.State, error) { - // Get the Consul API client - client, err := b.clientRaw() - if err != nil { - return nil, err - } - // Determine the path of the data path := b.path(name) @@ -91,7 +73,7 @@ func (b *Backend) State(name string) (state.State, error) { // Build the state client var stateMgr state.State = &remote.State{ Client: &RemoteClient{ - Client: client, + Client: b.client, Path: path, GZip: gzip, lockState: b.lock, @@ -103,6 +85,11 @@ func (b *Backend) State(name string) (state.State, error) { stateMgr = &state.LockDisabled{Inner: stateMgr} } + // the default state always exists + if name == backend.DefaultStateName { + return stateMgr, nil + } + // Grab a lock, we use this to write an empty state if one doesn't // exist already. We have to write an empty state as a sentinel value // so States() knows it exists. diff --git a/backend/remote-state/consul/client.go b/backend/remote-state/consul/client.go index fe14e2c0c..bd37712f3 100644 --- a/backend/remote-state/consul/client.go +++ b/backend/remote-state/consul/client.go @@ -32,6 +32,8 @@ const ( lockReacquireInterval = 2 * time.Second ) +var lostLockErr = errors.New("consul lock was lost") + // RemoteClient is a remote client that stores data in Consul. type RemoteClient struct { Client *consulapi.Client @@ -228,6 +230,9 @@ func (c *RemoteClient) lock() (string, error) { return "", err } + // store the session ID for correlation with consul logs + c.info.Info = "consul session: " + lockSession + opts := &consulapi.LockOptions{ Key: c.Path + lockSuffix, Session: lockSession, @@ -406,7 +411,7 @@ func (c *RemoteClient) unlock(id string) error { select { case <-c.lockCh: - return errors.New("consul lock was lost") + return lostLockErr default: } diff --git a/backend/remote-state/consul/client_test.go b/backend/remote-state/consul/client_test.go index 910c5e9eb..ef3c5b187 100644 --- a/backend/remote-state/consul/client_test.go +++ b/backend/remote-state/consul/client_test.go @@ -1,7 +1,10 @@ package consul import ( + "context" "fmt" + "net" + "sync" "testing" "time" @@ -173,6 +176,7 @@ func TestConsul_lostLock(t *testing.T) { reLocked := make(chan struct{}) testLockHook = func() { close(reLocked) + testLockHook = nil } // now we use the second client to break the lock @@ -188,3 +192,100 @@ func TestConsul_lostLock(t *testing.T) { t.Fatal(err) } } + +func TestConsul_lostLockConnection(t *testing.T) { + srv := newConsulTestServer(t) + defer srv.Stop() + + // create an "unreliable" network by closing all the consul client's + // network connections + conns := &unreliableConns{} + origDialFn := dialContext + defer func() { + dialContext = origDialFn + }() + dialContext = conns.DialContext + + path := fmt.Sprintf("tf-unit/%s", time.Now().String()) + + b := backend.TestBackendConfig(t, New(), map[string]interface{}{ + "address": srv.HTTPAddr, + "path": path, + }) + + s, err := b.State(backend.DefaultStateName) + if err != nil { + t.Fatal(err) + } + + info := state.NewLockInfo() + info.Operation = "test-lost-lock-connection" + id, err := s.Lock(info) + if err != nil { + t.Fatal(err) + } + + // set a callback to know when the monitor loop re-connects + dialed := make(chan struct{}) + conns.dialCallback = func() { + close(dialed) + conns.dialCallback = nil + } + + // kill any open connections + // once the consul client is fixed, we should loop over this a few time to + // be sure, since we can't hook into the client's internal lock monitor + // loop. + conns.Kill() + // wait for a new connection to be dialed, and kill it again + <-dialed + conns.Kill() + + // since the lock monitor loop is hidden in the consul api client, we can + // only wait a bit to make sure we were notified of the failure + time.Sleep(time.Second) + + // once the consul client can reconnect properly, there will no longer be + // an error here + //if err := s.Unlock(id); err != nil { + if err := s.Unlock(id); err != lostLockErr { + t.Fatalf("expected lost lock error, got %v", err) + } +} + +type unreliableConns struct { + sync.Mutex + conns []net.Conn + dialCallback func() +} + +func (u *unreliableConns) DialContext(ctx context.Context, netw, addr string) (net.Conn, error) { + u.Lock() + defer u.Unlock() + + dialer := &net.Dialer{} + conn, err := dialer.DialContext(ctx, netw, addr) + if err != nil { + return nil, err + } + + u.conns = append(u.conns, conn) + + if u.dialCallback != nil { + u.dialCallback() + } + + return conn, nil +} + +// Kill these with a deadline, just to make sure we don't end up with any EOFs +// that get ignored. +func (u *unreliableConns) Kill() { + u.Lock() + defer u.Unlock() + + for _, conn := range u.conns { + conn.(*net.TCPConn).SetDeadline(time.Now()) + } + u.conns = nil +}