diff --git a/backend/remote-state/consul/backend.go b/backend/remote-state/consul/backend.go index d0cfa6ca4..271a60b63 100644 --- a/backend/remote-state/consul/backend.go +++ b/backend/remote-state/consul/backend.go @@ -120,11 +120,7 @@ func (b *Backend) configure(ctx context.Context) error { config := consulapi.DefaultConfig() // replace the default Transport Dialer to reduce the KeepAlive - - config.Transport.DialContext = (&net.Dialer{ - Timeout: 30 * time.Second, - KeepAlive: 17 * time.Second, - }).DialContext + config.Transport.DialContext = dialContext if v, ok := data.GetOk("access_token"); ok && v.(string) != "" { config.Token = v.(string) @@ -175,3 +171,10 @@ func (b *Backend) configure(ctx context.Context) error { b.client = client return nil } + +// dialContext is the DialContext function for the consul client transport. +// This is stored in a package var to inject a different dialer for tests. +var dialContext = (&net.Dialer{ + Timeout: 30 * time.Second, + KeepAlive: 17 * time.Second, +}).DialContext diff --git a/backend/remote-state/consul/backend_state.go b/backend/remote-state/consul/backend_state.go index 8002d9830..95010aa0e 100644 --- a/backend/remote-state/consul/backend_state.go +++ b/backend/remote-state/consul/backend_state.go @@ -85,6 +85,11 @@ func (b *Backend) State(name string) (state.State, error) { stateMgr = &state.LockDisabled{Inner: stateMgr} } + // the default state always exists + if name == backend.DefaultStateName { + return stateMgr, nil + } + // Grab a lock, we use this to write an empty state if one doesn't // exist already. We have to write an empty state as a sentinel value // so States() knows it exists. diff --git a/backend/remote-state/consul/client.go b/backend/remote-state/consul/client.go index e18021091..bd37712f3 100644 --- a/backend/remote-state/consul/client.go +++ b/backend/remote-state/consul/client.go @@ -32,6 +32,8 @@ const ( lockReacquireInterval = 2 * time.Second ) +var lostLockErr = errors.New("consul lock was lost") + // RemoteClient is a remote client that stores data in Consul. type RemoteClient struct { Client *consulapi.Client @@ -409,7 +411,7 @@ func (c *RemoteClient) unlock(id string) error { select { case <-c.lockCh: - return errors.New("consul lock was lost") + return lostLockErr default: } diff --git a/backend/remote-state/consul/client_test.go b/backend/remote-state/consul/client_test.go index 910c5e9eb..09e1a7d9a 100644 --- a/backend/remote-state/consul/client_test.go +++ b/backend/remote-state/consul/client_test.go @@ -1,7 +1,10 @@ package consul import ( + "context" "fmt" + "net" + "sync" "testing" "time" @@ -188,3 +191,100 @@ func TestConsul_lostLock(t *testing.T) { t.Fatal(err) } } + +func TestConsul_lostLockConnection(t *testing.T) { + srv := newConsulTestServer(t) + defer srv.Stop() + + // create an "unreliable" network by closing all the consul client's + // network connections + conns := &unreliableConns{} + origDialFn := dialContext + defer func() { + dialContext = origDialFn + }() + dialContext = conns.DialContext + + path := fmt.Sprintf("tf-unit/%s", time.Now().String()) + + b := backend.TestBackendConfig(t, New(), map[string]interface{}{ + "address": srv.HTTPAddr, + "path": path, + }) + + s, err := b.State(backend.DefaultStateName) + if err != nil { + t.Fatal(err) + } + + info := state.NewLockInfo() + info.Operation = "test-lost-lock-connection" + id, err := s.Lock(info) + if err != nil { + t.Fatal(err) + } + + // set a callback to know when the monitor loop re-connects + dialed := make(chan struct{}) + conns.dialCallback = func() { + close(dialed) + conns.dialCallback = nil + } + + // kill any open connections + // once the consul client is fixed, we should loop over this a few time to + // be sure, since we can't hook into the client's internal lock monitor + // loop. + conns.Kill() + // wait for a new connection to be dialed, and kill it again + <-dialed + conns.Kill() + + // since the lock monitor loop is hidden in the consul api client, we can + // only wait a bit to make sure we were notified of the failure + time.Sleep(time.Second) + + // once the consul client can reconnect properly, there will no longer be + // an error here + //if err := s.Unlock(id); err != nil { + if err := s.Unlock(id); err != lostLockErr { + t.Fatalf("expected lost lock error, got %v", err) + } +} + +type unreliableConns struct { + sync.Mutex + conns []net.Conn + dialCallback func() +} + +func (u *unreliableConns) DialContext(ctx context.Context, netw, addr string) (net.Conn, error) { + u.Lock() + defer u.Unlock() + + dialer := &net.Dialer{} + conn, err := dialer.DialContext(ctx, netw, addr) + if err != nil { + return nil, err + } + + u.conns = append(u.conns, conn) + + if u.dialCallback != nil { + u.dialCallback() + } + + return conn, nil +} + +// Kill these with a deadline, just to make sure we don't end up with any EOFs +// that get ignored. +func (u *unreliableConns) Kill() { + u.Lock() + defer u.Unlock() + + for _, conn := range u.conns { + conn.(*net.TCPConn).SetDeadline(time.Now()) + } + u.conns = nil +}