From d0ecb232ae96c8a59fb6e3a0fc9a4d92d1f14fe4 Mon Sep 17 00:00:00 2001 From: James Bardin Date: Sun, 8 Oct 2017 11:24:43 -0400 Subject: [PATCH 1/4] record consul session ID in lock info This can help correlate TF and consul logs --- backend/remote-state/consul/client.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/backend/remote-state/consul/client.go b/backend/remote-state/consul/client.go index fe14e2c0c..e18021091 100644 --- a/backend/remote-state/consul/client.go +++ b/backend/remote-state/consul/client.go @@ -228,6 +228,9 @@ func (c *RemoteClient) lock() (string, error) { return "", err } + // store the session ID for correlation with consul logs + c.info.Info = "consul session: " + lockSession + opts := &consulapi.LockOptions{ Key: c.Path + lockSuffix, Session: lockSession, From fd9adcdb36603461318d81b808a400b6d35489a8 Mon Sep 17 00:00:00 2001 From: James Bardin Date: Sun, 8 Oct 2017 11:26:05 -0400 Subject: [PATCH 2/4] only init one consul client, and lower keepalive The consul Client is analogous to an http.Client, and we really don't need more than 1. Configure a single client and store it in the backend. Replace the default Transport's Dialer to reduce the KeepAlive setting from 30s to 17s. This avoids racing with the common network timeout value of 30s, and is also coprime to other common intervals. --- backend/remote-state/consul/backend.go | 25 ++++++++++++++------ backend/remote-state/consul/backend_state.go | 24 +++---------------- 2 files changed, 21 insertions(+), 28 deletions(-) diff --git a/backend/remote-state/consul/backend.go b/backend/remote-state/consul/backend.go index ce7c2ee1f..d0cfa6ca4 100644 --- a/backend/remote-state/consul/backend.go +++ b/backend/remote-state/consul/backend.go @@ -2,7 +2,9 @@ package consul import ( "context" + "net" "strings" + "time" consulapi "github.com/hashicorp/consul/api" "github.com/hashicorp/terraform/backend" @@ -100,6 +102,7 @@ type Backend struct { *schema.Backend // The fields below are set from configure + client *consulapi.Client configData *schema.ResourceData lock bool } @@ -111,16 +114,18 @@ func (b *Backend) configure(ctx context.Context) error { // Store the lock information b.lock = b.configData.Get("lock").(bool) - // Initialize a client to test config - _, err := b.clientRaw() - return err -} - -func (b *Backend) clientRaw() (*consulapi.Client, error) { data := b.configData // Configure the client config := consulapi.DefaultConfig() + + // replace the default Transport Dialer to reduce the KeepAlive + + config.Transport.DialContext = (&net.Dialer{ + Timeout: 30 * time.Second, + KeepAlive: 17 * time.Second, + }).DialContext + if v, ok := data.GetOk("access_token"); ok && v.(string) != "" { config.Token = v.(string) } @@ -162,5 +167,11 @@ func (b *Backend) clientRaw() (*consulapi.Client, error) { } } - return consulapi.NewClient(config) + client, err := consulapi.NewClient(config) + if err != nil { + return err + } + + b.client = client + return nil } diff --git a/backend/remote-state/consul/backend_state.go b/backend/remote-state/consul/backend_state.go index e77729446..8002d9830 100644 --- a/backend/remote-state/consul/backend_state.go +++ b/backend/remote-state/consul/backend_state.go @@ -15,15 +15,9 @@ const ( ) func (b *Backend) States() ([]string, error) { - // Get the Consul client - client, err := b.clientRaw() - if err != nil { - return nil, err - } - // List our raw path prefix := b.configData.Get("path").(string) + keyEnvPrefix - keys, _, err := client.KV().Keys(prefix, "/", nil) + keys, _, err := b.client.KV().Keys(prefix, "/", nil) if err != nil { return nil, err } @@ -60,28 +54,16 @@ func (b *Backend) DeleteState(name string) error { return fmt.Errorf("can't delete default state") } - // Get the Consul API client - client, err := b.clientRaw() - if err != nil { - return err - } - // Determine the path of the data path := b.path(name) // Delete it. We just delete it without any locking since // the DeleteState API is documented as such. - _, err = client.KV().Delete(path, nil) + _, err := b.client.KV().Delete(path, nil) return err } func (b *Backend) State(name string) (state.State, error) { - // Get the Consul API client - client, err := b.clientRaw() - if err != nil { - return nil, err - } - // Determine the path of the data path := b.path(name) @@ -91,7 +73,7 @@ func (b *Backend) State(name string) (state.State, error) { // Build the state client var stateMgr state.State = &remote.State{ Client: &RemoteClient{ - Client: client, + Client: b.client, Path: path, GZip: gzip, lockState: b.lock, From 25a8227291715a499ce4ed957d6d9f79d041b46a Mon Sep 17 00:00:00 2001 From: James Bardin Date: Sun, 8 Oct 2017 12:57:11 -0400 Subject: [PATCH 3/4] add broken test for lock lost on connection error Add a way to inject network errors by setting an immediate deadline on open consul connections. The consul client currently doesn't retry on some errors, and will force us to lose our lock. Once the consul api client is fixed, this test will fail. --- backend/remote-state/consul/backend.go | 13 ++- backend/remote-state/consul/backend_state.go | 5 + backend/remote-state/consul/client.go | 4 +- backend/remote-state/consul/client_test.go | 100 +++++++++++++++++++ 4 files changed, 116 insertions(+), 6 deletions(-) diff --git a/backend/remote-state/consul/backend.go b/backend/remote-state/consul/backend.go index d0cfa6ca4..271a60b63 100644 --- a/backend/remote-state/consul/backend.go +++ b/backend/remote-state/consul/backend.go @@ -120,11 +120,7 @@ func (b *Backend) configure(ctx context.Context) error { config := consulapi.DefaultConfig() // replace the default Transport Dialer to reduce the KeepAlive - - config.Transport.DialContext = (&net.Dialer{ - Timeout: 30 * time.Second, - KeepAlive: 17 * time.Second, - }).DialContext + config.Transport.DialContext = dialContext if v, ok := data.GetOk("access_token"); ok && v.(string) != "" { config.Token = v.(string) @@ -175,3 +171,10 @@ func (b *Backend) configure(ctx context.Context) error { b.client = client return nil } + +// dialContext is the DialContext function for the consul client transport. +// This is stored in a package var to inject a different dialer for tests. +var dialContext = (&net.Dialer{ + Timeout: 30 * time.Second, + KeepAlive: 17 * time.Second, +}).DialContext diff --git a/backend/remote-state/consul/backend_state.go b/backend/remote-state/consul/backend_state.go index 8002d9830..95010aa0e 100644 --- a/backend/remote-state/consul/backend_state.go +++ b/backend/remote-state/consul/backend_state.go @@ -85,6 +85,11 @@ func (b *Backend) State(name string) (state.State, error) { stateMgr = &state.LockDisabled{Inner: stateMgr} } + // the default state always exists + if name == backend.DefaultStateName { + return stateMgr, nil + } + // Grab a lock, we use this to write an empty state if one doesn't // exist already. We have to write an empty state as a sentinel value // so States() knows it exists. diff --git a/backend/remote-state/consul/client.go b/backend/remote-state/consul/client.go index e18021091..bd37712f3 100644 --- a/backend/remote-state/consul/client.go +++ b/backend/remote-state/consul/client.go @@ -32,6 +32,8 @@ const ( lockReacquireInterval = 2 * time.Second ) +var lostLockErr = errors.New("consul lock was lost") + // RemoteClient is a remote client that stores data in Consul. type RemoteClient struct { Client *consulapi.Client @@ -409,7 +411,7 @@ func (c *RemoteClient) unlock(id string) error { select { case <-c.lockCh: - return errors.New("consul lock was lost") + return lostLockErr default: } diff --git a/backend/remote-state/consul/client_test.go b/backend/remote-state/consul/client_test.go index 910c5e9eb..09e1a7d9a 100644 --- a/backend/remote-state/consul/client_test.go +++ b/backend/remote-state/consul/client_test.go @@ -1,7 +1,10 @@ package consul import ( + "context" "fmt" + "net" + "sync" "testing" "time" @@ -188,3 +191,100 @@ func TestConsul_lostLock(t *testing.T) { t.Fatal(err) } } + +func TestConsul_lostLockConnection(t *testing.T) { + srv := newConsulTestServer(t) + defer srv.Stop() + + // create an "unreliable" network by closing all the consul client's + // network connections + conns := &unreliableConns{} + origDialFn := dialContext + defer func() { + dialContext = origDialFn + }() + dialContext = conns.DialContext + + path := fmt.Sprintf("tf-unit/%s", time.Now().String()) + + b := backend.TestBackendConfig(t, New(), map[string]interface{}{ + "address": srv.HTTPAddr, + "path": path, + }) + + s, err := b.State(backend.DefaultStateName) + if err != nil { + t.Fatal(err) + } + + info := state.NewLockInfo() + info.Operation = "test-lost-lock-connection" + id, err := s.Lock(info) + if err != nil { + t.Fatal(err) + } + + // set a callback to know when the monitor loop re-connects + dialed := make(chan struct{}) + conns.dialCallback = func() { + close(dialed) + conns.dialCallback = nil + } + + // kill any open connections + // once the consul client is fixed, we should loop over this a few time to + // be sure, since we can't hook into the client's internal lock monitor + // loop. + conns.Kill() + // wait for a new connection to be dialed, and kill it again + <-dialed + conns.Kill() + + // since the lock monitor loop is hidden in the consul api client, we can + // only wait a bit to make sure we were notified of the failure + time.Sleep(time.Second) + + // once the consul client can reconnect properly, there will no longer be + // an error here + //if err := s.Unlock(id); err != nil { + if err := s.Unlock(id); err != lostLockErr { + t.Fatalf("expected lost lock error, got %v", err) + } +} + +type unreliableConns struct { + sync.Mutex + conns []net.Conn + dialCallback func() +} + +func (u *unreliableConns) DialContext(ctx context.Context, netw, addr string) (net.Conn, error) { + u.Lock() + defer u.Unlock() + + dialer := &net.Dialer{} + conn, err := dialer.DialContext(ctx, netw, addr) + if err != nil { + return nil, err + } + + u.conns = append(u.conns, conn) + + if u.dialCallback != nil { + u.dialCallback() + } + + return conn, nil +} + +// Kill these with a deadline, just to make sure we don't end up with any EOFs +// that get ignored. +func (u *unreliableConns) Kill() { + u.Lock() + defer u.Unlock() + + for _, conn := range u.conns { + conn.(*net.TCPConn).SetDeadline(time.Now()) + } + u.conns = nil +} From f5e9a20c660e0dbd5bc5c7a01571d3caccec597e Mon Sep 17 00:00:00 2001 From: James Bardin Date: Sun, 8 Oct 2017 16:24:45 -0400 Subject: [PATCH 4/4] reset testLockHook --- backend/remote-state/consul/client_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/remote-state/consul/client_test.go b/backend/remote-state/consul/client_test.go index 09e1a7d9a..ef3c5b187 100644 --- a/backend/remote-state/consul/client_test.go +++ b/backend/remote-state/consul/client_test.go @@ -176,6 +176,7 @@ func TestConsul_lostLock(t *testing.T) { reLocked := make(chan struct{}) testLockHook = func() { close(reLocked) + testLockHook = nil } // now we use the second client to break the lock