add broken test for lock lost on connection error

Add a way to inject network errors by setting an immediate deadline on
open consul connections. The consul client currently doesn't retry on
some errors, and will force us to lose our lock.

Once the consul api client is fixed, this test will fail.
This commit is contained in:
James Bardin 2017-10-08 12:57:11 -04:00
parent fd9adcdb36
commit 25a8227291
4 changed files with 116 additions and 6 deletions

View File

@ -120,11 +120,7 @@ func (b *Backend) configure(ctx context.Context) error {
config := consulapi.DefaultConfig()
// replace the default Transport Dialer to reduce the KeepAlive
config.Transport.DialContext = (&net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 17 * time.Second,
}).DialContext
config.Transport.DialContext = dialContext
if v, ok := data.GetOk("access_token"); ok && v.(string) != "" {
config.Token = v.(string)
@ -175,3 +171,10 @@ func (b *Backend) configure(ctx context.Context) error {
b.client = client
return nil
}
// dialContext is the DialContext function for the consul client transport.
// This is stored in a package var to inject a different dialer for tests.
var dialContext = (&net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 17 * time.Second,
}).DialContext

View File

@ -85,6 +85,11 @@ func (b *Backend) State(name string) (state.State, error) {
stateMgr = &state.LockDisabled{Inner: stateMgr}
}
// the default state always exists
if name == backend.DefaultStateName {
return stateMgr, nil
}
// Grab a lock, we use this to write an empty state if one doesn't
// exist already. We have to write an empty state as a sentinel value
// so States() knows it exists.

View File

@ -32,6 +32,8 @@ const (
lockReacquireInterval = 2 * time.Second
)
var lostLockErr = errors.New("consul lock was lost")
// RemoteClient is a remote client that stores data in Consul.
type RemoteClient struct {
Client *consulapi.Client
@ -409,7 +411,7 @@ func (c *RemoteClient) unlock(id string) error {
select {
case <-c.lockCh:
return errors.New("consul lock was lost")
return lostLockErr
default:
}

View File

@ -1,7 +1,10 @@
package consul
import (
"context"
"fmt"
"net"
"sync"
"testing"
"time"
@ -188,3 +191,100 @@ func TestConsul_lostLock(t *testing.T) {
t.Fatal(err)
}
}
func TestConsul_lostLockConnection(t *testing.T) {
srv := newConsulTestServer(t)
defer srv.Stop()
// create an "unreliable" network by closing all the consul client's
// network connections
conns := &unreliableConns{}
origDialFn := dialContext
defer func() {
dialContext = origDialFn
}()
dialContext = conns.DialContext
path := fmt.Sprintf("tf-unit/%s", time.Now().String())
b := backend.TestBackendConfig(t, New(), map[string]interface{}{
"address": srv.HTTPAddr,
"path": path,
})
s, err := b.State(backend.DefaultStateName)
if err != nil {
t.Fatal(err)
}
info := state.NewLockInfo()
info.Operation = "test-lost-lock-connection"
id, err := s.Lock(info)
if err != nil {
t.Fatal(err)
}
// set a callback to know when the monitor loop re-connects
dialed := make(chan struct{})
conns.dialCallback = func() {
close(dialed)
conns.dialCallback = nil
}
// kill any open connections
// once the consul client is fixed, we should loop over this a few time to
// be sure, since we can't hook into the client's internal lock monitor
// loop.
conns.Kill()
// wait for a new connection to be dialed, and kill it again
<-dialed
conns.Kill()
// since the lock monitor loop is hidden in the consul api client, we can
// only wait a bit to make sure we were notified of the failure
time.Sleep(time.Second)
// once the consul client can reconnect properly, there will no longer be
// an error here
//if err := s.Unlock(id); err != nil {
if err := s.Unlock(id); err != lostLockErr {
t.Fatalf("expected lost lock error, got %v", err)
}
}
type unreliableConns struct {
sync.Mutex
conns []net.Conn
dialCallback func()
}
func (u *unreliableConns) DialContext(ctx context.Context, netw, addr string) (net.Conn, error) {
u.Lock()
defer u.Unlock()
dialer := &net.Dialer{}
conn, err := dialer.DialContext(ctx, netw, addr)
if err != nil {
return nil, err
}
u.conns = append(u.conns, conn)
if u.dialCallback != nil {
u.dialCallback()
}
return conn, nil
}
// Kill these with a deadline, just to make sure we don't end up with any EOFs
// that get ignored.
func (u *unreliableConns) Kill() {
u.Lock()
defer u.Unlock()
for _, conn := range u.conns {
conn.(*net.TCPConn).SetDeadline(time.Now())
}
u.conns = nil
}