From 54224eb41efdc3bf654084ace3d3e05cc8ccf981 Mon Sep 17 00:00:00 2001 From: James Bardin Date: Mon, 13 Mar 2017 18:07:10 -0400 Subject: [PATCH 1/7] fetch consul binary for tests --- .travis.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.travis.yml b/.travis.yml index dcb8c11d4..a548b6b77 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,6 +2,18 @@ sudo: false language: go go: - 1.8 + +env: + - CONSUL_VERSION=0.7.5 + +# Fetch consul for the backend and provider tests +before_install: + - curl -sLo consul.zip https://releases.hashicorp.com/consul/${CONSUL_VERSION}/consul_${CONSUL_VERSION}_linux_amd64.zip + - unzip consul.zip + - mkdir ~/bin + - mv consul ~/bin + - export PATH="~/bin:$PATH" + install: # This script is used by the Travis build to install a cookie for # go.googlesource.com so rate limits are higher when using `go get` to fetch From 90055c6ae2dd36358270888b92d945de9434bcfb Mon Sep 17 00:00:00 2001 From: James Bardin Date: Mon, 13 Mar 2017 18:25:58 -0400 Subject: [PATCH 2/7] convert the consul backend to use consul/testutil Start up our own consul server for unit tests. --- backend/remote-state/consul/backend_test.go | 11 ++++------- backend/remote-state/consul/client_test.go | 22 ++++++++------------- 2 files changed, 12 insertions(+), 21 deletions(-) diff --git a/backend/remote-state/consul/backend_test.go b/backend/remote-state/consul/backend_test.go index fb2c0e0f6..2fcb1c6a7 100644 --- a/backend/remote-state/consul/backend_test.go +++ b/backend/remote-state/consul/backend_test.go @@ -2,10 +2,10 @@ package consul import ( "fmt" - "os" "testing" "time" + "github.com/hashicorp/consul/testutil" "github.com/hashicorp/terraform/backend" ) @@ -14,15 +14,12 @@ func TestBackend_impl(t *testing.T) { } func TestBackend(t *testing.T) { - addr := os.Getenv("CONSUL_HTTP_ADDR") - if addr == "" { - t.Log("consul tests require CONSUL_HTTP_ADDR") - t.Skip() - } + srv := testutil.NewTestServer(t) + defer srv.Stop() // Get the backend b := backend.TestBackendConfig(t, New(), map[string]interface{}{ - "address": addr, + "address": srv.HTTPAddr, "path": fmt.Sprintf("tf-unit/%s", time.Now().String()), }) diff --git a/backend/remote-state/consul/client_test.go b/backend/remote-state/consul/client_test.go index d123e39c7..6109c4c0f 100644 --- a/backend/remote-state/consul/client_test.go +++ b/backend/remote-state/consul/client_test.go @@ -2,10 +2,10 @@ package consul import ( "fmt" - "os" "testing" "time" + "github.com/hashicorp/consul/testutil" "github.com/hashicorp/terraform/backend" "github.com/hashicorp/terraform/state/remote" ) @@ -16,15 +16,12 @@ func TestRemoteClient_impl(t *testing.T) { } func TestRemoteClient(t *testing.T) { - addr := os.Getenv("CONSUL_HTTP_ADDR") - if addr == "" { - t.Log("consul tests require CONSUL_HTTP_ADDR") - t.Skip() - } + srv := testutil.NewTestServer(t) + defer srv.Stop() // Get the backend b := backend.TestBackendConfig(t, New(), map[string]interface{}{ - "address": addr, + "address": srv.HTTPAddr, "path": fmt.Sprintf("tf-unit/%s", time.Now().String()), }) @@ -39,17 +36,14 @@ func TestRemoteClient(t *testing.T) { } func TestConsul_stateLock(t *testing.T) { - addr := os.Getenv("CONSUL_HTTP_ADDR") - if addr == "" { - t.Log("consul lock tests require CONSUL_HTTP_ADDR") - t.Skip() - } + srv := testutil.NewTestServer(t) + defer srv.Stop() path := fmt.Sprintf("tf-unit/%s", time.Now().String()) // create 2 instances to get 2 remote.Clients sA, err := backend.TestBackendConfig(t, New(), map[string]interface{}{ - "address": addr, + "address": srv.HTTPAddr, "path": path, }).State(backend.DefaultStateName) if err != nil { @@ -57,7 +51,7 @@ func TestConsul_stateLock(t *testing.T) { } sB, err := backend.TestBackendConfig(t, New(), map[string]interface{}{ - "address": addr, + "address": srv.HTTPAddr, "path": path, }).State(backend.DefaultStateName) if err != nil { From 9529bd3bf040e5c7d62468aa560f98f145dabc45 Mon Sep 17 00:00:00 2001 From: James Bardin Date: Tue, 14 Mar 2017 10:36:14 -0400 Subject: [PATCH 3/7] Have the consul provider use a local test server We're providing a local test server for the consul backends, so we might as well use it to make the provider acceptance tests more relible as well. TODO: the TLS test doesn't actualy test anything other than the Config. The tests have been modified to make it apparent that they aren't connecting to the server at all. --- .../data_source_consul_agent_self_test.go | 4 +- .../data_source_consul_catalog_nodes_test.go | 1 - ...data_source_consul_catalog_service_test.go | 1 - ...ata_source_consul_catalog_services_test.go | 1 - .../consul/data_source_consul_keys_test.go | 1 - .../consul/resource_consul_key_prefix_test.go | 1 - .../consul/resource_consul_keys_test.go | 1 - .../resource_consul_prepared_query_test.go | 1 - .../consul/resource_provider_test.go | 55 +++++++++++++++---- 9 files changed, 45 insertions(+), 21 deletions(-) diff --git a/builtin/providers/consul/data_source_consul_agent_self_test.go b/builtin/providers/consul/data_source_consul_agent_self_test.go index 341a89c9c..16d6a9ead 100644 --- a/builtin/providers/consul/data_source_consul_agent_self_test.go +++ b/builtin/providers/consul/data_source_consul_agent_self_test.go @@ -10,7 +10,6 @@ import ( func TestAccDataConsulAgentSelf_basic(t *testing.T) { resource.Test(t, resource.TestCase{ - PreCheck: func() { testAccPreCheck(t) }, Providers: testAccProviders, Steps: []resource.TestStep{ resource.TestStep{ @@ -25,7 +24,8 @@ func TestAccDataConsulAgentSelf_basic(t *testing.T) { testAccCheckDataSourceValue("data.consul_agent_self.read", "advertise_addr", ""), testAccCheckDataSourceValue("data.consul_agent_self.read", "bind_addr", ""), testAccCheckDataSourceValue("data.consul_agent_self.read", "bootstrap_expect", ""), - testAccCheckDataSourceValue("data.consul_agent_self.read", "bootstrap_mode", "false"), + // the local test server is bootstrapped + testAccCheckDataSourceValue("data.consul_agent_self.read", "bootstrap_mode", "true"), testAccCheckDataSourceValue("data.consul_agent_self.read", "client_addr", ""), testAccCheckDataSourceValue("data.consul_agent_self.read", "datacenter", ""), testAccCheckDataSourceValue("data.consul_agent_self.read", "dev_mode", ""), diff --git a/builtin/providers/consul/data_source_consul_catalog_nodes_test.go b/builtin/providers/consul/data_source_consul_catalog_nodes_test.go index 94cea160d..891295044 100644 --- a/builtin/providers/consul/data_source_consul_catalog_nodes_test.go +++ b/builtin/providers/consul/data_source_consul_catalog_nodes_test.go @@ -8,7 +8,6 @@ import ( func TestAccDataConsulCatalogNodes_basic(t *testing.T) { resource.Test(t, resource.TestCase{ - PreCheck: func() { testAccPreCheck(t) }, Providers: testAccProviders, Steps: []resource.TestStep{ resource.TestStep{ diff --git a/builtin/providers/consul/data_source_consul_catalog_service_test.go b/builtin/providers/consul/data_source_consul_catalog_service_test.go index 0ef7d1ab0..b5901c4c5 100644 --- a/builtin/providers/consul/data_source_consul_catalog_service_test.go +++ b/builtin/providers/consul/data_source_consul_catalog_service_test.go @@ -8,7 +8,6 @@ import ( func TestAccDataConsulCatalogService_basic(t *testing.T) { resource.Test(t, resource.TestCase{ - PreCheck: func() { testAccPreCheck(t) }, Providers: testAccProviders, Steps: []resource.TestStep{ resource.TestStep{ diff --git a/builtin/providers/consul/data_source_consul_catalog_services_test.go b/builtin/providers/consul/data_source_consul_catalog_services_test.go index 1087073f7..8617fc205 100644 --- a/builtin/providers/consul/data_source_consul_catalog_services_test.go +++ b/builtin/providers/consul/data_source_consul_catalog_services_test.go @@ -8,7 +8,6 @@ import ( func TestAccDataConsulCatalogServices_basic(t *testing.T) { resource.Test(t, resource.TestCase{ - PreCheck: func() { testAccPreCheck(t) }, Providers: testAccProviders, Steps: []resource.TestStep{ resource.TestStep{ diff --git a/builtin/providers/consul/data_source_consul_keys_test.go b/builtin/providers/consul/data_source_consul_keys_test.go index 09f62a927..ef9baafd6 100644 --- a/builtin/providers/consul/data_source_consul_keys_test.go +++ b/builtin/providers/consul/data_source_consul_keys_test.go @@ -8,7 +8,6 @@ import ( func TestAccDataConsulKeys_basic(t *testing.T) { resource.Test(t, resource.TestCase{ - PreCheck: func() { testAccPreCheck(t) }, Providers: testAccProviders, Steps: []resource.TestStep{ resource.TestStep{ diff --git a/builtin/providers/consul/resource_consul_key_prefix_test.go b/builtin/providers/consul/resource_consul_key_prefix_test.go index e33fb13da..141ddf446 100644 --- a/builtin/providers/consul/resource_consul_key_prefix_test.go +++ b/builtin/providers/consul/resource_consul_key_prefix_test.go @@ -11,7 +11,6 @@ import ( func TestAccConsulKeyPrefix_basic(t *testing.T) { resource.Test(t, resource.TestCase{ - PreCheck: func() { testAccPreCheck(t) }, Providers: testAccProviders, CheckDestroy: resource.ComposeTestCheckFunc( testAccCheckConsulKeyPrefixKeyAbsent("species"), diff --git a/builtin/providers/consul/resource_consul_keys_test.go b/builtin/providers/consul/resource_consul_keys_test.go index f6045658e..f24c294ed 100644 --- a/builtin/providers/consul/resource_consul_keys_test.go +++ b/builtin/providers/consul/resource_consul_keys_test.go @@ -11,7 +11,6 @@ import ( func TestAccConsulKeys_basic(t *testing.T) { resource.Test(t, resource.TestCase{ - PreCheck: func() { testAccPreCheck(t) }, Providers: testAccProviders, CheckDestroy: testAccCheckConsulKeysDestroy, Steps: []resource.TestStep{ diff --git a/builtin/providers/consul/resource_consul_prepared_query_test.go b/builtin/providers/consul/resource_consul_prepared_query_test.go index 6b08adaa8..12501c235 100644 --- a/builtin/providers/consul/resource_consul_prepared_query_test.go +++ b/builtin/providers/consul/resource_consul_prepared_query_test.go @@ -11,7 +11,6 @@ import ( func TestAccConsulPreparedQuery_basic(t *testing.T) { resource.Test(t, resource.TestCase{ - PreCheck: func() { testAccPreCheck(t) }, Providers: testAccProviders, CheckDestroy: testAccCheckConsulPreparedQueryDestroy, Steps: []resource.TestStep{ diff --git a/builtin/providers/consul/resource_provider_test.go b/builtin/providers/consul/resource_provider_test.go index df8fe1b85..9c964adb8 100644 --- a/builtin/providers/consul/resource_provider_test.go +++ b/builtin/providers/consul/resource_provider_test.go @@ -1,24 +1,64 @@ package consul import ( + "io/ioutil" + "log" "os" "testing" + "github.com/hashicorp/consul/testutil" "github.com/hashicorp/terraform/config" "github.com/hashicorp/terraform/helper/schema" "github.com/hashicorp/terraform/terraform" + "github.com/mitchellh/mapstructure" ) var testAccProviders map[string]terraform.ResourceProvider var testAccProvider *schema.Provider +var testConsulHTTPAddr string func init() { testAccProvider = Provider().(*schema.Provider) + testAccProvider.ConfigureFunc = testProviderConfigure + testAccProviders = map[string]terraform.ResourceProvider{ "consul": testAccProvider, } } +// we need to overrride the configured address for the tests +func testProviderConfigure(d *schema.ResourceData) (interface{}, error) { + var config Config + configRaw := d.Get("").(map[string]interface{}) + if err := mapstructure.Decode(configRaw, &config); err != nil { + return nil, err + } + config.Address = testConsulHTTPAddr + + log.Printf("[INFO] Initializing Consul test client") + return config.Client() +} + +func TestMain(m *testing.M) { + t := struct { + testutil.TestingT + }{} + + // start and stop the test consul server once for all tests + srv := testutil.NewTestServerConfig(t, func(c *testutil.TestServerConfig) { + c.LogLevel = "warn" + c.Stdout = ioutil.Discard + c.Stderr = ioutil.Discard + }) + + testConsulHTTPAddr = srv.HTTPAddr + + ret := m.Run() + + srv.Stop() + os.Exit(ret) +} + func TestResourceProvider(t *testing.T) { if err := Provider().(*schema.Provider).InternalValidate(); err != nil { t.Fatalf("err: %s", err) @@ -32,8 +72,9 @@ func TestResourceProvider_impl(t *testing.T) { func TestResourceProvider_Configure(t *testing.T) { rp := Provider() + // these configuration tests don't require an running server raw := map[string]interface{}{ - "address": "demo.consul.io:80", + "address": "example.com:8500", "datacenter": "nyc3", "scheme": "https", } @@ -53,7 +94,7 @@ func TestResourceProvider_ConfigureTLS(t *testing.T) { rp := Provider() raw := map[string]interface{}{ - "address": "demo.consul.io:80", + "address": "example.com:8943", "ca_file": "test-fixtures/cacert.pem", "cert_file": "test-fixtures/usercert.pem", "datacenter": "nyc3", @@ -71,13 +112,3 @@ func TestResourceProvider_ConfigureTLS(t *testing.T) { t.Fatalf("err: %s", err) } } - -func testAccPreCheck(t *testing.T) { - if v := os.Getenv("CONSUL_HTTP_ADDR"); v != "" { - return - } - if v := os.Getenv("CONSUL_ADDRESS"); v != "" { - return - } - t.Fatal("Either CONSUL_ADDRESS or CONSUL_HTTP_ADDR must be set for acceptance tests") -} From fa7743b627ff8a3985589cbee6debf73bfbfb92a Mon Sep 17 00:00:00 2001 From: James Bardin Date: Tue, 14 Mar 2017 10:49:38 -0400 Subject: [PATCH 4/7] quiet the consul server during backend tests Don't display logs unless using `-v` --- backend/remote-state/consul/backend_test.go | 16 +++++++++++++++- backend/remote-state/consul/client_test.go | 5 ++--- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/backend/remote-state/consul/backend_test.go b/backend/remote-state/consul/backend_test.go index 2fcb1c6a7..819e790d7 100644 --- a/backend/remote-state/consul/backend_test.go +++ b/backend/remote-state/consul/backend_test.go @@ -2,6 +2,7 @@ package consul import ( "fmt" + "io/ioutil" "testing" "time" @@ -13,8 +14,21 @@ func TestBackend_impl(t *testing.T) { var _ backend.Backend = new(Backend) } +func newConsulTestServer(t *testing.T) *testutil.TestServer { + srv := testutil.NewTestServerConfig(t, func(c *testutil.TestServerConfig) { + c.LogLevel = "warn" + + if !testing.Verbose() { + c.Stdout = ioutil.Discard + c.Stderr = ioutil.Discard + } + }) + + return srv +} + func TestBackend(t *testing.T) { - srv := testutil.NewTestServer(t) + srv := newConsulTestServer(t) defer srv.Stop() // Get the backend diff --git a/backend/remote-state/consul/client_test.go b/backend/remote-state/consul/client_test.go index 6109c4c0f..8978755ec 100644 --- a/backend/remote-state/consul/client_test.go +++ b/backend/remote-state/consul/client_test.go @@ -5,7 +5,6 @@ import ( "testing" "time" - "github.com/hashicorp/consul/testutil" "github.com/hashicorp/terraform/backend" "github.com/hashicorp/terraform/state/remote" ) @@ -16,7 +15,7 @@ func TestRemoteClient_impl(t *testing.T) { } func TestRemoteClient(t *testing.T) { - srv := testutil.NewTestServer(t) + srv := newConsulTestServer(t) defer srv.Stop() // Get the backend @@ -36,7 +35,7 @@ func TestRemoteClient(t *testing.T) { } func TestConsul_stateLock(t *testing.T) { - srv := testutil.NewTestServer(t) + srv := newConsulTestServer(t) defer srv.Stop() path := fmt.Sprintf("tf-unit/%s", time.Now().String()) From d46d93c85f5d51a1dcb531e239e556ecaed5f04b Mon Sep 17 00:00:00 2001 From: James Bardin Date: Tue, 14 Mar 2017 13:12:07 -0400 Subject: [PATCH 5/7] try trusty --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index a548b6b77..c8bcec473 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,3 +1,4 @@ +dist: trusty sudo: false language: go go: From 7233cc9cae9e9b2e95eb075bcd2ad65b843d6bc8 Mon Sep 17 00:00:00 2001 From: James Bardin Date: Tue, 14 Mar 2017 13:23:28 -0400 Subject: [PATCH 6/7] vendor consul/testutil try to weed out extras later --- vendor/github.com/armon/go-metrics/README.md | 74 + .../github.com/armon/go-metrics/const_unix.go | 12 + .../armon/go-metrics/const_windows.go | 13 + vendor/github.com/armon/go-metrics/inmem.go | 247 +++ .../armon/go-metrics/inmem_signal.go | 100 + vendor/github.com/armon/go-metrics/metrics.go | 115 ++ vendor/github.com/armon/go-metrics/sink.go | 52 + vendor/github.com/armon/go-metrics/start.go | 95 + vendor/github.com/armon/go-metrics/statsd.go | 154 ++ .../github.com/armon/go-metrics/statsite.go | 142 ++ vendor/github.com/hashicorp/consul/acl/acl.go | 672 +++++++ .../github.com/hashicorp/consul/acl/cache.go | 177 ++ .../github.com/hashicorp/consul/acl/policy.go | 191 ++ .../consul/consul/structs/operator.go | 57 + .../consul/consul/structs/prepared_query.go | 257 +++ .../consul/consul/structs/snapshot.go | 40 + .../consul/consul/structs/structs.go | 1041 ++++++++++ .../hashicorp/consul/consul/structs/txn.go | 85 + .../hashicorp/consul/testutil/README.md | 65 + .../hashicorp/consul/testutil/server.go | 528 +++++ .../hashicorp/consul/testutil/wait.go | 62 + .../hashicorp/consul/types/README.md | 39 + .../hashicorp/consul/types/checks.go | 5 + .../hashicorp/consul/types/node_id.go | 4 + .../github.com/hashicorp/go-sockaddr/LICENSE | 373 ++++ .../github.com/hashicorp/go-sockaddr/Makefile | 63 + .../hashicorp/go-sockaddr/README.md | 118 ++ .../github.com/hashicorp/go-sockaddr/doc.go | 5 + .../hashicorp/go-sockaddr/ifaddr.go | 126 ++ .../hashicorp/go-sockaddr/ifaddrs.go | 969 +++++++++ .../hashicorp/go-sockaddr/ifattr.go | 65 + .../hashicorp/go-sockaddr/ipaddr.go | 169 ++ .../hashicorp/go-sockaddr/ipaddrs.go | 98 + .../hashicorp/go-sockaddr/ipv4addr.go | 515 +++++ .../hashicorp/go-sockaddr/ipv6addr.go | 591 ++++++ .../github.com/hashicorp/go-sockaddr/rfc.go | 947 +++++++++ .../hashicorp/go-sockaddr/route_info.go | 19 + .../hashicorp/go-sockaddr/route_info_bsd.go | 36 + .../go-sockaddr/route_info_default.go | 10 + .../hashicorp/go-sockaddr/route_info_linux.go | 37 + .../go-sockaddr/route_info_solaris.go | 37 + .../go-sockaddr/route_info_windows.go | 41 + .../hashicorp/go-sockaddr/sockaddr.go | 178 ++ .../hashicorp/go-sockaddr/sockaddrs.go | 193 ++ .../hashicorp/go-sockaddr/unixsock.go | 135 ++ vendor/github.com/hashicorp/golang-lru/2q.go | 212 ++ .../github.com/hashicorp/golang-lru/README.md | 25 + vendor/github.com/hashicorp/golang-lru/arc.go | 257 +++ vendor/github.com/hashicorp/golang-lru/lru.go | 114 ++ .../hashicorp/golang-lru/simplelru/lru.go | 160 ++ .../github.com/hashicorp/memberlist/LICENSE | 354 ++++ .../github.com/hashicorp/memberlist/Makefile | 14 + .../github.com/hashicorp/memberlist/README.md | 144 ++ .../hashicorp/memberlist/alive_delegate.go | 14 + .../hashicorp/memberlist/awareness.go | 69 + .../hashicorp/memberlist/broadcast.go | 100 + .../github.com/hashicorp/memberlist/config.go | 277 +++ .../hashicorp/memberlist/conflict_delegate.go | 10 + .../hashicorp/memberlist/delegate.go | 37 + .../hashicorp/memberlist/event_delegate.go | 61 + .../hashicorp/memberlist/keyring.go | 160 ++ .../hashicorp/memberlist/logging.go | 22 + .../hashicorp/memberlist/memberlist.go | 660 +++++++ .../hashicorp/memberlist/merge_delegate.go | 14 + vendor/github.com/hashicorp/memberlist/net.go | 1123 +++++++++++ .../hashicorp/memberlist/ping_delegate.go | 14 + .../github.com/hashicorp/memberlist/queue.go | 167 ++ .../hashicorp/memberlist/security.go | 198 ++ .../github.com/hashicorp/memberlist/state.go | 1151 +++++++++++ .../hashicorp/memberlist/suspicion.go | 130 ++ .../github.com/hashicorp/memberlist/todo.md | 6 + .../github.com/hashicorp/memberlist/util.go | 288 +++ vendor/github.com/hashicorp/raft/LICENSE | 354 ++++ vendor/github.com/hashicorp/raft/Makefile | 17 + vendor/github.com/hashicorp/raft/README.md | 89 + vendor/github.com/hashicorp/raft/api.go | 1007 ++++++++++ vendor/github.com/hashicorp/raft/commands.go | 151 ++ .../github.com/hashicorp/raft/commitment.go | 101 + vendor/github.com/hashicorp/raft/config.go | 258 +++ .../hashicorp/raft/configuration.go | 343 ++++ .../hashicorp/raft/discard_snapshot.go | 49 + .../hashicorp/raft/file_snapshot.go | 494 +++++ vendor/github.com/hashicorp/raft/fsm.go | 136 ++ vendor/github.com/hashicorp/raft/future.go | 289 +++ .../hashicorp/raft/inmem_snapshot.go | 106 + .../github.com/hashicorp/raft/inmem_store.go | 125 ++ .../hashicorp/raft/inmem_transport.go | 322 +++ vendor/github.com/hashicorp/raft/log.go | 72 + vendor/github.com/hashicorp/raft/log_cache.go | 79 + .../github.com/hashicorp/raft/membership.md | 83 + .../hashicorp/raft/net_transport.go | 622 ++++++ vendor/github.com/hashicorp/raft/observer.go | 115 ++ vendor/github.com/hashicorp/raft/peersjson.go | 46 + vendor/github.com/hashicorp/raft/raft.go | 1456 ++++++++++++++ .../github.com/hashicorp/raft/replication.go | 561 ++++++ vendor/github.com/hashicorp/raft/snapshot.go | 239 +++ vendor/github.com/hashicorp/raft/stable.go | 15 + vendor/github.com/hashicorp/raft/state.go | 167 ++ .../hashicorp/raft/tcp_transport.go | 105 + vendor/github.com/hashicorp/raft/transport.go | 124 ++ vendor/github.com/hashicorp/raft/util.go | 133 ++ .../hashicorp/serf/serf/broadcast.go | 27 + .../hashicorp/serf/serf/coalesce.go | 80 + .../hashicorp/serf/serf/coalesce_member.go | 68 + .../hashicorp/serf/serf/coalesce_user.go | 52 + .../github.com/hashicorp/serf/serf/config.go | 267 +++ .../hashicorp/serf/serf/conflict_delegate.go | 13 + .../hashicorp/serf/serf/delegate.go | 275 +++ .../github.com/hashicorp/serf/serf/event.go | 174 ++ .../hashicorp/serf/serf/event_delegate.go | 21 + .../hashicorp/serf/serf/internal_query.go | 312 +++ .../hashicorp/serf/serf/keymanager.go | 192 ++ .../github.com/hashicorp/serf/serf/lamport.go | 45 + .../hashicorp/serf/serf/merge_delegate.go | 44 + .../hashicorp/serf/serf/messages.go | 173 ++ .../hashicorp/serf/serf/ping_delegate.go | 89 + .../github.com/hashicorp/serf/serf/query.go | 294 +++ vendor/github.com/hashicorp/serf/serf/serf.go | 1739 +++++++++++++++++ .../hashicorp/serf/serf/snapshot.go | 560 ++++++ vendor/github.com/sean-/seed/LICENSE | 54 + vendor/github.com/sean-/seed/README.md | 44 + vendor/github.com/sean-/seed/init.go | 84 + vendor/vendor.json | 72 + 123 files changed, 27070 insertions(+) create mode 100644 vendor/github.com/armon/go-metrics/README.md create mode 100644 vendor/github.com/armon/go-metrics/const_unix.go create mode 100644 vendor/github.com/armon/go-metrics/const_windows.go create mode 100644 vendor/github.com/armon/go-metrics/inmem.go create mode 100644 vendor/github.com/armon/go-metrics/inmem_signal.go create mode 100755 vendor/github.com/armon/go-metrics/metrics.go create mode 100755 vendor/github.com/armon/go-metrics/sink.go create mode 100755 vendor/github.com/armon/go-metrics/start.go create mode 100644 vendor/github.com/armon/go-metrics/statsd.go create mode 100755 vendor/github.com/armon/go-metrics/statsite.go create mode 100644 vendor/github.com/hashicorp/consul/acl/acl.go create mode 100644 vendor/github.com/hashicorp/consul/acl/cache.go create mode 100644 vendor/github.com/hashicorp/consul/acl/policy.go create mode 100644 vendor/github.com/hashicorp/consul/consul/structs/operator.go create mode 100644 vendor/github.com/hashicorp/consul/consul/structs/prepared_query.go create mode 100644 vendor/github.com/hashicorp/consul/consul/structs/snapshot.go create mode 100644 vendor/github.com/hashicorp/consul/consul/structs/structs.go create mode 100644 vendor/github.com/hashicorp/consul/consul/structs/txn.go create mode 100644 vendor/github.com/hashicorp/consul/testutil/README.md create mode 100644 vendor/github.com/hashicorp/consul/testutil/server.go create mode 100644 vendor/github.com/hashicorp/consul/testutil/wait.go create mode 100644 vendor/github.com/hashicorp/consul/types/README.md create mode 100644 vendor/github.com/hashicorp/consul/types/checks.go create mode 100644 vendor/github.com/hashicorp/consul/types/node_id.go create mode 100644 vendor/github.com/hashicorp/go-sockaddr/LICENSE create mode 100644 vendor/github.com/hashicorp/go-sockaddr/Makefile create mode 100644 vendor/github.com/hashicorp/go-sockaddr/README.md create mode 100644 vendor/github.com/hashicorp/go-sockaddr/doc.go create mode 100644 vendor/github.com/hashicorp/go-sockaddr/ifaddr.go create mode 100644 vendor/github.com/hashicorp/go-sockaddr/ifaddrs.go create mode 100644 vendor/github.com/hashicorp/go-sockaddr/ifattr.go create mode 100644 vendor/github.com/hashicorp/go-sockaddr/ipaddr.go create mode 100644 vendor/github.com/hashicorp/go-sockaddr/ipaddrs.go create mode 100644 vendor/github.com/hashicorp/go-sockaddr/ipv4addr.go create mode 100644 vendor/github.com/hashicorp/go-sockaddr/ipv6addr.go create mode 100644 vendor/github.com/hashicorp/go-sockaddr/rfc.go create mode 100644 vendor/github.com/hashicorp/go-sockaddr/route_info.go create mode 100644 vendor/github.com/hashicorp/go-sockaddr/route_info_bsd.go create mode 100644 vendor/github.com/hashicorp/go-sockaddr/route_info_default.go create mode 100644 vendor/github.com/hashicorp/go-sockaddr/route_info_linux.go create mode 100644 vendor/github.com/hashicorp/go-sockaddr/route_info_solaris.go create mode 100644 vendor/github.com/hashicorp/go-sockaddr/route_info_windows.go create mode 100644 vendor/github.com/hashicorp/go-sockaddr/sockaddr.go create mode 100644 vendor/github.com/hashicorp/go-sockaddr/sockaddrs.go create mode 100644 vendor/github.com/hashicorp/go-sockaddr/unixsock.go create mode 100644 vendor/github.com/hashicorp/golang-lru/2q.go create mode 100644 vendor/github.com/hashicorp/golang-lru/README.md create mode 100644 vendor/github.com/hashicorp/golang-lru/arc.go create mode 100644 vendor/github.com/hashicorp/golang-lru/lru.go create mode 100644 vendor/github.com/hashicorp/golang-lru/simplelru/lru.go create mode 100644 vendor/github.com/hashicorp/memberlist/LICENSE create mode 100644 vendor/github.com/hashicorp/memberlist/Makefile create mode 100644 vendor/github.com/hashicorp/memberlist/README.md create mode 100644 vendor/github.com/hashicorp/memberlist/alive_delegate.go create mode 100644 vendor/github.com/hashicorp/memberlist/awareness.go create mode 100644 vendor/github.com/hashicorp/memberlist/broadcast.go create mode 100644 vendor/github.com/hashicorp/memberlist/config.go create mode 100644 vendor/github.com/hashicorp/memberlist/conflict_delegate.go create mode 100644 vendor/github.com/hashicorp/memberlist/delegate.go create mode 100644 vendor/github.com/hashicorp/memberlist/event_delegate.go create mode 100644 vendor/github.com/hashicorp/memberlist/keyring.go create mode 100644 vendor/github.com/hashicorp/memberlist/logging.go create mode 100644 vendor/github.com/hashicorp/memberlist/memberlist.go create mode 100644 vendor/github.com/hashicorp/memberlist/merge_delegate.go create mode 100644 vendor/github.com/hashicorp/memberlist/net.go create mode 100644 vendor/github.com/hashicorp/memberlist/ping_delegate.go create mode 100644 vendor/github.com/hashicorp/memberlist/queue.go create mode 100644 vendor/github.com/hashicorp/memberlist/security.go create mode 100644 vendor/github.com/hashicorp/memberlist/state.go create mode 100644 vendor/github.com/hashicorp/memberlist/suspicion.go create mode 100644 vendor/github.com/hashicorp/memberlist/todo.md create mode 100644 vendor/github.com/hashicorp/memberlist/util.go create mode 100644 vendor/github.com/hashicorp/raft/LICENSE create mode 100644 vendor/github.com/hashicorp/raft/Makefile create mode 100644 vendor/github.com/hashicorp/raft/README.md create mode 100644 vendor/github.com/hashicorp/raft/api.go create mode 100644 vendor/github.com/hashicorp/raft/commands.go create mode 100644 vendor/github.com/hashicorp/raft/commitment.go create mode 100644 vendor/github.com/hashicorp/raft/config.go create mode 100644 vendor/github.com/hashicorp/raft/configuration.go create mode 100644 vendor/github.com/hashicorp/raft/discard_snapshot.go create mode 100644 vendor/github.com/hashicorp/raft/file_snapshot.go create mode 100644 vendor/github.com/hashicorp/raft/fsm.go create mode 100644 vendor/github.com/hashicorp/raft/future.go create mode 100644 vendor/github.com/hashicorp/raft/inmem_snapshot.go create mode 100644 vendor/github.com/hashicorp/raft/inmem_store.go create mode 100644 vendor/github.com/hashicorp/raft/inmem_transport.go create mode 100644 vendor/github.com/hashicorp/raft/log.go create mode 100644 vendor/github.com/hashicorp/raft/log_cache.go create mode 100644 vendor/github.com/hashicorp/raft/membership.md create mode 100644 vendor/github.com/hashicorp/raft/net_transport.go create mode 100644 vendor/github.com/hashicorp/raft/observer.go create mode 100644 vendor/github.com/hashicorp/raft/peersjson.go create mode 100644 vendor/github.com/hashicorp/raft/raft.go create mode 100644 vendor/github.com/hashicorp/raft/replication.go create mode 100644 vendor/github.com/hashicorp/raft/snapshot.go create mode 100644 vendor/github.com/hashicorp/raft/stable.go create mode 100644 vendor/github.com/hashicorp/raft/state.go create mode 100644 vendor/github.com/hashicorp/raft/tcp_transport.go create mode 100644 vendor/github.com/hashicorp/raft/transport.go create mode 100644 vendor/github.com/hashicorp/raft/util.go create mode 100644 vendor/github.com/hashicorp/serf/serf/broadcast.go create mode 100644 vendor/github.com/hashicorp/serf/serf/coalesce.go create mode 100644 vendor/github.com/hashicorp/serf/serf/coalesce_member.go create mode 100644 vendor/github.com/hashicorp/serf/serf/coalesce_user.go create mode 100644 vendor/github.com/hashicorp/serf/serf/config.go create mode 100644 vendor/github.com/hashicorp/serf/serf/conflict_delegate.go create mode 100644 vendor/github.com/hashicorp/serf/serf/delegate.go create mode 100644 vendor/github.com/hashicorp/serf/serf/event.go create mode 100644 vendor/github.com/hashicorp/serf/serf/event_delegate.go create mode 100644 vendor/github.com/hashicorp/serf/serf/internal_query.go create mode 100644 vendor/github.com/hashicorp/serf/serf/keymanager.go create mode 100644 vendor/github.com/hashicorp/serf/serf/lamport.go create mode 100644 vendor/github.com/hashicorp/serf/serf/merge_delegate.go create mode 100644 vendor/github.com/hashicorp/serf/serf/messages.go create mode 100644 vendor/github.com/hashicorp/serf/serf/ping_delegate.go create mode 100644 vendor/github.com/hashicorp/serf/serf/query.go create mode 100644 vendor/github.com/hashicorp/serf/serf/serf.go create mode 100644 vendor/github.com/hashicorp/serf/serf/snapshot.go create mode 100644 vendor/github.com/sean-/seed/LICENSE create mode 100644 vendor/github.com/sean-/seed/README.md create mode 100644 vendor/github.com/sean-/seed/init.go diff --git a/vendor/github.com/armon/go-metrics/README.md b/vendor/github.com/armon/go-metrics/README.md new file mode 100644 index 000000000..a7399cddf --- /dev/null +++ b/vendor/github.com/armon/go-metrics/README.md @@ -0,0 +1,74 @@ +go-metrics +========== + +This library provides a `metrics` package which can be used to instrument code, +expose application metrics, and profile runtime performance in a flexible manner. + +Current API: [![GoDoc](https://godoc.org/github.com/armon/go-metrics?status.svg)](https://godoc.org/github.com/armon/go-metrics) + +Sinks +===== + +The `metrics` package makes use of a `MetricSink` interface to support delivery +to any type of backend. Currently the following sinks are provided: + +* StatsiteSink : Sinks to a [statsite](https://github.com/armon/statsite/) instance (TCP) +* StatsdSink: Sinks to a [StatsD](https://github.com/etsy/statsd/) / statsite instance (UDP) +* PrometheusSink: Sinks to a [Prometheus](http://prometheus.io/) metrics endpoint (exposed via HTTP for scrapes) +* InmemSink : Provides in-memory aggregation, can be used to export stats +* FanoutSink : Sinks to multiple sinks. Enables writing to multiple statsite instances for example. +* BlackholeSink : Sinks to nowhere + +In addition to the sinks, the `InmemSignal` can be used to catch a signal, +and dump a formatted output of recent metrics. For example, when a process gets +a SIGUSR1, it can dump to stderr recent performance metrics for debugging. + +Examples +======== + +Here is an example of using the package: + +```go +func SlowMethod() { + // Profiling the runtime of a method + defer metrics.MeasureSince([]string{"SlowMethod"}, time.Now()) +} + +// Configure a statsite sink as the global metrics sink +sink, _ := metrics.NewStatsiteSink("statsite:8125") +metrics.NewGlobal(metrics.DefaultConfig("service-name"), sink) + +// Emit a Key/Value pair +metrics.EmitKey([]string{"questions", "meaning of life"}, 42) +``` + +Here is an example of setting up a signal handler: + +```go +// Setup the inmem sink and signal handler +inm := metrics.NewInmemSink(10*time.Second, time.Minute) +sig := metrics.DefaultInmemSignal(inm) +metrics.NewGlobal(metrics.DefaultConfig("service-name"), inm) + +// Run some code +inm.SetGauge([]string{"foo"}, 42) +inm.EmitKey([]string{"bar"}, 30) + +inm.IncrCounter([]string{"baz"}, 42) +inm.IncrCounter([]string{"baz"}, 1) +inm.IncrCounter([]string{"baz"}, 80) + +inm.AddSample([]string{"method", "wow"}, 42) +inm.AddSample([]string{"method", "wow"}, 100) +inm.AddSample([]string{"method", "wow"}, 22) + +.... +``` + +When a signal comes in, output like the following will be dumped to stderr: + + [2014-01-28 14:57:33.04 -0800 PST][G] 'foo': 42.000 + [2014-01-28 14:57:33.04 -0800 PST][P] 'bar': 30.000 + [2014-01-28 14:57:33.04 -0800 PST][C] 'baz': Count: 3 Min: 1.000 Mean: 41.000 Max: 80.000 Stddev: 39.509 + [2014-01-28 14:57:33.04 -0800 PST][S] 'method.wow': Count: 3 Min: 22.000 Mean: 54.667 Max: 100.000 Stddev: 40.513 + diff --git a/vendor/github.com/armon/go-metrics/const_unix.go b/vendor/github.com/armon/go-metrics/const_unix.go new file mode 100644 index 000000000..31098dd57 --- /dev/null +++ b/vendor/github.com/armon/go-metrics/const_unix.go @@ -0,0 +1,12 @@ +// +build !windows + +package metrics + +import ( + "syscall" +) + +const ( + // DefaultSignal is used with DefaultInmemSignal + DefaultSignal = syscall.SIGUSR1 +) diff --git a/vendor/github.com/armon/go-metrics/const_windows.go b/vendor/github.com/armon/go-metrics/const_windows.go new file mode 100644 index 000000000..38136af3e --- /dev/null +++ b/vendor/github.com/armon/go-metrics/const_windows.go @@ -0,0 +1,13 @@ +// +build windows + +package metrics + +import ( + "syscall" +) + +const ( + // DefaultSignal is used with DefaultInmemSignal + // Windows has no SIGUSR1, use SIGBREAK + DefaultSignal = syscall.Signal(21) +) diff --git a/vendor/github.com/armon/go-metrics/inmem.go b/vendor/github.com/armon/go-metrics/inmem.go new file mode 100644 index 000000000..83fb6bba0 --- /dev/null +++ b/vendor/github.com/armon/go-metrics/inmem.go @@ -0,0 +1,247 @@ +package metrics + +import ( + "fmt" + "math" + "strings" + "sync" + "time" +) + +// InmemSink provides a MetricSink that does in-memory aggregation +// without sending metrics over a network. It can be embedded within +// an application to provide profiling information. +type InmemSink struct { + // How long is each aggregation interval + interval time.Duration + + // Retain controls how many metrics interval we keep + retain time.Duration + + // maxIntervals is the maximum length of intervals. + // It is retain / interval. + maxIntervals int + + // intervals is a slice of the retained intervals + intervals []*IntervalMetrics + intervalLock sync.RWMutex + + rateDenom float64 +} + +// IntervalMetrics stores the aggregated metrics +// for a specific interval +type IntervalMetrics struct { + sync.RWMutex + + // The start time of the interval + Interval time.Time + + // Gauges maps the key to the last set value + Gauges map[string]float32 + + // Points maps the string to the list of emitted values + // from EmitKey + Points map[string][]float32 + + // Counters maps the string key to a sum of the counter + // values + Counters map[string]*AggregateSample + + // Samples maps the key to an AggregateSample, + // which has the rolled up view of a sample + Samples map[string]*AggregateSample +} + +// NewIntervalMetrics creates a new IntervalMetrics for a given interval +func NewIntervalMetrics(intv time.Time) *IntervalMetrics { + return &IntervalMetrics{ + Interval: intv, + Gauges: make(map[string]float32), + Points: make(map[string][]float32), + Counters: make(map[string]*AggregateSample), + Samples: make(map[string]*AggregateSample), + } +} + +// AggregateSample is used to hold aggregate metrics +// about a sample +type AggregateSample struct { + Count int // The count of emitted pairs + Rate float64 // The count of emitted pairs per time unit (usually 1 second) + Sum float64 // The sum of values + SumSq float64 // The sum of squared values + Min float64 // Minimum value + Max float64 // Maximum value + LastUpdated time.Time // When value was last updated +} + +// Computes a Stddev of the values +func (a *AggregateSample) Stddev() float64 { + num := (float64(a.Count) * a.SumSq) - math.Pow(a.Sum, 2) + div := float64(a.Count * (a.Count - 1)) + if div == 0 { + return 0 + } + return math.Sqrt(num / div) +} + +// Computes a mean of the values +func (a *AggregateSample) Mean() float64 { + if a.Count == 0 { + return 0 + } + return a.Sum / float64(a.Count) +} + +// Ingest is used to update a sample +func (a *AggregateSample) Ingest(v float64, rateDenom float64) { + a.Count++ + a.Sum += v + a.SumSq += (v * v) + if v < a.Min || a.Count == 1 { + a.Min = v + } + if v > a.Max || a.Count == 1 { + a.Max = v + } + a.Rate = float64(a.Count)/rateDenom + a.LastUpdated = time.Now() +} + +func (a *AggregateSample) String() string { + if a.Count == 0 { + return "Count: 0" + } else if a.Stddev() == 0 { + return fmt.Sprintf("Count: %d Sum: %0.3f LastUpdated: %s", a.Count, a.Sum, a.LastUpdated) + } else { + return fmt.Sprintf("Count: %d Min: %0.3f Mean: %0.3f Max: %0.3f Stddev: %0.3f Sum: %0.3f LastUpdated: %s", + a.Count, a.Min, a.Mean(), a.Max, a.Stddev(), a.Sum, a.LastUpdated) + } +} + +// NewInmemSink is used to construct a new in-memory sink. +// Uses an aggregation interval and maximum retention period. +func NewInmemSink(interval, retain time.Duration) *InmemSink { + rateTimeUnit := time.Second + i := &InmemSink{ + interval: interval, + retain: retain, + maxIntervals: int(retain / interval), + rateDenom: float64(interval.Nanoseconds()) / float64(rateTimeUnit.Nanoseconds()), + } + i.intervals = make([]*IntervalMetrics, 0, i.maxIntervals) + return i +} + +func (i *InmemSink) SetGauge(key []string, val float32) { + k := i.flattenKey(key) + intv := i.getInterval() + + intv.Lock() + defer intv.Unlock() + intv.Gauges[k] = val +} + +func (i *InmemSink) EmitKey(key []string, val float32) { + k := i.flattenKey(key) + intv := i.getInterval() + + intv.Lock() + defer intv.Unlock() + vals := intv.Points[k] + intv.Points[k] = append(vals, val) +} + +func (i *InmemSink) IncrCounter(key []string, val float32) { + k := i.flattenKey(key) + intv := i.getInterval() + + intv.Lock() + defer intv.Unlock() + + agg := intv.Counters[k] + if agg == nil { + agg = &AggregateSample{} + intv.Counters[k] = agg + } + agg.Ingest(float64(val), i.rateDenom) +} + +func (i *InmemSink) AddSample(key []string, val float32) { + k := i.flattenKey(key) + intv := i.getInterval() + + intv.Lock() + defer intv.Unlock() + + agg := intv.Samples[k] + if agg == nil { + agg = &AggregateSample{} + intv.Samples[k] = agg + } + agg.Ingest(float64(val), i.rateDenom) +} + +// Data is used to retrieve all the aggregated metrics +// Intervals may be in use, and a read lock should be acquired +func (i *InmemSink) Data() []*IntervalMetrics { + // Get the current interval, forces creation + i.getInterval() + + i.intervalLock.RLock() + defer i.intervalLock.RUnlock() + + intervals := make([]*IntervalMetrics, len(i.intervals)) + copy(intervals, i.intervals) + return intervals +} + +func (i *InmemSink) getExistingInterval(intv time.Time) *IntervalMetrics { + i.intervalLock.RLock() + defer i.intervalLock.RUnlock() + + n := len(i.intervals) + if n > 0 && i.intervals[n-1].Interval == intv { + return i.intervals[n-1] + } + return nil +} + +func (i *InmemSink) createInterval(intv time.Time) *IntervalMetrics { + i.intervalLock.Lock() + defer i.intervalLock.Unlock() + + // Check for an existing interval + n := len(i.intervals) + if n > 0 && i.intervals[n-1].Interval == intv { + return i.intervals[n-1] + } + + // Add the current interval + current := NewIntervalMetrics(intv) + i.intervals = append(i.intervals, current) + n++ + + // Truncate the intervals if they are too long + if n >= i.maxIntervals { + copy(i.intervals[0:], i.intervals[n-i.maxIntervals:]) + i.intervals = i.intervals[:i.maxIntervals] + } + return current +} + +// getInterval returns the current interval to write to +func (i *InmemSink) getInterval() *IntervalMetrics { + intv := time.Now().Truncate(i.interval) + if m := i.getExistingInterval(intv); m != nil { + return m + } + return i.createInterval(intv) +} + +// Flattens the key for formatting, removes spaces +func (i *InmemSink) flattenKey(parts []string) string { + joined := strings.Join(parts, ".") + return strings.Replace(joined, " ", "_", -1) +} diff --git a/vendor/github.com/armon/go-metrics/inmem_signal.go b/vendor/github.com/armon/go-metrics/inmem_signal.go new file mode 100644 index 000000000..95d08ee10 --- /dev/null +++ b/vendor/github.com/armon/go-metrics/inmem_signal.go @@ -0,0 +1,100 @@ +package metrics + +import ( + "bytes" + "fmt" + "io" + "os" + "os/signal" + "sync" + "syscall" +) + +// InmemSignal is used to listen for a given signal, and when received, +// to dump the current metrics from the InmemSink to an io.Writer +type InmemSignal struct { + signal syscall.Signal + inm *InmemSink + w io.Writer + sigCh chan os.Signal + + stop bool + stopCh chan struct{} + stopLock sync.Mutex +} + +// NewInmemSignal creates a new InmemSignal which listens for a given signal, +// and dumps the current metrics out to a writer +func NewInmemSignal(inmem *InmemSink, sig syscall.Signal, w io.Writer) *InmemSignal { + i := &InmemSignal{ + signal: sig, + inm: inmem, + w: w, + sigCh: make(chan os.Signal, 1), + stopCh: make(chan struct{}), + } + signal.Notify(i.sigCh, sig) + go i.run() + return i +} + +// DefaultInmemSignal returns a new InmemSignal that responds to SIGUSR1 +// and writes output to stderr. Windows uses SIGBREAK +func DefaultInmemSignal(inmem *InmemSink) *InmemSignal { + return NewInmemSignal(inmem, DefaultSignal, os.Stderr) +} + +// Stop is used to stop the InmemSignal from listening +func (i *InmemSignal) Stop() { + i.stopLock.Lock() + defer i.stopLock.Unlock() + + if i.stop { + return + } + i.stop = true + close(i.stopCh) + signal.Stop(i.sigCh) +} + +// run is a long running routine that handles signals +func (i *InmemSignal) run() { + for { + select { + case <-i.sigCh: + i.dumpStats() + case <-i.stopCh: + return + } + } +} + +// dumpStats is used to dump the data to output writer +func (i *InmemSignal) dumpStats() { + buf := bytes.NewBuffer(nil) + + data := i.inm.Data() + // Skip the last period which is still being aggregated + for i := 0; i < len(data)-1; i++ { + intv := data[i] + intv.RLock() + for name, val := range intv.Gauges { + fmt.Fprintf(buf, "[%v][G] '%s': %0.3f\n", intv.Interval, name, val) + } + for name, vals := range intv.Points { + for _, val := range vals { + fmt.Fprintf(buf, "[%v][P] '%s': %0.3f\n", intv.Interval, name, val) + } + } + for name, agg := range intv.Counters { + fmt.Fprintf(buf, "[%v][C] '%s': %s\n", intv.Interval, name, agg) + } + for name, agg := range intv.Samples { + fmt.Fprintf(buf, "[%v][S] '%s': %s\n", intv.Interval, name, agg) + } + intv.RUnlock() + } + + // Write out the bytes + i.w.Write(buf.Bytes()) +} diff --git a/vendor/github.com/armon/go-metrics/metrics.go b/vendor/github.com/armon/go-metrics/metrics.go new file mode 100755 index 000000000..b818e4182 --- /dev/null +++ b/vendor/github.com/armon/go-metrics/metrics.go @@ -0,0 +1,115 @@ +package metrics + +import ( + "runtime" + "time" +) + +func (m *Metrics) SetGauge(key []string, val float32) { + if m.HostName != "" && m.EnableHostname { + key = insert(0, m.HostName, key) + } + if m.EnableTypePrefix { + key = insert(0, "gauge", key) + } + if m.ServiceName != "" { + key = insert(0, m.ServiceName, key) + } + m.sink.SetGauge(key, val) +} + +func (m *Metrics) EmitKey(key []string, val float32) { + if m.EnableTypePrefix { + key = insert(0, "kv", key) + } + if m.ServiceName != "" { + key = insert(0, m.ServiceName, key) + } + m.sink.EmitKey(key, val) +} + +func (m *Metrics) IncrCounter(key []string, val float32) { + if m.EnableTypePrefix { + key = insert(0, "counter", key) + } + if m.ServiceName != "" { + key = insert(0, m.ServiceName, key) + } + m.sink.IncrCounter(key, val) +} + +func (m *Metrics) AddSample(key []string, val float32) { + if m.EnableTypePrefix { + key = insert(0, "sample", key) + } + if m.ServiceName != "" { + key = insert(0, m.ServiceName, key) + } + m.sink.AddSample(key, val) +} + +func (m *Metrics) MeasureSince(key []string, start time.Time) { + if m.EnableTypePrefix { + key = insert(0, "timer", key) + } + if m.ServiceName != "" { + key = insert(0, m.ServiceName, key) + } + now := time.Now() + elapsed := now.Sub(start) + msec := float32(elapsed.Nanoseconds()) / float32(m.TimerGranularity) + m.sink.AddSample(key, msec) +} + +// Periodically collects runtime stats to publish +func (m *Metrics) collectStats() { + for { + time.Sleep(m.ProfileInterval) + m.emitRuntimeStats() + } +} + +// Emits various runtime statsitics +func (m *Metrics) emitRuntimeStats() { + // Export number of Goroutines + numRoutines := runtime.NumGoroutine() + m.SetGauge([]string{"runtime", "num_goroutines"}, float32(numRoutines)) + + // Export memory stats + var stats runtime.MemStats + runtime.ReadMemStats(&stats) + m.SetGauge([]string{"runtime", "alloc_bytes"}, float32(stats.Alloc)) + m.SetGauge([]string{"runtime", "sys_bytes"}, float32(stats.Sys)) + m.SetGauge([]string{"runtime", "malloc_count"}, float32(stats.Mallocs)) + m.SetGauge([]string{"runtime", "free_count"}, float32(stats.Frees)) + m.SetGauge([]string{"runtime", "heap_objects"}, float32(stats.HeapObjects)) + m.SetGauge([]string{"runtime", "total_gc_pause_ns"}, float32(stats.PauseTotalNs)) + m.SetGauge([]string{"runtime", "total_gc_runs"}, float32(stats.NumGC)) + + // Export info about the last few GC runs + num := stats.NumGC + + // Handle wrap around + if num < m.lastNumGC { + m.lastNumGC = 0 + } + + // Ensure we don't scan more than 256 + if num-m.lastNumGC >= 256 { + m.lastNumGC = num - 255 + } + + for i := m.lastNumGC; i < num; i++ { + pause := stats.PauseNs[i%256] + m.AddSample([]string{"runtime", "gc_pause_ns"}, float32(pause)) + } + m.lastNumGC = num +} + +// Inserts a string value at an index into the slice +func insert(i int, v string, s []string) []string { + s = append(s, "") + copy(s[i+1:], s[i:]) + s[i] = v + return s +} diff --git a/vendor/github.com/armon/go-metrics/sink.go b/vendor/github.com/armon/go-metrics/sink.go new file mode 100755 index 000000000..0c240c2c4 --- /dev/null +++ b/vendor/github.com/armon/go-metrics/sink.go @@ -0,0 +1,52 @@ +package metrics + +// The MetricSink interface is used to transmit metrics information +// to an external system +type MetricSink interface { + // A Gauge should retain the last value it is set to + SetGauge(key []string, val float32) + + // Should emit a Key/Value pair for each call + EmitKey(key []string, val float32) + + // Counters should accumulate values + IncrCounter(key []string, val float32) + + // Samples are for timing information, where quantiles are used + AddSample(key []string, val float32) +} + +// BlackholeSink is used to just blackhole messages +type BlackholeSink struct{} + +func (*BlackholeSink) SetGauge(key []string, val float32) {} +func (*BlackholeSink) EmitKey(key []string, val float32) {} +func (*BlackholeSink) IncrCounter(key []string, val float32) {} +func (*BlackholeSink) AddSample(key []string, val float32) {} + +// FanoutSink is used to sink to fanout values to multiple sinks +type FanoutSink []MetricSink + +func (fh FanoutSink) SetGauge(key []string, val float32) { + for _, s := range fh { + s.SetGauge(key, val) + } +} + +func (fh FanoutSink) EmitKey(key []string, val float32) { + for _, s := range fh { + s.EmitKey(key, val) + } +} + +func (fh FanoutSink) IncrCounter(key []string, val float32) { + for _, s := range fh { + s.IncrCounter(key, val) + } +} + +func (fh FanoutSink) AddSample(key []string, val float32) { + for _, s := range fh { + s.AddSample(key, val) + } +} diff --git a/vendor/github.com/armon/go-metrics/start.go b/vendor/github.com/armon/go-metrics/start.go new file mode 100755 index 000000000..44113f100 --- /dev/null +++ b/vendor/github.com/armon/go-metrics/start.go @@ -0,0 +1,95 @@ +package metrics + +import ( + "os" + "time" +) + +// Config is used to configure metrics settings +type Config struct { + ServiceName string // Prefixed with keys to seperate services + HostName string // Hostname to use. If not provided and EnableHostname, it will be os.Hostname + EnableHostname bool // Enable prefixing gauge values with hostname + EnableRuntimeMetrics bool // Enables profiling of runtime metrics (GC, Goroutines, Memory) + EnableTypePrefix bool // Prefixes key with a type ("counter", "gauge", "timer") + TimerGranularity time.Duration // Granularity of timers. + ProfileInterval time.Duration // Interval to profile runtime metrics +} + +// Metrics represents an instance of a metrics sink that can +// be used to emit +type Metrics struct { + Config + lastNumGC uint32 + sink MetricSink +} + +// Shared global metrics instance +var globalMetrics *Metrics + +func init() { + // Initialize to a blackhole sink to avoid errors + globalMetrics = &Metrics{sink: &BlackholeSink{}} +} + +// DefaultConfig provides a sane default configuration +func DefaultConfig(serviceName string) *Config { + c := &Config{ + ServiceName: serviceName, // Use client provided service + HostName: "", + EnableHostname: true, // Enable hostname prefix + EnableRuntimeMetrics: true, // Enable runtime profiling + EnableTypePrefix: false, // Disable type prefix + TimerGranularity: time.Millisecond, // Timers are in milliseconds + ProfileInterval: time.Second, // Poll runtime every second + } + + // Try to get the hostname + name, _ := os.Hostname() + c.HostName = name + return c +} + +// New is used to create a new instance of Metrics +func New(conf *Config, sink MetricSink) (*Metrics, error) { + met := &Metrics{} + met.Config = *conf + met.sink = sink + + // Start the runtime collector + if conf.EnableRuntimeMetrics { + go met.collectStats() + } + return met, nil +} + +// NewGlobal is the same as New, but it assigns the metrics object to be +// used globally as well as returning it. +func NewGlobal(conf *Config, sink MetricSink) (*Metrics, error) { + metrics, err := New(conf, sink) + if err == nil { + globalMetrics = metrics + } + return metrics, err +} + +// Proxy all the methods to the globalMetrics instance +func SetGauge(key []string, val float32) { + globalMetrics.SetGauge(key, val) +} + +func EmitKey(key []string, val float32) { + globalMetrics.EmitKey(key, val) +} + +func IncrCounter(key []string, val float32) { + globalMetrics.IncrCounter(key, val) +} + +func AddSample(key []string, val float32) { + globalMetrics.AddSample(key, val) +} + +func MeasureSince(key []string, start time.Time) { + globalMetrics.MeasureSince(key, start) +} diff --git a/vendor/github.com/armon/go-metrics/statsd.go b/vendor/github.com/armon/go-metrics/statsd.go new file mode 100644 index 000000000..65a5021a0 --- /dev/null +++ b/vendor/github.com/armon/go-metrics/statsd.go @@ -0,0 +1,154 @@ +package metrics + +import ( + "bytes" + "fmt" + "log" + "net" + "strings" + "time" +) + +const ( + // statsdMaxLen is the maximum size of a packet + // to send to statsd + statsdMaxLen = 1400 +) + +// StatsdSink provides a MetricSink that can be used +// with a statsite or statsd metrics server. It uses +// only UDP packets, while StatsiteSink uses TCP. +type StatsdSink struct { + addr string + metricQueue chan string +} + +// NewStatsdSink is used to create a new StatsdSink +func NewStatsdSink(addr string) (*StatsdSink, error) { + s := &StatsdSink{ + addr: addr, + metricQueue: make(chan string, 4096), + } + go s.flushMetrics() + return s, nil +} + +// Close is used to stop flushing to statsd +func (s *StatsdSink) Shutdown() { + close(s.metricQueue) +} + +func (s *StatsdSink) SetGauge(key []string, val float32) { + flatKey := s.flattenKey(key) + s.pushMetric(fmt.Sprintf("%s:%f|g\n", flatKey, val)) +} + +func (s *StatsdSink) EmitKey(key []string, val float32) { + flatKey := s.flattenKey(key) + s.pushMetric(fmt.Sprintf("%s:%f|kv\n", flatKey, val)) +} + +func (s *StatsdSink) IncrCounter(key []string, val float32) { + flatKey := s.flattenKey(key) + s.pushMetric(fmt.Sprintf("%s:%f|c\n", flatKey, val)) +} + +func (s *StatsdSink) AddSample(key []string, val float32) { + flatKey := s.flattenKey(key) + s.pushMetric(fmt.Sprintf("%s:%f|ms\n", flatKey, val)) +} + +// Flattens the key for formatting, removes spaces +func (s *StatsdSink) flattenKey(parts []string) string { + joined := strings.Join(parts, ".") + return strings.Map(func(r rune) rune { + switch r { + case ':': + fallthrough + case ' ': + return '_' + default: + return r + } + }, joined) +} + +// Does a non-blocking push to the metrics queue +func (s *StatsdSink) pushMetric(m string) { + select { + case s.metricQueue <- m: + default: + } +} + +// Flushes metrics +func (s *StatsdSink) flushMetrics() { + var sock net.Conn + var err error + var wait <-chan time.Time + ticker := time.NewTicker(flushInterval) + defer ticker.Stop() + +CONNECT: + // Create a buffer + buf := bytes.NewBuffer(nil) + + // Attempt to connect + sock, err = net.Dial("udp", s.addr) + if err != nil { + log.Printf("[ERR] Error connecting to statsd! Err: %s", err) + goto WAIT + } + + for { + select { + case metric, ok := <-s.metricQueue: + // Get a metric from the queue + if !ok { + goto QUIT + } + + // Check if this would overflow the packet size + if len(metric)+buf.Len() > statsdMaxLen { + _, err := sock.Write(buf.Bytes()) + buf.Reset() + if err != nil { + log.Printf("[ERR] Error writing to statsd! Err: %s", err) + goto WAIT + } + } + + // Append to the buffer + buf.WriteString(metric) + + case <-ticker.C: + if buf.Len() == 0 { + continue + } + + _, err := sock.Write(buf.Bytes()) + buf.Reset() + if err != nil { + log.Printf("[ERR] Error flushing to statsd! Err: %s", err) + goto WAIT + } + } + } + +WAIT: + // Wait for a while + wait = time.After(time.Duration(5) * time.Second) + for { + select { + // Dequeue the messages to avoid backlog + case _, ok := <-s.metricQueue: + if !ok { + goto QUIT + } + case <-wait: + goto CONNECT + } + } +QUIT: + s.metricQueue = nil +} diff --git a/vendor/github.com/armon/go-metrics/statsite.go b/vendor/github.com/armon/go-metrics/statsite.go new file mode 100755 index 000000000..68730139a --- /dev/null +++ b/vendor/github.com/armon/go-metrics/statsite.go @@ -0,0 +1,142 @@ +package metrics + +import ( + "bufio" + "fmt" + "log" + "net" + "strings" + "time" +) + +const ( + // We force flush the statsite metrics after this period of + // inactivity. Prevents stats from getting stuck in a buffer + // forever. + flushInterval = 100 * time.Millisecond +) + +// StatsiteSink provides a MetricSink that can be used with a +// statsite metrics server +type StatsiteSink struct { + addr string + metricQueue chan string +} + +// NewStatsiteSink is used to create a new StatsiteSink +func NewStatsiteSink(addr string) (*StatsiteSink, error) { + s := &StatsiteSink{ + addr: addr, + metricQueue: make(chan string, 4096), + } + go s.flushMetrics() + return s, nil +} + +// Close is used to stop flushing to statsite +func (s *StatsiteSink) Shutdown() { + close(s.metricQueue) +} + +func (s *StatsiteSink) SetGauge(key []string, val float32) { + flatKey := s.flattenKey(key) + s.pushMetric(fmt.Sprintf("%s:%f|g\n", flatKey, val)) +} + +func (s *StatsiteSink) EmitKey(key []string, val float32) { + flatKey := s.flattenKey(key) + s.pushMetric(fmt.Sprintf("%s:%f|kv\n", flatKey, val)) +} + +func (s *StatsiteSink) IncrCounter(key []string, val float32) { + flatKey := s.flattenKey(key) + s.pushMetric(fmt.Sprintf("%s:%f|c\n", flatKey, val)) +} + +func (s *StatsiteSink) AddSample(key []string, val float32) { + flatKey := s.flattenKey(key) + s.pushMetric(fmt.Sprintf("%s:%f|ms\n", flatKey, val)) +} + +// Flattens the key for formatting, removes spaces +func (s *StatsiteSink) flattenKey(parts []string) string { + joined := strings.Join(parts, ".") + return strings.Map(func(r rune) rune { + switch r { + case ':': + fallthrough + case ' ': + return '_' + default: + return r + } + }, joined) +} + +// Does a non-blocking push to the metrics queue +func (s *StatsiteSink) pushMetric(m string) { + select { + case s.metricQueue <- m: + default: + } +} + +// Flushes metrics +func (s *StatsiteSink) flushMetrics() { + var sock net.Conn + var err error + var wait <-chan time.Time + var buffered *bufio.Writer + ticker := time.NewTicker(flushInterval) + defer ticker.Stop() + +CONNECT: + // Attempt to connect + sock, err = net.Dial("tcp", s.addr) + if err != nil { + log.Printf("[ERR] Error connecting to statsite! Err: %s", err) + goto WAIT + } + + // Create a buffered writer + buffered = bufio.NewWriter(sock) + + for { + select { + case metric, ok := <-s.metricQueue: + // Get a metric from the queue + if !ok { + goto QUIT + } + + // Try to send to statsite + _, err := buffered.Write([]byte(metric)) + if err != nil { + log.Printf("[ERR] Error writing to statsite! Err: %s", err) + goto WAIT + } + case <-ticker.C: + if err := buffered.Flush(); err != nil { + log.Printf("[ERR] Error flushing to statsite! Err: %s", err) + goto WAIT + } + } + } + +WAIT: + // Wait for a while + wait = time.After(time.Duration(5) * time.Second) + for { + select { + // Dequeue the messages to avoid backlog + case _, ok := <-s.metricQueue: + if !ok { + goto QUIT + } + case <-wait: + goto CONNECT + } + } +QUIT: + s.metricQueue = nil +} diff --git a/vendor/github.com/hashicorp/consul/acl/acl.go b/vendor/github.com/hashicorp/consul/acl/acl.go new file mode 100644 index 000000000..3ade9d405 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/acl/acl.go @@ -0,0 +1,672 @@ +package acl + +import ( + "github.com/armon/go-radix" +) + +var ( + // allowAll is a singleton policy which allows all + // non-management actions + allowAll ACL + + // denyAll is a singleton policy which denies all actions + denyAll ACL + + // manageAll is a singleton policy which allows all + // actions, including management + manageAll ACL +) + +func init() { + // Setup the singletons + allowAll = &StaticACL{ + allowManage: false, + defaultAllow: true, + } + denyAll = &StaticACL{ + allowManage: false, + defaultAllow: false, + } + manageAll = &StaticACL{ + allowManage: true, + defaultAllow: true, + } +} + +// ACL is the interface for policy enforcement. +type ACL interface { + // ACLList checks for permission to list all the ACLs + ACLList() bool + + // ACLModify checks for permission to manipulate ACLs + ACLModify() bool + + // AgentRead checks for permission to read from agent endpoints for a + // given node. + AgentRead(string) bool + + // AgentWrite checks for permission to make changes via agent endpoints + // for a given node. + AgentWrite(string) bool + + // EventRead determines if a specific event can be queried. + EventRead(string) bool + + // EventWrite determines if a specific event may be fired. + EventWrite(string) bool + + // KeyRead checks for permission to read a given key + KeyRead(string) bool + + // KeyWrite checks for permission to write a given key + KeyWrite(string) bool + + // KeyWritePrefix checks for permission to write to an + // entire key prefix. This means there must be no sub-policies + // that deny a write. + KeyWritePrefix(string) bool + + // KeyringRead determines if the encryption keyring used in + // the gossip layer can be read. + KeyringRead() bool + + // KeyringWrite determines if the keyring can be manipulated + KeyringWrite() bool + + // NodeRead checks for permission to read (discover) a given node. + NodeRead(string) bool + + // NodeWrite checks for permission to create or update (register) a + // given node. + NodeWrite(string) bool + + // OperatorRead determines if the read-only Consul operator functions + // can be used. + OperatorRead() bool + + // OperatorWrite determines if the state-changing Consul operator + // functions can be used. + OperatorWrite() bool + + // PrepardQueryRead determines if a specific prepared query can be read + // to show its contents (this is not used for execution). + PreparedQueryRead(string) bool + + // PreparedQueryWrite determines if a specific prepared query can be + // created, modified, or deleted. + PreparedQueryWrite(string) bool + + // ServiceRead checks for permission to read a given service + ServiceRead(string) bool + + // ServiceWrite checks for permission to create or update a given + // service + ServiceWrite(string) bool + + // SessionRead checks for permission to read sessions for a given node. + SessionRead(string) bool + + // SessionWrite checks for permission to create sessions for a given + // node. + SessionWrite(string) bool + + // Snapshot checks for permission to take and restore snapshots. + Snapshot() bool +} + +// StaticACL is used to implement a base ACL policy. It either +// allows or denies all requests. This can be used as a parent +// ACL to act in a blacklist or whitelist mode. +type StaticACL struct { + allowManage bool + defaultAllow bool +} + +func (s *StaticACL) ACLList() bool { + return s.allowManage +} + +func (s *StaticACL) ACLModify() bool { + return s.allowManage +} + +func (s *StaticACL) AgentRead(string) bool { + return s.defaultAllow +} + +func (s *StaticACL) AgentWrite(string) bool { + return s.defaultAllow +} + +func (s *StaticACL) EventRead(string) bool { + return s.defaultAllow +} + +func (s *StaticACL) EventWrite(string) bool { + return s.defaultAllow +} + +func (s *StaticACL) KeyRead(string) bool { + return s.defaultAllow +} + +func (s *StaticACL) KeyWrite(string) bool { + return s.defaultAllow +} + +func (s *StaticACL) KeyWritePrefix(string) bool { + return s.defaultAllow +} + +func (s *StaticACL) KeyringRead() bool { + return s.defaultAllow +} + +func (s *StaticACL) KeyringWrite() bool { + return s.defaultAllow +} + +func (s *StaticACL) NodeRead(string) bool { + return s.defaultAllow +} + +func (s *StaticACL) NodeWrite(string) bool { + return s.defaultAllow +} + +func (s *StaticACL) OperatorRead() bool { + return s.defaultAllow +} + +func (s *StaticACL) OperatorWrite() bool { + return s.defaultAllow +} + +func (s *StaticACL) PreparedQueryRead(string) bool { + return s.defaultAllow +} + +func (s *StaticACL) PreparedQueryWrite(string) bool { + return s.defaultAllow +} + +func (s *StaticACL) ServiceRead(string) bool { + return s.defaultAllow +} + +func (s *StaticACL) ServiceWrite(string) bool { + return s.defaultAllow +} + +func (s *StaticACL) SessionRead(string) bool { + return s.defaultAllow +} + +func (s *StaticACL) SessionWrite(string) bool { + return s.defaultAllow +} + +func (s *StaticACL) Snapshot() bool { + return s.allowManage +} + +// AllowAll returns an ACL rule that allows all operations +func AllowAll() ACL { + return allowAll +} + +// DenyAll returns an ACL rule that denies all operations +func DenyAll() ACL { + return denyAll +} + +// ManageAll returns an ACL rule that can manage all resources +func ManageAll() ACL { + return manageAll +} + +// RootACL returns a possible ACL if the ID matches a root policy +func RootACL(id string) ACL { + switch id { + case "allow": + return allowAll + case "deny": + return denyAll + case "manage": + return manageAll + default: + return nil + } +} + +// PolicyACL is used to wrap a set of ACL policies to provide +// the ACL interface. +type PolicyACL struct { + // parent is used to resolve policy if we have + // no matching rule. + parent ACL + + // agentRules contains the agent policies + agentRules *radix.Tree + + // keyRules contains the key policies + keyRules *radix.Tree + + // nodeRules contains the node policies + nodeRules *radix.Tree + + // serviceRules contains the service policies + serviceRules *radix.Tree + + // sessionRules contains the session policies + sessionRules *radix.Tree + + // eventRules contains the user event policies + eventRules *radix.Tree + + // preparedQueryRules contains the prepared query policies + preparedQueryRules *radix.Tree + + // keyringRule contains the keyring policies. The keyring has + // a very simple yes/no without prefix matching, so here we + // don't need to use a radix tree. + keyringRule string + + // operatorRule contains the operator policies. + operatorRule string +} + +// New is used to construct a policy based ACL from a set of policies +// and a parent policy to resolve missing cases. +func New(parent ACL, policy *Policy) (*PolicyACL, error) { + p := &PolicyACL{ + parent: parent, + agentRules: radix.New(), + keyRules: radix.New(), + nodeRules: radix.New(), + serviceRules: radix.New(), + sessionRules: radix.New(), + eventRules: radix.New(), + preparedQueryRules: radix.New(), + } + + // Load the agent policy + for _, ap := range policy.Agents { + p.agentRules.Insert(ap.Node, ap.Policy) + } + + // Load the key policy + for _, kp := range policy.Keys { + p.keyRules.Insert(kp.Prefix, kp.Policy) + } + + // Load the node policy + for _, np := range policy.Nodes { + p.nodeRules.Insert(np.Name, np.Policy) + } + + // Load the service policy + for _, sp := range policy.Services { + p.serviceRules.Insert(sp.Name, sp.Policy) + } + + // Load the session policy + for _, sp := range policy.Sessions { + p.sessionRules.Insert(sp.Node, sp.Policy) + } + + // Load the event policy + for _, ep := range policy.Events { + p.eventRules.Insert(ep.Event, ep.Policy) + } + + // Load the prepared query policy + for _, pq := range policy.PreparedQueries { + p.preparedQueryRules.Insert(pq.Prefix, pq.Policy) + } + + // Load the keyring policy + p.keyringRule = policy.Keyring + + // Load the operator policy + p.operatorRule = policy.Operator + + return p, nil +} + +// ACLList checks if listing of ACLs is allowed +func (p *PolicyACL) ACLList() bool { + return p.parent.ACLList() +} + +// ACLModify checks if modification of ACLs is allowed +func (p *PolicyACL) ACLModify() bool { + return p.parent.ACLModify() +} + +// AgentRead checks for permission to read from agent endpoints for a given +// node. +func (p *PolicyACL) AgentRead(node string) bool { + // Check for an exact rule or catch-all + _, rule, ok := p.agentRules.LongestPrefix(node) + + if ok { + switch rule { + case PolicyRead, PolicyWrite: + return true + default: + return false + } + } + + // No matching rule, use the parent. + return p.parent.AgentRead(node) +} + +// AgentWrite checks for permission to make changes via agent endpoints for a +// given node. +func (p *PolicyACL) AgentWrite(node string) bool { + // Check for an exact rule or catch-all + _, rule, ok := p.agentRules.LongestPrefix(node) + + if ok { + switch rule { + case PolicyWrite: + return true + default: + return false + } + } + + // No matching rule, use the parent. + return p.parent.AgentWrite(node) +} + +// Snapshot checks if taking and restoring snapshots is allowed. +func (p *PolicyACL) Snapshot() bool { + return p.parent.Snapshot() +} + +// EventRead is used to determine if the policy allows for a +// specific user event to be read. +func (p *PolicyACL) EventRead(name string) bool { + // Longest-prefix match on event names + if _, rule, ok := p.eventRules.LongestPrefix(name); ok { + switch rule { + case PolicyRead, PolicyWrite: + return true + default: + return false + } + } + + // Nothing matched, use parent + return p.parent.EventRead(name) +} + +// EventWrite is used to determine if new events can be created +// (fired) by the policy. +func (p *PolicyACL) EventWrite(name string) bool { + // Longest-prefix match event names + if _, rule, ok := p.eventRules.LongestPrefix(name); ok { + return rule == PolicyWrite + } + + // No match, use parent + return p.parent.EventWrite(name) +} + +// KeyRead returns if a key is allowed to be read +func (p *PolicyACL) KeyRead(key string) bool { + // Look for a matching rule + _, rule, ok := p.keyRules.LongestPrefix(key) + if ok { + switch rule.(string) { + case PolicyRead, PolicyWrite: + return true + default: + return false + } + } + + // No matching rule, use the parent. + return p.parent.KeyRead(key) +} + +// KeyWrite returns if a key is allowed to be written +func (p *PolicyACL) KeyWrite(key string) bool { + // Look for a matching rule + _, rule, ok := p.keyRules.LongestPrefix(key) + if ok { + switch rule.(string) { + case PolicyWrite: + return true + default: + return false + } + } + + // No matching rule, use the parent. + return p.parent.KeyWrite(key) +} + +// KeyWritePrefix returns if a prefix is allowed to be written +func (p *PolicyACL) KeyWritePrefix(prefix string) bool { + // Look for a matching rule that denies + _, rule, ok := p.keyRules.LongestPrefix(prefix) + if ok && rule.(string) != PolicyWrite { + return false + } + + // Look if any of our children have a deny policy + deny := false + p.keyRules.WalkPrefix(prefix, func(path string, rule interface{}) bool { + // We have a rule to prevent a write in a sub-directory! + if rule.(string) != PolicyWrite { + deny = true + return true + } + return false + }) + + // Deny the write if any sub-rules may be violated + if deny { + return false + } + + // If we had a matching rule, done + if ok { + return true + } + + // No matching rule, use the parent. + return p.parent.KeyWritePrefix(prefix) +} + +// KeyringRead is used to determine if the keyring can be +// read by the current ACL token. +func (p *PolicyACL) KeyringRead() bool { + switch p.keyringRule { + case PolicyRead, PolicyWrite: + return true + case PolicyDeny: + return false + default: + return p.parent.KeyringRead() + } +} + +// KeyringWrite determines if the keyring can be manipulated. +func (p *PolicyACL) KeyringWrite() bool { + if p.keyringRule == PolicyWrite { + return true + } + return p.parent.KeyringWrite() +} + +// OperatorRead determines if the read-only operator functions are allowed. +func (p *PolicyACL) OperatorRead() bool { + switch p.operatorRule { + case PolicyRead, PolicyWrite: + return true + case PolicyDeny: + return false + default: + return p.parent.OperatorRead() + } +} + +// NodeRead checks if reading (discovery) of a node is allowed +func (p *PolicyACL) NodeRead(name string) bool { + // Check for an exact rule or catch-all + _, rule, ok := p.nodeRules.LongestPrefix(name) + + if ok { + switch rule { + case PolicyRead, PolicyWrite: + return true + default: + return false + } + } + + // No matching rule, use the parent. + return p.parent.NodeRead(name) +} + +// NodeWrite checks if writing (registering) a node is allowed +func (p *PolicyACL) NodeWrite(name string) bool { + // Check for an exact rule or catch-all + _, rule, ok := p.nodeRules.LongestPrefix(name) + + if ok { + switch rule { + case PolicyWrite: + return true + default: + return false + } + } + + // No matching rule, use the parent. + return p.parent.NodeWrite(name) +} + +// OperatorWrite determines if the state-changing operator functions are +// allowed. +func (p *PolicyACL) OperatorWrite() bool { + if p.operatorRule == PolicyWrite { + return true + } + return p.parent.OperatorWrite() +} + +// PreparedQueryRead checks if reading (listing) of a prepared query is +// allowed - this isn't execution, just listing its contents. +func (p *PolicyACL) PreparedQueryRead(prefix string) bool { + // Check for an exact rule or catch-all + _, rule, ok := p.preparedQueryRules.LongestPrefix(prefix) + + if ok { + switch rule { + case PolicyRead, PolicyWrite: + return true + default: + return false + } + } + + // No matching rule, use the parent. + return p.parent.PreparedQueryRead(prefix) +} + +// PreparedQueryWrite checks if writing (creating, updating, or deleting) of a +// prepared query is allowed. +func (p *PolicyACL) PreparedQueryWrite(prefix string) bool { + // Check for an exact rule or catch-all + _, rule, ok := p.preparedQueryRules.LongestPrefix(prefix) + + if ok { + switch rule { + case PolicyWrite: + return true + default: + return false + } + } + + // No matching rule, use the parent. + return p.parent.PreparedQueryWrite(prefix) +} + +// ServiceRead checks if reading (discovery) of a service is allowed +func (p *PolicyACL) ServiceRead(name string) bool { + // Check for an exact rule or catch-all + _, rule, ok := p.serviceRules.LongestPrefix(name) + + if ok { + switch rule { + case PolicyRead, PolicyWrite: + return true + default: + return false + } + } + + // No matching rule, use the parent. + return p.parent.ServiceRead(name) +} + +// ServiceWrite checks if writing (registering) a service is allowed +func (p *PolicyACL) ServiceWrite(name string) bool { + // Check for an exact rule or catch-all + _, rule, ok := p.serviceRules.LongestPrefix(name) + + if ok { + switch rule { + case PolicyWrite: + return true + default: + return false + } + } + + // No matching rule, use the parent. + return p.parent.ServiceWrite(name) +} + +// SessionRead checks for permission to read sessions for a given node. +func (p *PolicyACL) SessionRead(node string) bool { + // Check for an exact rule or catch-all + _, rule, ok := p.sessionRules.LongestPrefix(node) + + if ok { + switch rule { + case PolicyRead, PolicyWrite: + return true + default: + return false + } + } + + // No matching rule, use the parent. + return p.parent.SessionRead(node) +} + +// SessionWrite checks for permission to create sessions for a given node. +func (p *PolicyACL) SessionWrite(node string) bool { + // Check for an exact rule or catch-all + _, rule, ok := p.sessionRules.LongestPrefix(node) + + if ok { + switch rule { + case PolicyWrite: + return true + default: + return false + } + } + + // No matching rule, use the parent. + return p.parent.SessionWrite(node) +} diff --git a/vendor/github.com/hashicorp/consul/acl/cache.go b/vendor/github.com/hashicorp/consul/acl/cache.go new file mode 100644 index 000000000..0387f9fbe --- /dev/null +++ b/vendor/github.com/hashicorp/consul/acl/cache.go @@ -0,0 +1,177 @@ +package acl + +import ( + "crypto/md5" + "fmt" + + "github.com/hashicorp/golang-lru" +) + +// FaultFunc is a function used to fault in the parent, +// rules for an ACL given its ID +type FaultFunc func(id string) (string, string, error) + +// aclEntry allows us to store the ACL with it's policy ID +type aclEntry struct { + ACL ACL + Parent string + RuleID string +} + +// Cache is used to implement policy and ACL caching +type Cache struct { + faultfn FaultFunc + aclCache *lru.TwoQueueCache // Cache id -> acl + policyCache *lru.TwoQueueCache // Cache policy -> acl + ruleCache *lru.TwoQueueCache // Cache rules -> policy +} + +// NewCache constructs a new policy and ACL cache of a given size +func NewCache(size int, faultfn FaultFunc) (*Cache, error) { + if size <= 0 { + return nil, fmt.Errorf("Must provide positive cache size") + } + + rc, err := lru.New2Q(size) + if err != nil { + return nil, err + } + + pc, err := lru.New2Q(size) + if err != nil { + return nil, err + } + + ac, err := lru.New2Q(size) + if err != nil { + return nil, err + } + + c := &Cache{ + faultfn: faultfn, + aclCache: ac, + policyCache: pc, + ruleCache: rc, + } + return c, nil +} + +// GetPolicy is used to get a potentially cached policy set. +// If not cached, it will be parsed, and then cached. +func (c *Cache) GetPolicy(rules string) (*Policy, error) { + return c.getPolicy(RuleID(rules), rules) +} + +// getPolicy is an internal method to get a cached policy, +// but it assumes a pre-computed ID +func (c *Cache) getPolicy(id, rules string) (*Policy, error) { + raw, ok := c.ruleCache.Get(id) + if ok { + return raw.(*Policy), nil + } + policy, err := Parse(rules) + if err != nil { + return nil, err + } + policy.ID = id + c.ruleCache.Add(id, policy) + return policy, nil + +} + +// RuleID is used to generate an ID for a rule +func RuleID(rules string) string { + return fmt.Sprintf("%x", md5.Sum([]byte(rules))) +} + +// policyID returns the cache ID for a policy +func (c *Cache) policyID(parent, ruleID string) string { + return parent + ":" + ruleID +} + +// GetACLPolicy is used to get the potentially cached ACL +// policy. If not cached, it will be generated and then cached. +func (c *Cache) GetACLPolicy(id string) (string, *Policy, error) { + // Check for a cached acl + if raw, ok := c.aclCache.Get(id); ok { + cached := raw.(aclEntry) + if raw, ok := c.ruleCache.Get(cached.RuleID); ok { + return cached.Parent, raw.(*Policy), nil + } + } + + // Fault in the rules + parent, rules, err := c.faultfn(id) + if err != nil { + return "", nil, err + } + + // Get cached + policy, err := c.GetPolicy(rules) + return parent, policy, err +} + +// GetACL is used to get a potentially cached ACL policy. +// If not cached, it will be generated and then cached. +func (c *Cache) GetACL(id string) (ACL, error) { + // Look for the ACL directly + raw, ok := c.aclCache.Get(id) + if ok { + return raw.(aclEntry).ACL, nil + } + + // Get the rules + parentID, rules, err := c.faultfn(id) + if err != nil { + return nil, err + } + ruleID := RuleID(rules) + + // Check for a compiled ACL + policyID := c.policyID(parentID, ruleID) + var compiled ACL + if raw, ok := c.policyCache.Get(policyID); ok { + compiled = raw.(ACL) + } else { + // Get the policy + policy, err := c.getPolicy(ruleID, rules) + if err != nil { + return nil, err + } + + // Get the parent ACL + parent := RootACL(parentID) + if parent == nil { + parent, err = c.GetACL(parentID) + if err != nil { + return nil, err + } + } + + // Compile the ACL + acl, err := New(parent, policy) + if err != nil { + return nil, err + } + + // Cache the compiled ACL + c.policyCache.Add(policyID, acl) + compiled = acl + } + + // Cache and return the ACL + c.aclCache.Add(id, aclEntry{compiled, parentID, ruleID}) + return compiled, nil +} + +// ClearACL is used to clear the ACL cache if any +func (c *Cache) ClearACL(id string) { + c.aclCache.Remove(id) +} + +// Purge is used to clear all the ACL caches. The +// rule and policy caches are not purged, since they +// are content-hashed anyways. +func (c *Cache) Purge() { + c.aclCache.Purge() +} diff --git a/vendor/github.com/hashicorp/consul/acl/policy.go b/vendor/github.com/hashicorp/consul/acl/policy.go new file mode 100644 index 000000000..f7781b81e --- /dev/null +++ b/vendor/github.com/hashicorp/consul/acl/policy.go @@ -0,0 +1,191 @@ +package acl + +import ( + "fmt" + + "github.com/hashicorp/hcl" +) + +const ( + PolicyDeny = "deny" + PolicyRead = "read" + PolicyWrite = "write" +) + +// Policy is used to represent the policy specified by +// an ACL configuration. +type Policy struct { + ID string `hcl:"-"` + Agents []*AgentPolicy `hcl:"agent,expand"` + Keys []*KeyPolicy `hcl:"key,expand"` + Nodes []*NodePolicy `hcl:"node,expand"` + Services []*ServicePolicy `hcl:"service,expand"` + Sessions []*SessionPolicy `hcl:"session,expand"` + Events []*EventPolicy `hcl:"event,expand"` + PreparedQueries []*PreparedQueryPolicy `hcl:"query,expand"` + Keyring string `hcl:"keyring"` + Operator string `hcl:"operator"` +} + +// AgentPolicy represents a policy for working with agent endpoints on nodes +// with specific name prefixes. +type AgentPolicy struct { + Node string `hcl:",key"` + Policy string +} + +func (a *AgentPolicy) GoString() string { + return fmt.Sprintf("%#v", *a) +} + +// KeyPolicy represents a policy for a key +type KeyPolicy struct { + Prefix string `hcl:",key"` + Policy string +} + +func (k *KeyPolicy) GoString() string { + return fmt.Sprintf("%#v", *k) +} + +// NodePolicy represents a policy for a node +type NodePolicy struct { + Name string `hcl:",key"` + Policy string +} + +func (n *NodePolicy) GoString() string { + return fmt.Sprintf("%#v", *n) +} + +// ServicePolicy represents a policy for a service +type ServicePolicy struct { + Name string `hcl:",key"` + Policy string +} + +func (s *ServicePolicy) GoString() string { + return fmt.Sprintf("%#v", *s) +} + +// SessionPolicy represents a policy for making sessions tied to specific node +// name prefixes. +type SessionPolicy struct { + Node string `hcl:",key"` + Policy string +} + +func (s *SessionPolicy) GoString() string { + return fmt.Sprintf("%#v", *s) +} + +// EventPolicy represents a user event policy. +type EventPolicy struct { + Event string `hcl:",key"` + Policy string +} + +func (e *EventPolicy) GoString() string { + return fmt.Sprintf("%#v", *e) +} + +// PreparedQueryPolicy represents a prepared query policy. +type PreparedQueryPolicy struct { + Prefix string `hcl:",key"` + Policy string +} + +func (p *PreparedQueryPolicy) GoString() string { + return fmt.Sprintf("%#v", *p) +} + +// isPolicyValid makes sure the given string matches one of the valid policies. +func isPolicyValid(policy string) bool { + switch policy { + case PolicyDeny: + return true + case PolicyRead: + return true + case PolicyWrite: + return true + default: + return false + } +} + +// Parse is used to parse the specified ACL rules into an +// intermediary set of policies, before being compiled into +// the ACL +func Parse(rules string) (*Policy, error) { + // Decode the rules + p := &Policy{} + if rules == "" { + // Hot path for empty rules + return p, nil + } + + if err := hcl.Decode(p, rules); err != nil { + return nil, fmt.Errorf("Failed to parse ACL rules: %v", err) + } + + // Validate the agent policy + for _, ap := range p.Agents { + if !isPolicyValid(ap.Policy) { + return nil, fmt.Errorf("Invalid agent policy: %#v", ap) + } + } + + // Validate the key policy + for _, kp := range p.Keys { + if !isPolicyValid(kp.Policy) { + return nil, fmt.Errorf("Invalid key policy: %#v", kp) + } + } + + // Validate the node policies + for _, np := range p.Nodes { + if !isPolicyValid(np.Policy) { + return nil, fmt.Errorf("Invalid node policy: %#v", np) + } + } + + // Validate the service policies + for _, sp := range p.Services { + if !isPolicyValid(sp.Policy) { + return nil, fmt.Errorf("Invalid service policy: %#v", sp) + } + } + + // Validate the session policies + for _, sp := range p.Sessions { + if !isPolicyValid(sp.Policy) { + return nil, fmt.Errorf("Invalid session policy: %#v", sp) + } + } + + // Validate the user event policies + for _, ep := range p.Events { + if !isPolicyValid(ep.Policy) { + return nil, fmt.Errorf("Invalid event policy: %#v", ep) + } + } + + // Validate the prepared query policies + for _, pq := range p.PreparedQueries { + if !isPolicyValid(pq.Policy) { + return nil, fmt.Errorf("Invalid query policy: %#v", pq) + } + } + + // Validate the keyring policy - this one is allowed to be empty + if p.Keyring != "" && !isPolicyValid(p.Keyring) { + return nil, fmt.Errorf("Invalid keyring policy: %#v", p.Keyring) + } + + // Validate the operator policy - this one is allowed to be empty + if p.Operator != "" && !isPolicyValid(p.Operator) { + return nil, fmt.Errorf("Invalid operator policy: %#v", p.Operator) + } + + return p, nil +} diff --git a/vendor/github.com/hashicorp/consul/consul/structs/operator.go b/vendor/github.com/hashicorp/consul/consul/structs/operator.go new file mode 100644 index 000000000..d564400bf --- /dev/null +++ b/vendor/github.com/hashicorp/consul/consul/structs/operator.go @@ -0,0 +1,57 @@ +package structs + +import ( + "github.com/hashicorp/raft" +) + +// RaftServer has information about a server in the Raft configuration. +type RaftServer struct { + // ID is the unique ID for the server. These are currently the same + // as the address, but they will be changed to a real GUID in a future + // release of Consul. + ID raft.ServerID + + // Node is the node name of the server, as known by Consul, or this + // will be set to "(unknown)" otherwise. + Node string + + // Address is the IP:port of the server, used for Raft communications. + Address raft.ServerAddress + + // Leader is true if this server is the current cluster leader. + Leader bool + + // Voter is true if this server has a vote in the cluster. This might + // be false if the server is staging and still coming online, or if + // it's a non-voting server, which will be added in a future release of + // Consul. + Voter bool +} + +// RaftConfigrationResponse is returned when querying for the current Raft +// configuration. +type RaftConfigurationResponse struct { + // Servers has the list of servers in the Raft configuration. + Servers []*RaftServer + + // Index has the Raft index of this configuration. + Index uint64 +} + +// RaftPeerByAddressRequest is used by the Operator endpoint to apply a Raft +// operation on a specific Raft peer by address in the form of "IP:port". +type RaftPeerByAddressRequest struct { + // Datacenter is the target this request is intended for. + Datacenter string + + // Address is the peer to remove, in the form "IP:port". + Address raft.ServerAddress + + // WriteRequest holds the ACL token to go along with this request. + WriteRequest +} + +// RequestDatacenter returns the datacenter for a given request. +func (op *RaftPeerByAddressRequest) RequestDatacenter() string { + return op.Datacenter +} diff --git a/vendor/github.com/hashicorp/consul/consul/structs/prepared_query.go b/vendor/github.com/hashicorp/consul/consul/structs/prepared_query.go new file mode 100644 index 000000000..af535f010 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/consul/structs/prepared_query.go @@ -0,0 +1,257 @@ +package structs + +// QueryDatacenterOptions sets options about how we fail over if there are no +// healthy nodes in the local datacenter. +type QueryDatacenterOptions struct { + // NearestN is set to the number of remote datacenters to try, based on + // network coordinates. + NearestN int + + // Datacenters is a fixed list of datacenters to try after NearestN. We + // never try a datacenter multiple times, so those are subtracted from + // this list before proceeding. + Datacenters []string +} + +// QueryDNSOptions controls settings when query results are served over DNS. +type QueryDNSOptions struct { + // TTL is the time to live for the served DNS results. + TTL string +} + +// ServiceQuery is used to query for a set of healthy nodes offering a specific +// service. +type ServiceQuery struct { + // Service is the service to query. + Service string + + // Failover controls what we do if there are no healthy nodes in the + // local datacenter. + Failover QueryDatacenterOptions + + // If OnlyPassing is true then we will only include nodes with passing + // health checks (critical AND warning checks will cause a node to be + // discarded) + OnlyPassing bool + + // Near allows the query to always prefer the node nearest the given + // node. If the node does not exist, results are returned in their + // normal randomly-shuffled order. Supplying the magic "_agent" value + // is supported to sort near the agent which initiated the request. + Near string + + // Tags are a set of required and/or disallowed tags. If a tag is in + // this list it must be present. If the tag is preceded with "!" then + // it is disallowed. + Tags []string + + // NodeMeta is a map of required node metadata fields. If a key/value + // pair is in this map it must be present on the node in order for the + // service entry to be returned. + NodeMeta map[string]string +} + +const ( + // QueryTemplateTypeNamePrefixMatch uses the Name field of the query as + // a prefix to select the template. + QueryTemplateTypeNamePrefixMatch = "name_prefix_match" +) + +// QueryTemplateOptions controls settings if this query is a template. +type QueryTemplateOptions struct { + // Type, if non-empty, means that this query is a template. This is + // set to one of the QueryTemplateType* constants above. + Type string + + // Regexp is an optional regular expression to use to parse the full + // name, once the prefix match has selected a template. This can be + // used to extract parts of the name and choose a service name, set + // tags, etc. + Regexp string +} + +// PreparedQuery defines a complete prepared query, and is the structure we +// maintain in the state store. +type PreparedQuery struct { + // ID is this UUID-based ID for the query, always generated by Consul. + ID string + + // Name is an optional friendly name for the query supplied by the + // user. NOTE - if this feature is used then it will reduce the security + // of any read ACL associated with this query/service since this name + // can be used to locate nodes with supplying any ACL. + Name string + + // Session is an optional session to tie this query's lifetime to. If + // this is omitted then the query will not expire. + Session string + + // Token is the ACL token used when the query was created, and it is + // used when a query is subsequently executed. This token, or a token + // with management privileges, must be used to change the query later. + Token string + + // Template is used to configure this query as a template, which will + // respond to queries based on the Name, and then will be rendered + // before it is executed. + Template QueryTemplateOptions + + // Service defines a service query (leaving things open for other types + // later). + Service ServiceQuery + + // DNS has options that control how the results of this query are + // served over DNS. + DNS QueryDNSOptions + + RaftIndex +} + +// GetACLPrefix returns the prefix to look up the prepared_query ACL policy for +// this query, and whether the prefix applies to this query. You always need to +// check the ok value before using the prefix. +func (pq *PreparedQuery) GetACLPrefix() (string, bool) { + if pq.Name != "" || pq.Template.Type != "" { + return pq.Name, true + } + + return "", false +} + +type PreparedQueries []*PreparedQuery + +type IndexedPreparedQueries struct { + Queries PreparedQueries + QueryMeta +} + +type PreparedQueryOp string + +const ( + PreparedQueryCreate PreparedQueryOp = "create" + PreparedQueryUpdate PreparedQueryOp = "update" + PreparedQueryDelete PreparedQueryOp = "delete" +) + +// QueryRequest is used to create or change prepared queries. +type PreparedQueryRequest struct { + // Datacenter is the target this request is intended for. + Datacenter string + + // Op is the operation to apply. + Op PreparedQueryOp + + // Query is the query itself. + Query *PreparedQuery + + // WriteRequest holds the ACL token to go along with this request. + WriteRequest +} + +// RequestDatacenter returns the datacenter for a given request. +func (q *PreparedQueryRequest) RequestDatacenter() string { + return q.Datacenter +} + +// PreparedQuerySpecificRequest is used to get information about a prepared +// query. +type PreparedQuerySpecificRequest struct { + // Datacenter is the target this request is intended for. + Datacenter string + + // QueryID is the ID of a query. + QueryID string + + // QueryOptions (unfortunately named here) controls the consistency + // settings for the query lookup itself, as well as the service lookups. + QueryOptions +} + +// RequestDatacenter returns the datacenter for a given request. +func (q *PreparedQuerySpecificRequest) RequestDatacenter() string { + return q.Datacenter +} + +// PreparedQueryExecuteRequest is used to execute a prepared query. +type PreparedQueryExecuteRequest struct { + // Datacenter is the target this request is intended for. + Datacenter string + + // QueryIDOrName is the ID of a query _or_ the name of one, either can + // be provided. + QueryIDOrName string + + // Limit will trim the resulting list down to the given limit. + Limit int + + // Source is used to sort the results relative to a given node using + // network coordinates. + Source QuerySource + + // Agent is used to carry around a reference to the agent which initiated + // the execute request. Used to distance-sort relative to the local node. + Agent QuerySource + + // QueryOptions (unfortunately named here) controls the consistency + // settings for the query lookup itself, as well as the service lookups. + QueryOptions +} + +// RequestDatacenter returns the datacenter for a given request. +func (q *PreparedQueryExecuteRequest) RequestDatacenter() string { + return q.Datacenter +} + +// PreparedQueryExecuteRemoteRequest is used when running a local query in a +// remote datacenter. +type PreparedQueryExecuteRemoteRequest struct { + // Datacenter is the target this request is intended for. + Datacenter string + + // Query is a copy of the query to execute. We have to ship the entire + // query over since it won't be present in the remote state store. + Query PreparedQuery + + // Limit will trim the resulting list down to the given limit. + Limit int + + // QueryOptions (unfortunately named here) controls the consistency + // settings for the the service lookups. + QueryOptions +} + +// RequestDatacenter returns the datacenter for a given request. +func (q *PreparedQueryExecuteRemoteRequest) RequestDatacenter() string { + return q.Datacenter +} + +// PreparedQueryExecuteResponse has the results of executing a query. +type PreparedQueryExecuteResponse struct { + // Service is the service that was queried. + Service string + + // Nodes has the nodes that were output by the query. + Nodes CheckServiceNodes + + // DNS has the options for serving these results over DNS. + DNS QueryDNSOptions + + // Datacenter is the datacenter that these results came from. + Datacenter string + + // Failovers is a count of how many times we had to query a remote + // datacenter. + Failovers int + + // QueryMeta has freshness information about the query. + QueryMeta +} + +// PreparedQueryExplainResponse has the results when explaining a query/ +type PreparedQueryExplainResponse struct { + // Query has the fully-rendered query. + Query PreparedQuery + + // QueryMeta has freshness information about the query. + QueryMeta +} diff --git a/vendor/github.com/hashicorp/consul/consul/structs/snapshot.go b/vendor/github.com/hashicorp/consul/consul/structs/snapshot.go new file mode 100644 index 000000000..3d65e317f --- /dev/null +++ b/vendor/github.com/hashicorp/consul/consul/structs/snapshot.go @@ -0,0 +1,40 @@ +package structs + +type SnapshotOp int + +const ( + SnapshotSave SnapshotOp = iota + SnapshotRestore +) + +// SnapshotRequest is used as a header for a snapshot RPC request. This will +// precede any streaming data that's part of the request and is JSON-encoded on +// the wire. +type SnapshotRequest struct { + // Datacenter is the target datacenter for this request. The request + // will be forwarded if necessary. + Datacenter string + + // Token is the ACL token to use for the operation. If ACLs are enabled + // then all operations require a management token. + Token string + + // If set, any follower can service the request. Results may be + // arbitrarily stale. Only applies to SnapshotSave. + AllowStale bool + + // Op is the operation code for the RPC. + Op SnapshotOp +} + +// SnapshotResponse is used header for a snapshot RPC response. This will +// precede any streaming data that's part of the request and is JSON-encoded on +// the wire. +type SnapshotResponse struct { + // Error is the overall error status of the RPC request. + Error string + + // QueryMeta has freshness information about the server that handled the + // request. It is only filled in for a SnapshotSave. + QueryMeta +} diff --git a/vendor/github.com/hashicorp/consul/consul/structs/structs.go b/vendor/github.com/hashicorp/consul/consul/structs/structs.go new file mode 100644 index 000000000..13c67b3d5 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/consul/structs/structs.go @@ -0,0 +1,1041 @@ +package structs + +import ( + "bytes" + "fmt" + "math/rand" + "reflect" + "time" + + "github.com/hashicorp/consul/acl" + "github.com/hashicorp/consul/types" + "github.com/hashicorp/go-msgpack/codec" + "github.com/hashicorp/serf/coordinate" + "regexp" + "strings" +) + +var ( + ErrNoLeader = fmt.Errorf("No cluster leader") + ErrNoDCPath = fmt.Errorf("No path to datacenter") + ErrNoServers = fmt.Errorf("No known Consul servers") +) + +type MessageType uint8 + +// RaftIndex is used to track the index used while creating +// or modifying a given struct type. +type RaftIndex struct { + CreateIndex uint64 + ModifyIndex uint64 +} + +const ( + RegisterRequestType MessageType = iota + DeregisterRequestType + KVSRequestType + SessionRequestType + ACLRequestType + TombstoneRequestType + CoordinateBatchUpdateType + PreparedQueryRequestType + TxnRequestType +) + +const ( + // IgnoreUnknownTypeFlag is set along with a MessageType + // to indicate that the message type can be safely ignored + // if it is not recognized. This is for future proofing, so + // that new commands can be added in a way that won't cause + // old servers to crash when the FSM attempts to process them. + IgnoreUnknownTypeFlag MessageType = 128 +) + +const ( + // HealthAny is special, and is used as a wild card, + // not as a specific state. + HealthAny = "any" + HealthPassing = "passing" + HealthWarning = "warning" + HealthCritical = "critical" + HealthMaint = "maintenance" +) + +const ( + // NodeMaint is the special key set by a node in maintenance mode. + NodeMaint = "_node_maintenance" + + // ServiceMaintPrefix is the prefix for a service in maintenance mode. + ServiceMaintPrefix = "_service_maintenance:" +) + +const ( + // The meta key prefix reserved for Consul's internal use + metaKeyReservedPrefix = "consul-" + + // The maximum number of metadata key pairs allowed to be registered + metaMaxKeyPairs = 64 + + // The maximum allowed length of a metadata key + metaKeyMaxLength = 128 + + // The maximum allowed length of a metadata value + metaValueMaxLength = 512 +) + +var ( + // metaKeyFormat checks if a metadata key string is valid + metaKeyFormat = regexp.MustCompile(`^[a-zA-Z0-9_-]+$`).MatchString +) + +func ValidStatus(s string) bool { + return s == HealthPassing || + s == HealthWarning || + s == HealthCritical +} + +const ( + // Client tokens have rules applied + ACLTypeClient = "client" + + // Management tokens have an always allow policy. + // They are used for token management. + ACLTypeManagement = "management" +) + +const ( + // MaxLockDelay provides a maximum LockDelay value for + // a session. Any value above this will not be respected. + MaxLockDelay = 60 * time.Second +) + +// RPCInfo is used to describe common information about query +type RPCInfo interface { + RequestDatacenter() string + IsRead() bool + AllowStaleRead() bool + ACLToken() string +} + +// QueryOptions is used to specify various flags for read queries +type QueryOptions struct { + // Token is the ACL token ID. If not provided, the 'anonymous' + // token is assumed for backwards compatibility. + Token string + + // If set, wait until query exceeds given index. Must be provided + // with MaxQueryTime. + MinQueryIndex uint64 + + // Provided with MinQueryIndex to wait for change. + MaxQueryTime time.Duration + + // If set, any follower can service the request. Results + // may be arbitrarily stale. + AllowStale bool + + // If set, the leader must verify leadership prior to + // servicing the request. Prevents a stale read. + RequireConsistent bool +} + +// QueryOption only applies to reads, so always true +func (q QueryOptions) IsRead() bool { + return true +} + +func (q QueryOptions) AllowStaleRead() bool { + return q.AllowStale +} + +func (q QueryOptions) ACLToken() string { + return q.Token +} + +type WriteRequest struct { + // Token is the ACL token ID. If not provided, the 'anonymous' + // token is assumed for backwards compatibility. + Token string +} + +// WriteRequest only applies to writes, always false +func (w WriteRequest) IsRead() bool { + return false +} + +func (w WriteRequest) AllowStaleRead() bool { + return false +} + +func (w WriteRequest) ACLToken() string { + return w.Token +} + +// QueryMeta allows a query response to include potentially +// useful metadata about a query +type QueryMeta struct { + // This is the index associated with the read + Index uint64 + + // If AllowStale is used, this is time elapsed since + // last contact between the follower and leader. This + // can be used to gauge staleness. + LastContact time.Duration + + // Used to indicate if there is a known leader node + KnownLeader bool +} + +// RegisterRequest is used for the Catalog.Register endpoint +// to register a node as providing a service. If no service +// is provided, the node is registered. +type RegisterRequest struct { + Datacenter string + ID types.NodeID + Node string + Address string + TaggedAddresses map[string]string + NodeMeta map[string]string + Service *NodeService + Check *HealthCheck + Checks HealthChecks + WriteRequest +} + +func (r *RegisterRequest) RequestDatacenter() string { + return r.Datacenter +} + +// ChangesNode returns true if the given register request changes the given +// node, which can be nil. This only looks for changes to the node record itself, +// not any of the health checks. +func (r *RegisterRequest) ChangesNode(node *Node) bool { + // This means it's creating the node. + if node == nil { + return true + } + + // Check if any of the node-level fields are being changed. + if r.ID != node.ID || + r.Node != node.Node || + r.Address != node.Address || + !reflect.DeepEqual(r.TaggedAddresses, node.TaggedAddresses) || + !reflect.DeepEqual(r.NodeMeta, node.Meta) { + return true + } + + return false +} + +// DeregisterRequest is used for the Catalog.Deregister endpoint +// to deregister a node as providing a service. If no service is +// provided the entire node is deregistered. +type DeregisterRequest struct { + Datacenter string + Node string + ServiceID string + CheckID types.CheckID + WriteRequest +} + +func (r *DeregisterRequest) RequestDatacenter() string { + return r.Datacenter +} + +// QuerySource is used to pass along information about the source node +// in queries so that we can adjust the response based on its network +// coordinates. +type QuerySource struct { + Datacenter string + Node string +} + +// DCSpecificRequest is used to query about a specific DC +type DCSpecificRequest struct { + Datacenter string + NodeMetaFilters map[string]string + Source QuerySource + QueryOptions +} + +func (r *DCSpecificRequest) RequestDatacenter() string { + return r.Datacenter +} + +// ServiceSpecificRequest is used to query about a specific service +type ServiceSpecificRequest struct { + Datacenter string + NodeMetaFilters map[string]string + ServiceName string + ServiceTag string + TagFilter bool // Controls tag filtering + Source QuerySource + QueryOptions +} + +func (r *ServiceSpecificRequest) RequestDatacenter() string { + return r.Datacenter +} + +// NodeSpecificRequest is used to request the information about a single node +type NodeSpecificRequest struct { + Datacenter string + Node string + QueryOptions +} + +func (r *NodeSpecificRequest) RequestDatacenter() string { + return r.Datacenter +} + +// ChecksInStateRequest is used to query for nodes in a state +type ChecksInStateRequest struct { + Datacenter string + NodeMetaFilters map[string]string + State string + Source QuerySource + QueryOptions +} + +func (r *ChecksInStateRequest) RequestDatacenter() string { + return r.Datacenter +} + +// Used to return information about a node +type Node struct { + ID types.NodeID + Node string + Address string + TaggedAddresses map[string]string + Meta map[string]string + + RaftIndex +} +type Nodes []*Node + +// ValidateMeta validates a set of key/value pairs from the agent config +func ValidateMetadata(meta map[string]string) error { + if len(meta) > metaMaxKeyPairs { + return fmt.Errorf("Node metadata cannot contain more than %d key/value pairs", metaMaxKeyPairs) + } + + for key, value := range meta { + if err := validateMetaPair(key, value); err != nil { + return fmt.Errorf("Couldn't load metadata pair ('%s', '%s'): %s", key, value, err) + } + } + + return nil +} + +// validateMetaPair checks that the given key/value pair is in a valid format +func validateMetaPair(key, value string) error { + if key == "" { + return fmt.Errorf("Key cannot be blank") + } + if !metaKeyFormat(key) { + return fmt.Errorf("Key contains invalid characters") + } + if len(key) > metaKeyMaxLength { + return fmt.Errorf("Key is too long (limit: %d characters)", metaKeyMaxLength) + } + if strings.HasPrefix(key, metaKeyReservedPrefix) { + return fmt.Errorf("Key prefix '%s' is reserved for internal use", metaKeyReservedPrefix) + } + if len(value) > metaValueMaxLength { + return fmt.Errorf("Value is too long (limit: %d characters)", metaValueMaxLength) + } + return nil +} + +// SatisfiesMetaFilters returns true if the metadata map contains the given filters +func SatisfiesMetaFilters(meta map[string]string, filters map[string]string) bool { + for key, value := range filters { + if v, ok := meta[key]; !ok || v != value { + return false + } + } + return true +} + +// Used to return information about a provided services. +// Maps service name to available tags +type Services map[string][]string + +// ServiceNode represents a node that is part of a service. ID, Address, +// TaggedAddresses, and NodeMeta are node-related fields that are always empty +// in the state store and are filled in on the way out by parseServiceNodes(). +// This is also why PartialClone() skips them, because we know they are blank +// already so it would be a waste of time to copy them. +type ServiceNode struct { + ID types.NodeID + Node string + Address string + TaggedAddresses map[string]string + NodeMeta map[string]string + ServiceID string + ServiceName string + ServiceTags []string + ServiceAddress string + ServicePort int + ServiceEnableTagOverride bool + + RaftIndex +} + +// PartialClone() returns a clone of the given service node, minus the node- +// related fields that get filled in later, Address and TaggedAddresses. +func (s *ServiceNode) PartialClone() *ServiceNode { + tags := make([]string, len(s.ServiceTags)) + copy(tags, s.ServiceTags) + + return &ServiceNode{ + // Skip ID, see above. + Node: s.Node, + // Skip Address, see above. + // Skip TaggedAddresses, see above. + ServiceID: s.ServiceID, + ServiceName: s.ServiceName, + ServiceTags: tags, + ServiceAddress: s.ServiceAddress, + ServicePort: s.ServicePort, + ServiceEnableTagOverride: s.ServiceEnableTagOverride, + RaftIndex: RaftIndex{ + CreateIndex: s.CreateIndex, + ModifyIndex: s.ModifyIndex, + }, + } +} + +// ToNodeService converts the given service node to a node service. +func (s *ServiceNode) ToNodeService() *NodeService { + return &NodeService{ + ID: s.ServiceID, + Service: s.ServiceName, + Tags: s.ServiceTags, + Address: s.ServiceAddress, + Port: s.ServicePort, + EnableTagOverride: s.ServiceEnableTagOverride, + RaftIndex: RaftIndex{ + CreateIndex: s.CreateIndex, + ModifyIndex: s.ModifyIndex, + }, + } +} + +type ServiceNodes []*ServiceNode + +// NodeService is a service provided by a node +type NodeService struct { + ID string + Service string + Tags []string + Address string + Port int + EnableTagOverride bool + + RaftIndex +} + +// IsSame checks if one NodeService is the same as another, without looking +// at the Raft information (that's why we didn't call it IsEqual). This is +// useful for seeing if an update would be idempotent for all the functional +// parts of the structure. +func (s *NodeService) IsSame(other *NodeService) bool { + if s.ID != other.ID || + s.Service != other.Service || + !reflect.DeepEqual(s.Tags, other.Tags) || + s.Address != other.Address || + s.Port != other.Port || + s.EnableTagOverride != other.EnableTagOverride { + return false + } + + return true +} + +// ToServiceNode converts the given node service to a service node. +func (s *NodeService) ToServiceNode(node string) *ServiceNode { + return &ServiceNode{ + // Skip ID, see ServiceNode definition. + Node: node, + // Skip Address, see ServiceNode definition. + // Skip TaggedAddresses, see ServiceNode definition. + ServiceID: s.ID, + ServiceName: s.Service, + ServiceTags: s.Tags, + ServiceAddress: s.Address, + ServicePort: s.Port, + ServiceEnableTagOverride: s.EnableTagOverride, + RaftIndex: RaftIndex{ + CreateIndex: s.CreateIndex, + ModifyIndex: s.ModifyIndex, + }, + } +} + +type NodeServices struct { + Node *Node + Services map[string]*NodeService +} + +// HealthCheck represents a single check on a given node +type HealthCheck struct { + Node string + CheckID types.CheckID // Unique per-node ID + Name string // Check name + Status string // The current check status + Notes string // Additional notes with the status + Output string // Holds output of script runs + ServiceID string // optional associated service + ServiceName string // optional service name + + RaftIndex +} + +// IsSame checks if one HealthCheck is the same as another, without looking +// at the Raft information (that's why we didn't call it IsEqual). This is +// useful for seeing if an update would be idempotent for all the functional +// parts of the structure. +func (c *HealthCheck) IsSame(other *HealthCheck) bool { + if c.Node != other.Node || + c.CheckID != other.CheckID || + c.Name != other.Name || + c.Status != other.Status || + c.Notes != other.Notes || + c.Output != other.Output || + c.ServiceID != other.ServiceID || + c.ServiceName != other.ServiceName { + return false + } + + return true +} + +// Clone returns a distinct clone of the HealthCheck. +func (c *HealthCheck) Clone() *HealthCheck { + clone := new(HealthCheck) + *clone = *c + return clone +} + +// HealthChecks is a collection of HealthCheck structs. +type HealthChecks []*HealthCheck + +// CheckServiceNode is used to provide the node, its service +// definition, as well as a HealthCheck that is associated. +type CheckServiceNode struct { + Node *Node + Service *NodeService + Checks HealthChecks +} +type CheckServiceNodes []CheckServiceNode + +// Shuffle does an in-place random shuffle using the Fisher-Yates algorithm. +func (nodes CheckServiceNodes) Shuffle() { + for i := len(nodes) - 1; i > 0; i-- { + j := rand.Int31n(int32(i + 1)) + nodes[i], nodes[j] = nodes[j], nodes[i] + } +} + +// Filter removes nodes that are failing health checks (and any non-passing +// check if that option is selected). Note that this returns the filtered +// results AND modifies the receiver for performance. +func (nodes CheckServiceNodes) Filter(onlyPassing bool) CheckServiceNodes { + n := len(nodes) +OUTER: + for i := 0; i < n; i++ { + node := nodes[i] + for _, check := range node.Checks { + if check.Status == HealthCritical || + (onlyPassing && check.Status != HealthPassing) { + nodes[i], nodes[n-1] = nodes[n-1], CheckServiceNode{} + n-- + i-- + continue OUTER + } + } + } + return nodes[:n] +} + +// NodeInfo is used to dump all associated information about +// a node. This is currently used for the UI only, as it is +// rather expensive to generate. +type NodeInfo struct { + ID types.NodeID + Node string + Address string + TaggedAddresses map[string]string + Meta map[string]string + Services []*NodeService + Checks HealthChecks +} + +// NodeDump is used to dump all the nodes with all their +// associated data. This is currently used for the UI only, +// as it is rather expensive to generate. +type NodeDump []*NodeInfo + +type IndexedNodes struct { + Nodes Nodes + QueryMeta +} + +type IndexedServices struct { + Services Services + QueryMeta +} + +type IndexedServiceNodes struct { + ServiceNodes ServiceNodes + QueryMeta +} + +type IndexedNodeServices struct { + NodeServices *NodeServices + QueryMeta +} + +type IndexedHealthChecks struct { + HealthChecks HealthChecks + QueryMeta +} + +type IndexedCheckServiceNodes struct { + Nodes CheckServiceNodes + QueryMeta +} + +type IndexedNodeDump struct { + Dump NodeDump + QueryMeta +} + +// DirEntry is used to represent a directory entry. This is +// used for values in our Key-Value store. +type DirEntry struct { + LockIndex uint64 + Key string + Flags uint64 + Value []byte + Session string `json:",omitempty"` + + RaftIndex +} + +// Returns a clone of the given directory entry. +func (d *DirEntry) Clone() *DirEntry { + return &DirEntry{ + LockIndex: d.LockIndex, + Key: d.Key, + Flags: d.Flags, + Value: d.Value, + Session: d.Session, + RaftIndex: RaftIndex{ + CreateIndex: d.CreateIndex, + ModifyIndex: d.ModifyIndex, + }, + } +} + +type DirEntries []*DirEntry + +type KVSOp string + +const ( + KVSSet KVSOp = "set" + KVSDelete = "delete" + KVSDeleteCAS = "delete-cas" // Delete with check-and-set + KVSDeleteTree = "delete-tree" + KVSCAS = "cas" // Check-and-set + KVSLock = "lock" // Lock a key + KVSUnlock = "unlock" // Unlock a key + + // The following operations are only available inside of atomic + // transactions via the Txn request. + KVSGet = "get" // Read the key during the transaction. + KVSGetTree = "get-tree" // Read all keys with the given prefix during the transaction. + KVSCheckSession = "check-session" // Check the session holds the key. + KVSCheckIndex = "check-index" // Check the modify index of the key. +) + +// IsWrite returns true if the given operation alters the state store. +func (op KVSOp) IsWrite() bool { + switch op { + case KVSGet, KVSGetTree, KVSCheckSession, KVSCheckIndex: + return false + + default: + return true + } +} + +// KVSRequest is used to operate on the Key-Value store +type KVSRequest struct { + Datacenter string + Op KVSOp // Which operation are we performing + DirEnt DirEntry // Which directory entry + WriteRequest +} + +func (r *KVSRequest) RequestDatacenter() string { + return r.Datacenter +} + +// KeyRequest is used to request a key, or key prefix +type KeyRequest struct { + Datacenter string + Key string + QueryOptions +} + +func (r *KeyRequest) RequestDatacenter() string { + return r.Datacenter +} + +// KeyListRequest is used to list keys +type KeyListRequest struct { + Datacenter string + Prefix string + Seperator string + QueryOptions +} + +func (r *KeyListRequest) RequestDatacenter() string { + return r.Datacenter +} + +type IndexedDirEntries struct { + Entries DirEntries + QueryMeta +} + +type IndexedKeyList struct { + Keys []string + QueryMeta +} + +type SessionBehavior string + +const ( + SessionKeysRelease SessionBehavior = "release" + SessionKeysDelete = "delete" +) + +const ( + SessionTTLMax = 24 * time.Hour + SessionTTLMultiplier = 2 +) + +// Session is used to represent an open session in the KV store. +// This issued to associate node checks with acquired locks. +type Session struct { + ID string + Name string + Node string + Checks []types.CheckID + LockDelay time.Duration + Behavior SessionBehavior // What to do when session is invalidated + TTL string + + RaftIndex +} +type Sessions []*Session + +type SessionOp string + +const ( + SessionCreate SessionOp = "create" + SessionDestroy = "destroy" +) + +// SessionRequest is used to operate on sessions +type SessionRequest struct { + Datacenter string + Op SessionOp // Which operation are we performing + Session Session // Which session + WriteRequest +} + +func (r *SessionRequest) RequestDatacenter() string { + return r.Datacenter +} + +// SessionSpecificRequest is used to request a session by ID +type SessionSpecificRequest struct { + Datacenter string + Session string + QueryOptions +} + +func (r *SessionSpecificRequest) RequestDatacenter() string { + return r.Datacenter +} + +type IndexedSessions struct { + Sessions Sessions + QueryMeta +} + +// ACL is used to represent a token and its rules +type ACL struct { + ID string + Name string + Type string + Rules string + + RaftIndex +} +type ACLs []*ACL + +type ACLOp string + +const ( + ACLSet ACLOp = "set" + ACLForceSet = "force-set" // Deprecated, left to backwards compatibility + ACLDelete = "delete" +) + +// IsSame checks if one ACL is the same as another, without looking +// at the Raft information (that's why we didn't call it IsEqual). This is +// useful for seeing if an update would be idempotent for all the functional +// parts of the structure. +func (a *ACL) IsSame(other *ACL) bool { + if a.ID != other.ID || + a.Name != other.Name || + a.Type != other.Type || + a.Rules != other.Rules { + return false + } + + return true +} + +// ACLRequest is used to create, update or delete an ACL +type ACLRequest struct { + Datacenter string + Op ACLOp + ACL ACL + WriteRequest +} + +func (r *ACLRequest) RequestDatacenter() string { + return r.Datacenter +} + +// ACLRequests is a list of ACL change requests. +type ACLRequests []*ACLRequest + +// ACLSpecificRequest is used to request an ACL by ID +type ACLSpecificRequest struct { + Datacenter string + ACL string + QueryOptions +} + +func (r *ACLSpecificRequest) RequestDatacenter() string { + return r.Datacenter +} + +// ACLPolicyRequest is used to request an ACL by ID, conditionally +// filtering on an ID +type ACLPolicyRequest struct { + Datacenter string + ACL string + ETag string + QueryOptions +} + +func (r *ACLPolicyRequest) RequestDatacenter() string { + return r.Datacenter +} + +type IndexedACLs struct { + ACLs ACLs + QueryMeta +} + +type ACLPolicy struct { + ETag string + Parent string + Policy *acl.Policy + TTL time.Duration + QueryMeta +} + +// ACLReplicationStatus provides information about the health of the ACL +// replication system. +type ACLReplicationStatus struct { + Enabled bool + Running bool + SourceDatacenter string + ReplicatedIndex uint64 + LastSuccess time.Time + LastError time.Time +} + +// Coordinate stores a node name with its associated network coordinate. +type Coordinate struct { + Node string + Coord *coordinate.Coordinate +} + +type Coordinates []*Coordinate + +// IndexedCoordinate is used to represent a single node's coordinate from the state +// store. +type IndexedCoordinate struct { + Coord *coordinate.Coordinate + QueryMeta +} + +// IndexedCoordinates is used to represent a list of nodes and their +// corresponding raw coordinates. +type IndexedCoordinates struct { + Coordinates Coordinates + QueryMeta +} + +// DatacenterMap is used to represent a list of nodes with their raw coordinates, +// associated with a datacenter. +type DatacenterMap struct { + Datacenter string + Coordinates Coordinates +} + +// CoordinateUpdateRequest is used to update the network coordinate of a given +// node. +type CoordinateUpdateRequest struct { + Datacenter string + Node string + Coord *coordinate.Coordinate + WriteRequest +} + +// RequestDatacenter returns the datacenter for a given update request. +func (c *CoordinateUpdateRequest) RequestDatacenter() string { + return c.Datacenter +} + +// EventFireRequest is used to ask a server to fire +// a Serf event. It is a bit odd, since it doesn't depend on +// the catalog or leader. Any node can respond, so it's not quite +// like a standard write request. This is used only internally. +type EventFireRequest struct { + Datacenter string + Name string + Payload []byte + + // Not using WriteRequest so that any server can process + // the request. It is a bit unusual... + QueryOptions +} + +func (r *EventFireRequest) RequestDatacenter() string { + return r.Datacenter +} + +// EventFireResponse is used to respond to a fire request. +type EventFireResponse struct { + QueryMeta +} + +type TombstoneOp string + +const ( + TombstoneReap TombstoneOp = "reap" +) + +// TombstoneRequest is used to trigger a reaping of the tombstones +type TombstoneRequest struct { + Datacenter string + Op TombstoneOp + ReapIndex uint64 + WriteRequest +} + +func (r *TombstoneRequest) RequestDatacenter() string { + return r.Datacenter +} + +// msgpackHandle is a shared handle for encoding/decoding of structs +var msgpackHandle = &codec.MsgpackHandle{} + +// Decode is used to decode a MsgPack encoded object +func Decode(buf []byte, out interface{}) error { + return codec.NewDecoder(bytes.NewReader(buf), msgpackHandle).Decode(out) +} + +// Encode is used to encode a MsgPack object with type prefix +func Encode(t MessageType, msg interface{}) ([]byte, error) { + var buf bytes.Buffer + buf.WriteByte(uint8(t)) + err := codec.NewEncoder(&buf, msgpackHandle).Encode(msg) + return buf.Bytes(), err +} + +// CompoundResponse is an interface for gathering multiple responses. It is +// used in cross-datacenter RPC calls where more than 1 datacenter is +// expected to reply. +type CompoundResponse interface { + // Add adds a new response to the compound response + Add(interface{}) + + // New returns an empty response object which can be passed around by + // reference, and then passed to Add() later on. + New() interface{} +} + +type KeyringOp string + +const ( + KeyringList KeyringOp = "list" + KeyringInstall = "install" + KeyringUse = "use" + KeyringRemove = "remove" +) + +// KeyringRequest encapsulates a request to modify an encryption keyring. +// It can be used for install, remove, or use key type operations. +type KeyringRequest struct { + Operation KeyringOp + Key string + Datacenter string + Forwarded bool + RelayFactor uint8 + QueryOptions +} + +func (r *KeyringRequest) RequestDatacenter() string { + return r.Datacenter +} + +// KeyringResponse is a unified key response and can be used for install, +// remove, use, as well as listing key queries. +type KeyringResponse struct { + WAN bool + Datacenter string + Messages map[string]string `json:",omitempty"` + Keys map[string]int + NumNodes int + Error string `json:",omitempty"` +} + +// KeyringResponses holds multiple responses to keyring queries. Each +// datacenter replies independently, and KeyringResponses is used as a +// container for the set of all responses. +type KeyringResponses struct { + Responses []*KeyringResponse + QueryMeta +} + +func (r *KeyringResponses) Add(v interface{}) { + val := v.(*KeyringResponses) + r.Responses = append(r.Responses, val.Responses...) +} + +func (r *KeyringResponses) New() interface{} { + return new(KeyringResponses) +} diff --git a/vendor/github.com/hashicorp/consul/consul/structs/txn.go b/vendor/github.com/hashicorp/consul/consul/structs/txn.go new file mode 100644 index 000000000..3f8035b97 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/consul/structs/txn.go @@ -0,0 +1,85 @@ +package structs + +import ( + "fmt" +) + +// TxnKVOp is used to define a single operation on the KVS inside a +// transaction +type TxnKVOp struct { + Verb KVSOp + DirEnt DirEntry +} + +// TxnKVResult is used to define the result of a single operation on the KVS +// inside a transaction. +type TxnKVResult *DirEntry + +// TxnOp is used to define a single operation inside a transaction. Only one +// of the types should be filled out per entry. +type TxnOp struct { + KV *TxnKVOp +} + +// TxnOps is a list of operations within a transaction. +type TxnOps []*TxnOp + +// TxnRequest is used to apply multiple operations to the state store in a +// single transaction +type TxnRequest struct { + Datacenter string + Ops TxnOps + WriteRequest +} + +func (r *TxnRequest) RequestDatacenter() string { + return r.Datacenter +} + +// TxnReadRequest is used as a fast path for read-only transactions that don't +// modify the state store. +type TxnReadRequest struct { + Datacenter string + Ops TxnOps + QueryOptions +} + +func (r *TxnReadRequest) RequestDatacenter() string { + return r.Datacenter +} + +// TxnError is used to return information about an error for a specific +// operation. +type TxnError struct { + OpIndex int + What string +} + +// Error returns the string representation of an atomic error. +func (e TxnError) Error() string { + return fmt.Sprintf("op %d: %s", e.OpIndex, e.What) +} + +// TxnErrors is a list of TxnError entries. +type TxnErrors []*TxnError + +// TxnResult is used to define the result of a given operation inside a +// transaction. Only one of the types should be filled out per entry. +type TxnResult struct { + KV TxnKVResult +} + +// TxnResults is a list of TxnResult entries. +type TxnResults []*TxnResult + +// TxnResponse is the structure returned by a TxnRequest. +type TxnResponse struct { + Results TxnResults + Errors TxnErrors +} + +// TxnReadResponse is the structure returned by a TxnReadRequest. +type TxnReadResponse struct { + TxnResponse + QueryMeta +} diff --git a/vendor/github.com/hashicorp/consul/testutil/README.md b/vendor/github.com/hashicorp/consul/testutil/README.md new file mode 100644 index 000000000..21eb01d2a --- /dev/null +++ b/vendor/github.com/hashicorp/consul/testutil/README.md @@ -0,0 +1,65 @@ +Consul Testing Utilities +======================== + +This package provides some generic helpers to facilitate testing in Consul. + +TestServer +========== + +TestServer is a harness for managing Consul agents and initializing them with +test data. Using it, you can form test clusters, create services, add health +checks, manipulate the K/V store, etc. This test harness is completely decoupled +from Consul's core and API client, meaning it can be easily imported and used in +external unit tests for various applications. It works by invoking the Consul +CLI, which means it is a requirement to have Consul installed in the `$PATH`. + +Following is an example usage: + +```go +package my_program + +import ( + "testing" + + "github.com/hashicorp/consul/consul/structs" + "github.com/hashicorp/consul/testutil" +) + +func TestMain(t *testing.T) { + // Create a test Consul server + srv1 := testutil.NewTestServer(t) + defer srv1.Stop() + + // Create a secondary server, passing in configuration + // to avoid bootstrapping as we are forming a cluster. + srv2 := testutil.NewTestServerConfig(t, func(c *testutil.TestServerConfig) { + c.Bootstrap = false + }) + defer srv2.Stop() + + // Join the servers together + srv1.JoinLAN(srv2.LANAddr) + + // Create a test key/value pair + srv1.SetKV("foo", []byte("bar")) + + // Create lots of test key/value pairs + srv1.PopulateKV(map[string][]byte{ + "bar": []byte("123"), + "baz": []byte("456"), + }) + + // Create a service + srv1.AddService("redis", structs.HealthPassing, []string{"master"}) + + // Create a service check + srv1.AddCheck("service:redis", "redis", structs.HealthPassing) + + // Create a node check + srv1.AddCheck("mem", "", structs.HealthCritical) + + // The HTTPAddr field contains the address of the Consul + // API on the new test server instance. + println(srv1.HTTPAddr) +} +``` diff --git a/vendor/github.com/hashicorp/consul/testutil/server.go b/vendor/github.com/hashicorp/consul/testutil/server.go new file mode 100644 index 000000000..7daa21ed6 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/testutil/server.go @@ -0,0 +1,528 @@ +package testutil + +// TestServer is a test helper. It uses a fork/exec model to create +// a test Consul server instance in the background and initialize it +// with some data and/or services. The test server can then be used +// to run a unit test, and offers an easy API to tear itself down +// when the test has completed. The only prerequisite is to have a consul +// binary available on the $PATH. +// +// This package does not use Consul's official API client. This is +// because we use TestServer to test the API client, which would +// otherwise cause an import cycle. + +import ( + "bytes" + "encoding/base64" + "encoding/json" + "fmt" + "io" + "io/ioutil" + "net" + "net/http" + "os" + "os/exec" + "strconv" + "strings" + + "github.com/hashicorp/consul/consul/structs" + "github.com/hashicorp/go-cleanhttp" +) + +// TestPerformanceConfig configures the performance parameters. +type TestPerformanceConfig struct { + RaftMultiplier uint `json:"raft_multiplier,omitempty"` +} + +// TestPortConfig configures the various ports used for services +// provided by the Consul server. +type TestPortConfig struct { + DNS int `json:"dns,omitempty"` + HTTP int `json:"http,omitempty"` + RPC int `json:"rpc,omitempty"` + SerfLan int `json:"serf_lan,omitempty"` + SerfWan int `json:"serf_wan,omitempty"` + Server int `json:"server,omitempty"` +} + +// TestAddressConfig contains the bind addresses for various +// components of the Consul server. +type TestAddressConfig struct { + HTTP string `json:"http,omitempty"` +} + +// TestServerConfig is the main server configuration struct. +type TestServerConfig struct { + NodeName string `json:"node_name"` + NodeMeta map[string]string `json:"node_meta,omitempty"` + Performance *TestPerformanceConfig `json:"performance,omitempty"` + Bootstrap bool `json:"bootstrap,omitempty"` + Server bool `json:"server,omitempty"` + DataDir string `json:"data_dir,omitempty"` + Datacenter string `json:"datacenter,omitempty"` + DisableCheckpoint bool `json:"disable_update_check"` + LogLevel string `json:"log_level,omitempty"` + Bind string `json:"bind_addr,omitempty"` + Addresses *TestAddressConfig `json:"addresses,omitempty"` + Ports *TestPortConfig `json:"ports,omitempty"` + ACLMasterToken string `json:"acl_master_token,omitempty"` + ACLDatacenter string `json:"acl_datacenter,omitempty"` + ACLDefaultPolicy string `json:"acl_default_policy,omitempty"` + Encrypt string `json:"encrypt,omitempty"` + Stdout, Stderr io.Writer `json:"-"` + Args []string `json:"-"` +} + +// ServerConfigCallback is a function interface which can be +// passed to NewTestServerConfig to modify the server config. +type ServerConfigCallback func(c *TestServerConfig) + +// defaultServerConfig returns a new TestServerConfig struct +// with all of the listen ports incremented by one. +func defaultServerConfig() *TestServerConfig { + return &TestServerConfig{ + NodeName: fmt.Sprintf("node%d", randomPort()), + DisableCheckpoint: true, + Performance: &TestPerformanceConfig{ + RaftMultiplier: 1, + }, + Bootstrap: true, + Server: true, + LogLevel: "debug", + Bind: "127.0.0.1", + Addresses: &TestAddressConfig{}, + Ports: &TestPortConfig{ + DNS: randomPort(), + HTTP: randomPort(), + RPC: randomPort(), + SerfLan: randomPort(), + SerfWan: randomPort(), + Server: randomPort(), + }, + } +} + +// randomPort asks the kernel for a random port to use. +func randomPort() int { + l, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + panic(err) + } + defer l.Close() + return l.Addr().(*net.TCPAddr).Port +} + +// TestService is used to serialize a service definition. +type TestService struct { + ID string `json:",omitempty"` + Name string `json:",omitempty"` + Tags []string `json:",omitempty"` + Address string `json:",omitempty"` + Port int `json:",omitempty"` +} + +// TestCheck is used to serialize a check definition. +type TestCheck struct { + ID string `json:",omitempty"` + Name string `json:",omitempty"` + ServiceID string `json:",omitempty"` + TTL string `json:",omitempty"` +} + +// TestingT is an interface wrapper around TestingT +type TestingT interface { + Logf(format string, args ...interface{}) + Errorf(format string, args ...interface{}) + Fatalf(format string, args ...interface{}) + Fatal(args ...interface{}) + Skip(args ...interface{}) +} + +// TestKVResponse is what we use to decode KV data. +type TestKVResponse struct { + Value string +} + +// TestServer is the main server wrapper struct. +type TestServer struct { + cmd *exec.Cmd + Config *TestServerConfig + t TestingT + + HTTPAddr string + LANAddr string + WANAddr string + + HttpClient *http.Client +} + +// NewTestServer is an easy helper method to create a new Consul +// test server with the most basic configuration. +func NewTestServer(t TestingT) *TestServer { + return NewTestServerConfig(t, nil) +} + +// NewTestServerConfig creates a new TestServer, and makes a call to +// an optional callback function to modify the configuration. +func NewTestServerConfig(t TestingT, cb ServerConfigCallback) *TestServer { + if path, err := exec.LookPath("consul"); err != nil || path == "" { + t.Fatal("consul not found on $PATH - download and install " + + "consul or skip this test") + } + + dataDir, err := ioutil.TempDir("", "consul") + if err != nil { + t.Fatalf("err: %s", err) + } + + configFile, err := ioutil.TempFile(dataDir, "config") + if err != nil { + defer os.RemoveAll(dataDir) + t.Fatalf("err: %s", err) + } + + consulConfig := defaultServerConfig() + consulConfig.DataDir = dataDir + + if cb != nil { + cb(consulConfig) + } + + configContent, err := json.Marshal(consulConfig) + if err != nil { + t.Fatalf("err: %s", err) + } + + if _, err := configFile.Write(configContent); err != nil { + t.Fatalf("err: %s", err) + } + configFile.Close() + + stdout := io.Writer(os.Stdout) + if consulConfig.Stdout != nil { + stdout = consulConfig.Stdout + } + + stderr := io.Writer(os.Stderr) + if consulConfig.Stderr != nil { + stderr = consulConfig.Stderr + } + + // Start the server + args := []string{"agent", "-config-file", configFile.Name()} + args = append(args, consulConfig.Args...) + cmd := exec.Command("consul", args...) + cmd.Stdout = stdout + cmd.Stderr = stderr + if err := cmd.Start(); err != nil { + t.Fatalf("err: %s", err) + } + + var httpAddr string + var client *http.Client + if strings.HasPrefix(consulConfig.Addresses.HTTP, "unix://") { + httpAddr = consulConfig.Addresses.HTTP + trans := cleanhttp.DefaultTransport() + trans.Dial = func(_, _ string) (net.Conn, error) { + return net.Dial("unix", httpAddr[7:]) + } + client = &http.Client{ + Transport: trans, + } + } else { + httpAddr = fmt.Sprintf("127.0.0.1:%d", consulConfig.Ports.HTTP) + client = cleanhttp.DefaultClient() + } + + server := &TestServer{ + Config: consulConfig, + cmd: cmd, + t: t, + + HTTPAddr: httpAddr, + LANAddr: fmt.Sprintf("127.0.0.1:%d", consulConfig.Ports.SerfLan), + WANAddr: fmt.Sprintf("127.0.0.1:%d", consulConfig.Ports.SerfWan), + + HttpClient: client, + } + + // Wait for the server to be ready + if consulConfig.Bootstrap { + server.waitForLeader() + } else { + server.waitForAPI() + } + + return server +} + +// Stop stops the test Consul server, and removes the Consul data +// directory once we are done. +func (s *TestServer) Stop() { + defer os.RemoveAll(s.Config.DataDir) + + if err := s.cmd.Process.Kill(); err != nil { + s.t.Errorf("err: %s", err) + } + + // wait for the process to exit to be sure that the data dir can be + // deleted on all platforms. + s.cmd.Wait() +} + +// waitForAPI waits for only the agent HTTP endpoint to start +// responding. This is an indication that the agent has started, +// but will likely return before a leader is elected. +func (s *TestServer) waitForAPI() { + WaitForResult(func() (bool, error) { + resp, err := s.HttpClient.Get(s.url("/v1/agent/self")) + if err != nil { + return false, err + } + defer resp.Body.Close() + if err := s.requireOK(resp); err != nil { + return false, err + } + return true, nil + }, func(err error) { + defer s.Stop() + s.t.Fatalf("err: %s", err) + }) +} + +// waitForLeader waits for the Consul server's HTTP API to become +// available, and then waits for a known leader and an index of +// 1 or more to be observed to confirm leader election is done. +// It then waits to ensure the anti-entropy sync has completed. +func (s *TestServer) waitForLeader() { + var index int64 + WaitForResult(func() (bool, error) { + // Query the API and check the status code. + url := s.url(fmt.Sprintf("/v1/catalog/nodes?index=%d&wait=2s", index)) + resp, err := s.HttpClient.Get(url) + if err != nil { + return false, err + } + defer resp.Body.Close() + if err := s.requireOK(resp); err != nil { + return false, err + } + + // Ensure we have a leader and a node registration. + if leader := resp.Header.Get("X-Consul-KnownLeader"); leader != "true" { + return false, fmt.Errorf("Consul leader status: %#v", leader) + } + index, err = strconv.ParseInt(resp.Header.Get("X-Consul-Index"), 10, 64) + if err != nil { + return false, fmt.Errorf("Consul index was bad: %v", err) + } + if index == 0 { + return false, fmt.Errorf("Consul index is 0") + } + + // Watch for the anti-entropy sync to finish. + var parsed []map[string]interface{} + dec := json.NewDecoder(resp.Body) + if err := dec.Decode(&parsed); err != nil { + return false, err + } + if len(parsed) < 1 { + return false, fmt.Errorf("No nodes") + } + taggedAddresses, ok := parsed[0]["TaggedAddresses"].(map[string]interface{}) + if !ok { + return false, fmt.Errorf("Missing tagged addresses") + } + if _, ok := taggedAddresses["lan"]; !ok { + return false, fmt.Errorf("No lan tagged addresses") + } + return true, nil + }, func(err error) { + defer s.Stop() + s.t.Fatalf("err: %s", err) + }) +} + +// url is a helper function which takes a relative URL and +// makes it into a proper URL against the local Consul server. +func (s *TestServer) url(path string) string { + return fmt.Sprintf("http://127.0.0.1:%d%s", s.Config.Ports.HTTP, path) +} + +// requireOK checks the HTTP response code and ensures it is acceptable. +func (s *TestServer) requireOK(resp *http.Response) error { + if resp.StatusCode != 200 { + return fmt.Errorf("Bad status code: %d", resp.StatusCode) + } + return nil +} + +// put performs a new HTTP PUT request. +func (s *TestServer) put(path string, body io.Reader) *http.Response { + req, err := http.NewRequest("PUT", s.url(path), body) + if err != nil { + s.t.Fatalf("err: %s", err) + } + resp, err := s.HttpClient.Do(req) + if err != nil { + s.t.Fatalf("err: %s", err) + } + if err := s.requireOK(resp); err != nil { + defer resp.Body.Close() + s.t.Fatal(err) + } + return resp +} + +// get performs a new HTTP GET request. +func (s *TestServer) get(path string) *http.Response { + resp, err := s.HttpClient.Get(s.url(path)) + if err != nil { + s.t.Fatalf("err: %s", err) + } + if err := s.requireOK(resp); err != nil { + defer resp.Body.Close() + s.t.Fatal(err) + } + return resp +} + +// encodePayload returns a new io.Reader wrapping the encoded contents +// of the payload, suitable for passing directly to a new request. +func (s *TestServer) encodePayload(payload interface{}) io.Reader { + var encoded bytes.Buffer + enc := json.NewEncoder(&encoded) + if err := enc.Encode(payload); err != nil { + s.t.Fatalf("err: %s", err) + } + return &encoded +} + +// JoinLAN is used to join nodes within the same datacenter. +func (s *TestServer) JoinLAN(addr string) { + resp := s.get("/v1/agent/join/" + addr) + resp.Body.Close() +} + +// JoinWAN is used to join remote datacenters together. +func (s *TestServer) JoinWAN(addr string) { + resp := s.get("/v1/agent/join/" + addr + "?wan=1") + resp.Body.Close() +} + +// SetKV sets an individual key in the K/V store. +func (s *TestServer) SetKV(key string, val []byte) { + resp := s.put("/v1/kv/"+key, bytes.NewBuffer(val)) + resp.Body.Close() +} + +// GetKV retrieves a single key and returns its value +func (s *TestServer) GetKV(key string) []byte { + resp := s.get("/v1/kv/" + key) + defer resp.Body.Close() + + raw, err := ioutil.ReadAll(resp.Body) + if err != nil { + s.t.Fatalf("err: %s", err) + } + + var result []*TestKVResponse + if err := json.Unmarshal(raw, &result); err != nil { + s.t.Fatalf("err: %s", err) + } + if len(result) < 1 { + s.t.Fatalf("key does not exist: %s", key) + } + + v, err := base64.StdEncoding.DecodeString(result[0].Value) + if err != nil { + s.t.Fatalf("err: %s", err) + } + + return v +} + +// PopulateKV fills the Consul KV with data from a generic map. +func (s *TestServer) PopulateKV(data map[string][]byte) { + for k, v := range data { + s.SetKV(k, v) + } +} + +// ListKV returns a list of keys present in the KV store. This will list all +// keys under the given prefix recursively and return them as a slice. +func (s *TestServer) ListKV(prefix string) []string { + resp := s.get("/v1/kv/" + prefix + "?keys") + defer resp.Body.Close() + + raw, err := ioutil.ReadAll(resp.Body) + if err != nil { + s.t.Fatalf("err: %s", err) + } + + var result []string + if err := json.Unmarshal(raw, &result); err != nil { + s.t.Fatalf("err: %s", err) + } + return result +} + +// AddService adds a new service to the Consul instance. It also +// automatically adds a health check with the given status, which +// can be one of "passing", "warning", or "critical". +func (s *TestServer) AddService(name, status string, tags []string) { + svc := &TestService{ + Name: name, + Tags: tags, + } + payload := s.encodePayload(svc) + s.put("/v1/agent/service/register", payload) + + chkName := "service:" + name + chk := &TestCheck{ + Name: chkName, + ServiceID: name, + TTL: "10m", + } + payload = s.encodePayload(chk) + s.put("/v1/agent/check/register", payload) + + switch status { + case structs.HealthPassing: + s.put("/v1/agent/check/pass/"+chkName, nil) + case structs.HealthWarning: + s.put("/v1/agent/check/warn/"+chkName, nil) + case structs.HealthCritical: + s.put("/v1/agent/check/fail/"+chkName, nil) + default: + s.t.Fatalf("Unrecognized status: %s", status) + } +} + +// AddCheck adds a check to the Consul instance. If the serviceID is +// left empty (""), then the check will be associated with the node. +// The check status may be "passing", "warning", or "critical". +func (s *TestServer) AddCheck(name, serviceID, status string) { + chk := &TestCheck{ + ID: name, + Name: name, + TTL: "10m", + } + if serviceID != "" { + chk.ServiceID = serviceID + } + + payload := s.encodePayload(chk) + s.put("/v1/agent/check/register", payload) + + switch status { + case structs.HealthPassing: + s.put("/v1/agent/check/pass/"+name, nil) + case structs.HealthWarning: + s.put("/v1/agent/check/warn/"+name, nil) + case structs.HealthCritical: + s.put("/v1/agent/check/fail/"+name, nil) + default: + s.t.Fatalf("Unrecognized status: %s", status) + } +} diff --git a/vendor/github.com/hashicorp/consul/testutil/wait.go b/vendor/github.com/hashicorp/consul/testutil/wait.go new file mode 100644 index 000000000..bd240796f --- /dev/null +++ b/vendor/github.com/hashicorp/consul/testutil/wait.go @@ -0,0 +1,62 @@ +package testutil + +import ( + "fmt" + "testing" + "time" + + "github.com/hashicorp/consul/consul/structs" +) + +type testFn func() (bool, error) +type errorFn func(error) + +const ( + baseWait = 1 * time.Millisecond + maxWait = 100 * time.Millisecond +) + +func WaitForResult(try testFn, fail errorFn) { + var err error + wait := baseWait + for retries := 100; retries > 0; retries-- { + var success bool + success, err = try() + if success { + time.Sleep(25 * time.Millisecond) + return + } + + time.Sleep(wait) + wait *= 2 + if wait > maxWait { + wait = maxWait + } + } + fail(err) +} + +type rpcFn func(string, interface{}, interface{}) error + +func WaitForLeader(t *testing.T, rpc rpcFn, dc string) structs.IndexedNodes { + var out structs.IndexedNodes + WaitForResult(func() (bool, error) { + // Ensure we have a leader and a node registration. + args := &structs.DCSpecificRequest{ + Datacenter: dc, + } + if err := rpc("Catalog.ListNodes", args, &out); err != nil { + return false, fmt.Errorf("Catalog.ListNodes failed: %v", err) + } + if !out.QueryMeta.KnownLeader { + return false, fmt.Errorf("No leader") + } + if out.Index == 0 { + return false, fmt.Errorf("Consul index is 0") + } + return true, nil + }, func(err error) { + t.Fatalf("failed to find leader: %v", err) + }) + return out +} diff --git a/vendor/github.com/hashicorp/consul/types/README.md b/vendor/github.com/hashicorp/consul/types/README.md new file mode 100644 index 000000000..da662f4a1 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/types/README.md @@ -0,0 +1,39 @@ +# Consul `types` Package + +The Go language has a strong type system built into the language. The +`types` package corrals named types into a single package that is terminal in +`go`'s import graph. The `types` package should not have any downstream +dependencies. Each subsystem that defines its own set of types exists in its +own file, but all types are defined in the same package. + +# Why + +> Everything should be made as simple as possible, but not simpler. + +`string` is a useful container and underlying type for identifiers, however +the `string` type is effectively opaque to the compiler in terms of how a +given string is intended to be used. For instance, there is nothing +preventing the following from happening: + +```go +// `map` of Widgets, looked up by ID +var widgetLookup map[string]*Widget +// ... +var widgetID string = "widgetID" +w, found := widgetLookup[widgetID] + +// Bad! +var widgetName string = "name of widget" +w, found := widgetLookup[widgetName] +``` + +but this class of problem is entirely preventable: + +```go +type WidgetID string +var widgetLookup map[WidgetID]*Widget +var widgetName +``` + +TL;DR: intentions and idioms aren't statically checked by compilers. The +`types` package uses Go's strong type system to prevent this class of bug. diff --git a/vendor/github.com/hashicorp/consul/types/checks.go b/vendor/github.com/hashicorp/consul/types/checks.go new file mode 100644 index 000000000..25a136b4f --- /dev/null +++ b/vendor/github.com/hashicorp/consul/types/checks.go @@ -0,0 +1,5 @@ +package types + +// CheckID is a strongly typed string used to uniquely represent a Consul +// Check on an Agent (a CheckID is not globally unique). +type CheckID string diff --git a/vendor/github.com/hashicorp/consul/types/node_id.go b/vendor/github.com/hashicorp/consul/types/node_id.go new file mode 100644 index 000000000..c0588ed42 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/types/node_id.go @@ -0,0 +1,4 @@ +package types + +// NodeID is a unique identifier for a node across space and time. +type NodeID string diff --git a/vendor/github.com/hashicorp/go-sockaddr/LICENSE b/vendor/github.com/hashicorp/go-sockaddr/LICENSE new file mode 100644 index 000000000..a612ad981 --- /dev/null +++ b/vendor/github.com/hashicorp/go-sockaddr/LICENSE @@ -0,0 +1,373 @@ +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. diff --git a/vendor/github.com/hashicorp/go-sockaddr/Makefile b/vendor/github.com/hashicorp/go-sockaddr/Makefile new file mode 100644 index 000000000..224135dc1 --- /dev/null +++ b/vendor/github.com/hashicorp/go-sockaddr/Makefile @@ -0,0 +1,63 @@ +TOOLS= golang.org/x/tools/cover +GOCOVER_TMPFILE?= $(GOCOVER_FILE).tmp +GOCOVER_FILE?= .cover.out +GOCOVERHTML?= coverage.html + +test:: $(GOCOVER_FILE) + @$(MAKE) -C cmd/sockaddr test + +cover:: coverage_report + +$(GOCOVER_FILE):: + @find . -type d ! -path '*cmd*' ! -path '*.git*' -print0 | xargs -0 -I % sh -ec "cd % && rm -f $(GOCOVER_TMPFILE) && go test -coverprofile=$(GOCOVER_TMPFILE)" + + @echo 'mode: set' > $(GOCOVER_FILE) + @find . -type f ! -path '*cmd*' ! -path '*.git*' -name "$(GOCOVER_TMPFILE)" -print0 | xargs -0 -n1 cat $(GOCOVER_TMPFILE) | grep -v '^mode: ' >> ${PWD}/$(GOCOVER_FILE) + +$(GOCOVERHTML): $(GOCOVER_FILE) + go tool cover -html=$(GOCOVER_FILE) -o $(GOCOVERHTML) + +coverage_report:: $(GOCOVER_FILE) + go tool cover -html=$(GOCOVER_FILE) + +audit_tools:: + @go get -u github.com/golang/lint/golint && echo "Installed golint:" + @go get -u github.com/fzipp/gocyclo && echo "Installed gocyclo:" + @go get -u github.com/remyoudompheng/go-misc/deadcode && echo "Installed deadcode:" + @go get -u github.com/client9/misspell/cmd/misspell && echo "Installed misspell:" + @go get -u github.com/gordonklaus/ineffassign && echo "Installed ineffassign:" + +audit:: + deadcode + go tool vet -all *.go + go tool vet -shadow=true *.go + golint *.go + ineffassign . + gocyclo -over 65 *.go + misspell *.go + +clean:: + rm -f $(GOCOVER_FILE) $(GOCOVERHTML) + +dev:: + @go build + @make -B -C cmd/sockaddr sockaddr + +install:: + @go install + @make -C cmd/sockaddr install + +doc:: + echo Visit: http://127.0.0.1:6060/pkg/github.com/hashicorp/go-sockaddr/ + godoc -http=:6060 -goroot $GOROOT + +world:: + @set -e; \ + for os in solaris darwin freebsd linux windows; do \ + for arch in amd64; do \ + printf "Building on %s-%s\n" "$${os}" "$${arch}" ; \ + env GOOS="$${os}" GOARCH="$${arch}" go build -o /dev/null; \ + done; \ + done + + make -C cmd/sockaddr world diff --git a/vendor/github.com/hashicorp/go-sockaddr/README.md b/vendor/github.com/hashicorp/go-sockaddr/README.md new file mode 100644 index 000000000..5273ee899 --- /dev/null +++ b/vendor/github.com/hashicorp/go-sockaddr/README.md @@ -0,0 +1,118 @@ +# go-sockaddr + +## `sockaddr` Library + +Socket address convenience functions for Go. `go-sockaddr` is a convenience +library that makes doing the right thing with IP addresses easy. `go-sockaddr` +is loosely modeled after the UNIX `sockaddr_t` and creates a union of the family +of `sockaddr_t` types (see below for an ascii diagram). Library documentation +is available +at +[https://godoc.org/github.com/hashicorp/go-sockaddr](https://godoc.org/github.com/hashicorp/go-sockaddr). +The primary intent of the library was to make it possible to define heuristics +for selecting the correct IP addresses when a configuration is evaluated at +runtime. See +the +[docs](https://godoc.org/github.com/hashicorp/go-sockaddr), +[`template` package](https://godoc.org/github.com/hashicorp/go-sockaddr/template), +tests, +and +[CLI utility](https://github.com/hashicorp/go-sockaddr/tree/master/cmd/sockaddr) +for details and hints as to how to use this library. + +For example, with this library it is possible to find an IP address that: + +* is attached to a default route + ([`GetDefaultInterfaces()`](https://godoc.org/github.com/hashicorp/go-sockaddr#GetDefaultInterfaces)) +* is contained within a CIDR block (['IfByNetwork()'](https://godoc.org/github.com/hashicorp/go-sockaddr#IfByNetwork)) +* is an RFC1918 address + ([`IfByRFC("1918")`](https://godoc.org/github.com/hashicorp/go-sockaddr#IfByRFC)) +* is ordered + ([`OrderedIfAddrBy(args)`](https://godoc.org/github.com/hashicorp/go-sockaddr#OrderedIfAddrBy) where + `args` includes, but is not limited + to, + [`AscIfType`](https://godoc.org/github.com/hashicorp/go-sockaddr#AscIfType), + [`AscNetworkSize`](https://godoc.org/github.com/hashicorp/go-sockaddr#AscNetworkSize)) +* excludes all IPv6 addresses + ([`IfByType("^(IPv4)$")`](https://godoc.org/github.com/hashicorp/go-sockaddr#IfByType)) +* is larger than a `/32` + ([`IfByMaskSize(32)`](https://godoc.org/github.com/hashicorp/go-sockaddr#IfByMaskSize)) +* is not on a `down` interface + ([`ExcludeIfs("flags", "down")`](https://godoc.org/github.com/hashicorp/go-sockaddr#ExcludeIfs)) +* preferences an IPv6 address over an IPv4 address + ([`SortIfByType()`](https://godoc.org/github.com/hashicorp/go-sockaddr#SortIfByType) + + [`ReverseIfAddrs()`](https://godoc.org/github.com/hashicorp/go-sockaddr#ReverseIfAddrs)); and +* excludes any IP in RFC6890 address + ([`IfByRFC("6890")`](https://godoc.org/github.com/hashicorp/go-sockaddr#IfByRFC)) + +Or any combination or variation therein. + +There are also a few simple helper functions such as `GetPublicIP` and +`GetPrivateIP` which both return strings and select the first public or private +IP address on the default interface, respectively. Similarly, there is also a +helper function called `GetInterfaceIP` which returns the first usable IP +address on the named interface. + +## `sockaddr` CLI + +Given the possible complexity of the `sockaddr` library, there is a CLI utility +that accompanies the library, also +called +[`sockaddr`](https://github.com/hashicorp/go-sockaddr/tree/master/cmd/sockaddr). +The +[`sockaddr`](https://github.com/hashicorp/go-sockaddr/tree/master/cmd/sockaddr) +utility exposes nearly all of the functionality of the library and can be used +either as an administrative tool or testing tool. To install +the +[`sockaddr`](https://github.com/hashicorp/go-sockaddr/tree/master/cmd/sockaddr), +run: + +```text +$ go get -u github.com/hashicorp/go-sockaddr/cmd/sockaddr +``` + +If you're familiar with UNIX's `sockaddr` struct's, the following diagram +mapping the C `sockaddr` (top) to `go-sockaddr` structs (bottom) and +interfaces will be helpful: + +``` ++-------------------------------------------------------+ +| | +| sockaddr | +| SockAddr | +| | +| +--------------+ +----------------------------------+ | +| | sockaddr_un | | | | +| | SockAddrUnix | | sockaddr_in{,6} | | +| +--------------+ | IPAddr | | +| | | | +| | +-------------+ +--------------+ | | +| | | sockaddr_in | | sockaddr_in6 | | | +| | | IPv4Addr | | IPv6Addr | | | +| | +-------------+ +--------------+ | | +| | | | +| +----------------------------------+ | +| | ++-------------------------------------------------------+ +``` + +## Inspiration and Design + +There were many subtle inspirations that led to this design, but the most direct +inspiration for the filtering syntax was +OpenBSD's +[`pf.conf(5)`](https://www.freebsd.org/cgi/man.cgi?query=pf.conf&apropos=0&sektion=0&arch=default&format=html#PARAMETERS) firewall +syntax that lets you select the first IP address on a given named interface. +The original problem stemmed from: + +* needing to create immutable images using [Packer](https://www.packer.io) that + ran the [Consul](https://www.consul.io) process (Consul can only use one IP + address at a time); +* images that may or may not have multiple interfaces or IP addresses at + runtime; and +* we didn't want to rely on configuration management to render out the correct + IP address if the VM image was being used in an auto-scaling group. + +Instead we needed some way to codify a heuristic that would correctly select the +right IP address but the input parameters were not known when the image was +created. diff --git a/vendor/github.com/hashicorp/go-sockaddr/doc.go b/vendor/github.com/hashicorp/go-sockaddr/doc.go new file mode 100644 index 000000000..90671deb5 --- /dev/null +++ b/vendor/github.com/hashicorp/go-sockaddr/doc.go @@ -0,0 +1,5 @@ +/* +Package sockaddr is a Go implementation of the UNIX socket family data types and +related helper functions. +*/ +package sockaddr diff --git a/vendor/github.com/hashicorp/go-sockaddr/ifaddr.go b/vendor/github.com/hashicorp/go-sockaddr/ifaddr.go new file mode 100644 index 000000000..3e4ff9fca --- /dev/null +++ b/vendor/github.com/hashicorp/go-sockaddr/ifaddr.go @@ -0,0 +1,126 @@ +package sockaddr + +// ifAddrAttrMap is a map of the IfAddr type-specific attributes. +var ifAddrAttrMap map[AttrName]func(IfAddr) string +var ifAddrAttrs []AttrName + +func init() { + ifAddrAttrInit() +} + +// GetPrivateIP returns a string with a single IP address that is part of RFC +// 6890 and has a default route. If the system can't determine its IP address +// or find an RFC 6890 IP address, an empty string will be returned instead. +// This function is the `eval` equivalent of: +// +// ``` +// $ sockaddr eval -r '{{GetPrivateInterfaces | attr "address"}}' +/// ``` +func GetPrivateIP() (string, error) { + privateIfs, err := GetPrivateInterfaces() + if err != nil { + return "", err + } + if len(privateIfs) < 1 { + return "", nil + } + + ifAddr := privateIfs[0] + ip := *ToIPAddr(ifAddr.SockAddr) + return ip.NetIP().String(), nil +} + +// GetPublicIP returns a string with a single IP address that is NOT part of RFC +// 6890 and has a default route. If the system can't determine its IP address +// or find a non RFC 6890 IP address, an empty string will be returned instead. +// This function is the `eval` equivalent of: +// +// ``` +// $ sockaddr eval -r '{{GetPublicInterfaces | attr "address"}}' +/// ``` +func GetPublicIP() (string, error) { + publicIfs, err := GetPublicInterfaces() + if err != nil { + return "", err + } else if len(publicIfs) < 1 { + return "", nil + } + + ifAddr := publicIfs[0] + ip := *ToIPAddr(ifAddr.SockAddr) + return ip.NetIP().String(), nil +} + +// GetInterfaceIP returns a string with a single IP address sorted by the size +// of the network (i.e. IP addresses with a smaller netmask, larger network +// size, are sorted first). This function is the `eval` equivalent of: +// +// ``` +// $ sockaddr eval -r '{{GetAllInterfaces | include "name" <> | sort "type,size" | include "flag" "forwardable" | attr "address" }}' +/// ``` +func GetInterfaceIP(namedIfRE string) (string, error) { + ifAddrs, err := GetAllInterfaces() + if err != nil { + return "", err + } + + ifAddrs, _, err = IfByName(namedIfRE, ifAddrs) + if err != nil { + return "", err + } + + ifAddrs, _, err = IfByFlag("forwardable", ifAddrs) + if err != nil { + return "", err + } + + ifAddrs, err = SortIfBy("+type,+size", ifAddrs) + if err != nil { + return "", err + } + + if len(ifAddrs) == 0 { + return "", err + } + + ip := ToIPAddr(ifAddrs[0].SockAddr) + if ip == nil { + return "", err + } + + return IPAddrAttr(*ip, "address"), nil +} + +// IfAddrAttrs returns a list of attributes supported by the IfAddr type +func IfAddrAttrs() []AttrName { + return ifAddrAttrs +} + +// IfAddrAttr returns a string representation of an attribute for the given +// IfAddr. +func IfAddrAttr(ifAddr IfAddr, attrName AttrName) string { + fn, found := ifAddrAttrMap[attrName] + if !found { + return "" + } + + return fn(ifAddr) +} + +// ifAddrAttrInit is called once at init() +func ifAddrAttrInit() { + // Sorted for human readability + ifAddrAttrs = []AttrName{ + "flags", + "name", + } + + ifAddrAttrMap = map[AttrName]func(ifAddr IfAddr) string{ + "flags": func(ifAddr IfAddr) string { + return ifAddr.Interface.Flags.String() + }, + "name": func(ifAddr IfAddr) string { + return ifAddr.Interface.Name + }, + } +} diff --git a/vendor/github.com/hashicorp/go-sockaddr/ifaddrs.go b/vendor/github.com/hashicorp/go-sockaddr/ifaddrs.go new file mode 100644 index 000000000..8233be202 --- /dev/null +++ b/vendor/github.com/hashicorp/go-sockaddr/ifaddrs.go @@ -0,0 +1,969 @@ +package sockaddr + +import ( + "errors" + "fmt" + "net" + "regexp" + "sort" + "strconv" + "strings" +) + +// IfAddrs is a slice of IfAddr +type IfAddrs []IfAddr + +func (ifs IfAddrs) Len() int { return len(ifs) } + +// CmpIfFunc is the function signature that must be met to be used in the +// OrderedIfAddrBy multiIfAddrSorter +type CmpIfAddrFunc func(p1, p2 *IfAddr) int + +// multiIfAddrSorter implements the Sort interface, sorting the IfAddrs within. +type multiIfAddrSorter struct { + ifAddrs IfAddrs + cmp []CmpIfAddrFunc +} + +// Sort sorts the argument slice according to the Cmp functions passed to +// OrderedIfAddrBy. +func (ms *multiIfAddrSorter) Sort(ifAddrs IfAddrs) { + ms.ifAddrs = ifAddrs + sort.Sort(ms) +} + +// OrderedIfAddrBy sorts SockAddr by the list of sort function pointers. +func OrderedIfAddrBy(cmpFuncs ...CmpIfAddrFunc) *multiIfAddrSorter { + return &multiIfAddrSorter{ + cmp: cmpFuncs, + } +} + +// Len is part of sort.Interface. +func (ms *multiIfAddrSorter) Len() int { + return len(ms.ifAddrs) +} + +// Less is part of sort.Interface. It is implemented by looping along the Cmp() +// functions until it finds a comparison that is either less than or greater +// than. A return value of 0 defers sorting to the next function in the +// multisorter (which means the results of sorting may leave the resutls in a +// non-deterministic order). +func (ms *multiIfAddrSorter) Less(i, j int) bool { + p, q := &ms.ifAddrs[i], &ms.ifAddrs[j] + // Try all but the last comparison. + var k int + for k = 0; k < len(ms.cmp)-1; k++ { + cmp := ms.cmp[k] + x := cmp(p, q) + switch x { + case -1: + // p < q, so we have a decision. + return true + case 1: + // p > q, so we have a decision. + return false + } + // p == q; try the next comparison. + } + // All comparisons to here said "equal", so just return whatever the + // final comparison reports. + switch ms.cmp[k](p, q) { + case -1: + return true + case 1: + return false + default: + // Still a tie! Now what? + return false + panic("undefined sort order for remaining items in the list") + } +} + +// Swap is part of sort.Interface. +func (ms *multiIfAddrSorter) Swap(i, j int) { + ms.ifAddrs[i], ms.ifAddrs[j] = ms.ifAddrs[j], ms.ifAddrs[i] +} + +// AscIfAddress is a sorting function to sort IfAddrs by their respective +// address type. Non-equal types are deferred in the sort. +func AscIfAddress(p1Ptr, p2Ptr *IfAddr) int { + return AscAddress(&p1Ptr.SockAddr, &p2Ptr.SockAddr) +} + +// AscIfName is a sorting function to sort IfAddrs by their interface names. +func AscIfName(p1Ptr, p2Ptr *IfAddr) int { + return strings.Compare(p1Ptr.Name, p2Ptr.Name) +} + +// AscIfNetworkSize is a sorting function to sort IfAddrs by their respective +// network mask size. +func AscIfNetworkSize(p1Ptr, p2Ptr *IfAddr) int { + return AscNetworkSize(&p1Ptr.SockAddr, &p2Ptr.SockAddr) +} + +// AscIfPort is a sorting function to sort IfAddrs by their respective +// port type. Non-equal types are deferred in the sort. +func AscIfPort(p1Ptr, p2Ptr *IfAddr) int { + return AscPort(&p1Ptr.SockAddr, &p2Ptr.SockAddr) +} + +// AscIfPrivate is a sorting function to sort IfAddrs by "private" values before +// "public" values. Both IPv4 and IPv6 are compared against RFC6890 (RFC6890 +// includes, and is not limited to, RFC1918 and RFC6598 for IPv4, and IPv6 +// includes RFC4193). +func AscIfPrivate(p1Ptr, p2Ptr *IfAddr) int { + return AscPrivate(&p1Ptr.SockAddr, &p2Ptr.SockAddr) +} + +// AscIfType is a sorting function to sort IfAddrs by their respective address +// type. Non-equal types are deferred in the sort. +func AscIfType(p1Ptr, p2Ptr *IfAddr) int { + return AscType(&p1Ptr.SockAddr, &p2Ptr.SockAddr) +} + +// DescIfAddress is identical to AscIfAddress but reverse ordered. +func DescIfAddress(p1Ptr, p2Ptr *IfAddr) int { + return -1 * AscAddress(&p1Ptr.SockAddr, &p2Ptr.SockAddr) +} + +// DescIfName is identical to AscIfName but reverse ordered. +func DescIfName(p1Ptr, p2Ptr *IfAddr) int { + return -1 * strings.Compare(p1Ptr.Name, p2Ptr.Name) +} + +// DescIfNetworkSize is identical to AscIfNetworkSize but reverse ordered. +func DescIfNetworkSize(p1Ptr, p2Ptr *IfAddr) int { + return -1 * AscNetworkSize(&p1Ptr.SockAddr, &p2Ptr.SockAddr) +} + +// DescIfPort is identical to AscIfPort but reverse ordered. +func DescIfPort(p1Ptr, p2Ptr *IfAddr) int { + return -1 * AscPort(&p1Ptr.SockAddr, &p2Ptr.SockAddr) +} + +// DescIfPrivate is identical to AscIfPrivate but reverse ordered. +func DescIfPrivate(p1Ptr, p2Ptr *IfAddr) int { + return -1 * AscPrivate(&p1Ptr.SockAddr, &p2Ptr.SockAddr) +} + +// DescIfType is identical to AscIfType but reverse ordered. +func DescIfType(p1Ptr, p2Ptr *IfAddr) int { + return -1 * AscType(&p1Ptr.SockAddr, &p2Ptr.SockAddr) +} + +// FilterIfByType filters IfAddrs and returns a list of the matching type +func FilterIfByType(ifAddrs IfAddrs, type_ SockAddrType) (matchedIfs, excludedIfs IfAddrs) { + excludedIfs = make(IfAddrs, 0, len(ifAddrs)) + matchedIfs = make(IfAddrs, 0, len(ifAddrs)) + + for _, ifAddr := range ifAddrs { + if ifAddr.SockAddr.Type()&type_ != 0 { + matchedIfs = append(matchedIfs, ifAddr) + } else { + excludedIfs = append(excludedIfs, ifAddr) + } + } + return matchedIfs, excludedIfs +} + +// IfAttr forwards the selector to IfAttr.Attr() for resolution. If there is +// more than one IfAddr, only the first IfAddr is used. +func IfAttr(selectorName string, ifAddrs IfAddrs) (string, error) { + if len(ifAddrs) == 0 { + return "", nil + } + + attrName := AttrName(strings.ToLower(selectorName)) + attrVal, err := ifAddrs[0].Attr(attrName) + return attrVal, err +} + +// GetAllInterfaces iterates over all available network interfaces and finds all +// available IP addresses on each interface and converts them to +// sockaddr.IPAddrs, and returning the result as an array of IfAddr. +func GetAllInterfaces() (IfAddrs, error) { + ifs, err := net.Interfaces() + if err != nil { + return nil, err + } + + ifAddrs := make(IfAddrs, 0, len(ifs)) + for _, intf := range ifs { + addrs, err := intf.Addrs() + if err != nil { + return nil, err + } + + for _, addr := range addrs { + var ipAddr IPAddr + ipAddr, err = NewIPAddr(addr.String()) + if err != nil { + return IfAddrs{}, fmt.Errorf("unable to create an IP address from %q", addr.String()) + } + + ifAddr := IfAddr{ + SockAddr: ipAddr, + Interface: intf, + } + ifAddrs = append(ifAddrs, ifAddr) + } + } + + return ifAddrs, nil +} + +// GetDefaultInterfaces returns IfAddrs of the addresses attached to the default +// route. +func GetDefaultInterfaces() (IfAddrs, error) { + ri, err := NewRouteInfo() + if err != nil { + return nil, err + } + + defaultIfName, err := ri.GetDefaultInterfaceName() + if err != nil { + return nil, err + } + + var defaultIfs, ifAddrs IfAddrs + ifAddrs, err = GetAllInterfaces() + for _, ifAddr := range ifAddrs { + if ifAddr.Name == defaultIfName { + defaultIfs = append(defaultIfs, ifAddr) + } + } + + return defaultIfs, nil +} + +// GetPrivateInterfaces returns an IfAddrs that are part of RFC 6890 and have a +// default route. If the system can't determine its IP address or find an RFC +// 6890 IP address, an empty IfAddrs will be returned instead. This function is +// the `eval` equivalent of: +// +// ``` +// $ sockaddr eval -r '{{GetDefaultInterfaces | include "type" "ip" | include "flags" "forwardable|up" | sort "type,size" | include "RFC" "6890" }}' +/// ``` +func GetPrivateInterfaces() (IfAddrs, error) { + privateIfs, err := GetDefaultInterfaces() + if err != nil { + return IfAddrs{}, err + } + if len(privateIfs) == 0 { + return IfAddrs{}, nil + } + + privateIfs, _ = FilterIfByType(privateIfs, TypeIP) + if len(privateIfs) == 0 { + return IfAddrs{}, nil + } + + privateIfs, _, err = IfByFlag("forwardable|up", privateIfs) + if err != nil { + return IfAddrs{}, err + } + if len(privateIfs) == 0 { + return IfAddrs{}, nil + } + + OrderedIfAddrBy(AscIfType, AscIfNetworkSize).Sort(privateIfs) + + privateIfs, _, err = IfByRFC("6890", privateIfs) + if err != nil { + return IfAddrs{}, err + } else if len(privateIfs) == 0 { + return IfAddrs{}, nil + } + + return privateIfs, nil +} + +// GetPublicInterfaces returns an IfAddrs that are NOT part of RFC 6890 and has a +// default route. If the system can't determine its IP address or find a non +// RFC 6890 IP address, an empty IfAddrs will be returned instead. This +// function is the `eval` equivalent of: +// +// ``` +// $ sockaddr eval -r '{{GetDefaultInterfaces | include "type" "ip" | include "flags" "forwardable|up" | sort "type,size" | exclude "RFC" "6890" }}' +/// ``` +func GetPublicInterfaces() (IfAddrs, error) { + publicIfs, err := GetDefaultInterfaces() + if err != nil { + return IfAddrs{}, err + } + if len(publicIfs) == 0 { + return IfAddrs{}, nil + } + + publicIfs, _ = FilterIfByType(publicIfs, TypeIP) + if len(publicIfs) == 0 { + return IfAddrs{}, nil + } + + publicIfs, _, err = IfByFlag("forwardable|up", publicIfs) + if err != nil { + return IfAddrs{}, err + } + if len(publicIfs) == 0 { + return IfAddrs{}, nil + } + + OrderedIfAddrBy(AscIfType, AscIfNetworkSize).Sort(publicIfs) + + _, publicIfs, err = IfByRFC("6890", publicIfs) + if err != nil { + return IfAddrs{}, err + } else if len(publicIfs) == 0 { + return IfAddrs{}, nil + } + + return publicIfs, nil +} + +// IfByAddress returns a list of matched and non-matched IfAddrs, or an error if +// the regexp fails to compile. +func IfByAddress(inputRe string, ifAddrs IfAddrs) (matched, remainder IfAddrs, err error) { + re, err := regexp.Compile(inputRe) + if err != nil { + return nil, nil, fmt.Errorf("Unable to compile address regexp %+q: %v", inputRe, err) + } + + matchedAddrs := make(IfAddrs, 0, len(ifAddrs)) + excludedAddrs := make(IfAddrs, 0, len(ifAddrs)) + for _, addr := range ifAddrs { + if re.MatchString(addr.SockAddr.String()) { + matchedAddrs = append(matchedAddrs, addr) + } else { + excludedAddrs = append(excludedAddrs, addr) + } + } + + return matchedAddrs, excludedAddrs, nil +} + +// IfByName returns a list of matched and non-matched IfAddrs, or an error if +// the regexp fails to compile. +func IfByName(inputRe string, ifAddrs IfAddrs) (matched, remainder IfAddrs, err error) { + re, err := regexp.Compile(inputRe) + if err != nil { + return nil, nil, fmt.Errorf("Unable to compile name regexp %+q: %v", inputRe, err) + } + + matchedAddrs := make(IfAddrs, 0, len(ifAddrs)) + excludedAddrs := make(IfAddrs, 0, len(ifAddrs)) + for _, addr := range ifAddrs { + if re.MatchString(addr.Name) { + matchedAddrs = append(matchedAddrs, addr) + } else { + excludedAddrs = append(excludedAddrs, addr) + } + } + + return matchedAddrs, excludedAddrs, nil +} + +// IfByPort returns a list of matched and non-matched IfAddrs, or an error if +// the regexp fails to compile. +func IfByPort(inputRe string, ifAddrs IfAddrs) (matchedIfs, excludedIfs IfAddrs, err error) { + re, err := regexp.Compile(inputRe) + if err != nil { + return nil, nil, fmt.Errorf("Unable to compile port regexp %+q: %v", inputRe, err) + } + + ipIfs, nonIfs := FilterIfByType(ifAddrs, TypeIP) + matchedIfs = make(IfAddrs, 0, len(ipIfs)) + excludedIfs = append(IfAddrs(nil), nonIfs...) + for _, addr := range ipIfs { + ipAddr := ToIPAddr(addr.SockAddr) + if ipAddr == nil { + continue + } + + port := strconv.FormatInt(int64((*ipAddr).IPPort()), 10) + if re.MatchString(port) { + matchedIfs = append(matchedIfs, addr) + } else { + excludedIfs = append(excludedIfs, addr) + } + } + + return matchedIfs, excludedIfs, nil +} + +// IfByRFC returns a list of matched and non-matched IfAddrs that contain the +// relevant RFC-specified traits. +func IfByRFC(selectorParam string, ifAddrs IfAddrs) (matched, remainder IfAddrs, err error) { + inputRFC, err := strconv.ParseUint(selectorParam, 10, 64) + if err != nil { + return IfAddrs{}, IfAddrs{}, fmt.Errorf("unable to parse RFC number %q: %v", selectorParam, err) + } + + matchedIfAddrs := make(IfAddrs, 0, len(ifAddrs)) + remainingIfAddrs := make(IfAddrs, 0, len(ifAddrs)) + + rfcNetMap := KnownRFCs() + rfcNets, ok := rfcNetMap[uint(inputRFC)] + if !ok { + return nil, nil, fmt.Errorf("unsupported RFC %d", inputRFC) + } + + for _, ifAddr := range ifAddrs { + var contained bool + for _, rfcNet := range rfcNets { + if rfcNet.Contains(ifAddr.SockAddr) { + matchedIfAddrs = append(matchedIfAddrs, ifAddr) + contained = true + break + } + } + if !contained { + remainingIfAddrs = append(remainingIfAddrs, ifAddr) + } + } + + return matchedIfAddrs, remainingIfAddrs, nil +} + +// IfByRFCs returns a list of matched and non-matched IfAddrs that contain the +// relevant RFC-specified traits. Multiple RFCs can be specified and separated +// by the `|` symbol. No protection is taken to ensure an IfAddr does not end +// up in both the included and excluded list. +func IfByRFCs(selectorParam string, ifAddrs IfAddrs) (matched, remainder IfAddrs, err error) { + var includedIfs, excludedIfs IfAddrs + for _, rfcStr := range strings.Split(selectorParam, "|") { + includedRFCIfs, excludedRFCIfs, err := IfByRFC(rfcStr, ifAddrs) + if err != nil { + return IfAddrs{}, IfAddrs{}, fmt.Errorf("unable to lookup RFC number %q: %v", rfcStr, err) + } + includedIfs = append(includedIfs, includedRFCIfs...) + excludedIfs = append(excludedIfs, excludedRFCIfs...) + } + + return includedIfs, excludedIfs, nil +} + +// IfByMaskSize returns a list of matched and non-matched IfAddrs that have the +// matching mask size. +func IfByMaskSize(selectorParam string, ifAddrs IfAddrs) (matchedIfs, excludedIfs IfAddrs, err error) { + maskSize, err := strconv.ParseUint(selectorParam, 10, 64) + if err != nil { + return IfAddrs{}, IfAddrs{}, fmt.Errorf("invalid exclude size argument (%q): %v", selectorParam, err) + } + + ipIfs, nonIfs := FilterIfByType(ifAddrs, TypeIP) + matchedIfs = make(IfAddrs, 0, len(ipIfs)) + excludedIfs = append(IfAddrs(nil), nonIfs...) + for _, addr := range ipIfs { + ipAddr := ToIPAddr(addr.SockAddr) + if ipAddr == nil { + return IfAddrs{}, IfAddrs{}, fmt.Errorf("unable to filter mask sizes on non-IP type %s: %v", addr.SockAddr.Type().String(), addr.SockAddr.String()) + } + + switch { + case (*ipAddr).Type()&TypeIPv4 != 0 && maskSize > 32: + return IfAddrs{}, IfAddrs{}, fmt.Errorf("mask size out of bounds for IPv4 address: %d", maskSize) + case (*ipAddr).Type()&TypeIPv6 != 0 && maskSize > 128: + return IfAddrs{}, IfAddrs{}, fmt.Errorf("mask size out of bounds for IPv6 address: %d", maskSize) + } + + if (*ipAddr).Maskbits() == int(maskSize) { + matchedIfs = append(matchedIfs, addr) + } else { + excludedIfs = append(excludedIfs, addr) + } + } + + return matchedIfs, excludedIfs, nil +} + +// IfByType returns a list of matching and non-matching IfAddr that match the +// specified type. For instance: +// +// include "type" "IPv4,IPv6" +// +// will include any IfAddrs that is either an IPv4 or IPv6 address. Any +// addresses on those interfaces that don't match will be included in the +// remainder results. +func IfByType(inputTypes string, ifAddrs IfAddrs) (matched, remainder IfAddrs, err error) { + matchingIfAddrs := make(IfAddrs, 0, len(ifAddrs)) + remainingIfAddrs := make(IfAddrs, 0, len(ifAddrs)) + + ifTypes := strings.Split(strings.ToLower(inputTypes), "|") + for _, ifType := range ifTypes { + switch ifType { + case "ip", "ipv4", "ipv6", "unix": + // Valid types + default: + return nil, nil, fmt.Errorf("unsupported type %q %q", ifType, inputTypes) + } + } + + for _, ifAddr := range ifAddrs { + for _, ifType := range ifTypes { + var matched bool + switch { + case ifType == "ip" && ifAddr.SockAddr.Type()&TypeIP != 0: + matched = true + case ifType == "ipv4" && ifAddr.SockAddr.Type()&TypeIPv4 != 0: + matched = true + case ifType == "ipv6" && ifAddr.SockAddr.Type()&TypeIPv6 != 0: + matched = true + case ifType == "unix" && ifAddr.SockAddr.Type()&TypeUnix != 0: + matched = true + } + + if matched { + matchingIfAddrs = append(matchingIfAddrs, ifAddr) + } else { + remainingIfAddrs = append(remainingIfAddrs, ifAddr) + } + } + } + + return matchingIfAddrs, remainingIfAddrs, nil +} + +// IfByFlag returns a list of matching and non-matching IfAddrs that match the +// specified type. For instance: +// +// include "flag" "up,broadcast" +// +// will include any IfAddrs that have both the "up" and "broadcast" flags set. +// Any addresses on those interfaces that don't match will be omitted from the +// results. +func IfByFlag(inputFlags string, ifAddrs IfAddrs) (matched, remainder IfAddrs, err error) { + matchedAddrs := make(IfAddrs, 0, len(ifAddrs)) + excludedAddrs := make(IfAddrs, 0, len(ifAddrs)) + + var wantForwardable, + wantGlobalUnicast, + wantInterfaceLocalMulticast, + wantLinkLocalMulticast, + wantLinkLocalUnicast, + wantLoopback, + wantMulticast, + wantUnspecified bool + var ifFlags net.Flags + var checkFlags, checkAttrs bool + for _, flagName := range strings.Split(strings.ToLower(inputFlags), "|") { + switch flagName { + case "broadcast": + checkFlags = true + ifFlags = ifFlags | net.FlagBroadcast + case "down": + checkFlags = true + ifFlags = (ifFlags &^ net.FlagUp) + case "forwardable": + checkAttrs = true + wantForwardable = true + case "global unicast": + checkAttrs = true + wantGlobalUnicast = true + case "interface-local multicast": + checkAttrs = true + wantInterfaceLocalMulticast = true + case "link-local multicast": + checkAttrs = true + wantLinkLocalMulticast = true + case "link-local unicast": + checkAttrs = true + wantLinkLocalUnicast = true + case "loopback": + checkAttrs = true + checkFlags = true + ifFlags = ifFlags | net.FlagLoopback + wantLoopback = true + case "multicast": + checkAttrs = true + checkFlags = true + ifFlags = ifFlags | net.FlagMulticast + wantMulticast = true + case "point-to-point": + checkFlags = true + ifFlags = ifFlags | net.FlagPointToPoint + case "unspecified": + checkAttrs = true + wantUnspecified = true + case "up": + checkFlags = true + ifFlags = ifFlags | net.FlagUp + default: + return nil, nil, fmt.Errorf("Unknown interface flag: %+q", flagName) + } + } + + for _, ifAddr := range ifAddrs { + var matched bool + if checkFlags && ifAddr.Interface.Flags&ifFlags == ifFlags { + matched = true + } + if checkAttrs { + if ip := ToIPAddr(ifAddr.SockAddr); ip != nil { + netIP := (*ip).NetIP() + switch { + case wantGlobalUnicast && netIP.IsGlobalUnicast(): + matched = true + case wantInterfaceLocalMulticast && netIP.IsInterfaceLocalMulticast(): + matched = true + case wantLinkLocalMulticast && netIP.IsLinkLocalMulticast(): + matched = true + case wantLinkLocalUnicast && netIP.IsLinkLocalUnicast(): + matched = true + case wantLoopback && netIP.IsLoopback(): + matched = true + case wantMulticast && netIP.IsMulticast(): + matched = true + case wantUnspecified && netIP.IsUnspecified(): + matched = true + case wantForwardable && !IsRFC(ForwardingBlacklist, ifAddr.SockAddr): + matched = true + } + } + } + if matched { + matchedAddrs = append(matchedAddrs, ifAddr) + } else { + excludedAddrs = append(excludedAddrs, ifAddr) + } + } + return matchedAddrs, excludedAddrs, nil +} + +// IfByNetwork returns an IfAddrs that are equal to or included within the +// network passed in by selector. +func IfByNetwork(selectorParam string, inputIfAddrs IfAddrs) (IfAddrs, IfAddrs, error) { + var includedIfs, excludedIfs IfAddrs + for _, netStr := range strings.Split(selectorParam, "|") { + netAddr, err := NewIPAddr(netStr) + if err != nil { + return nil, nil, fmt.Errorf("unable to create an IP address from %+q: %v", netStr, err) + } + + for _, ifAddr := range inputIfAddrs { + if netAddr.Contains(ifAddr.SockAddr) { + includedIfs = append(includedIfs, ifAddr) + } else { + excludedIfs = append(excludedIfs, ifAddr) + } + } + } + + return includedIfs, excludedIfs, nil +} + +// IncludeIfs returns an IfAddrs based on the passed in selector. +func IncludeIfs(selectorName, selectorParam string, inputIfAddrs IfAddrs) (IfAddrs, error) { + var includedIfs IfAddrs + var err error + + switch strings.ToLower(selectorName) { + case "address": + includedIfs, _, err = IfByAddress(selectorParam, inputIfAddrs) + case "flag", "flags": + includedIfs, _, err = IfByFlag(selectorParam, inputIfAddrs) + case "name": + includedIfs, _, err = IfByName(selectorParam, inputIfAddrs) + case "network": + includedIfs, _, err = IfByNetwork(selectorParam, inputIfAddrs) + case "port": + includedIfs, _, err = IfByPort(selectorParam, inputIfAddrs) + case "rfc", "rfcs": + includedIfs, _, err = IfByRFCs(selectorParam, inputIfAddrs) + case "size": + includedIfs, _, err = IfByMaskSize(selectorParam, inputIfAddrs) + case "type": + includedIfs, _, err = IfByType(selectorParam, inputIfAddrs) + default: + return IfAddrs{}, fmt.Errorf("invalid include selector %q", selectorName) + } + + if err != nil { + return IfAddrs{}, err + } + + return includedIfs, nil +} + +// ExcludeIfs returns an IfAddrs based on the passed in selector. +func ExcludeIfs(selectorName, selectorParam string, inputIfAddrs IfAddrs) (IfAddrs, error) { + var excludedIfs IfAddrs + var err error + + switch strings.ToLower(selectorName) { + case "address": + _, excludedIfs, err = IfByAddress(selectorParam, inputIfAddrs) + case "flag", "flags": + _, excludedIfs, err = IfByFlag(selectorParam, inputIfAddrs) + case "name": + _, excludedIfs, err = IfByName(selectorParam, inputIfAddrs) + case "network": + _, excludedIfs, err = IfByNetwork(selectorParam, inputIfAddrs) + case "port": + _, excludedIfs, err = IfByPort(selectorParam, inputIfAddrs) + case "rfc", "rfcs": + _, excludedIfs, err = IfByRFCs(selectorParam, inputIfAddrs) + case "size": + _, excludedIfs, err = IfByMaskSize(selectorParam, inputIfAddrs) + case "type": + _, excludedIfs, err = IfByType(selectorParam, inputIfAddrs) + default: + return IfAddrs{}, fmt.Errorf("invalid exclude selector %q", selectorName) + } + + if err != nil { + return IfAddrs{}, err + } + + return excludedIfs, nil +} + +// SortIfBy returns an IfAddrs sorted based on the passed in selector. Multiple +// sort clauses can be passed in as a comma delimited list without whitespace. +func SortIfBy(selectorParam string, inputIfAddrs IfAddrs) (IfAddrs, error) { + sortedIfs := append(IfAddrs(nil), inputIfAddrs...) + + clauses := strings.Split(selectorParam, ",") + sortFuncs := make([]CmpIfAddrFunc, len(clauses)) + + for i, clause := range clauses { + switch strings.TrimSpace(strings.ToLower(clause)) { + case "+address", "address": + // The "address" selector returns an array of IfAddrs + // ordered by the network address. IfAddrs that are not + // comparable will be at the end of the list and in a + // non-deterministic order. + sortFuncs[i] = AscIfAddress + case "-address": + sortFuncs[i] = DescIfAddress + case "+name", "name": + // The "name" selector returns an array of IfAddrs + // ordered by the interface name. + sortFuncs[i] = AscIfName + case "-name": + sortFuncs[i] = DescIfName + case "+port", "port": + // The "port" selector returns an array of IfAddrs + // ordered by the port, if included in the IfAddr. + // IfAddrs that are not comparable will be at the end of + // the list and in a non-deterministic order. + sortFuncs[i] = AscIfPort + case "-port": + sortFuncs[i] = DescIfPort + case "+private", "private": + // The "private" selector returns an array of IfAddrs + // ordered by private addresses first. IfAddrs that are + // not comparable will be at the end of the list and in + // a non-deterministic order. + sortFuncs[i] = AscIfPrivate + case "-private": + sortFuncs[i] = DescIfPrivate + case "+size", "size": + // The "size" selector returns an array of IfAddrs + // ordered by the size of the network mask, smaller mask + // (larger number of hosts per network) to largest + // (e.g. a /24 sorts before a /32). + sortFuncs[i] = AscIfNetworkSize + case "-size": + sortFuncs[i] = DescIfNetworkSize + case "+type", "type": + // The "type" selector returns an array of IfAddrs + // ordered by the type of the IfAddr. The sort order is + // Unix, IPv4, then IPv6. + sortFuncs[i] = AscIfType + case "-type": + sortFuncs[i] = DescIfType + default: + // Return an empty list for invalid sort types. + return IfAddrs{}, fmt.Errorf("unknown sort type: %q", clause) + } + } + + OrderedIfAddrBy(sortFuncs...).Sort(sortedIfs) + + return sortedIfs, nil +} + +// UniqueIfAddrsBy creates a unique set of IfAddrs based on the matching +// selector. UniqueIfAddrsBy assumes the input has already been sorted. +func UniqueIfAddrsBy(selectorName string, inputIfAddrs IfAddrs) (IfAddrs, error) { + attrName := strings.ToLower(selectorName) + + ifs := make(IfAddrs, 0, len(inputIfAddrs)) + var lastMatch string + for _, ifAddr := range inputIfAddrs { + var out string + switch attrName { + case "address": + out = ifAddr.SockAddr.String() + case "name": + out = ifAddr.Name + default: + return nil, fmt.Errorf("unsupported unique constraint %+q", selectorName) + } + + switch { + case lastMatch == "", lastMatch != out: + lastMatch = out + ifs = append(ifs, ifAddr) + case lastMatch == out: + continue + } + } + + return ifs, nil +} + +// JoinIfAddrs joins an IfAddrs and returns a string +func JoinIfAddrs(selectorName string, joinStr string, inputIfAddrs IfAddrs) (string, error) { + outputs := make([]string, 0, len(inputIfAddrs)) + attrName := AttrName(strings.ToLower(selectorName)) + + for _, ifAddr := range inputIfAddrs { + var attrVal string + var err error + attrVal, err = ifAddr.Attr(attrName) + if err != nil { + return "", err + } + outputs = append(outputs, attrVal) + } + return strings.Join(outputs, joinStr), nil +} + +// LimitIfAddrs returns a slice of IfAddrs based on the specified limit. +func LimitIfAddrs(lim uint, in IfAddrs) (IfAddrs, error) { + // Clamp the limit to the length of the array + if int(lim) > len(in) { + lim = uint(len(in)) + } + + return in[0:lim], nil +} + +// OffsetIfAddrs returns a slice of IfAddrs based on the specified offset. +func OffsetIfAddrs(off int, in IfAddrs) (IfAddrs, error) { + var end bool + if off < 0 { + end = true + off = off * -1 + } + + if off > len(in) { + return IfAddrs{}, fmt.Errorf("unable to seek past the end of the interface array: offset (%d) exceeds the number of interfaces (%d)", off, len(in)) + } + + if end { + return in[len(in)-off:], nil + } + return in[off:], nil +} + +func (ifAddr IfAddr) String() string { + return fmt.Sprintf("%s %v", ifAddr.SockAddr, ifAddr.Interface) +} + +// parseDefaultIfNameFromRoute parses standard route(8)'s output for the *BSDs +// and Solaris. +func parseDefaultIfNameFromRoute(routeOut string) (string, error) { + lines := strings.Split(routeOut, "\n") + for _, line := range lines { + kvs := strings.SplitN(line, ":", 2) + if len(kvs) != 2 { + continue + } + + if strings.TrimSpace(kvs[0]) == "interface" { + ifName := strings.TrimSpace(kvs[1]) + return ifName, nil + } + } + + return "", errors.New("No default interface found") +} + +// parseDefaultIfNameFromIPCmd parses the default interface from ip(8) for +// Linux. +func parseDefaultIfNameFromIPCmd(routeOut string) (string, error) { + lines := strings.Split(routeOut, "\n") + re := regexp.MustCompile(`[\s]+`) + for _, line := range lines { + kvs := re.Split(line, -1) + if len(kvs) < 5 { + continue + } + + if kvs[0] == "default" && + kvs[1] == "via" && + kvs[3] == "dev" { + ifName := strings.TrimSpace(kvs[4]) + return ifName, nil + } + } + + return "", errors.New("No default interface found") +} + +// parseDefaultIfNameWindows parses the default interface from `netstat -rn` and +// `ipconfig` on Windows. +func parseDefaultIfNameWindows(routeOut, ipconfigOut string) (string, error) { + defaultIPAddr, err := parseDefaultIPAddrWindowsRoute(routeOut) + if err != nil { + return "", err + } + + ifName, err := parseDefaultIfNameWindowsIPConfig(defaultIPAddr, ipconfigOut) + if err != nil { + return "", err + } + + return ifName, nil +} + +// parseDefaultIPAddrWindowsRoute parses the IP address on the default interface +// `netstat -rn`. +// +// NOTES(sean): Only IPv4 addresses are parsed at this time. If you have an +// IPv6 connected host, submit an issue on github.com/hashicorp/go-sockaddr with +// the output from `netstat -rn`, `ipconfig`, and version of Windows to see IPv6 +// support added. +func parseDefaultIPAddrWindowsRoute(routeOut string) (string, error) { + lines := strings.Split(routeOut, "\n") + re := regexp.MustCompile(`[\s]+`) + for _, line := range lines { + kvs := re.Split(strings.TrimSpace(line), -1) + if len(kvs) < 3 { + continue + } + + if kvs[0] == "0.0.0.0" && kvs[1] == "0.0.0.0" { + defaultIPAddr := strings.TrimSpace(kvs[3]) + return defaultIPAddr, nil + } + } + + return "", errors.New("No IP on default interface found") +} + +// parseDefaultIfNameWindowsIPConfig parses the output of `ipconfig` to find the +// interface name forwarding traffic to the default gateway. +func parseDefaultIfNameWindowsIPConfig(defaultIPAddr, routeOut string) (string, error) { + lines := strings.Split(routeOut, "\n") + ifNameRE := regexp.MustCompile(`^Ethernet adapter ([^\s:]+):`) + ipAddrRE := regexp.MustCompile(`^ IPv[46] Address\. \. \. \. \. \. \. \. \. \. \. : ([^\s]+)`) + var ifName string + for _, line := range lines { + switch ifNameMatches := ifNameRE.FindStringSubmatch(line); { + case len(ifNameMatches) > 1: + ifName = ifNameMatches[1] + continue + } + + switch ipAddrMatches := ipAddrRE.FindStringSubmatch(line); { + case len(ipAddrMatches) > 1 && ipAddrMatches[1] == defaultIPAddr: + return ifName, nil + } + } + + return "", errors.New("No default interface found with matching IP") +} diff --git a/vendor/github.com/hashicorp/go-sockaddr/ifattr.go b/vendor/github.com/hashicorp/go-sockaddr/ifattr.go new file mode 100644 index 000000000..6984cb4a3 --- /dev/null +++ b/vendor/github.com/hashicorp/go-sockaddr/ifattr.go @@ -0,0 +1,65 @@ +package sockaddr + +import ( + "fmt" + "net" +) + +// IfAddr is a union of a SockAddr and a net.Interface. +type IfAddr struct { + SockAddr + net.Interface +} + +// Attr returns the named attribute as a string +func (ifAddr IfAddr) Attr(attrName AttrName) (string, error) { + val := IfAddrAttr(ifAddr, attrName) + if val != "" { + return val, nil + } + + return Attr(ifAddr.SockAddr, attrName) +} + +// Attr returns the named attribute as a string +func Attr(sa SockAddr, attrName AttrName) (string, error) { + switch sockType := sa.Type(); { + case sockType&TypeIP != 0: + ip := *ToIPAddr(sa) + attrVal := IPAddrAttr(ip, attrName) + if attrVal != "" { + return attrVal, nil + } + + if sockType == TypeIPv4 { + ipv4 := *ToIPv4Addr(sa) + attrVal := IPv4AddrAttr(ipv4, attrName) + if attrVal != "" { + return attrVal, nil + } + } else if sockType == TypeIPv6 { + ipv6 := *ToIPv6Addr(sa) + attrVal := IPv6AddrAttr(ipv6, attrName) + if attrVal != "" { + return attrVal, nil + } + } + + case sockType == TypeUnix: + us := *ToUnixSock(sa) + attrVal := UnixSockAttr(us, attrName) + if attrVal != "" { + return attrVal, nil + } + } + + // Non type-specific attributes + switch attrName { + case "string": + return sa.String(), nil + case "type": + return sa.Type().String(), nil + } + + return "", fmt.Errorf("unsupported attribute name %q", attrName) +} diff --git a/vendor/github.com/hashicorp/go-sockaddr/ipaddr.go b/vendor/github.com/hashicorp/go-sockaddr/ipaddr.go new file mode 100644 index 000000000..b47d15c20 --- /dev/null +++ b/vendor/github.com/hashicorp/go-sockaddr/ipaddr.go @@ -0,0 +1,169 @@ +package sockaddr + +import ( + "fmt" + "math/big" + "net" + "strings" +) + +// Constants for the sizes of IPv3, IPv4, and IPv6 address types. +const ( + IPv3len = 6 + IPv4len = 4 + IPv6len = 16 +) + +// IPAddr is a generic IP address interface for IPv4 and IPv6 addresses, +// networks, and socket endpoints. +type IPAddr interface { + SockAddr + AddressBinString() string + AddressHexString() string + Cmp(SockAddr) int + CmpAddress(SockAddr) int + CmpPort(SockAddr) int + FirstUsable() IPAddr + Host() IPAddr + IPPort() IPPort + LastUsable() IPAddr + Maskbits() int + NetIP() *net.IP + NetIPMask() *net.IPMask + NetIPNet() *net.IPNet + Network() IPAddr + Octets() []int +} + +// IPPort is the type for an IP port number for the TCP and UDP IP transports. +type IPPort uint16 + +// IPPrefixLen is a typed integer representing the prefix length for a given +// IPAddr. +type IPPrefixLen byte + +// ipAddrAttrMap is a map of the IPAddr type-specific attributes. +var ipAddrAttrMap map[AttrName]func(IPAddr) string +var ipAddrAttrs []AttrName + +func init() { + ipAddrInit() +} + +// NewIPAddr creates a new IPAddr from a string. Returns nil if the string is +// not an IPv4 or an IPv6 address. +func NewIPAddr(addr string) (IPAddr, error) { + ipv4Addr, err := NewIPv4Addr(addr) + if err == nil { + return ipv4Addr, nil + } + + ipv6Addr, err := NewIPv6Addr(addr) + if err == nil { + return ipv6Addr, nil + } + + return nil, fmt.Errorf("invalid IPAddr %v", addr) +} + +// IPAddrAttr returns a string representation of an attribute for the given +// IPAddr. +func IPAddrAttr(ip IPAddr, selector AttrName) string { + fn, found := ipAddrAttrMap[selector] + if !found { + return "" + } + + return fn(ip) +} + +// IPAttrs returns a list of attributes supported by the IPAddr type +func IPAttrs() []AttrName { + return ipAddrAttrs +} + +// MustIPAddr is a helper method that must return an IPAddr or panic on invalid +// input. +func MustIPAddr(addr string) IPAddr { + ip, err := NewIPAddr(addr) + if err != nil { + panic(fmt.Sprintf("Unable to create an IPAddr from %+q: %v", addr, err)) + } + return ip +} + +// ipAddrInit is called once at init() +func ipAddrInit() { + // Sorted for human readability + ipAddrAttrs = []AttrName{ + "host", + "address", + "port", + "netmask", + "network", + "mask_bits", + "binary", + "hex", + "first_usable", + "last_usable", + "octets", + } + + ipAddrAttrMap = map[AttrName]func(ip IPAddr) string{ + "address": func(ip IPAddr) string { + return ip.NetIP().String() + }, + "binary": func(ip IPAddr) string { + return ip.AddressBinString() + }, + "first_usable": func(ip IPAddr) string { + return ip.FirstUsable().String() + }, + "hex": func(ip IPAddr) string { + return ip.AddressHexString() + }, + "host": func(ip IPAddr) string { + return ip.Host().String() + }, + "last_usable": func(ip IPAddr) string { + return ip.LastUsable().String() + }, + "mask_bits": func(ip IPAddr) string { + return fmt.Sprintf("%d", ip.Maskbits()) + }, + "netmask": func(ip IPAddr) string { + switch v := ip.(type) { + case IPv4Addr: + ipv4Mask := IPv4Addr{ + Address: IPv4Address(v.Mask), + Mask: IPv4HostMask, + } + return ipv4Mask.String() + case IPv6Addr: + ipv6Mask := new(big.Int) + ipv6Mask.Set(v.Mask) + ipv6MaskAddr := IPv6Addr{ + Address: IPv6Address(ipv6Mask), + Mask: ipv6HostMask, + } + return ipv6MaskAddr.String() + default: + return fmt.Sprintf("", ip) + } + }, + "network": func(ip IPAddr) string { + return ip.Network().NetIP().String() + }, + "octets": func(ip IPAddr) string { + octets := ip.Octets() + octetStrs := make([]string, 0, len(octets)) + for _, octet := range octets { + octetStrs = append(octetStrs, fmt.Sprintf("%d", octet)) + } + return strings.Join(octetStrs, " ") + }, + "port": func(ip IPAddr) string { + return fmt.Sprintf("%d", ip.IPPort()) + }, + } +} diff --git a/vendor/github.com/hashicorp/go-sockaddr/ipaddrs.go b/vendor/github.com/hashicorp/go-sockaddr/ipaddrs.go new file mode 100644 index 000000000..6eeb7ddd2 --- /dev/null +++ b/vendor/github.com/hashicorp/go-sockaddr/ipaddrs.go @@ -0,0 +1,98 @@ +package sockaddr + +import "bytes" + +type IPAddrs []IPAddr + +func (s IPAddrs) Len() int { return len(s) } +func (s IPAddrs) Swap(i, j int) { s[i], s[j] = s[j], s[i] } + +// // SortIPAddrsByCmp is a type that satisfies sort.Interface and can be used +// // by the routines in this package. The SortIPAddrsByCmp type is used to +// // sort IPAddrs by Cmp() +// type SortIPAddrsByCmp struct{ IPAddrs } + +// // Less reports whether the element with index i should sort before the +// // element with index j. +// func (s SortIPAddrsByCmp) Less(i, j int) bool { +// // Sort by Type, then address, then port number. +// return Less(s.IPAddrs[i], s.IPAddrs[j]) +// } + +// SortIPAddrsBySpecificMaskLen is a type that satisfies sort.Interface and +// can be used by the routines in this package. The +// SortIPAddrsBySpecificMaskLen type is used to sort IPAddrs by smallest +// network (most specific to largest network). +type SortIPAddrsByNetworkSize struct{ IPAddrs } + +// Less reports whether the element with index i should sort before the +// element with index j. +func (s SortIPAddrsByNetworkSize) Less(i, j int) bool { + // Sort masks with a larger binary value (i.e. fewer hosts per network + // prefix) after masks with a smaller value (larger number of hosts per + // prefix). + switch bytes.Compare([]byte(*s.IPAddrs[i].NetIPMask()), []byte(*s.IPAddrs[j].NetIPMask())) { + case 0: + // Fall through to the second test if the net.IPMasks are the + // same. + break + case 1: + return true + case -1: + return false + default: + panic("bad, m'kay?") + } + + // Sort IPs based on the length (i.e. prefer IPv4 over IPv6). + iLen := len(*s.IPAddrs[i].NetIP()) + jLen := len(*s.IPAddrs[j].NetIP()) + if iLen != jLen { + return iLen > jLen + } + + // Sort IPs based on their network address from lowest to highest. + switch bytes.Compare(s.IPAddrs[i].NetIPNet().IP, s.IPAddrs[j].NetIPNet().IP) { + case 0: + break + case 1: + return false + case -1: + return true + default: + panic("lol wut?") + } + + // If a host does not have a port set, it always sorts after hosts + // that have a port (e.g. a host with a /32 and port number is more + // specific and should sort first over a host with a /32 but no port + // set). + if s.IPAddrs[i].IPPort() == 0 || s.IPAddrs[j].IPPort() == 0 { + return false + } + return s.IPAddrs[i].IPPort() < s.IPAddrs[j].IPPort() +} + +// SortIPAddrsBySpecificMaskLen is a type that satisfies sort.Interface and +// can be used by the routines in this package. The +// SortIPAddrsBySpecificMaskLen type is used to sort IPAddrs by smallest +// network (most specific to largest network). +type SortIPAddrsBySpecificMaskLen struct{ IPAddrs } + +// Less reports whether the element with index i should sort before the +// element with index j. +func (s SortIPAddrsBySpecificMaskLen) Less(i, j int) bool { + return s.IPAddrs[i].Maskbits() > s.IPAddrs[j].Maskbits() +} + +// SortIPAddrsByBroadMaskLen is a type that satisfies sort.Interface and can +// be used by the routines in this package. The SortIPAddrsByBroadMaskLen +// type is used to sort IPAddrs by largest network (i.e. largest subnets +// first). +type SortIPAddrsByBroadMaskLen struct{ IPAddrs } + +// Less reports whether the element with index i should sort before the +// element with index j. +func (s SortIPAddrsByBroadMaskLen) Less(i, j int) bool { + return s.IPAddrs[i].Maskbits() < s.IPAddrs[j].Maskbits() +} diff --git a/vendor/github.com/hashicorp/go-sockaddr/ipv4addr.go b/vendor/github.com/hashicorp/go-sockaddr/ipv4addr.go new file mode 100644 index 000000000..9f2616a69 --- /dev/null +++ b/vendor/github.com/hashicorp/go-sockaddr/ipv4addr.go @@ -0,0 +1,515 @@ +package sockaddr + +import ( + "encoding/binary" + "fmt" + "net" + "regexp" + "strconv" + "strings" +) + +type ( + // IPv4Address is a named type representing an IPv4 address. + IPv4Address uint32 + + // IPv4Network is a named type representing an IPv4 network. + IPv4Network uint32 + + // IPv4Mask is a named type representing an IPv4 network mask. + IPv4Mask uint32 +) + +// IPv4HostMask is a constant represents a /32 IPv4 Address +// (i.e. 255.255.255.255). +const IPv4HostMask = IPv4Mask(0xffffffff) + +// ipv4AddrAttrMap is a map of the IPv4Addr type-specific attributes. +var ipv4AddrAttrMap map[AttrName]func(IPv4Addr) string +var ipv4AddrAttrs []AttrName +var trailingHexNetmaskRE *regexp.Regexp + +// IPv4Addr implements a convenience wrapper around the union of Go's +// built-in net.IP and net.IPNet types. In UNIX-speak, IPv4Addr implements +// `sockaddr` when the the address family is set to AF_INET +// (i.e. `sockaddr_in`). +type IPv4Addr struct { + IPAddr + Address IPv4Address + Mask IPv4Mask + Port IPPort +} + +func init() { + ipv4AddrInit() + trailingHexNetmaskRE = regexp.MustCompile(`/([0f]{8})$`) +} + +// NewIPv4Addr creates an IPv4Addr from a string. String can be in the form +// of either an IPv4:port (e.g. `1.2.3.4:80`, in which case the mask is +// assumed to be a `/32`), an IPv4 address (e.g. `1.2.3.4`, also with a `/32` +// mask), or an IPv4 CIDR (e.g. `1.2.3.4/24`, which has its IP port +// initialized to zero). ipv4Str can not be a hostname. +// +// NOTE: Many net.*() routines will initialize and return an IPv6 address. +// To create uint32 values from net.IP, always test to make sure the address +// returned can be converted to a 4 byte array using To4(). +func NewIPv4Addr(ipv4Str string) (IPv4Addr, error) { + // Strip off any bogus hex-encoded netmasks that will be mis-parsed by Go. In + // particular, clients with the Barracuda VPN client will see something like: + // `192.168.3.51/00ffffff` as their IP address. + if match := trailingHexNetmaskRE.FindStringIndex(ipv4Str); match != nil { + ipv4Str = ipv4Str[:match[0]] + } + + // Parse as an IPv4 CIDR + ipAddr, network, err := net.ParseCIDR(ipv4Str) + if err == nil { + ipv4 := ipAddr.To4() + if ipv4 == nil { + return IPv4Addr{}, fmt.Errorf("Unable to convert %s to an IPv4 address", ipv4Str) + } + + // If we see an IPv6 netmask, convert it to an IPv4 mask. + netmaskSepPos := strings.LastIndexByte(ipv4Str, '/') + if netmaskSepPos != -1 && netmaskSepPos+1 < len(ipv4Str) { + netMask, err := strconv.ParseUint(ipv4Str[netmaskSepPos+1:], 10, 8) + if err != nil { + return IPv4Addr{}, fmt.Errorf("Unable to convert %s to an IPv4 address: unable to parse CIDR netmask: %v", ipv4Str, err) + } else if netMask > 128 { + return IPv4Addr{}, fmt.Errorf("Unable to convert %s to an IPv4 address: invalid CIDR netmask", ipv4Str) + } + + if netMask >= 96 { + // Convert the IPv6 netmask to an IPv4 netmask + network.Mask = net.CIDRMask(int(netMask-96), IPv4len*8) + } + } + ipv4Addr := IPv4Addr{ + Address: IPv4Address(binary.BigEndian.Uint32(ipv4)), + Mask: IPv4Mask(binary.BigEndian.Uint32(network.Mask)), + } + return ipv4Addr, nil + } + + // Attempt to parse ipv4Str as a /32 host with a port number. + tcpAddr, err := net.ResolveTCPAddr("tcp4", ipv4Str) + if err == nil { + ipv4 := tcpAddr.IP.To4() + if ipv4 == nil { + return IPv4Addr{}, fmt.Errorf("Unable to resolve %+q as an IPv4 address", ipv4Str) + } + + ipv4Uint32 := binary.BigEndian.Uint32(ipv4) + ipv4Addr := IPv4Addr{ + Address: IPv4Address(ipv4Uint32), + Mask: IPv4HostMask, + Port: IPPort(tcpAddr.Port), + } + + return ipv4Addr, nil + } + + // Parse as a naked IPv4 address + ip := net.ParseIP(ipv4Str) + if ip != nil { + ipv4 := ip.To4() + if ipv4 == nil { + return IPv4Addr{}, fmt.Errorf("Unable to string convert %+q to an IPv4 address", ipv4Str) + } + + ipv4Uint32 := binary.BigEndian.Uint32(ipv4) + ipv4Addr := IPv4Addr{ + Address: IPv4Address(ipv4Uint32), + Mask: IPv4HostMask, + } + return ipv4Addr, nil + } + + return IPv4Addr{}, fmt.Errorf("Unable to parse %+q to an IPv4 address: %v", ipv4Str, err) +} + +// AddressBinString returns a string with the IPv4Addr's Address represented +// as a sequence of '0' and '1' characters. This method is useful for +// debugging or by operators who want to inspect an address. +func (ipv4 IPv4Addr) AddressBinString() string { + return fmt.Sprintf("%032s", strconv.FormatUint(uint64(ipv4.Address), 2)) +} + +// AddressHexString returns a string with the IPv4Addr address represented as +// a sequence of hex characters. This method is useful for debugging or by +// operators who want to inspect an address. +func (ipv4 IPv4Addr) AddressHexString() string { + return fmt.Sprintf("%08s", strconv.FormatUint(uint64(ipv4.Address), 16)) +} + +// Broadcast is an IPv4Addr-only method that returns the broadcast address of +// the network. +// +// NOTE: IPv6 only supports multicast, so this method only exists for +// IPv4Addr. +func (ipv4 IPv4Addr) Broadcast() IPAddr { + // Nothing should listen on a broadcast address. + return IPv4Addr{ + Address: IPv4Address(ipv4.BroadcastAddress()), + Mask: IPv4HostMask, + } +} + +// BroadcastAddress returns a IPv4Network of the IPv4Addr's broadcast +// address. +func (ipv4 IPv4Addr) BroadcastAddress() IPv4Network { + return IPv4Network(uint32(ipv4.Address)&uint32(ipv4.Mask) | ^uint32(ipv4.Mask)) +} + +// CmpAddress follows the Cmp() standard protocol and returns: +// +// - -1 If the receiver should sort first because its address is lower than arg +// - 0 if the SockAddr arg is equal to the receiving IPv4Addr or the argument is +// of a different type. +// - 1 If the argument should sort first. +func (ipv4 IPv4Addr) CmpAddress(sa SockAddr) int { + ipv4b, ok := sa.(IPv4Addr) + if !ok { + return sortDeferDecision + } + + switch { + case ipv4.Address == ipv4b.Address: + return sortDeferDecision + case ipv4.Address < ipv4b.Address: + return sortReceiverBeforeArg + default: + return sortArgBeforeReceiver + } +} + +// CmpPort follows the Cmp() standard protocol and returns: +// +// - -1 If the receiver should sort first because its port is lower than arg +// - 0 if the SockAddr arg's port number is equal to the receiving IPv4Addr, +// regardless of type. +// - 1 If the argument should sort first. +func (ipv4 IPv4Addr) CmpPort(sa SockAddr) int { + var saPort IPPort + switch v := sa.(type) { + case IPv4Addr: + saPort = v.Port + case IPv6Addr: + saPort = v.Port + default: + return sortDeferDecision + } + + switch { + case ipv4.Port == saPort: + return sortDeferDecision + case ipv4.Port < saPort: + return sortReceiverBeforeArg + default: + return sortArgBeforeReceiver + } +} + +// CmpRFC follows the Cmp() standard protocol and returns: +// +// - -1 If the receiver should sort first because it belongs to the RFC and its +// arg does not +// - 0 if the receiver and arg both belong to the same RFC or neither do. +// - 1 If the arg belongs to the RFC but receiver does not. +func (ipv4 IPv4Addr) CmpRFC(rfcNum uint, sa SockAddr) int { + recvInRFC := IsRFC(rfcNum, ipv4) + ipv4b, ok := sa.(IPv4Addr) + if !ok { + // If the receiver is part of the desired RFC and the SockAddr + // argument is not, return -1 so that the receiver sorts before + // the non-IPv4 SockAddr. Conversely, if the receiver is not + // part of the RFC, punt on sorting and leave it for the next + // sorter. + if recvInRFC { + return sortReceiverBeforeArg + } else { + return sortDeferDecision + } + } + + argInRFC := IsRFC(rfcNum, ipv4b) + switch { + case (recvInRFC && argInRFC), (!recvInRFC && !argInRFC): + // If a and b both belong to the RFC, or neither belong to + // rfcNum, defer sorting to the next sorter. + return sortDeferDecision + case recvInRFC && !argInRFC: + return sortReceiverBeforeArg + default: + return sortArgBeforeReceiver + } +} + +// Contains returns true if the SockAddr is contained within the receiver. +func (ipv4 IPv4Addr) Contains(sa SockAddr) bool { + ipv4b, ok := sa.(IPv4Addr) + if !ok { + return false + } + + return ipv4.ContainsNetwork(ipv4b) +} + +// ContainsAddress returns true if the IPv4Address is contained within the +// receiver. +func (ipv4 IPv4Addr) ContainsAddress(x IPv4Address) bool { + return IPv4Address(ipv4.NetworkAddress()) <= x && + IPv4Address(ipv4.BroadcastAddress()) >= x +} + +// ContainsNetwork returns true if the network from IPv4Addr is contained +// within the receiver. +func (ipv4 IPv4Addr) ContainsNetwork(x IPv4Addr) bool { + return ipv4.NetworkAddress() <= x.NetworkAddress() && + ipv4.BroadcastAddress() >= x.BroadcastAddress() +} + +// DialPacketArgs returns the arguments required to be passed to +// net.DialUDP(). If the Mask of ipv4 is not a /32 or the Port is 0, +// DialPacketArgs() will fail. See Host() to create an IPv4Addr with its +// mask set to /32. +func (ipv4 IPv4Addr) DialPacketArgs() (network, dialArgs string) { + if ipv4.Mask != IPv4HostMask || ipv4.Port == 0 { + return "udp4", "" + } + return "udp4", fmt.Sprintf("%s:%d", ipv4.NetIP().String(), ipv4.Port) +} + +// DialStreamArgs returns the arguments required to be passed to +// net.DialTCP(). If the Mask of ipv4 is not a /32 or the Port is 0, +// DialStreamArgs() will fail. See Host() to create an IPv4Addr with its +// mask set to /32. +func (ipv4 IPv4Addr) DialStreamArgs() (network, dialArgs string) { + if ipv4.Mask != IPv4HostMask || ipv4.Port == 0 { + return "tcp4", "" + } + return "tcp4", fmt.Sprintf("%s:%d", ipv4.NetIP().String(), ipv4.Port) +} + +// Equal returns true if a SockAddr is equal to the receiving IPv4Addr. +func (ipv4 IPv4Addr) Equal(sa SockAddr) bool { + ipv4b, ok := sa.(IPv4Addr) + if !ok { + return false + } + + if ipv4.Port != ipv4b.Port { + return false + } + + if ipv4.Address != ipv4b.Address { + return false + } + + if ipv4.NetIPNet().String() != ipv4b.NetIPNet().String() { + return false + } + + return true +} + +// FirstUsable returns an IPv4Addr set to the first address following the +// network prefix. The first usable address in a network is normally the +// gateway and should not be used except by devices forwarding packets +// between two administratively distinct networks (i.e. a router). This +// function does not discriminate against first usable vs "first address that +// should be used." For example, FirstUsable() on "192.168.1.10/24" would +// return the address "192.168.1.1/24". +func (ipv4 IPv4Addr) FirstUsable() IPAddr { + addr := ipv4.NetworkAddress() + + // If /32, return the address itself. If /31 assume a point-to-point + // link and return the lower address. + if ipv4.Maskbits() < 31 { + addr++ + } + + return IPv4Addr{ + Address: IPv4Address(addr), + Mask: IPv4HostMask, + } +} + +// Host returns a copy of ipv4 with its mask set to /32 so that it can be +// used by DialPacketArgs(), DialStreamArgs(), ListenPacketArgs(), or +// ListenStreamArgs(). +func (ipv4 IPv4Addr) Host() IPAddr { + // Nothing should listen on a broadcast address. + return IPv4Addr{ + Address: ipv4.Address, + Mask: IPv4HostMask, + Port: ipv4.Port, + } +} + +// IPPort returns the Port number attached to the IPv4Addr +func (ipv4 IPv4Addr) IPPort() IPPort { + return ipv4.Port +} + +// LastUsable returns the last address before the broadcast address in a +// given network. +func (ipv4 IPv4Addr) LastUsable() IPAddr { + addr := ipv4.BroadcastAddress() + + // If /32, return the address itself. If /31 assume a point-to-point + // link and return the upper address. + if ipv4.Maskbits() < 31 { + addr-- + } + + return IPv4Addr{ + Address: IPv4Address(addr), + Mask: IPv4HostMask, + } +} + +// ListenPacketArgs returns the arguments required to be passed to +// net.ListenUDP(). If the Mask of ipv4 is not a /32, ListenPacketArgs() +// will fail. See Host() to create an IPv4Addr with its mask set to /32. +func (ipv4 IPv4Addr) ListenPacketArgs() (network, listenArgs string) { + if ipv4.Mask != IPv4HostMask { + return "udp4", "" + } + return "udp4", fmt.Sprintf("%s:%d", ipv4.NetIP().String(), ipv4.Port) +} + +// ListenStreamArgs returns the arguments required to be passed to +// net.ListenTCP(). If the Mask of ipv4 is not a /32, ListenStreamArgs() +// will fail. See Host() to create an IPv4Addr with its mask set to /32. +func (ipv4 IPv4Addr) ListenStreamArgs() (network, listenArgs string) { + if ipv4.Mask != IPv4HostMask { + return "tcp4", "" + } + return "tcp4", fmt.Sprintf("%s:%d", ipv4.NetIP().String(), ipv4.Port) +} + +// Maskbits returns the number of network mask bits in a given IPv4Addr. For +// example, the Maskbits() of "192.168.1.1/24" would return 24. +func (ipv4 IPv4Addr) Maskbits() int { + mask := make(net.IPMask, IPv4len) + binary.BigEndian.PutUint32(mask, uint32(ipv4.Mask)) + maskOnes, _ := mask.Size() + return maskOnes +} + +// MustIPv4Addr is a helper method that must return an IPv4Addr or panic on +// invalid input. +func MustIPv4Addr(addr string) IPv4Addr { + ipv4, err := NewIPv4Addr(addr) + if err != nil { + panic(fmt.Sprintf("Unable to create an IPv4Addr from %+q: %v", addr, err)) + } + return ipv4 +} + +// NetIP returns the address as a net.IP (address is always presized to +// IPv4). +func (ipv4 IPv4Addr) NetIP() *net.IP { + x := make(net.IP, IPv4len) + binary.BigEndian.PutUint32(x, uint32(ipv4.Address)) + return &x +} + +// NetIPMask create a new net.IPMask from the IPv4Addr. +func (ipv4 IPv4Addr) NetIPMask() *net.IPMask { + ipv4Mask := net.IPMask{} + ipv4Mask = make(net.IPMask, IPv4len) + binary.BigEndian.PutUint32(ipv4Mask, uint32(ipv4.Mask)) + return &ipv4Mask +} + +// NetIPNet create a new net.IPNet from the IPv4Addr. +func (ipv4 IPv4Addr) NetIPNet() *net.IPNet { + ipv4net := &net.IPNet{} + ipv4net.IP = make(net.IP, IPv4len) + binary.BigEndian.PutUint32(ipv4net.IP, uint32(ipv4.NetworkAddress())) + ipv4net.Mask = *ipv4.NetIPMask() + return ipv4net +} + +// Network returns the network prefix or network address for a given network. +func (ipv4 IPv4Addr) Network() IPAddr { + return IPv4Addr{ + Address: IPv4Address(ipv4.NetworkAddress()), + Mask: ipv4.Mask, + } +} + +// NetworkAddress returns an IPv4Network of the IPv4Addr's network address. +func (ipv4 IPv4Addr) NetworkAddress() IPv4Network { + return IPv4Network(uint32(ipv4.Address) & uint32(ipv4.Mask)) +} + +// Octets returns a slice of the four octets in an IPv4Addr's Address. The +// order of the bytes is big endian. +func (ipv4 IPv4Addr) Octets() []int { + return []int{ + int(ipv4.Address >> 24), + int((ipv4.Address >> 16) & 0xff), + int((ipv4.Address >> 8) & 0xff), + int(ipv4.Address & 0xff), + } +} + +// String returns a string representation of the IPv4Addr +func (ipv4 IPv4Addr) String() string { + if ipv4.Port != 0 { + return fmt.Sprintf("%s:%d", ipv4.NetIP().String(), ipv4.Port) + } + + if ipv4.Maskbits() == 32 { + return ipv4.NetIP().String() + } + + return fmt.Sprintf("%s/%d", ipv4.NetIP().String(), ipv4.Maskbits()) +} + +// Type is used as a type switch and returns TypeIPv4 +func (IPv4Addr) Type() SockAddrType { + return TypeIPv4 +} + +// IPv4AddrAttr returns a string representation of an attribute for the given +// IPv4Addr. +func IPv4AddrAttr(ipv4 IPv4Addr, selector AttrName) string { + fn, found := ipv4AddrAttrMap[selector] + if !found { + return "" + } + + return fn(ipv4) +} + +// IPv4Attrs returns a list of attributes supported by the IPv4Addr type +func IPv4Attrs() []AttrName { + return ipv4AddrAttrs +} + +// ipv4AddrInit is called once at init() +func ipv4AddrInit() { + // Sorted for human readability + ipv4AddrAttrs = []AttrName{ + "size", // Same position as in IPv6 for output consistency + "broadcast", + "uint32", + } + + ipv4AddrAttrMap = map[AttrName]func(ipv4 IPv4Addr) string{ + "broadcast": func(ipv4 IPv4Addr) string { + return ipv4.Broadcast().String() + }, + "size": func(ipv4 IPv4Addr) string { + return fmt.Sprintf("%d", 1< 2 && ipv6Str[0] == '[' && ipv6Str[len(ipv6Str)-1] == ']' { + ipv6Str = ipv6Str[1 : len(ipv6Str)-1] + } + ip := net.ParseIP(ipv6Str) + if ip != nil { + ipv6 := ip.To16() + if ipv6 == nil { + return IPv6Addr{}, fmt.Errorf("Unable to string convert %+q to a 16byte IPv6 address", ipv6Str) + } + + ipv6BigIntAddr := new(big.Int) + ipv6BigIntAddr.SetBytes(ipv6) + + ipv6BigIntMask := new(big.Int) + ipv6BigIntMask.Set(ipv6HostMask) + + return IPv6Addr{ + Address: IPv6Address(ipv6BigIntAddr), + Mask: IPv6Mask(ipv6BigIntMask), + }, nil + } + + // Parse as an IPv6 CIDR + ipAddr, network, err := net.ParseCIDR(ipv6Str) + if err == nil { + ipv6 := ipAddr.To16() + if ipv6 == nil { + return IPv6Addr{}, fmt.Errorf("Unable to convert %+q to a 16byte IPv6 address", ipv6Str) + } + + ipv6BigIntAddr := new(big.Int) + ipv6BigIntAddr.SetBytes(ipv6) + + ipv6BigIntMask := new(big.Int) + ipv6BigIntMask.SetBytes(network.Mask) + + ipv6Addr := IPv6Addr{ + Address: IPv6Address(ipv6BigIntAddr), + Mask: IPv6Mask(ipv6BigIntMask), + } + return ipv6Addr, nil + } + + return IPv6Addr{}, fmt.Errorf("Unable to parse %+q to an IPv6 address: %v", ipv6Str, err) +} + +// AddressBinString returns a string with the IPv6Addr's Address represented +// as a sequence of '0' and '1' characters. This method is useful for +// debugging or by operators who want to inspect an address. +func (ipv6 IPv6Addr) AddressBinString() string { + bi := big.Int(*ipv6.Address) + return fmt.Sprintf("%0128s", bi.Text(2)) +} + +// AddressHexString returns a string with the IPv6Addr address represented as +// a sequence of hex characters. This method is useful for debugging or by +// operators who want to inspect an address. +func (ipv6 IPv6Addr) AddressHexString() string { + bi := big.Int(*ipv6.Address) + return fmt.Sprintf("%032s", bi.Text(16)) +} + +// CmpAddress follows the Cmp() standard protocol and returns: +// +// - -1 If the receiver should sort first because its address is lower than arg +// - 0 if the SockAddr arg equal to the receiving IPv6Addr or the argument is of a +// different type. +// - 1 If the argument should sort first. +func (ipv6 IPv6Addr) CmpAddress(sa SockAddr) int { + ipv6b, ok := sa.(IPv6Addr) + if !ok { + return sortDeferDecision + } + + ipv6aBigInt := new(big.Int) + ipv6aBigInt.Set(ipv6.Address) + ipv6bBigInt := new(big.Int) + ipv6bBigInt.Set(ipv6b.Address) + + return ipv6aBigInt.Cmp(ipv6bBigInt) +} + +// CmpPort follows the Cmp() standard protocol and returns: +// +// - -1 If the receiver should sort first because its port is lower than arg +// - 0 if the SockAddr arg's port number is equal to the receiving IPv6Addr, +// regardless of type. +// - 1 If the argument should sort first. +func (ipv6 IPv6Addr) CmpPort(sa SockAddr) int { + var saPort IPPort + switch v := sa.(type) { + case IPv4Addr: + saPort = v.Port + case IPv6Addr: + saPort = v.Port + default: + return sortDeferDecision + } + + switch { + case ipv6.Port == saPort: + return sortDeferDecision + case ipv6.Port < saPort: + return sortReceiverBeforeArg + default: + return sortArgBeforeReceiver + } +} + +// CmpRFC follows the Cmp() standard protocol and returns: +// +// - -1 If the receiver should sort first because it belongs to the RFC and its +// arg does not +// - 0 if the receiver and arg both belong to the same RFC or neither do. +// - 1 If the arg belongs to the RFC but receiver does not. +func (ipv6 IPv6Addr) CmpRFC(rfcNum uint, sa SockAddr) int { + recvInRFC := IsRFC(rfcNum, ipv6) + ipv6b, ok := sa.(IPv6Addr) + if !ok { + // If the receiver is part of the desired RFC and the SockAddr + // argument is not, sort receiver before the non-IPv6 SockAddr. + // Conversely, if the receiver is not part of the RFC, punt on + // sorting and leave it for the next sorter. + if recvInRFC { + return sortReceiverBeforeArg + } else { + return sortDeferDecision + } + } + + argInRFC := IsRFC(rfcNum, ipv6b) + switch { + case (recvInRFC && argInRFC), (!recvInRFC && !argInRFC): + // If a and b both belong to the RFC, or neither belong to + // rfcNum, defer sorting to the next sorter. + return sortDeferDecision + case recvInRFC && !argInRFC: + return sortReceiverBeforeArg + default: + return sortArgBeforeReceiver + } +} + +// Contains returns true if the SockAddr is contained within the receiver. +func (ipv6 IPv6Addr) Contains(sa SockAddr) bool { + ipv6b, ok := sa.(IPv6Addr) + if !ok { + return false + } + + return ipv6.ContainsNetwork(ipv6b) +} + +// ContainsAddress returns true if the IPv6Address is contained within the +// receiver. +func (ipv6 IPv6Addr) ContainsAddress(x IPv6Address) bool { + xAddr := IPv6Addr{ + Address: x, + Mask: ipv6HostMask, + } + + { + xIPv6 := xAddr.FirstUsable().(IPv6Addr) + yIPv6 := ipv6.FirstUsable().(IPv6Addr) + if xIPv6.CmpAddress(yIPv6) >= 1 { + return false + } + } + + { + xIPv6 := xAddr.LastUsable().(IPv6Addr) + yIPv6 := ipv6.LastUsable().(IPv6Addr) + if xIPv6.CmpAddress(yIPv6) <= -1 { + return false + } + } + return true +} + +// ContainsNetwork returns true if the network from IPv6Addr is contained within +// the receiver. +func (x IPv6Addr) ContainsNetwork(y IPv6Addr) bool { + { + xIPv6 := x.FirstUsable().(IPv6Addr) + yIPv6 := y.FirstUsable().(IPv6Addr) + if ret := xIPv6.CmpAddress(yIPv6); ret >= 1 { + return false + } + } + + { + xIPv6 := x.LastUsable().(IPv6Addr) + yIPv6 := y.LastUsable().(IPv6Addr) + if ret := xIPv6.CmpAddress(yIPv6); ret <= -1 { + return false + } + } + return true +} + +// DialPacketArgs returns the arguments required to be passed to +// net.DialUDP(). If the Mask of ipv6 is not a /128 or the Port is 0, +// DialPacketArgs() will fail. See Host() to create an IPv6Addr with its +// mask set to /128. +func (ipv6 IPv6Addr) DialPacketArgs() (network, dialArgs string) { + ipv6Mask := big.Int(*ipv6.Mask) + if ipv6Mask.Cmp(ipv6HostMask) != 0 || ipv6.Port == 0 { + return "udp6", "" + } + return "udp6", fmt.Sprintf("[%s]:%d", ipv6.NetIP().String(), ipv6.Port) +} + +// DialStreamArgs returns the arguments required to be passed to +// net.DialTCP(). If the Mask of ipv6 is not a /128 or the Port is 0, +// DialStreamArgs() will fail. See Host() to create an IPv6Addr with its +// mask set to /128. +func (ipv6 IPv6Addr) DialStreamArgs() (network, dialArgs string) { + ipv6Mask := big.Int(*ipv6.Mask) + if ipv6Mask.Cmp(ipv6HostMask) != 0 || ipv6.Port == 0 { + return "tcp6", "" + } + return "tcp6", fmt.Sprintf("[%s]:%d", ipv6.NetIP().String(), ipv6.Port) +} + +// Equal returns true if a SockAddr is equal to the receiving IPv4Addr. +func (ipv6a IPv6Addr) Equal(sa SockAddr) bool { + ipv6b, ok := sa.(IPv6Addr) + if !ok { + return false + } + + if ipv6a.NetIP().String() != ipv6b.NetIP().String() { + return false + } + + if ipv6a.NetIPNet().String() != ipv6b.NetIPNet().String() { + return false + } + + if ipv6a.Port != ipv6b.Port { + return false + } + + return true +} + +// FirstUsable returns an IPv6Addr set to the first address following the +// network prefix. The first usable address in a network is normally the +// gateway and should not be used except by devices forwarding packets +// between two administratively distinct networks (i.e. a router). This +// function does not discriminate against first usable vs "first address that +// should be used." For example, FirstUsable() on "2001:0db8::0003/64" would +// return "2001:0db8::00011". +func (ipv6 IPv6Addr) FirstUsable() IPAddr { + return IPv6Addr{ + Address: IPv6Address(ipv6.NetworkAddress()), + Mask: ipv6HostMask, + } +} + +// Host returns a copy of ipv6 with its mask set to /128 so that it can be +// used by DialPacketArgs(), DialStreamArgs(), ListenPacketArgs(), or +// ListenStreamArgs(). +func (ipv6 IPv6Addr) Host() IPAddr { + // Nothing should listen on a broadcast address. + return IPv6Addr{ + Address: ipv6.Address, + Mask: ipv6HostMask, + Port: ipv6.Port, + } +} + +// IPPort returns the Port number attached to the IPv6Addr +func (ipv6 IPv6Addr) IPPort() IPPort { + return ipv6.Port +} + +// LastUsable returns the last address in a given network. +func (ipv6 IPv6Addr) LastUsable() IPAddr { + addr := new(big.Int) + addr.Set(ipv6.Address) + + mask := new(big.Int) + mask.Set(ipv6.Mask) + + negMask := new(big.Int) + negMask.Xor(ipv6HostMask, mask) + + lastAddr := new(big.Int) + lastAddr.And(addr, mask) + lastAddr.Or(lastAddr, negMask) + + return IPv6Addr{ + Address: IPv6Address(lastAddr), + Mask: ipv6HostMask, + } +} + +// ListenPacketArgs returns the arguments required to be passed to +// net.ListenUDP(). If the Mask of ipv6 is not a /128, ListenPacketArgs() +// will fail. See Host() to create an IPv6Addr with its mask set to /128. +func (ipv6 IPv6Addr) ListenPacketArgs() (network, listenArgs string) { + ipv6Mask := big.Int(*ipv6.Mask) + if ipv6Mask.Cmp(ipv6HostMask) != 0 { + return "udp6", "" + } + return "udp6", fmt.Sprintf("[%s]:%d", ipv6.NetIP().String(), ipv6.Port) +} + +// ListenStreamArgs returns the arguments required to be passed to +// net.ListenTCP(). If the Mask of ipv6 is not a /128, ListenStreamArgs() +// will fail. See Host() to create an IPv6Addr with its mask set to /128. +func (ipv6 IPv6Addr) ListenStreamArgs() (network, listenArgs string) { + ipv6Mask := big.Int(*ipv6.Mask) + if ipv6Mask.Cmp(ipv6HostMask) != 0 { + return "tcp6", "" + } + return "tcp6", fmt.Sprintf("[%s]:%d", ipv6.NetIP().String(), ipv6.Port) +} + +// Maskbits returns the number of network mask bits in a given IPv6Addr. For +// example, the Maskbits() of "2001:0db8::0003/64" would return 64. +func (ipv6 IPv6Addr) Maskbits() int { + maskOnes, _ := ipv6.NetIPNet().Mask.Size() + + return maskOnes +} + +// MustIPv6Addr is a helper method that must return an IPv6Addr or panic on +// invalid input. +func MustIPv6Addr(addr string) IPv6Addr { + ipv6, err := NewIPv6Addr(addr) + if err != nil { + panic(fmt.Sprintf("Unable to create an IPv6Addr from %+q: %v", addr, err)) + } + return ipv6 +} + +// NetIP returns the address as a net.IP. +func (ipv6 IPv6Addr) NetIP() *net.IP { + return bigIntToNetIPv6(ipv6.Address) +} + +// NetIPMask create a new net.IPMask from the IPv6Addr. +func (ipv6 IPv6Addr) NetIPMask() *net.IPMask { + ipv6Mask := make(net.IPMask, IPv6len) + m := big.Int(*ipv6.Mask) + copy(ipv6Mask, m.Bytes()) + return &ipv6Mask +} + +// Network returns a pointer to the net.IPNet within IPv4Addr receiver. +func (ipv6 IPv6Addr) NetIPNet() *net.IPNet { + ipv6net := &net.IPNet{} + ipv6net.IP = make(net.IP, IPv6len) + copy(ipv6net.IP, *ipv6.NetIP()) + ipv6net.Mask = *ipv6.NetIPMask() + return ipv6net +} + +// Network returns the network prefix or network address for a given network. +func (ipv6 IPv6Addr) Network() IPAddr { + return IPv6Addr{ + Address: IPv6Address(ipv6.NetworkAddress()), + Mask: ipv6.Mask, + } +} + +// NetworkAddress returns an IPv6Network of the IPv6Addr's network address. +func (ipv6 IPv6Addr) NetworkAddress() IPv6Network { + addr := new(big.Int) + addr.SetBytes((*ipv6.Address).Bytes()) + + mask := new(big.Int) + mask.SetBytes(*ipv6.NetIPMask()) + + netAddr := new(big.Int) + netAddr.And(addr, mask) + + return IPv6Network(netAddr) +} + +// Octets returns a slice of the 16 octets in an IPv6Addr's Address. The +// order of the bytes is big endian. +func (ipv6 IPv6Addr) Octets() []int { + x := make([]int, IPv6len) + for i, b := range *bigIntToNetIPv6(ipv6.Address) { + x[i] = int(b) + } + + return x +} + +// String returns a string representation of the IPv6Addr +func (ipv6 IPv6Addr) String() string { + if ipv6.Port != 0 { + return fmt.Sprintf("[%s]:%d", ipv6.NetIP().String(), ipv6.Port) + } + + if ipv6.Maskbits() == 128 { + return ipv6.NetIP().String() + } + + return fmt.Sprintf("%s/%d", ipv6.NetIP().String(), ipv6.Maskbits()) +} + +// Type is used as a type switch and returns TypeIPv6 +func (IPv6Addr) Type() SockAddrType { + return TypeIPv6 +} + +// IPv6Attrs returns a list of attributes supported by the IPv6Addr type +func IPv6Attrs() []AttrName { + return ipv6AddrAttrs +} + +// IPv6AddrAttr returns a string representation of an attribute for the given +// IPv6Addr. +func IPv6AddrAttr(ipv6 IPv6Addr, selector AttrName) string { + fn, found := ipv6AddrAttrMap[selector] + if !found { + return "" + } + + return fn(ipv6) +} + +// ipv6AddrInit is called once at init() +func ipv6AddrInit() { + // Sorted for human readability + ipv6AddrAttrs = []AttrName{ + "size", // Same position as in IPv6 for output consistency + "uint128", + } + + ipv6AddrAttrMap = map[AttrName]func(ipv6 IPv6Addr) string{ + "size": func(ipv6 IPv6Addr) string { + netSize := big.NewInt(1) + netSize = netSize.Lsh(netSize, uint(IPv6len*8-ipv6.Maskbits())) + return netSize.Text(10) + }, + "uint128": func(ipv6 IPv6Addr) string { + b := big.Int(*ipv6.Address) + return b.Text(10) + }, + } +} + +// bigIntToNetIPv6 is a helper function that correctly returns a net.IP with the +// correctly padded values. +func bigIntToNetIPv6(bi *big.Int) *net.IP { + x := make(net.IP, IPv6len) + ipv6Bytes := bi.Bytes() + + // It's possibe for ipv6Bytes to be less than IPv6len bytes in size. If + // they are different sizes we to pad the size of response. + if len(ipv6Bytes) < IPv6len { + buf := new(bytes.Buffer) + buf.Grow(IPv6len) + + for i := len(ipv6Bytes); i < IPv6len; i++ { + if err := binary.Write(buf, binary.BigEndian, byte(0)); err != nil { + panic(fmt.Sprintf("Unable to pad byte %d of input %v: %v", i, bi, err)) + } + } + + for _, b := range ipv6Bytes { + if err := binary.Write(buf, binary.BigEndian, b); err != nil { + panic(fmt.Sprintf("Unable to preserve endianness of input %v: %v", bi, err)) + } + } + + ipv6Bytes = buf.Bytes() + } + i := copy(x, ipv6Bytes) + if i != IPv6len { + panic("IPv6 wrong size") + } + return &x +} diff --git a/vendor/github.com/hashicorp/go-sockaddr/rfc.go b/vendor/github.com/hashicorp/go-sockaddr/rfc.go new file mode 100644 index 000000000..fd9be940b --- /dev/null +++ b/vendor/github.com/hashicorp/go-sockaddr/rfc.go @@ -0,0 +1,947 @@ +package sockaddr + +// ForwardingBlacklist is a faux RFC that includes a list of non-forwardable IP +// blocks. +const ForwardingBlacklist = 4294967295 + +// IsRFC tests to see if an SockAddr matches the specified RFC +func IsRFC(rfcNum uint, sa SockAddr) bool { + rfcNetMap := KnownRFCs() + rfcNets, ok := rfcNetMap[rfcNum] + if !ok { + return false + } + + var contained bool + for _, rfcNet := range rfcNets { + if rfcNet.Contains(sa) { + contained = true + break + } + } + return contained +} + +// KnownRFCs returns an initial set of known RFCs. +// +// NOTE (sean@): As this list evolves over time, please submit patches to keep +// this list current. If something isn't right, inquire, as it may just be a +// bug on my part. Some of the inclusions were based on my judgement as to what +// would be a useful value (e.g. RFC3330). +// +// Useful resources: +// +// * https://www.iana.org/assignments/ipv6-address-space/ipv6-address-space.xhtml +// * https://www.iana.org/assignments/ipv6-unicast-address-assignments/ipv6-unicast-address-assignments.xhtml +// * https://www.iana.org/assignments/ipv6-address-space/ipv6-address-space.xhtml +func KnownRFCs() map[uint]SockAddrs { + // NOTE(sean@): Multiple SockAddrs per RFC lend themselves well to a + // RADIX tree, but `ENOTIME`. Patches welcome. + return map[uint]SockAddrs{ + 919: { + // [RFC919] Broadcasting Internet Datagrams + MustIPv4Addr("255.255.255.255/32"), // [RFC1122], §7 Broadcast IP Addressing - Proposed Standards + }, + 1122: { + // [RFC1122] Requirements for Internet Hosts -- Communication Layers + MustIPv4Addr("0.0.0.0/8"), // [RFC1122], §3.2.1.3 + MustIPv4Addr("127.0.0.0/8"), // [RFC1122], §3.2.1.3 + }, + 1112: { + // [RFC1112] Host Extensions for IP Multicasting + MustIPv4Addr("224.0.0.0/4"), // [RFC1112], §4 Host Group Addresses + }, + 1918: { + // [RFC1918] Address Allocation for Private Internets + MustIPv4Addr("10.0.0.0/8"), + MustIPv4Addr("172.16.0.0/12"), + MustIPv4Addr("192.168.0.0/16"), + }, + 2544: { + // [RFC2544] Benchmarking Methodology for Network + // Interconnect Devices + MustIPv4Addr("198.18.0.0/15"), + }, + 2765: { + // [RFC2765] Stateless IP/ICMP Translation Algorithm + // (SIIT) (obsoleted by RFCs 6145, which itself was + // later obsoleted by 7915). + + // [RFC2765], §2.1 Addresses + MustIPv6Addr("0:0:0:0:0:ffff:0:0/96"), + }, + 2928: { + // [RFC2928] Initial IPv6 Sub-TLA ID Assignments + MustIPv6Addr("2001::/16"), // Superblock + //MustIPv6Addr("2001:0000::/23"), // IANA + //MustIPv6Addr("2001:0200::/23"), // APNIC + //MustIPv6Addr("2001:0400::/23"), // ARIN + //MustIPv6Addr("2001:0600::/23"), // RIPE NCC + //MustIPv6Addr("2001:0800::/23"), // (future assignment) + // ... + //MustIPv6Addr("2001:FE00::/23"), // (future assignment) + }, + 3056: { // 6to4 address + // [RFC3056] Connection of IPv6 Domains via IPv4 Clouds + + // [RFC3056], §2 IPv6 Prefix Allocation + MustIPv6Addr("2002::/16"), + }, + 3068: { + // [RFC3068] An Anycast Prefix for 6to4 Relay Routers + // (obsolete by RFC7526) + + // [RFC3068], § 6to4 Relay anycast address + MustIPv4Addr("192.88.99.0/24"), + + // [RFC3068], §2.5 6to4 IPv6 relay anycast address + // + // NOTE: /120 == 128-(32-24) + MustIPv6Addr("2002:c058:6301::/120"), + }, + 3171: { + // [RFC3171] IANA Guidelines for IPv4 Multicast Address Assignments + MustIPv4Addr("224.0.0.0/4"), + }, + 3330: { + // [RFC3330] Special-Use IPv4 Addresses + + // Addresses in this block refer to source hosts on + // "this" network. Address 0.0.0.0/32 may be used as a + // source address for this host on this network; other + // addresses within 0.0.0.0/8 may be used to refer to + // specified hosts on this network [RFC1700, page 4]. + MustIPv4Addr("0.0.0.0/8"), + + // 10.0.0.0/8 - This block is set aside for use in + // private networks. Its intended use is documented in + // [RFC1918]. Addresses within this block should not + // appear on the public Internet. + MustIPv4Addr("10.0.0.0/8"), + + // 14.0.0.0/8 - This block is set aside for assignments + // to the international system of Public Data Networks + // [RFC1700, page 181]. The registry of assignments + // within this block can be accessed from the "Public + // Data Network Numbers" link on the web page at + // http://www.iana.org/numbers.html. Addresses within + // this block are assigned to users and should be + // treated as such. + + // 24.0.0.0/8 - This block was allocated in early 1996 + // for use in provisioning IP service over cable + // television systems. Although the IANA initially was + // involved in making assignments to cable operators, + // this responsibility was transferred to American + // Registry for Internet Numbers (ARIN) in May 2001. + // Addresses within this block are assigned in the + // normal manner and should be treated as such. + + // 39.0.0.0/8 - This block was used in the "Class A + // Subnet Experiment" that commenced in May 1995, as + // documented in [RFC1797]. The experiment has been + // completed and this block has been returned to the + // pool of addresses reserved for future allocation or + // assignment. This block therefore no longer has a + // special use and is subject to allocation to a + // Regional Internet Registry for assignment in the + // normal manner. + + // 127.0.0.0/8 - This block is assigned for use as the Internet host + // loopback address. A datagram sent by a higher level protocol to an + // address anywhere within this block should loop back inside the host. + // This is ordinarily implemented using only 127.0.0.1/32 for loopback, + // but no addresses within this block should ever appear on any network + // anywhere [RFC1700, page 5]. + MustIPv4Addr("127.0.0.0/8"), + + // 128.0.0.0/16 - This block, corresponding to the + // numerically lowest of the former Class B addresses, + // was initially and is still reserved by the IANA. + // Given the present classless nature of the IP address + // space, the basis for the reservation no longer + // applies and addresses in this block are subject to + // future allocation to a Regional Internet Registry for + // assignment in the normal manner. + + // 169.254.0.0/16 - This is the "link local" block. It + // is allocated for communication between hosts on a + // single link. Hosts obtain these addresses by + // auto-configuration, such as when a DHCP server may + // not be found. + MustIPv4Addr("169.254.0.0/16"), + + // 172.16.0.0/12 - This block is set aside for use in + // private networks. Its intended use is documented in + // [RFC1918]. Addresses within this block should not + // appear on the public Internet. + MustIPv4Addr("172.16.0.0/12"), + + // 191.255.0.0/16 - This block, corresponding to the numerically highest + // to the former Class B addresses, was initially and is still reserved + // by the IANA. Given the present classless nature of the IP address + // space, the basis for the reservation no longer applies and addresses + // in this block are subject to future allocation to a Regional Internet + // Registry for assignment in the normal manner. + + // 192.0.0.0/24 - This block, corresponding to the + // numerically lowest of the former Class C addresses, + // was initially and is still reserved by the IANA. + // Given the present classless nature of the IP address + // space, the basis for the reservation no longer + // applies and addresses in this block are subject to + // future allocation to a Regional Internet Registry for + // assignment in the normal manner. + + // 192.0.2.0/24 - This block is assigned as "TEST-NET" for use in + // documentation and example code. It is often used in conjunction with + // domain names example.com or example.net in vendor and protocol + // documentation. Addresses within this block should not appear on the + // public Internet. + MustIPv4Addr("192.0.2.0/24"), + + // 192.88.99.0/24 - This block is allocated for use as 6to4 relay + // anycast addresses, according to [RFC3068]. + MustIPv4Addr("192.88.99.0/24"), + + // 192.168.0.0/16 - This block is set aside for use in private networks. + // Its intended use is documented in [RFC1918]. Addresses within this + // block should not appear on the public Internet. + MustIPv4Addr("192.168.0.0/16"), + + // 198.18.0.0/15 - This block has been allocated for use + // in benchmark tests of network interconnect devices. + // Its use is documented in [RFC2544]. + MustIPv4Addr("198.18.0.0/15"), + + // 223.255.255.0/24 - This block, corresponding to the + // numerically highest of the former Class C addresses, + // was initially and is still reserved by the IANA. + // Given the present classless nature of the IP address + // space, the basis for the reservation no longer + // applies and addresses in this block are subject to + // future allocation to a Regional Internet Registry for + // assignment in the normal manner. + + // 224.0.0.0/4 - This block, formerly known as the Class + // D address space, is allocated for use in IPv4 + // multicast address assignments. The IANA guidelines + // for assignments from this space are described in + // [RFC3171]. + MustIPv4Addr("224.0.0.0/4"), + + // 240.0.0.0/4 - This block, formerly known as the Class E address + // space, is reserved. The "limited broadcast" destination address + // 255.255.255.255 should never be forwarded outside the (sub-)net of + // the source. The remainder of this space is reserved + // for future use. [RFC1700, page 4] + MustIPv4Addr("240.0.0.0/4"), + }, + 3849: { + // [RFC3849] IPv6 Address Prefix Reserved for Documentation + MustIPv6Addr("2001:db8::/32"), // [RFC3849], §4 IANA Considerations + }, + 3927: { + // [RFC3927] Dynamic Configuration of IPv4 Link-Local Addresses + MustIPv4Addr("169.254.0.0/16"), // [RFC3927], §2.1 Link-Local Address Selection + }, + 4038: { + // [RFC4038] Application Aspects of IPv6 Transition + + // [RFC4038], §4.2. IPv6 Applications in a Dual-Stack Node + MustIPv6Addr("0:0:0:0:0:ffff::/96"), + }, + 4193: { + // [RFC4193] Unique Local IPv6 Unicast Addresses + MustIPv6Addr("fc00::/7"), + }, + 4291: { + // [RFC4291] IP Version 6 Addressing Architecture + + // [RFC4291], §2.5.2 The Unspecified Address + MustIPv6Addr("::/128"), + + // [RFC4291], §2.5.3 The Loopback Address + MustIPv6Addr("::1/128"), + + // [RFC4291], §2.5.5.1. IPv4-Compatible IPv6 Address + MustIPv6Addr("::/96"), + + // [RFC4291], §2.5.5.2. IPv4-Mapped IPv6 Address + MustIPv6Addr("::ffff:0:0/96"), + + // [RFC4291], §2.5.6 Link-Local IPv6 Unicast Addresses + MustIPv6Addr("fe80::/10"), + + // [RFC4291], §2.5.7 Site-Local IPv6 Unicast Addresses + // (depreciated) + MustIPv6Addr("fec0::/10"), + + // [RFC4291], §2.7 Multicast Addresses + MustIPv6Addr("ff00::/8"), + + // IPv6 Multicast Information. + // + // In the following "table" below, `ff0x` is replaced + // with the following values depending on the scope of + // the query: + // + // IPv6 Multicast Scopes: + // * ff00/9 // reserved + // * ff01/9 // interface-local + // * ff02/9 // link-local + // * ff03/9 // realm-local + // * ff04/9 // admin-local + // * ff05/9 // site-local + // * ff08/9 // organization-local + // * ff0e/9 // global + // * ff0f/9 // reserved + // + // IPv6 Multicast Addresses: + // * ff0x::2 // All routers + // * ff02::5 // OSPFIGP + // * ff02::6 // OSPFIGP Designated Routers + // * ff02::9 // RIP Routers + // * ff02::a // EIGRP Routers + // * ff02::d // All PIM Routers + // * ff02::1a // All RPL Routers + // * ff0x::fb // mDNSv6 + // * ff0x::101 // All Network Time Protocol (NTP) servers + // * ff02::1:1 // Link Name + // * ff02::1:2 // All-dhcp-agents + // * ff02::1:3 // Link-local Multicast Name Resolution + // * ff05::1:3 // All-dhcp-servers + // * ff02::1:ff00:0/104 // Solicited-node multicast address. + // * ff02::2:ff00:0/104 // Node Information Queries + }, + 4380: { + // [RFC4380] Teredo: Tunneling IPv6 over UDP through + // Network Address Translations (NATs) + + // [RFC4380], §2.6 Global Teredo IPv6 Service Prefix + MustIPv6Addr("2001:0000::/32"), + }, + 4773: { + // [RFC4773] Administration of the IANA Special Purpose IPv6 Address Block + MustIPv6Addr("2001:0000::/23"), // IANA + }, + 4843: { + // [RFC4843] An IPv6 Prefix for Overlay Routable Cryptographic Hash Identifiers (ORCHID) + MustIPv6Addr("2001:10::/28"), // [RFC4843], §7 IANA Considerations + }, + 5180: { + // [RFC5180] IPv6 Benchmarking Methodology for Network Interconnect Devices + MustIPv6Addr("2001:0200::/48"), // [RFC5180], §8 IANA Considerations + }, + 5735: { + // [RFC5735] Special Use IPv4 Addresses + MustIPv4Addr("192.0.2.0/24"), // TEST-NET-1 + MustIPv4Addr("198.51.100.0/24"), // TEST-NET-2 + MustIPv4Addr("203.0.113.0/24"), // TEST-NET-3 + MustIPv4Addr("198.18.0.0/15"), // Benchmarks + }, + 5737: { + // [RFC5737] IPv4 Address Blocks Reserved for Documentation + MustIPv4Addr("192.0.2.0/24"), // TEST-NET-1 + MustIPv4Addr("198.51.100.0/24"), // TEST-NET-2 + MustIPv4Addr("203.0.113.0/24"), // TEST-NET-3 + }, + 6052: { + // [RFC6052] IPv6 Addressing of IPv4/IPv6 Translators + MustIPv6Addr("64:ff9b::/96"), // [RFC6052], §2.1. Well-Known Prefix + }, + 6333: { + // [RFC6333] Dual-Stack Lite Broadband Deployments Following IPv4 Exhaustion + MustIPv4Addr("192.0.0.0/29"), // [RFC6333], §5.7 Well-Known IPv4 Address + }, + 6598: { + // [RFC6598] IANA-Reserved IPv4 Prefix for Shared Address Space + MustIPv4Addr("100.64.0.0/10"), + }, + 6666: { + // [RFC6666] A Discard Prefix for IPv6 + MustIPv6Addr("0100::/64"), + }, + 6890: { + // [RFC6890] Special-Purpose IP Address Registries + + // From "RFC6890 §2.2.1 Information Requirements": + /* + The IPv4 and IPv6 Special-Purpose Address Registries maintain the + following information regarding each entry: + + o Address Block - A block of IPv4 or IPv6 addresses that has been + registered for a special purpose. + + o Name - A descriptive name for the special-purpose address block. + + o RFC - The RFC through which the special-purpose address block was + requested. + + o Allocation Date - The date upon which the special-purpose address + block was allocated. + + o Termination Date - The date upon which the allocation is to be + terminated. This field is applicable for limited-use allocations + only. + + o Source - A boolean value indicating whether an address from the + allocated special-purpose address block is valid when used as the + source address of an IP datagram that transits two devices. + + o Destination - A boolean value indicating whether an address from + the allocated special-purpose address block is valid when used as + the destination address of an IP datagram that transits two + devices. + + o Forwardable - A boolean value indicating whether a router may + forward an IP datagram whose destination address is drawn from the + allocated special-purpose address block between external + interfaces. + + o Global - A boolean value indicating whether an IP datagram whose + destination address is drawn from the allocated special-purpose + address block is forwardable beyond a specified administrative + domain. + + o Reserved-by-Protocol - A boolean value indicating whether the + special-purpose address block is reserved by IP, itself. This + value is "TRUE" if the RFC that created the special-purpose + address block requires all compliant IP implementations to behave + in a special way when processing packets either to or from + addresses contained by the address block. + + If the value of "Destination" is FALSE, the values of "Forwardable" + and "Global" must also be false. + */ + + /*+----------------------+----------------------------+ + * | Attribute | Value | + * +----------------------+----------------------------+ + * | Address Block | 0.0.0.0/8 | + * | Name | "This host on this network"| + * | RFC | [RFC1122], Section 3.2.1.3 | + * | Allocation Date | September 1981 | + * | Termination Date | N/A | + * | Source | True | + * | Destination | False | + * | Forwardable | False | + * | Global | False | + * | Reserved-by-Protocol | True | + * +----------------------+----------------------------+*/ + MustIPv4Addr("0.0.0.0/8"), + + /*+----------------------+---------------+ + * | Attribute | Value | + * +----------------------+---------------+ + * | Address Block | 10.0.0.0/8 | + * | Name | Private-Use | + * | RFC | [RFC1918] | + * | Allocation Date | February 1996 | + * | Termination Date | N/A | + * | Source | True | + * | Destination | True | + * | Forwardable | True | + * | Global | False | + * | Reserved-by-Protocol | False | + * +----------------------+---------------+ */ + MustIPv4Addr("10.0.0.0/8"), + + /*+----------------------+----------------------+ + | Attribute | Value | + +----------------------+----------------------+ + | Address Block | 100.64.0.0/10 | + | Name | Shared Address Space | + | RFC | [RFC6598] | + | Allocation Date | April 2012 | + | Termination Date | N/A | + | Source | True | + | Destination | True | + | Forwardable | True | + | Global | False | + | Reserved-by-Protocol | False | + +----------------------+----------------------+*/ + MustIPv4Addr("100.64.0.0/10"), + + /*+----------------------+----------------------------+ + | Attribute | Value | + +----------------------+----------------------------+ + | Address Block | 127.0.0.0/8 | + | Name | Loopback | + | RFC | [RFC1122], Section 3.2.1.3 | + | Allocation Date | September 1981 | + | Termination Date | N/A | + | Source | False [1] | + | Destination | False [1] | + | Forwardable | False [1] | + | Global | False [1] | + | Reserved-by-Protocol | True | + +----------------------+----------------------------+*/ + // [1] Several protocols have been granted exceptions to + // this rule. For examples, see [RFC4379] and + // [RFC5884]. + MustIPv4Addr("127.0.0.0/8"), + + /*+----------------------+----------------+ + | Attribute | Value | + +----------------------+----------------+ + | Address Block | 169.254.0.0/16 | + | Name | Link Local | + | RFC | [RFC3927] | + | Allocation Date | May 2005 | + | Termination Date | N/A | + | Source | True | + | Destination | True | + | Forwardable | False | + | Global | False | + | Reserved-by-Protocol | True | + +----------------------+----------------+*/ + MustIPv4Addr("169.254.0.0/16"), + + /*+----------------------+---------------+ + | Attribute | Value | + +----------------------+---------------+ + | Address Block | 172.16.0.0/12 | + | Name | Private-Use | + | RFC | [RFC1918] | + | Allocation Date | February 1996 | + | Termination Date | N/A | + | Source | True | + | Destination | True | + | Forwardable | True | + | Global | False | + | Reserved-by-Protocol | False | + +----------------------+---------------+*/ + MustIPv4Addr("172.16.0.0/12"), + + /*+----------------------+---------------------------------+ + | Attribute | Value | + +----------------------+---------------------------------+ + | Address Block | 192.0.0.0/24 [2] | + | Name | IETF Protocol Assignments | + | RFC | Section 2.1 of this document | + | Allocation Date | January 2010 | + | Termination Date | N/A | + | Source | False | + | Destination | False | + | Forwardable | False | + | Global | False | + | Reserved-by-Protocol | False | + +----------------------+---------------------------------+*/ + // [2] Not usable unless by virtue of a more specific + // reservation. + MustIPv4Addr("192.0.0.0/24"), + + /*+----------------------+--------------------------------+ + | Attribute | Value | + +----------------------+--------------------------------+ + | Address Block | 192.0.0.0/29 | + | Name | IPv4 Service Continuity Prefix | + | RFC | [RFC6333], [RFC7335] | + | Allocation Date | June 2011 | + | Termination Date | N/A | + | Source | True | + | Destination | True | + | Forwardable | True | + | Global | False | + | Reserved-by-Protocol | False | + +----------------------+--------------------------------+*/ + MustIPv4Addr("192.0.0.0/29"), + + /*+----------------------+----------------------------+ + | Attribute | Value | + +----------------------+----------------------------+ + | Address Block | 192.0.2.0/24 | + | Name | Documentation (TEST-NET-1) | + | RFC | [RFC5737] | + | Allocation Date | January 2010 | + | Termination Date | N/A | + | Source | False | + | Destination | False | + | Forwardable | False | + | Global | False | + | Reserved-by-Protocol | False | + +----------------------+----------------------------+*/ + MustIPv4Addr("192.0.2.0/24"), + + /*+----------------------+--------------------+ + | Attribute | Value | + +----------------------+--------------------+ + | Address Block | 192.88.99.0/24 | + | Name | 6to4 Relay Anycast | + | RFC | [RFC3068] | + | Allocation Date | June 2001 | + | Termination Date | N/A | + | Source | True | + | Destination | True | + | Forwardable | True | + | Global | True | + | Reserved-by-Protocol | False | + +----------------------+--------------------+*/ + MustIPv4Addr("192.88.99.0/24"), + + /*+----------------------+----------------+ + | Attribute | Value | + +----------------------+----------------+ + | Address Block | 192.168.0.0/16 | + | Name | Private-Use | + | RFC | [RFC1918] | + | Allocation Date | February 1996 | + | Termination Date | N/A | + | Source | True | + | Destination | True | + | Forwardable | True | + | Global | False | + | Reserved-by-Protocol | False | + +----------------------+----------------+*/ + MustIPv4Addr("192.168.0.0/16"), + + /*+----------------------+---------------+ + | Attribute | Value | + +----------------------+---------------+ + | Address Block | 198.18.0.0/15 | + | Name | Benchmarking | + | RFC | [RFC2544] | + | Allocation Date | March 1999 | + | Termination Date | N/A | + | Source | True | + | Destination | True | + | Forwardable | True | + | Global | False | + | Reserved-by-Protocol | False | + +----------------------+---------------+*/ + MustIPv4Addr("198.18.0.0/15"), + + /*+----------------------+----------------------------+ + | Attribute | Value | + +----------------------+----------------------------+ + | Address Block | 198.51.100.0/24 | + | Name | Documentation (TEST-NET-2) | + | RFC | [RFC5737] | + | Allocation Date | January 2010 | + | Termination Date | N/A | + | Source | False | + | Destination | False | + | Forwardable | False | + | Global | False | + | Reserved-by-Protocol | False | + +----------------------+----------------------------+*/ + MustIPv4Addr("198.51.100.0/24"), + + /*+----------------------+----------------------------+ + | Attribute | Value | + +----------------------+----------------------------+ + | Address Block | 203.0.113.0/24 | + | Name | Documentation (TEST-NET-3) | + | RFC | [RFC5737] | + | Allocation Date | January 2010 | + | Termination Date | N/A | + | Source | False | + | Destination | False | + | Forwardable | False | + | Global | False | + | Reserved-by-Protocol | False | + +----------------------+----------------------------+*/ + MustIPv4Addr("203.0.113.0/24"), + + /*+----------------------+----------------------+ + | Attribute | Value | + +----------------------+----------------------+ + | Address Block | 240.0.0.0/4 | + | Name | Reserved | + | RFC | [RFC1112], Section 4 | + | Allocation Date | August 1989 | + | Termination Date | N/A | + | Source | False | + | Destination | False | + | Forwardable | False | + | Global | False | + | Reserved-by-Protocol | True | + +----------------------+----------------------+*/ + MustIPv4Addr("240.0.0.0/4"), + + /*+----------------------+----------------------+ + | Attribute | Value | + +----------------------+----------------------+ + | Address Block | 255.255.255.255/32 | + | Name | Limited Broadcast | + | RFC | [RFC0919], Section 7 | + | Allocation Date | October 1984 | + | Termination Date | N/A | + | Source | False | + | Destination | True | + | Forwardable | False | + | Global | False | + | Reserved-by-Protocol | False | + +----------------------+----------------------+*/ + MustIPv4Addr("255.255.255.255/32"), + + /*+----------------------+------------------+ + | Attribute | Value | + +----------------------+------------------+ + | Address Block | ::1/128 | + | Name | Loopback Address | + | RFC | [RFC4291] | + | Allocation Date | February 2006 | + | Termination Date | N/A | + | Source | False | + | Destination | False | + | Forwardable | False | + | Global | False | + | Reserved-by-Protocol | True | + +----------------------+------------------+*/ + MustIPv6Addr("::1/128"), + + /*+----------------------+---------------------+ + | Attribute | Value | + +----------------------+---------------------+ + | Address Block | ::/128 | + | Name | Unspecified Address | + | RFC | [RFC4291] | + | Allocation Date | February 2006 | + | Termination Date | N/A | + | Source | True | + | Destination | False | + | Forwardable | False | + | Global | False | + | Reserved-by-Protocol | True | + +----------------------+---------------------+*/ + MustIPv6Addr("::/128"), + + /*+----------------------+---------------------+ + | Attribute | Value | + +----------------------+---------------------+ + | Address Block | 64:ff9b::/96 | + | Name | IPv4-IPv6 Translat. | + | RFC | [RFC6052] | + | Allocation Date | October 2010 | + | Termination Date | N/A | + | Source | True | + | Destination | True | + | Forwardable | True | + | Global | True | + | Reserved-by-Protocol | False | + +----------------------+---------------------+*/ + MustIPv6Addr("64:ff9b::/96"), + + /*+----------------------+---------------------+ + | Attribute | Value | + +----------------------+---------------------+ + | Address Block | ::ffff:0:0/96 | + | Name | IPv4-mapped Address | + | RFC | [RFC4291] | + | Allocation Date | February 2006 | + | Termination Date | N/A | + | Source | False | + | Destination | False | + | Forwardable | False | + | Global | False | + | Reserved-by-Protocol | True | + +----------------------+---------------------+*/ + MustIPv6Addr("::ffff:0:0/96"), + + /*+----------------------+----------------------------+ + | Attribute | Value | + +----------------------+----------------------------+ + | Address Block | 100::/64 | + | Name | Discard-Only Address Block | + | RFC | [RFC6666] | + | Allocation Date | June 2012 | + | Termination Date | N/A | + | Source | True | + | Destination | True | + | Forwardable | True | + | Global | False | + | Reserved-by-Protocol | False | + +----------------------+----------------------------+*/ + MustIPv6Addr("100::/64"), + + /*+----------------------+---------------------------+ + | Attribute | Value | + +----------------------+---------------------------+ + | Address Block | 2001::/23 | + | Name | IETF Protocol Assignments | + | RFC | [RFC2928] | + | Allocation Date | September 2000 | + | Termination Date | N/A | + | Source | False[1] | + | Destination | False[1] | + | Forwardable | False[1] | + | Global | False[1] | + | Reserved-by-Protocol | False | + +----------------------+---------------------------+*/ + // [1] Unless allowed by a more specific allocation. + MustIPv6Addr("2001::/16"), + + /*+----------------------+----------------+ + | Attribute | Value | + +----------------------+----------------+ + | Address Block | 2001::/32 | + | Name | TEREDO | + | RFC | [RFC4380] | + | Allocation Date | January 2006 | + | Termination Date | N/A | + | Source | True | + | Destination | True | + | Forwardable | True | + | Global | False | + | Reserved-by-Protocol | False | + +----------------------+----------------+*/ + // Covered by previous entry, included for completeness. + // + // MustIPv6Addr("2001::/16"), + + /*+----------------------+----------------+ + | Attribute | Value | + +----------------------+----------------+ + | Address Block | 2001:2::/48 | + | Name | Benchmarking | + | RFC | [RFC5180] | + | Allocation Date | April 2008 | + | Termination Date | N/A | + | Source | True | + | Destination | True | + | Forwardable | True | + | Global | False | + | Reserved-by-Protocol | False | + +----------------------+----------------+*/ + // Covered by previous entry, included for completeness. + // + // MustIPv6Addr("2001:2::/48"), + + /*+----------------------+---------------+ + | Attribute | Value | + +----------------------+---------------+ + | Address Block | 2001:db8::/32 | + | Name | Documentation | + | RFC | [RFC3849] | + | Allocation Date | July 2004 | + | Termination Date | N/A | + | Source | False | + | Destination | False | + | Forwardable | False | + | Global | False | + | Reserved-by-Protocol | False | + +----------------------+---------------+*/ + // Covered by previous entry, included for completeness. + // + // MustIPv6Addr("2001:db8::/32"), + + /*+----------------------+--------------+ + | Attribute | Value | + +----------------------+--------------+ + | Address Block | 2001:10::/28 | + | Name | ORCHID | + | RFC | [RFC4843] | + | Allocation Date | March 2007 | + | Termination Date | March 2014 | + | Source | False | + | Destination | False | + | Forwardable | False | + | Global | False | + | Reserved-by-Protocol | False | + +----------------------+--------------+*/ + // Covered by previous entry, included for completeness. + // + // MustIPv6Addr("2001:10::/28"), + + /*+----------------------+---------------+ + | Attribute | Value | + +----------------------+---------------+ + | Address Block | 2002::/16 [2] | + | Name | 6to4 | + | RFC | [RFC3056] | + | Allocation Date | February 2001 | + | Termination Date | N/A | + | Source | True | + | Destination | True | + | Forwardable | True | + | Global | N/A [2] | + | Reserved-by-Protocol | False | + +----------------------+---------------+*/ + // [2] See [RFC3056] for details. + MustIPv6Addr("2002::/16"), + + /*+----------------------+--------------+ + | Attribute | Value | + +----------------------+--------------+ + | Address Block | fc00::/7 | + | Name | Unique-Local | + | RFC | [RFC4193] | + | Allocation Date | October 2005 | + | Termination Date | N/A | + | Source | True | + | Destination | True | + | Forwardable | True | + | Global | False | + | Reserved-by-Protocol | False | + +----------------------+--------------+*/ + MustIPv6Addr("fc00::/7"), + + /*+----------------------+-----------------------+ + | Attribute | Value | + +----------------------+-----------------------+ + | Address Block | fe80::/10 | + | Name | Linked-Scoped Unicast | + | RFC | [RFC4291] | + | Allocation Date | February 2006 | + | Termination Date | N/A | + | Source | True | + | Destination | True | + | Forwardable | False | + | Global | False | + | Reserved-by-Protocol | True | + +----------------------+-----------------------+*/ + MustIPv6Addr("fe80::/10"), + }, + 7335: { + // [RFC7335] IPv4 Service Continuity Prefix + MustIPv4Addr("192.0.0.0/29"), // [RFC7335], §6 IANA Considerations + }, + ForwardingBlacklist: { // Pseudo-RFC + // Blacklist of non-forwardable IP blocks taken from RFC6890 + // + // TODO: the attributes for forwardable should be + // searcahble and embedded in the main list of RFCs + // above. + MustIPv4Addr("0.0.0.0/8"), + MustIPv4Addr("127.0.0.0/8"), + MustIPv4Addr("169.254.0.0/16"), + MustIPv4Addr("192.0.0.0/24"), + MustIPv4Addr("192.0.2.0/24"), + MustIPv4Addr("198.51.100.0/24"), + MustIPv4Addr("203.0.113.0/24"), + MustIPv4Addr("240.0.0.0/4"), + MustIPv4Addr("255.255.255.255/32"), + MustIPv6Addr("::1/128"), + MustIPv6Addr("::/128"), + MustIPv6Addr("::ffff:0:0/96"), + + // There is no way of expressing a whitelist per RFC2928 + // atm without creating a negative mask, which I don't + // want to do atm. + //MustIPv6Addr("2001::/23"), + + MustIPv6Addr("2001:db8::/32"), + MustIPv6Addr("2001:10::/28"), + MustIPv6Addr("fe80::/10"), + }, + } +} + +// VisitAllRFCs iterates over all known RFCs and calls the visitor +func VisitAllRFCs(fn func(rfcNum uint, sockaddrs SockAddrs)) { + rfcNetMap := KnownRFCs() + + // Blacklist of faux-RFCs. Don't show the world that we're abusing the + // RFC system in this library. + rfcBlacklist := map[uint]struct{}{ + ForwardingBlacklist: {}, + } + + for rfcNum, sas := range rfcNetMap { + if _, found := rfcBlacklist[rfcNum]; !found { + fn(rfcNum, sas) + } + } +} diff --git a/vendor/github.com/hashicorp/go-sockaddr/route_info.go b/vendor/github.com/hashicorp/go-sockaddr/route_info.go new file mode 100644 index 000000000..2a3ee1db9 --- /dev/null +++ b/vendor/github.com/hashicorp/go-sockaddr/route_info.go @@ -0,0 +1,19 @@ +package sockaddr + +// RouteInterface specifies an interface for obtaining memoized route table and +// network information from a given OS. +type RouteInterface interface { + // GetDefaultInterfaceName returns the name of the interface that has a + // default route or an error and an empty string if a problem was + // encountered. + GetDefaultInterfaceName() (string, error) +} + +// VisitCommands visits each command used by the platform-specific RouteInfo +// implementation. +func (ri routeInfo) VisitCommands(fn func(name string, cmd []string)) { + for k, v := range ri.cmds { + cmds := append([]string(nil), v...) + fn(k, cmds) + } +} diff --git a/vendor/github.com/hashicorp/go-sockaddr/route_info_bsd.go b/vendor/github.com/hashicorp/go-sockaddr/route_info_bsd.go new file mode 100644 index 000000000..705757abc --- /dev/null +++ b/vendor/github.com/hashicorp/go-sockaddr/route_info_bsd.go @@ -0,0 +1,36 @@ +// +build darwin dragonfly freebsd netbsd openbsd + +package sockaddr + +import "os/exec" + +var cmds map[string][]string = map[string][]string{ + "route": {"/sbin/route", "-n", "get", "default"}, +} + +type routeInfo struct { + cmds map[string][]string +} + +// NewRouteInfo returns a BSD-specific implementation of the RouteInfo +// interface. +func NewRouteInfo() (routeInfo, error) { + return routeInfo{ + cmds: cmds, + }, nil +} + +// GetDefaultInterfaceName returns the interface name attached to the default +// route on the default interface. +func (ri routeInfo) GetDefaultInterfaceName() (string, error) { + out, err := exec.Command(cmds["route"][0], cmds["route"][1:]...).Output() + if err != nil { + return "", err + } + + var ifName string + if ifName, err = parseDefaultIfNameFromRoute(string(out)); err != nil { + return "", err + } + return ifName, nil +} diff --git a/vendor/github.com/hashicorp/go-sockaddr/route_info_default.go b/vendor/github.com/hashicorp/go-sockaddr/route_info_default.go new file mode 100644 index 000000000..d1b009f65 --- /dev/null +++ b/vendor/github.com/hashicorp/go-sockaddr/route_info_default.go @@ -0,0 +1,10 @@ +// +build android nacl plan9 + +package sockaddr + +import "errors" + +// getDefaultIfName is the default interface function for unsupported platforms. +func getDefaultIfName() (string, error) { + return "", errors.New("No default interface found (unsupported platform)") +} diff --git a/vendor/github.com/hashicorp/go-sockaddr/route_info_linux.go b/vendor/github.com/hashicorp/go-sockaddr/route_info_linux.go new file mode 100644 index 000000000..b33e4c0d0 --- /dev/null +++ b/vendor/github.com/hashicorp/go-sockaddr/route_info_linux.go @@ -0,0 +1,37 @@ +package sockaddr + +import ( + "errors" + "os/exec" +) + +var cmds map[string][]string = map[string][]string{ + "ip": {"/sbin/ip", "route"}, +} + +type routeInfo struct { + cmds map[string][]string +} + +// NewRouteInfo returns a Linux-specific implementation of the RouteInfo +// interface. +func NewRouteInfo() (routeInfo, error) { + return routeInfo{ + cmds: cmds, + }, nil +} + +// GetDefaultInterfaceName returns the interface name attached to the default +// route on the default interface. +func (ri routeInfo) GetDefaultInterfaceName() (string, error) { + out, err := exec.Command(cmds["ip"][0], cmds["ip"][1:]...).Output() + if err != nil { + return "", err + } + + var ifName string + if ifName, err = parseDefaultIfNameFromIPCmd(string(out)); err != nil { + return "", errors.New("No default interface found") + } + return ifName, nil +} diff --git a/vendor/github.com/hashicorp/go-sockaddr/route_info_solaris.go b/vendor/github.com/hashicorp/go-sockaddr/route_info_solaris.go new file mode 100644 index 000000000..ee8e7984d --- /dev/null +++ b/vendor/github.com/hashicorp/go-sockaddr/route_info_solaris.go @@ -0,0 +1,37 @@ +package sockaddr + +import ( + "errors" + "os/exec" +) + +var cmds map[string][]string = map[string][]string{ + "route": {"/usr/sbin/route", "-n", "get", "default"}, +} + +type routeInfo struct { + cmds map[string][]string +} + +// NewRouteInfo returns a BSD-specific implementation of the RouteInfo +// interface. +func NewRouteInfo() (routeInfo, error) { + return routeInfo{ + cmds: cmds, + }, nil +} + +// GetDefaultInterfaceName returns the interface name attached to the default +// route on the default interface. +func (ri routeInfo) GetDefaultInterfaceName() (string, error) { + out, err := exec.Command(cmds["route"][0], cmds["route"][1:]...).Output() + if err != nil { + return "", err + } + + var ifName string + if ifName, err = parseDefaultIfNameFromRoute(string(out)); err != nil { + return "", errors.New("No default interface found") + } + return ifName, nil +} diff --git a/vendor/github.com/hashicorp/go-sockaddr/route_info_windows.go b/vendor/github.com/hashicorp/go-sockaddr/route_info_windows.go new file mode 100644 index 000000000..3da972883 --- /dev/null +++ b/vendor/github.com/hashicorp/go-sockaddr/route_info_windows.go @@ -0,0 +1,41 @@ +package sockaddr + +import "os/exec" + +var cmds map[string][]string = map[string][]string{ + "netstat": {"netstat", "-rn"}, + "ipconfig": {"ipconfig"}, +} + +type routeInfo struct { + cmds map[string][]string +} + +// NewRouteInfo returns a BSD-specific implementation of the RouteInfo +// interface. +func NewRouteInfo() (routeInfo, error) { + return routeInfo{ + cmds: cmds, + }, nil +} + +// GetDefaultInterfaceName returns the interface name attached to the default +// route on the default interface. +func (ri routeInfo) GetDefaultInterfaceName() (string, error) { + ifNameOut, err := exec.Command(cmds["netstat"][0], cmds["netstat"][1:]...).Output() + if err != nil { + return "", err + } + + ipconfigOut, err := exec.Command(cmds["ipconfig"][0], cmds["ipconfig"][1:]...).Output() + if err != nil { + return "", err + } + + ifName, err := parseDefaultIfNameWindows(string(ifNameOut), string(ipconfigOut)) + if err != nil { + return "", err + } + + return ifName, nil +} diff --git a/vendor/github.com/hashicorp/go-sockaddr/sockaddr.go b/vendor/github.com/hashicorp/go-sockaddr/sockaddr.go new file mode 100644 index 000000000..51389ebe9 --- /dev/null +++ b/vendor/github.com/hashicorp/go-sockaddr/sockaddr.go @@ -0,0 +1,178 @@ +package sockaddr + +import ( + "fmt" + "strings" +) + +type SockAddrType int +type AttrName string + +const ( + TypeUnknown SockAddrType = 0x0 + TypeUnix = 0x1 + TypeIPv4 = 0x2 + TypeIPv6 = 0x4 + + // TypeIP is the union of TypeIPv4 and TypeIPv6 + TypeIP = 0x6 +) + +type SockAddr interface { + // CmpRFC returns 0 if SockAddr exactly matches one of the matched RFC + // networks, -1 if the receiver is contained within the RFC network, or + // 1 if the address is not contained within the RFC. + CmpRFC(rfcNum uint, sa SockAddr) int + + // Contains returns true if the SockAddr arg is contained within the + // receiver + Contains(SockAddr) bool + + // Equal allows for the comparison of two SockAddrs + Equal(SockAddr) bool + + DialPacketArgs() (string, string) + DialStreamArgs() (string, string) + ListenPacketArgs() (string, string) + ListenStreamArgs() (string, string) + + // String returns the string representation of SockAddr + String() string + + // Type returns the SockAddrType + Type() SockAddrType +} + +// sockAddrAttrMap is a map of the SockAddr type-specific attributes. +var sockAddrAttrMap map[AttrName]func(SockAddr) string +var sockAddrAttrs []AttrName + +func init() { + sockAddrInit() +} + +// New creates a new SockAddr from the string. The order in which New() +// attempts to construct a SockAddr is: IPv4Addr, IPv6Addr, SockAddrUnix. +// +// NOTE: New() relies on the heuristic wherein if the path begins with either a +// '.' or '/' character before creating a new UnixSock. For UNIX sockets that +// are absolute paths or are nested within a sub-directory, this works as +// expected, however if the UNIX socket is contained in the current working +// directory, this will fail unless the path begins with "./" +// (e.g. "./my-local-socket"). Calls directly to NewUnixSock() do not suffer +// this limitation. Invalid IP addresses such as "256.0.0.0/-1" will run afoul +// of this heuristic and be assumed to be a valid UNIX socket path (which they +// are, but it is probably not what you want and you won't realize it until you +// stat(2) the file system to discover it doesn't exist). +func NewSockAddr(s string) (SockAddr, error) { + ipv4Addr, err := NewIPv4Addr(s) + if err == nil { + return ipv4Addr, nil + } + + ipv6Addr, err := NewIPv6Addr(s) + if err == nil { + return ipv6Addr, nil + } + + // Check to make sure the string begins with either a '.' or '/', or + // contains a '/'. + if len(s) > 1 && (strings.IndexAny(s[0:1], "./") != -1 || strings.IndexByte(s, '/') != -1) { + unixSock, err := NewUnixSock(s) + if err == nil { + return unixSock, nil + } + } + + return nil, fmt.Errorf("Unable to convert %q to an IPv4 or IPv6 address, or a UNIX Socket", s) +} + +// ToIPAddr returns an IPAddr type or nil if the type conversion fails. +func ToIPAddr(sa SockAddr) *IPAddr { + ipa, ok := sa.(IPAddr) + if !ok { + return nil + } + return &ipa +} + +// ToIPv4Addr returns an IPv4Addr type or nil if the type conversion fails. +func ToIPv4Addr(sa SockAddr) *IPv4Addr { + switch v := sa.(type) { + case IPv4Addr: + return &v + default: + return nil + } +} + +// ToIPv6Addr returns an IPv6Addr type or nil if the type conversion fails. +func ToIPv6Addr(sa SockAddr) *IPv6Addr { + switch v := sa.(type) { + case IPv6Addr: + return &v + default: + return nil + } +} + +// ToUnixSock returns a UnixSock type or nil if the type conversion fails. +func ToUnixSock(sa SockAddr) *UnixSock { + switch v := sa.(type) { + case UnixSock: + return &v + default: + return nil + } +} + +// SockAddrAttr returns a string representation of an attribute for the given +// SockAddr. +func SockAddrAttr(sa SockAddr, selector AttrName) string { + fn, found := sockAddrAttrMap[selector] + if !found { + return "" + } + + return fn(sa) +} + +// String() for SockAddrType returns a string representation of the +// SockAddrType (e.g. "IPv4", "IPv6", "UNIX", "IP", or "unknown"). +func (sat SockAddrType) String() string { + switch sat { + case TypeIPv4: + return "IPv4" + case TypeIPv6: + return "IPv6" + // There is no concrete "IP" type. Leaving here as a reminder. + // case TypeIP: + // return "IP" + case TypeUnix: + return "UNIX" + default: + panic("unsupported type") + } +} + +// sockAddrInit is called once at init() +func sockAddrInit() { + sockAddrAttrs = []AttrName{ + "type", // type should be first + "string", + } + + sockAddrAttrMap = map[AttrName]func(sa SockAddr) string{ + "string": func(sa SockAddr) string { + return sa.String() + }, + "type": func(sa SockAddr) string { + return sa.Type().String() + }, + } +} + +// UnixSockAttrs returns a list of attributes supported by the UnixSock type +func SockAddrAttrs() []AttrName { + return sockAddrAttrs +} diff --git a/vendor/github.com/hashicorp/go-sockaddr/sockaddrs.go b/vendor/github.com/hashicorp/go-sockaddr/sockaddrs.go new file mode 100644 index 000000000..75fbffb1e --- /dev/null +++ b/vendor/github.com/hashicorp/go-sockaddr/sockaddrs.go @@ -0,0 +1,193 @@ +package sockaddr + +import ( + "bytes" + "sort" +) + +// SockAddrs is a slice of SockAddrs +type SockAddrs []SockAddr + +func (s SockAddrs) Len() int { return len(s) } +func (s SockAddrs) Swap(i, j int) { s[i], s[j] = s[j], s[i] } + +// CmpAddrFunc is the function signature that must be met to be used in the +// OrderedAddrBy multiAddrSorter +type CmpAddrFunc func(p1, p2 *SockAddr) int + +// multiAddrSorter implements the Sort interface, sorting the SockAddrs within. +type multiAddrSorter struct { + addrs SockAddrs + cmp []CmpAddrFunc +} + +// Sort sorts the argument slice according to the Cmp functions passed to +// OrderedAddrBy. +func (ms *multiAddrSorter) Sort(sockAddrs SockAddrs) { + ms.addrs = sockAddrs + sort.Sort(ms) +} + +// OrderedAddrBy sorts SockAddr by the list of sort function pointers. +func OrderedAddrBy(cmpFuncs ...CmpAddrFunc) *multiAddrSorter { + return &multiAddrSorter{ + cmp: cmpFuncs, + } +} + +// Len is part of sort.Interface. +func (ms *multiAddrSorter) Len() int { + return len(ms.addrs) +} + +// Less is part of sort.Interface. It is implemented by looping along the +// Cmp() functions until it finds a comparison that is either less than, +// equal to, or greater than. +func (ms *multiAddrSorter) Less(i, j int) bool { + p, q := &ms.addrs[i], &ms.addrs[j] + // Try all but the last comparison. + var k int + for k = 0; k < len(ms.cmp)-1; k++ { + cmp := ms.cmp[k] + x := cmp(p, q) + switch x { + case -1: + // p < q, so we have a decision. + return true + case 1: + // p > q, so we have a decision. + return false + } + // p == q; try the next comparison. + } + // All comparisons to here said "equal", so just return whatever the + // final comparison reports. + switch ms.cmp[k](p, q) { + case -1: + return true + case 1: + return false + default: + // Still a tie! Now what? + return false + } +} + +// Swap is part of sort.Interface. +func (ms *multiAddrSorter) Swap(i, j int) { + ms.addrs[i], ms.addrs[j] = ms.addrs[j], ms.addrs[i] +} + +const ( + // NOTE (sean@): These constants are here for code readability only and + // are sprucing up the code for readability purposes. Some of the + // Cmp*() variants have confusing logic (especially when dealing with + // mixed-type comparisons) and this, I think, has made it easier to grok + // the code faster. + sortReceiverBeforeArg = -1 + sortDeferDecision = 0 + sortArgBeforeReceiver = 1 +) + +// AscAddress is a sorting function to sort SockAddrs by their respective +// address type. Non-equal types are deferred in the sort. +func AscAddress(p1Ptr, p2Ptr *SockAddr) int { + p1 := *p1Ptr + p2 := *p2Ptr + + switch v := p1.(type) { + case IPv4Addr: + return v.CmpAddress(p2) + case IPv6Addr: + return v.CmpAddress(p2) + case UnixSock: + return v.CmpAddress(p2) + default: + return sortDeferDecision + } +} + +// AscPort is a sorting function to sort SockAddrs by their respective address +// type. Non-equal types are deferred in the sort. +func AscPort(p1Ptr, p2Ptr *SockAddr) int { + p1 := *p1Ptr + p2 := *p2Ptr + + switch v := p1.(type) { + case IPv4Addr: + return v.CmpPort(p2) + case IPv6Addr: + return v.CmpPort(p2) + default: + return sortDeferDecision + } +} + +// AscPrivate is a sorting function to sort "more secure" private values before +// "more public" values. Both IPv4 and IPv6 are compared against RFC6890 +// (RFC6890 includes, and is not limited to, RFC1918 and RFC6598 for IPv4, and +// IPv6 includes RFC4193). +func AscPrivate(p1Ptr, p2Ptr *SockAddr) int { + p1 := *p1Ptr + p2 := *p2Ptr + + switch v := p1.(type) { + case IPv4Addr, IPv6Addr: + return v.CmpRFC(6890, p2) + default: + return sortDeferDecision + } +} + +// AscNetworkSize is a sorting function to sort SockAddrs based on their network +// size. Non-equal types are deferred in the sort. +func AscNetworkSize(p1Ptr, p2Ptr *SockAddr) int { + p1 := *p1Ptr + p2 := *p2Ptr + p1Type := p1.Type() + p2Type := p2.Type() + + // Network size operations on non-IP types make no sense + if p1Type != p2Type && p1Type != TypeIP { + return sortDeferDecision + } + + ipA := p1.(IPAddr) + ipB := p2.(IPAddr) + + return bytes.Compare([]byte(*ipA.NetIPMask()), []byte(*ipB.NetIPMask())) +} + +// AscType is a sorting function to sort "more secure" types before +// "less-secure" types. +func AscType(p1Ptr, p2Ptr *SockAddr) int { + p1 := *p1Ptr + p2 := *p2Ptr + p1Type := p1.Type() + p2Type := p2.Type() + switch { + case p1Type < p2Type: + return sortReceiverBeforeArg + case p1Type == p2Type: + return sortDeferDecision + case p1Type > p2Type: + return sortArgBeforeReceiver + default: + return sortDeferDecision + } +} + +// FilterByType returns two lists: a list of matched and unmatched SockAddrs +func (sas SockAddrs) FilterByType(type_ SockAddrType) (matched, excluded SockAddrs) { + matched = make(SockAddrs, 0, len(sas)) + excluded = make(SockAddrs, 0, len(sas)) + + for _, sa := range sas { + if sa.Type()&type_ != 0 { + matched = append(matched, sa) + } else { + excluded = append(excluded, sa) + } + } + return matched, excluded +} diff --git a/vendor/github.com/hashicorp/go-sockaddr/unixsock.go b/vendor/github.com/hashicorp/go-sockaddr/unixsock.go new file mode 100644 index 000000000..f3be3f67e --- /dev/null +++ b/vendor/github.com/hashicorp/go-sockaddr/unixsock.go @@ -0,0 +1,135 @@ +package sockaddr + +import ( + "fmt" + "strings" +) + +type UnixSock struct { + SockAddr + path string +} +type UnixSocks []*UnixSock + +// unixAttrMap is a map of the UnixSockAddr type-specific attributes. +var unixAttrMap map[AttrName]func(UnixSock) string +var unixAttrs []AttrName + +func init() { + unixAttrInit() +} + +// NewUnixSock creates an UnixSock from a string path. String can be in the +// form of either URI-based string (e.g. `file:///etc/passwd`), an absolute +// path (e.g. `/etc/passwd`), or a relative path (e.g. `./foo`). +func NewUnixSock(s string) (ret UnixSock, err error) { + ret.path = s + return ret, nil +} + +// CmpAddress follows the Cmp() standard protocol and returns: +// +// - -1 If the receiver should sort first because its name lexically sorts before arg +// - 0 if the SockAddr arg is not a UnixSock, or is a UnixSock with the same path. +// - 1 If the argument should sort first. +func (us UnixSock) CmpAddress(sa SockAddr) int { + usb, ok := sa.(UnixSock) + if !ok { + return sortDeferDecision + } + + return strings.Compare(us.Path(), usb.Path()) +} + +// DialPacketArgs returns the arguments required to be passed to net.DialUnix() +// with the `unixgram` network type. +func (us UnixSock) DialPacketArgs() (network, dialArgs string) { + return "unixgram", us.path +} + +// DialStreamArgs returns the arguments required to be passed to net.DialUnix() +// with the `unix` network type. +func (us UnixSock) DialStreamArgs() (network, dialArgs string) { + return "unix", us.path +} + +// Equal returns true if a SockAddr is equal to the receiving UnixSock. +func (us UnixSock) Equal(sa SockAddr) bool { + usb, ok := sa.(UnixSock) + if !ok { + return false + } + + if us.Path() != usb.Path() { + return false + } + + return true +} + +// ListenPacketArgs returns the arguments required to be passed to +// net.ListenUnixgram() with the `unixgram` network type. +func (us UnixSock) ListenPacketArgs() (network, dialArgs string) { + return "unixgram", us.path +} + +// ListenStreamArgs returns the arguments required to be passed to +// net.ListenUnix() with the `unix` network type. +func (us UnixSock) ListenStreamArgs() (network, dialArgs string) { + return "unix", us.path +} + +// MustUnixSock is a helper method that must return an UnixSock or panic on +// invalid input. +func MustUnixSock(addr string) UnixSock { + us, err := NewUnixSock(addr) + if err != nil { + panic(fmt.Sprintf("Unable to create a UnixSock from %+q: %v", addr, err)) + } + return us +} + +// Path returns the given path of the UnixSock +func (us UnixSock) Path() string { + return us.path +} + +// String returns the path of the UnixSock +func (us UnixSock) String() string { + return fmt.Sprintf("%+q", us.path) +} + +// Type is used as a type switch and returns TypeUnix +func (UnixSock) Type() SockAddrType { + return TypeUnix +} + +// UnixSockAttrs returns a list of attributes supported by the UnixSockAddr type +func UnixSockAttrs() []AttrName { + return unixAttrs +} + +// UnixSockAttr returns a string representation of an attribute for the given +// UnixSock. +func UnixSockAttr(us UnixSock, attrName AttrName) string { + fn, found := unixAttrMap[attrName] + if !found { + return "" + } + + return fn(us) +} + +// unixAttrInit is called once at init() +func unixAttrInit() { + // Sorted for human readability + unixAttrs = []AttrName{ + "path", + } + + unixAttrMap = map[AttrName]func(us UnixSock) string{ + "path": func(us UnixSock) string { + return us.Path() + }, + } +} diff --git a/vendor/github.com/hashicorp/golang-lru/2q.go b/vendor/github.com/hashicorp/golang-lru/2q.go new file mode 100644 index 000000000..337d96329 --- /dev/null +++ b/vendor/github.com/hashicorp/golang-lru/2q.go @@ -0,0 +1,212 @@ +package lru + +import ( + "fmt" + "sync" + + "github.com/hashicorp/golang-lru/simplelru" +) + +const ( + // Default2QRecentRatio is the ratio of the 2Q cache dedicated + // to recently added entries that have only been accessed once. + Default2QRecentRatio = 0.25 + + // Default2QGhostEntries is the default ratio of ghost + // entries kept to track entries recently evicted + Default2QGhostEntries = 0.50 +) + +// TwoQueueCache is a thread-safe fixed size 2Q cache. +// 2Q is an enhancement over the standard LRU cache +// in that it tracks both frequently and recently used +// entries separately. This avoids a burst in access to new +// entries from evicting frequently used entries. It adds some +// additional tracking overhead to the standard LRU cache, and is +// computationally about 2x the cost, and adds some metadata over +// head. The ARCCache is similar, but does not require setting any +// parameters. +type TwoQueueCache struct { + size int + recentSize int + + recent *simplelru.LRU + frequent *simplelru.LRU + recentEvict *simplelru.LRU + lock sync.RWMutex +} + +// New2Q creates a new TwoQueueCache using the default +// values for the parameters. +func New2Q(size int) (*TwoQueueCache, error) { + return New2QParams(size, Default2QRecentRatio, Default2QGhostEntries) +} + +// New2QParams creates a new TwoQueueCache using the provided +// parameter values. +func New2QParams(size int, recentRatio float64, ghostRatio float64) (*TwoQueueCache, error) { + if size <= 0 { + return nil, fmt.Errorf("invalid size") + } + if recentRatio < 0.0 || recentRatio > 1.0 { + return nil, fmt.Errorf("invalid recent ratio") + } + if ghostRatio < 0.0 || ghostRatio > 1.0 { + return nil, fmt.Errorf("invalid ghost ratio") + } + + // Determine the sub-sizes + recentSize := int(float64(size) * recentRatio) + evictSize := int(float64(size) * ghostRatio) + + // Allocate the LRUs + recent, err := simplelru.NewLRU(size, nil) + if err != nil { + return nil, err + } + frequent, err := simplelru.NewLRU(size, nil) + if err != nil { + return nil, err + } + recentEvict, err := simplelru.NewLRU(evictSize, nil) + if err != nil { + return nil, err + } + + // Initialize the cache + c := &TwoQueueCache{ + size: size, + recentSize: recentSize, + recent: recent, + frequent: frequent, + recentEvict: recentEvict, + } + return c, nil +} + +func (c *TwoQueueCache) Get(key interface{}) (interface{}, bool) { + c.lock.Lock() + defer c.lock.Unlock() + + // Check if this is a frequent value + if val, ok := c.frequent.Get(key); ok { + return val, ok + } + + // If the value is contained in recent, then we + // promote it to frequent + if val, ok := c.recent.Peek(key); ok { + c.recent.Remove(key) + c.frequent.Add(key, val) + return val, ok + } + + // No hit + return nil, false +} + +func (c *TwoQueueCache) Add(key, value interface{}) { + c.lock.Lock() + defer c.lock.Unlock() + + // Check if the value is frequently used already, + // and just update the value + if c.frequent.Contains(key) { + c.frequent.Add(key, value) + return + } + + // Check if the value is recently used, and promote + // the value into the frequent list + if c.recent.Contains(key) { + c.recent.Remove(key) + c.frequent.Add(key, value) + return + } + + // If the value was recently evicted, add it to the + // frequently used list + if c.recentEvict.Contains(key) { + c.ensureSpace(true) + c.recentEvict.Remove(key) + c.frequent.Add(key, value) + return + } + + // Add to the recently seen list + c.ensureSpace(false) + c.recent.Add(key, value) + return +} + +// ensureSpace is used to ensure we have space in the cache +func (c *TwoQueueCache) ensureSpace(recentEvict bool) { + // If we have space, nothing to do + recentLen := c.recent.Len() + freqLen := c.frequent.Len() + if recentLen+freqLen < c.size { + return + } + + // If the recent buffer is larger than + // the target, evict from there + if recentLen > 0 && (recentLen > c.recentSize || (recentLen == c.recentSize && !recentEvict)) { + k, _, _ := c.recent.RemoveOldest() + c.recentEvict.Add(k, nil) + return + } + + // Remove from the frequent list otherwise + c.frequent.RemoveOldest() +} + +func (c *TwoQueueCache) Len() int { + c.lock.RLock() + defer c.lock.RUnlock() + return c.recent.Len() + c.frequent.Len() +} + +func (c *TwoQueueCache) Keys() []interface{} { + c.lock.RLock() + defer c.lock.RUnlock() + k1 := c.frequent.Keys() + k2 := c.recent.Keys() + return append(k1, k2...) +} + +func (c *TwoQueueCache) Remove(key interface{}) { + c.lock.Lock() + defer c.lock.Unlock() + if c.frequent.Remove(key) { + return + } + if c.recent.Remove(key) { + return + } + if c.recentEvict.Remove(key) { + return + } +} + +func (c *TwoQueueCache) Purge() { + c.lock.Lock() + defer c.lock.Unlock() + c.recent.Purge() + c.frequent.Purge() + c.recentEvict.Purge() +} + +func (c *TwoQueueCache) Contains(key interface{}) bool { + c.lock.RLock() + defer c.lock.RUnlock() + return c.frequent.Contains(key) || c.recent.Contains(key) +} + +func (c *TwoQueueCache) Peek(key interface{}) (interface{}, bool) { + c.lock.RLock() + defer c.lock.RUnlock() + if val, ok := c.frequent.Peek(key); ok { + return val, ok + } + return c.recent.Peek(key) +} diff --git a/vendor/github.com/hashicorp/golang-lru/README.md b/vendor/github.com/hashicorp/golang-lru/README.md new file mode 100644 index 000000000..33e58cfaf --- /dev/null +++ b/vendor/github.com/hashicorp/golang-lru/README.md @@ -0,0 +1,25 @@ +golang-lru +========== + +This provides the `lru` package which implements a fixed-size +thread safe LRU cache. It is based on the cache in Groupcache. + +Documentation +============= + +Full docs are available on [Godoc](http://godoc.org/github.com/hashicorp/golang-lru) + +Example +======= + +Using the LRU is very simple: + +```go +l, _ := New(128) +for i := 0; i < 256; i++ { + l.Add(i, nil) +} +if l.Len() != 128 { + panic(fmt.Sprintf("bad len: %v", l.Len())) +} +``` diff --git a/vendor/github.com/hashicorp/golang-lru/arc.go b/vendor/github.com/hashicorp/golang-lru/arc.go new file mode 100644 index 000000000..a2a252817 --- /dev/null +++ b/vendor/github.com/hashicorp/golang-lru/arc.go @@ -0,0 +1,257 @@ +package lru + +import ( + "sync" + + "github.com/hashicorp/golang-lru/simplelru" +) + +// ARCCache is a thread-safe fixed size Adaptive Replacement Cache (ARC). +// ARC is an enhancement over the standard LRU cache in that tracks both +// frequency and recency of use. This avoids a burst in access to new +// entries from evicting the frequently used older entries. It adds some +// additional tracking overhead to a standard LRU cache, computationally +// it is roughly 2x the cost, and the extra memory overhead is linear +// with the size of the cache. ARC has been patented by IBM, but is +// similar to the TwoQueueCache (2Q) which requires setting parameters. +type ARCCache struct { + size int // Size is the total capacity of the cache + p int // P is the dynamic preference towards T1 or T2 + + t1 *simplelru.LRU // T1 is the LRU for recently accessed items + b1 *simplelru.LRU // B1 is the LRU for evictions from t1 + + t2 *simplelru.LRU // T2 is the LRU for frequently accessed items + b2 *simplelru.LRU // B2 is the LRU for evictions from t2 + + lock sync.RWMutex +} + +// NewARC creates an ARC of the given size +func NewARC(size int) (*ARCCache, error) { + // Create the sub LRUs + b1, err := simplelru.NewLRU(size, nil) + if err != nil { + return nil, err + } + b2, err := simplelru.NewLRU(size, nil) + if err != nil { + return nil, err + } + t1, err := simplelru.NewLRU(size, nil) + if err != nil { + return nil, err + } + t2, err := simplelru.NewLRU(size, nil) + if err != nil { + return nil, err + } + + // Initialize the ARC + c := &ARCCache{ + size: size, + p: 0, + t1: t1, + b1: b1, + t2: t2, + b2: b2, + } + return c, nil +} + +// Get looks up a key's value from the cache. +func (c *ARCCache) Get(key interface{}) (interface{}, bool) { + c.lock.Lock() + defer c.lock.Unlock() + + // Ff the value is contained in T1 (recent), then + // promote it to T2 (frequent) + if val, ok := c.t1.Peek(key); ok { + c.t1.Remove(key) + c.t2.Add(key, val) + return val, ok + } + + // Check if the value is contained in T2 (frequent) + if val, ok := c.t2.Get(key); ok { + return val, ok + } + + // No hit + return nil, false +} + +// Add adds a value to the cache. +func (c *ARCCache) Add(key, value interface{}) { + c.lock.Lock() + defer c.lock.Unlock() + + // Check if the value is contained in T1 (recent), and potentially + // promote it to frequent T2 + if c.t1.Contains(key) { + c.t1.Remove(key) + c.t2.Add(key, value) + return + } + + // Check if the value is already in T2 (frequent) and update it + if c.t2.Contains(key) { + c.t2.Add(key, value) + return + } + + // Check if this value was recently evicted as part of the + // recently used list + if c.b1.Contains(key) { + // T1 set is too small, increase P appropriately + delta := 1 + b1Len := c.b1.Len() + b2Len := c.b2.Len() + if b2Len > b1Len { + delta = b2Len / b1Len + } + if c.p+delta >= c.size { + c.p = c.size + } else { + c.p += delta + } + + // Potentially need to make room in the cache + if c.t1.Len()+c.t2.Len() >= c.size { + c.replace(false) + } + + // Remove from B1 + c.b1.Remove(key) + + // Add the key to the frequently used list + c.t2.Add(key, value) + return + } + + // Check if this value was recently evicted as part of the + // frequently used list + if c.b2.Contains(key) { + // T2 set is too small, decrease P appropriately + delta := 1 + b1Len := c.b1.Len() + b2Len := c.b2.Len() + if b1Len > b2Len { + delta = b1Len / b2Len + } + if delta >= c.p { + c.p = 0 + } else { + c.p -= delta + } + + // Potentially need to make room in the cache + if c.t1.Len()+c.t2.Len() >= c.size { + c.replace(true) + } + + // Remove from B2 + c.b2.Remove(key) + + // Add the key to the frequntly used list + c.t2.Add(key, value) + return + } + + // Potentially need to make room in the cache + if c.t1.Len()+c.t2.Len() >= c.size { + c.replace(false) + } + + // Keep the size of the ghost buffers trim + if c.b1.Len() > c.size-c.p { + c.b1.RemoveOldest() + } + if c.b2.Len() > c.p { + c.b2.RemoveOldest() + } + + // Add to the recently seen list + c.t1.Add(key, value) + return +} + +// replace is used to adaptively evict from either T1 or T2 +// based on the current learned value of P +func (c *ARCCache) replace(b2ContainsKey bool) { + t1Len := c.t1.Len() + if t1Len > 0 && (t1Len > c.p || (t1Len == c.p && b2ContainsKey)) { + k, _, ok := c.t1.RemoveOldest() + if ok { + c.b1.Add(k, nil) + } + } else { + k, _, ok := c.t2.RemoveOldest() + if ok { + c.b2.Add(k, nil) + } + } +} + +// Len returns the number of cached entries +func (c *ARCCache) Len() int { + c.lock.RLock() + defer c.lock.RUnlock() + return c.t1.Len() + c.t2.Len() +} + +// Keys returns all the cached keys +func (c *ARCCache) Keys() []interface{} { + c.lock.RLock() + defer c.lock.RUnlock() + k1 := c.t1.Keys() + k2 := c.t2.Keys() + return append(k1, k2...) +} + +// Remove is used to purge a key from the cache +func (c *ARCCache) Remove(key interface{}) { + c.lock.Lock() + defer c.lock.Unlock() + if c.t1.Remove(key) { + return + } + if c.t2.Remove(key) { + return + } + if c.b1.Remove(key) { + return + } + if c.b2.Remove(key) { + return + } +} + +// Purge is used to clear the cache +func (c *ARCCache) Purge() { + c.lock.Lock() + defer c.lock.Unlock() + c.t1.Purge() + c.t2.Purge() + c.b1.Purge() + c.b2.Purge() +} + +// Contains is used to check if the cache contains a key +// without updating recency or frequency. +func (c *ARCCache) Contains(key interface{}) bool { + c.lock.RLock() + defer c.lock.RUnlock() + return c.t1.Contains(key) || c.t2.Contains(key) +} + +// Peek is used to inspect the cache value of a key +// without updating recency or frequency. +func (c *ARCCache) Peek(key interface{}) (interface{}, bool) { + c.lock.RLock() + defer c.lock.RUnlock() + if val, ok := c.t1.Peek(key); ok { + return val, ok + } + return c.t2.Peek(key) +} diff --git a/vendor/github.com/hashicorp/golang-lru/lru.go b/vendor/github.com/hashicorp/golang-lru/lru.go new file mode 100644 index 000000000..a6285f989 --- /dev/null +++ b/vendor/github.com/hashicorp/golang-lru/lru.go @@ -0,0 +1,114 @@ +// This package provides a simple LRU cache. It is based on the +// LRU implementation in groupcache: +// https://github.com/golang/groupcache/tree/master/lru +package lru + +import ( + "sync" + + "github.com/hashicorp/golang-lru/simplelru" +) + +// Cache is a thread-safe fixed size LRU cache. +type Cache struct { + lru *simplelru.LRU + lock sync.RWMutex +} + +// New creates an LRU of the given size +func New(size int) (*Cache, error) { + return NewWithEvict(size, nil) +} + +// NewWithEvict constructs a fixed size cache with the given eviction +// callback. +func NewWithEvict(size int, onEvicted func(key interface{}, value interface{})) (*Cache, error) { + lru, err := simplelru.NewLRU(size, simplelru.EvictCallback(onEvicted)) + if err != nil { + return nil, err + } + c := &Cache{ + lru: lru, + } + return c, nil +} + +// Purge is used to completely clear the cache +func (c *Cache) Purge() { + c.lock.Lock() + c.lru.Purge() + c.lock.Unlock() +} + +// Add adds a value to the cache. Returns true if an eviction occurred. +func (c *Cache) Add(key, value interface{}) bool { + c.lock.Lock() + defer c.lock.Unlock() + return c.lru.Add(key, value) +} + +// Get looks up a key's value from the cache. +func (c *Cache) Get(key interface{}) (interface{}, bool) { + c.lock.Lock() + defer c.lock.Unlock() + return c.lru.Get(key) +} + +// Check if a key is in the cache, without updating the recent-ness +// or deleting it for being stale. +func (c *Cache) Contains(key interface{}) bool { + c.lock.RLock() + defer c.lock.RUnlock() + return c.lru.Contains(key) +} + +// Returns the key value (or undefined if not found) without updating +// the "recently used"-ness of the key. +func (c *Cache) Peek(key interface{}) (interface{}, bool) { + c.lock.RLock() + defer c.lock.RUnlock() + return c.lru.Peek(key) +} + +// ContainsOrAdd checks if a key is in the cache without updating the +// recent-ness or deleting it for being stale, and if not, adds the value. +// Returns whether found and whether an eviction occurred. +func (c *Cache) ContainsOrAdd(key, value interface{}) (ok, evict bool) { + c.lock.Lock() + defer c.lock.Unlock() + + if c.lru.Contains(key) { + return true, false + } else { + evict := c.lru.Add(key, value) + return false, evict + } +} + +// Remove removes the provided key from the cache. +func (c *Cache) Remove(key interface{}) { + c.lock.Lock() + c.lru.Remove(key) + c.lock.Unlock() +} + +// RemoveOldest removes the oldest item from the cache. +func (c *Cache) RemoveOldest() { + c.lock.Lock() + c.lru.RemoveOldest() + c.lock.Unlock() +} + +// Keys returns a slice of the keys in the cache, from oldest to newest. +func (c *Cache) Keys() []interface{} { + c.lock.RLock() + defer c.lock.RUnlock() + return c.lru.Keys() +} + +// Len returns the number of items in the cache. +func (c *Cache) Len() int { + c.lock.RLock() + defer c.lock.RUnlock() + return c.lru.Len() +} diff --git a/vendor/github.com/hashicorp/golang-lru/simplelru/lru.go b/vendor/github.com/hashicorp/golang-lru/simplelru/lru.go new file mode 100644 index 000000000..cb416b394 --- /dev/null +++ b/vendor/github.com/hashicorp/golang-lru/simplelru/lru.go @@ -0,0 +1,160 @@ +package simplelru + +import ( + "container/list" + "errors" +) + +// EvictCallback is used to get a callback when a cache entry is evicted +type EvictCallback func(key interface{}, value interface{}) + +// LRU implements a non-thread safe fixed size LRU cache +type LRU struct { + size int + evictList *list.List + items map[interface{}]*list.Element + onEvict EvictCallback +} + +// entry is used to hold a value in the evictList +type entry struct { + key interface{} + value interface{} +} + +// NewLRU constructs an LRU of the given size +func NewLRU(size int, onEvict EvictCallback) (*LRU, error) { + if size <= 0 { + return nil, errors.New("Must provide a positive size") + } + c := &LRU{ + size: size, + evictList: list.New(), + items: make(map[interface{}]*list.Element), + onEvict: onEvict, + } + return c, nil +} + +// Purge is used to completely clear the cache +func (c *LRU) Purge() { + for k, v := range c.items { + if c.onEvict != nil { + c.onEvict(k, v.Value.(*entry).value) + } + delete(c.items, k) + } + c.evictList.Init() +} + +// Add adds a value to the cache. Returns true if an eviction occurred. +func (c *LRU) Add(key, value interface{}) bool { + // Check for existing item + if ent, ok := c.items[key]; ok { + c.evictList.MoveToFront(ent) + ent.Value.(*entry).value = value + return false + } + + // Add new item + ent := &entry{key, value} + entry := c.evictList.PushFront(ent) + c.items[key] = entry + + evict := c.evictList.Len() > c.size + // Verify size not exceeded + if evict { + c.removeOldest() + } + return evict +} + +// Get looks up a key's value from the cache. +func (c *LRU) Get(key interface{}) (value interface{}, ok bool) { + if ent, ok := c.items[key]; ok { + c.evictList.MoveToFront(ent) + return ent.Value.(*entry).value, true + } + return +} + +// Check if a key is in the cache, without updating the recent-ness +// or deleting it for being stale. +func (c *LRU) Contains(key interface{}) (ok bool) { + _, ok = c.items[key] + return ok +} + +// Returns the key value (or undefined if not found) without updating +// the "recently used"-ness of the key. +func (c *LRU) Peek(key interface{}) (value interface{}, ok bool) { + if ent, ok := c.items[key]; ok { + return ent.Value.(*entry).value, true + } + return nil, ok +} + +// Remove removes the provided key from the cache, returning if the +// key was contained. +func (c *LRU) Remove(key interface{}) bool { + if ent, ok := c.items[key]; ok { + c.removeElement(ent) + return true + } + return false +} + +// RemoveOldest removes the oldest item from the cache. +func (c *LRU) RemoveOldest() (interface{}, interface{}, bool) { + ent := c.evictList.Back() + if ent != nil { + c.removeElement(ent) + kv := ent.Value.(*entry) + return kv.key, kv.value, true + } + return nil, nil, false +} + +// GetOldest returns the oldest entry +func (c *LRU) GetOldest() (interface{}, interface{}, bool) { + ent := c.evictList.Back() + if ent != nil { + kv := ent.Value.(*entry) + return kv.key, kv.value, true + } + return nil, nil, false +} + +// Keys returns a slice of the keys in the cache, from oldest to newest. +func (c *LRU) Keys() []interface{} { + keys := make([]interface{}, len(c.items)) + i := 0 + for ent := c.evictList.Back(); ent != nil; ent = ent.Prev() { + keys[i] = ent.Value.(*entry).key + i++ + } + return keys +} + +// Len returns the number of items in the cache. +func (c *LRU) Len() int { + return c.evictList.Len() +} + +// removeOldest removes the oldest item from the cache. +func (c *LRU) removeOldest() { + ent := c.evictList.Back() + if ent != nil { + c.removeElement(ent) + } +} + +// removeElement is used to remove a given list element from the cache +func (c *LRU) removeElement(e *list.Element) { + c.evictList.Remove(e) + kv := e.Value.(*entry) + delete(c.items, kv.key) + if c.onEvict != nil { + c.onEvict(kv.key, kv.value) + } +} diff --git a/vendor/github.com/hashicorp/memberlist/LICENSE b/vendor/github.com/hashicorp/memberlist/LICENSE new file mode 100644 index 000000000..c33dcc7c9 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/LICENSE @@ -0,0 +1,354 @@ +Mozilla Public License, version 2.0 + +1. Definitions + +1.1. “Contributor” + + means each individual or legal entity that creates, contributes to the + creation of, or owns Covered Software. + +1.2. “Contributor Version” + + means the combination of the Contributions of others (if any) used by a + Contributor and that particular Contributor’s Contribution. + +1.3. “Contribution” + + means Covered Software of a particular Contributor. + +1.4. “Covered Software” + + means Source Code Form to which the initial Contributor has attached the + notice in Exhibit A, the Executable Form of such Source Code Form, and + Modifications of such Source Code Form, in each case including portions + thereof. + +1.5. “Incompatible With Secondary Licenses” + means + + a. that the initial Contributor has attached the notice described in + Exhibit B to the Covered Software; or + + b. that the Covered Software was made available under the terms of version + 1.1 or earlier of the License, but not also under the terms of a + Secondary License. + +1.6. “Executable Form” + + means any form of the work other than Source Code Form. + +1.7. “Larger Work” + + means a work that combines Covered Software with other material, in a separate + file or files, that is not Covered Software. + +1.8. “License” + + means this document. + +1.9. “Licensable” + + means having the right to grant, to the maximum extent possible, whether at the + time of the initial grant or subsequently, any and all of the rights conveyed by + this License. + +1.10. “Modifications” + + means any of the following: + + a. any file in Source Code Form that results from an addition to, deletion + from, or modification of the contents of Covered Software; or + + b. any new file in Source Code Form that contains any Covered Software. + +1.11. “Patent Claims” of a Contributor + + means any patent claim(s), including without limitation, method, process, + and apparatus claims, in any patent Licensable by such Contributor that + would be infringed, but for the grant of the License, by the making, + using, selling, offering for sale, having made, import, or transfer of + either its Contributions or its Contributor Version. + +1.12. “Secondary License” + + means either the GNU General Public License, Version 2.0, the GNU Lesser + General Public License, Version 2.1, the GNU Affero General Public + License, Version 3.0, or any later versions of those licenses. + +1.13. “Source Code Form” + + means the form of the work preferred for making modifications. + +1.14. “You” (or “Your”) + + means an individual or a legal entity exercising rights under this + License. For legal entities, “You” includes any entity that controls, is + controlled by, or is under common control with You. For purposes of this + definition, “control” means (a) the power, direct or indirect, to cause + the direction or management of such entity, whether by contract or + otherwise, or (b) ownership of more than fifty percent (50%) of the + outstanding shares or beneficial ownership of such entity. + + +2. License Grants and Conditions + +2.1. Grants + + Each Contributor hereby grants You a world-wide, royalty-free, + non-exclusive license: + + a. under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or as + part of a Larger Work; and + + b. under Patent Claims of such Contributor to make, use, sell, offer for + sale, have made, import, and otherwise transfer either its Contributions + or its Contributor Version. + +2.2. Effective Date + + The licenses granted in Section 2.1 with respect to any Contribution become + effective for each Contribution on the date the Contributor first distributes + such Contribution. + +2.3. Limitations on Grant Scope + + The licenses granted in this Section 2 are the only rights granted under this + License. No additional rights or licenses will be implied from the distribution + or licensing of Covered Software under this License. Notwithstanding Section + 2.1(b) above, no patent license is granted by a Contributor: + + a. for any code that a Contributor has removed from Covered Software; or + + b. for infringements caused by: (i) Your and any other third party’s + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + + c. under Patent Claims infringed by Covered Software in the absence of its + Contributions. + + This License does not grant any rights in the trademarks, service marks, or + logos of any Contributor (except as may be necessary to comply with the + notice requirements in Section 3.4). + +2.4. Subsequent Licenses + + No Contributor makes additional grants as a result of Your choice to + distribute the Covered Software under a subsequent version of this License + (see Section 10.2) or under the terms of a Secondary License (if permitted + under the terms of Section 3.3). + +2.5. Representation + + Each Contributor represents that the Contributor believes its Contributions + are its original creation(s) or it has sufficient rights to grant the + rights to its Contributions conveyed by this License. + +2.6. Fair Use + + This License is not intended to limit any rights You have under applicable + copyright doctrines of fair use, fair dealing, or other equivalents. + +2.7. Conditions + + Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in + Section 2.1. + + +3. Responsibilities + +3.1. Distribution of Source Form + + All distribution of Covered Software in Source Code Form, including any + Modifications that You create or to which You contribute, must be under the + terms of this License. You must inform recipients that the Source Code Form + of the Covered Software is governed by the terms of this License, and how + they can obtain a copy of this License. You may not attempt to alter or + restrict the recipients’ rights in the Source Code Form. + +3.2. Distribution of Executable Form + + If You distribute Covered Software in Executable Form then: + + a. such Covered Software must also be made available in Source Code Form, + as described in Section 3.1, and You must inform recipients of the + Executable Form how they can obtain a copy of such Source Code Form by + reasonable means in a timely manner, at a charge no more than the cost + of distribution to the recipient; and + + b. You may distribute such Executable Form under the terms of this License, + or sublicense it under different terms, provided that the license for + the Executable Form does not attempt to limit or alter the recipients’ + rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + + You may create and distribute a Larger Work under terms of Your choice, + provided that You also comply with the requirements of this License for the + Covered Software. If the Larger Work is a combination of Covered Software + with a work governed by one or more Secondary Licenses, and the Covered + Software is not Incompatible With Secondary Licenses, this License permits + You to additionally distribute such Covered Software under the terms of + such Secondary License(s), so that the recipient of the Larger Work may, at + their option, further distribute the Covered Software under the terms of + either this License or such Secondary License(s). + +3.4. Notices + + You may not remove or alter the substance of any license notices (including + copyright notices, patent notices, disclaimers of warranty, or limitations + of liability) contained within the Source Code Form of the Covered + Software, except that You may alter any license notices to the extent + required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + + You may choose to offer, and to charge a fee for, warranty, support, + indemnity or liability obligations to one or more recipients of Covered + Software. However, You may do so only on Your own behalf, and not on behalf + of any Contributor. You must make it absolutely clear that any such + warranty, support, indemnity, or liability obligation is offered by You + alone, and You hereby agree to indemnify every Contributor for any + liability incurred by such Contributor as a result of warranty, support, + indemnity or liability terms You offer. You may include additional + disclaimers of warranty and limitations of liability specific to any + jurisdiction. + +4. Inability to Comply Due to Statute or Regulation + + If it is impossible for You to comply with any of the terms of this License + with respect to some or all of the Covered Software due to statute, judicial + order, or regulation then You must: (a) comply with the terms of this License + to the maximum extent possible; and (b) describe the limitations and the code + they affect. Such description must be placed in a text file included with all + distributions of the Covered Software under this License. Except to the + extent prohibited by statute or regulation, such description must be + sufficiently detailed for a recipient of ordinary skill to be able to + understand it. + +5. Termination + +5.1. The rights granted under this License will terminate automatically if You + fail to comply with any of its terms. However, if You become compliant, + then the rights granted under this License from a particular Contributor + are reinstated (a) provisionally, unless and until such Contributor + explicitly and finally terminates Your grants, and (b) on an ongoing basis, + if such Contributor fails to notify You of the non-compliance by some + reasonable means prior to 60 days after You have come back into compliance. + Moreover, Your grants from a particular Contributor are reinstated on an + ongoing basis if such Contributor notifies You of the non-compliance by + some reasonable means, this is the first time You have received notice of + non-compliance with this License from such Contributor, and You become + compliant prior to 30 days after Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent + infringement claim (excluding declaratory judgment actions, counter-claims, + and cross-claims) alleging that a Contributor Version directly or + indirectly infringes any patent, then the rights granted to You by any and + all Contributors for the Covered Software under Section 2.1 of this License + shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user + license agreements (excluding distributors and resellers) which have been + validly granted by You or Your distributors under this License prior to + termination shall survive termination. + +6. Disclaimer of Warranty + + Covered Software is provided under this License on an “as is” basis, without + warranty of any kind, either expressed, implied, or statutory, including, + without limitation, warranties that the Covered Software is free of defects, + merchantable, fit for a particular purpose or non-infringing. The entire + risk as to the quality and performance of the Covered Software is with You. + Should any Covered Software prove defective in any respect, You (not any + Contributor) assume the cost of any necessary servicing, repair, or + correction. This disclaimer of warranty constitutes an essential part of this + License. No use of any Covered Software is authorized under this License + except under this disclaimer. + +7. Limitation of Liability + + Under no circumstances and under no legal theory, whether tort (including + negligence), contract, or otherwise, shall any Contributor, or anyone who + distributes Covered Software as permitted above, be liable to You for any + direct, indirect, special, incidental, or consequential damages of any + character including, without limitation, damages for lost profits, loss of + goodwill, work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses, even if such party shall have been + informed of the possibility of such damages. This limitation of liability + shall not apply to liability for death or personal injury resulting from such + party’s negligence to the extent applicable law prohibits such limitation. + Some jurisdictions do not allow the exclusion or limitation of incidental or + consequential damages, so this exclusion and limitation may not apply to You. + +8. Litigation + + Any litigation relating to this License may be brought only in the courts of + a jurisdiction where the defendant maintains its principal place of business + and such litigation shall be governed by laws of that jurisdiction, without + reference to its conflict-of-law provisions. Nothing in this Section shall + prevent a party’s ability to bring cross-claims or counter-claims. + +9. Miscellaneous + + This License represents the complete agreement concerning the subject matter + hereof. If any provision of this License is held to be unenforceable, such + provision shall be reformed only to the extent necessary to make it + enforceable. Any law or regulation which provides that the language of a + contract shall be construed against the drafter shall not be used to construe + this License against a Contributor. + + +10. Versions of the License + +10.1. New Versions + + Mozilla Foundation is the license steward. Except as provided in Section + 10.3, no one other than the license steward has the right to modify or + publish new versions of this License. Each version will be given a + distinguishing version number. + +10.2. Effect of New Versions + + You may distribute the Covered Software under the terms of the version of + the License under which You originally received the Covered Software, or + under the terms of any subsequent version published by the license + steward. + +10.3. Modified Versions + + If you create software not governed by this License, and you want to + create a new license for such software, you may create and use a modified + version of this License if you rename the license and remove any + references to the name of the license steward (except to note that such + modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary Licenses + If You choose to distribute Source Code Form that is Incompatible With + Secondary Licenses under the terms of this version of the License, the + notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice + + This Source Code Form is subject to the + terms of the Mozilla Public License, v. + 2.0. If a copy of the MPL was not + distributed with this file, You can + obtain one at + http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular file, then +You may include the notice in a location (such as a LICENSE file in a relevant +directory) where a recipient would be likely to look for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - “Incompatible With Secondary Licenses” Notice + + This Source Code Form is “Incompatible + With Secondary Licenses”, as defined by + the Mozilla Public License, v. 2.0. + diff --git a/vendor/github.com/hashicorp/memberlist/Makefile b/vendor/github.com/hashicorp/memberlist/Makefile new file mode 100644 index 000000000..56ef6c28c --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/Makefile @@ -0,0 +1,14 @@ +test: subnet + go test ./... + +integ: subnet + INTEG_TESTS=yes go test ./... + +subnet: + ./test/setup_subnet.sh + +cov: + gocov test github.com/hashicorp/memberlist | gocov-html > /tmp/coverage.html + open /tmp/coverage.html + +.PNONY: test cov integ diff --git a/vendor/github.com/hashicorp/memberlist/README.md b/vendor/github.com/hashicorp/memberlist/README.md new file mode 100644 index 000000000..fc605a59b --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/README.md @@ -0,0 +1,144 @@ +# memberlist [![GoDoc](https://godoc.org/github.com/hashicorp/memberlist?status.png)](https://godoc.org/github.com/hashicorp/memberlist) + +memberlist is a [Go](http://www.golang.org) library that manages cluster +membership and member failure detection using a gossip based protocol. + +The use cases for such a library are far-reaching: all distributed systems +require membership, and memberlist is a re-usable solution to managing +cluster membership and node failure detection. + +memberlist is eventually consistent but converges quickly on average. +The speed at which it converges can be heavily tuned via various knobs +on the protocol. Node failures are detected and network partitions are partially +tolerated by attempting to communicate to potentially dead nodes through +multiple routes. + +## Building + +If you wish to build memberlist you'll need Go version 1.2+ installed. + +Please check your installation with: + +``` +go version +``` + +## Usage + +Memberlist is surprisingly simple to use. An example is shown below: + +```go +/* Create the initial memberlist from a safe configuration. + Please reference the godoc for other default config types. + http://godoc.org/github.com/hashicorp/memberlist#Config +*/ +list, err := memberlist.Create(memberlist.DefaultLocalConfig()) +if err != nil { + panic("Failed to create memberlist: " + err.Error()) +} + +// Join an existing cluster by specifying at least one known member. +n, err := list.Join([]string{"1.2.3.4"}) +if err != nil { + panic("Failed to join cluster: " + err.Error()) +} + +// Ask for members of the cluster +for _, member := range list.Members() { + fmt.Printf("Member: %s %s\n", member.Name, member.Addr) +} + +// Continue doing whatever you need, memberlist will maintain membership +// information in the background. Delegates can be used for receiving +// events when members join or leave. +``` + +The most difficult part of memberlist is configuring it since it has many +available knobs in order to tune state propagation delay and convergence times. +Memberlist provides a default configuration that offers a good starting point, +but errs on the side of caution, choosing values that are optimized for +higher convergence at the cost of higher bandwidth usage. + +For complete documentation, see the associated [Godoc](http://godoc.org/github.com/hashicorp/memberlist). + +## Protocol + +memberlist is based on ["SWIM: Scalable Weakly-consistent Infection-style Process Group Membership Protocol"](http://www.cs.cornell.edu/~asdas/research/dsn02-swim.pdf), +with a few minor adaptations, mostly to increase propagation speed and +convergence rate. + +A high level overview of the memberlist protocol (based on SWIM) is +described below, but for details please read the full +[SWIM paper](http://www.cs.cornell.edu/~asdas/research/dsn02-swim.pdf) +followed by the memberlist source. We welcome any questions related +to the protocol on our issue tracker. + +### Protocol Description + +memberlist begins by joining an existing cluster or starting a new +cluster. If starting a new cluster, additional nodes are expected to join +it. New nodes in an existing cluster must be given the address of at +least one existing member in order to join the cluster. The new member +does a full state sync with the existing member over TCP and begins gossiping its +existence to the cluster. + +Gossip is done over UDP with a configurable but fixed fanout and interval. +This ensures that network usage is constant with regards to number of nodes, as opposed to +exponential growth that can occur with traditional heartbeat mechanisms. +Complete state exchanges with a random node are done periodically over +TCP, but much less often than gossip messages. This increases the likelihood +that the membership list converges properly since the full state is exchanged +and merged. The interval between full state exchanges is configurable or can +be disabled entirely. + +Failure detection is done by periodic random probing using a configurable interval. +If the node fails to ack within a reasonable time (typically some multiple +of RTT), then an indirect probe as well as a direct TCP probe are attempted. An +indirect probe asks a configurable number of random nodes to probe the same node, +in case there are network issues causing our own node to fail the probe. The direct +TCP probe is used to help identify the common situation where networking is +misconfigured to allow TCP but not UDP. Without the TCP probe, a UDP-isolated node +would think all other nodes were suspect and could cause churn in the cluster when +it attempts a TCP-based state exchange with another node. It is not desirable to +operate with only TCP connectivity because convergence will be much slower, but it +is enabled so that memberlist can detect this situation and alert operators. + +If both our probe, the indirect probes, and the direct TCP probe fail within a +configurable time, then the node is marked "suspicious" and this knowledge is +gossiped to the cluster. A suspicious node is still considered a member of +cluster. If the suspect member of the cluster does not dispute the suspicion +within a configurable period of time, the node is finally considered dead, +and this state is then gossiped to the cluster. + +This is a brief and incomplete description of the protocol. For a better idea, +please read the +[SWIM paper](http://www.cs.cornell.edu/~asdas/research/dsn02-swim.pdf) +in its entirety, along with the memberlist source code. + +### Changes from SWIM + +As mentioned earlier, the memberlist protocol is based on SWIM but includes +minor changes, mostly to increase propagation speed and convergence rates. + +The changes from SWIM are noted here: + +* memberlist does a full state sync over TCP periodically. SWIM only propagates + changes over gossip. While both eventually reach convergence, the full state + sync increases the likelihood that nodes are fully converged more quickly, + at the expense of more bandwidth usage. This feature can be totally disabled + if you wish. + +* memberlist has a dedicated gossip layer separate from the failure detection + protocol. SWIM only piggybacks gossip messages on top of probe/ack messages. + memberlist also piggybacks gossip messages on top of probe/ack messages, but + also will periodically send out dedicated gossip messages on their own. This + feature lets you have a higher gossip rate (for example once per 200ms) + and a slower failure detection rate (such as once per second), resulting + in overall faster convergence rates and data propagation speeds. This feature + can be totally disabed as well, if you wish. + +* memberlist stores around the state of dead nodes for a set amount of time, + so that when full syncs are requested, the requester also receives information + about dead nodes. Because SWIM doesn't do full syncs, SWIM deletes dead node + state immediately upon learning that the node is dead. This change again helps + the cluster converge more quickly. diff --git a/vendor/github.com/hashicorp/memberlist/alive_delegate.go b/vendor/github.com/hashicorp/memberlist/alive_delegate.go new file mode 100644 index 000000000..51a0ba905 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/alive_delegate.go @@ -0,0 +1,14 @@ +package memberlist + +// AliveDelegate is used to involve a client in processing +// a node "alive" message. When a node joins, either through +// a UDP gossip or TCP push/pull, we update the state of +// that node via an alive message. This can be used to filter +// a node out and prevent it from being considered a peer +// using application specific logic. +type AliveDelegate interface { + // NotifyMerge is invoked when a merge could take place. + // Provides a list of the nodes known by the peer. If + // the return value is non-nil, the merge is canceled. + NotifyAlive(peer *Node) error +} diff --git a/vendor/github.com/hashicorp/memberlist/awareness.go b/vendor/github.com/hashicorp/memberlist/awareness.go new file mode 100644 index 000000000..ea95c7538 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/awareness.go @@ -0,0 +1,69 @@ +package memberlist + +import ( + "sync" + "time" + + "github.com/armon/go-metrics" +) + +// awareness manages a simple metric for tracking the estimated health of the +// local node. Health is primary the node's ability to respond in the soft +// real-time manner required for correct health checking of other nodes in the +// cluster. +type awareness struct { + sync.RWMutex + + // max is the upper threshold for the timeout scale (the score will be + // constrained to be from 0 <= score < max). + max int + + // score is the current awareness score. Lower values are healthier and + // zero is the minimum value. + score int +} + +// newAwareness returns a new awareness object. +func newAwareness(max int) *awareness { + return &awareness{ + max: max, + score: 0, + } +} + +// ApplyDelta takes the given delta and applies it to the score in a thread-safe +// manner. It also enforces a floor of zero and a max of max, so deltas may not +// change the overall score if it's railed at one of the extremes. +func (a *awareness) ApplyDelta(delta int) { + a.Lock() + initial := a.score + a.score += delta + if a.score < 0 { + a.score = 0 + } else if a.score > (a.max - 1) { + a.score = (a.max - 1) + } + final := a.score + a.Unlock() + + if initial != final { + metrics.SetGauge([]string{"memberlist", "health", "score"}, float32(final)) + } +} + +// GetHealthScore returns the raw health score. +func (a *awareness) GetHealthScore() int { + a.RLock() + score := a.score + a.RUnlock() + return score +} + +// ScaleTimeout takes the given duration and scales it based on the current +// score. Less healthyness will lead to longer timeouts. +func (a *awareness) ScaleTimeout(timeout time.Duration) time.Duration { + a.RLock() + score := a.score + a.RUnlock() + return timeout * (time.Duration(score) + 1) +} diff --git a/vendor/github.com/hashicorp/memberlist/broadcast.go b/vendor/github.com/hashicorp/memberlist/broadcast.go new file mode 100644 index 000000000..f7e85a119 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/broadcast.go @@ -0,0 +1,100 @@ +package memberlist + +/* +The broadcast mechanism works by maintaining a sorted list of messages to be +sent out. When a message is to be broadcast, the retransmit count +is set to zero and appended to the queue. The retransmit count serves +as the "priority", ensuring that newer messages get sent first. Once +a message hits the retransmit limit, it is removed from the queue. + +Additionally, older entries can be invalidated by new messages that +are contradictory. For example, if we send "{suspect M1 inc: 1}, +then a following {alive M1 inc: 2} will invalidate that message +*/ + +type memberlistBroadcast struct { + node string + msg []byte + notify chan struct{} +} + +func (b *memberlistBroadcast) Invalidates(other Broadcast) bool { + // Check if that broadcast is a memberlist type + mb, ok := other.(*memberlistBroadcast) + if !ok { + return false + } + + // Invalidates any message about the same node + return b.node == mb.node +} + +func (b *memberlistBroadcast) Message() []byte { + return b.msg +} + +func (b *memberlistBroadcast) Finished() { + select { + case b.notify <- struct{}{}: + default: + } +} + +// encodeAndBroadcast encodes a message and enqueues it for broadcast. Fails +// silently if there is an encoding error. +func (m *Memberlist) encodeAndBroadcast(node string, msgType messageType, msg interface{}) { + m.encodeBroadcastNotify(node, msgType, msg, nil) +} + +// encodeBroadcastNotify encodes a message and enqueues it for broadcast +// and notifies the given channel when transmission is finished. Fails +// silently if there is an encoding error. +func (m *Memberlist) encodeBroadcastNotify(node string, msgType messageType, msg interface{}, notify chan struct{}) { + buf, err := encode(msgType, msg) + if err != nil { + m.logger.Printf("[ERR] memberlist: Failed to encode message for broadcast: %s", err) + } else { + m.queueBroadcast(node, buf.Bytes(), notify) + } +} + +// queueBroadcast is used to start dissemination of a message. It will be +// sent up to a configured number of times. The message could potentially +// be invalidated by a future message about the same node +func (m *Memberlist) queueBroadcast(node string, msg []byte, notify chan struct{}) { + b := &memberlistBroadcast{node, msg, notify} + m.broadcasts.QueueBroadcast(b) +} + +// getBroadcasts is used to return a slice of broadcasts to send up to +// a maximum byte size, while imposing a per-broadcast overhead. This is used +// to fill a UDP packet with piggybacked data +func (m *Memberlist) getBroadcasts(overhead, limit int) [][]byte { + // Get memberlist messages first + toSend := m.broadcasts.GetBroadcasts(overhead, limit) + + // Check if the user has anything to broadcast + d := m.config.Delegate + if d != nil { + // Determine the bytes used already + bytesUsed := 0 + for _, msg := range toSend { + bytesUsed += len(msg) + overhead + } + + // Check space remaining for user messages + avail := limit - bytesUsed + if avail > overhead+userMsgOverhead { + userMsgs := d.GetBroadcasts(overhead+userMsgOverhead, avail) + + // Frame each user message + for _, msg := range userMsgs { + buf := make([]byte, 1, len(msg)+1) + buf[0] = byte(userMsg) + buf = append(buf, msg...) + toSend = append(toSend, buf) + } + } + } + return toSend +} diff --git a/vendor/github.com/hashicorp/memberlist/config.go b/vendor/github.com/hashicorp/memberlist/config.go new file mode 100644 index 000000000..1c13bfcd3 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/config.go @@ -0,0 +1,277 @@ +package memberlist + +import ( + "io" + "log" + "os" + "time" +) + +type Config struct { + // The name of this node. This must be unique in the cluster. + Name string + + // Configuration related to what address to bind to and ports to + // listen on. The port is used for both UDP and TCP gossip. + // It is assumed other nodes are running on this port, but they + // do not need to. + BindAddr string + BindPort int + + // Configuration related to what address to advertise to other + // cluster members. Used for nat traversal. + AdvertiseAddr string + AdvertisePort int + + // ProtocolVersion is the configured protocol version that we + // will _speak_. This must be between ProtocolVersionMin and + // ProtocolVersionMax. + ProtocolVersion uint8 + + // TCPTimeout is the timeout for establishing a TCP connection with + // a remote node for a full state sync. + TCPTimeout time.Duration + + // IndirectChecks is the number of nodes that will be asked to perform + // an indirect probe of a node in the case a direct probe fails. Memberlist + // waits for an ack from any single indirect node, so increasing this + // number will increase the likelihood that an indirect probe will succeed + // at the expense of bandwidth. + IndirectChecks int + + // RetransmitMult is the multiplier for the number of retransmissions + // that are attempted for messages broadcasted over gossip. The actual + // count of retransmissions is calculated using the formula: + // + // Retransmits = RetransmitMult * log(N+1) + // + // This allows the retransmits to scale properly with cluster size. The + // higher the multiplier, the more likely a failed broadcast is to converge + // at the expense of increased bandwidth. + RetransmitMult int + + // SuspicionMult is the multiplier for determining the time an + // inaccessible node is considered suspect before declaring it dead. + // The actual timeout is calculated using the formula: + // + // SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval + // + // This allows the timeout to scale properly with expected propagation + // delay with a larger cluster size. The higher the multiplier, the longer + // an inaccessible node is considered part of the cluster before declaring + // it dead, giving that suspect node more time to refute if it is indeed + // still alive. + SuspicionMult int + + // SuspicionMaxTimeoutMult is the multiplier applied to the + // SuspicionTimeout used as an upper bound on detection time. This max + // timeout is calculated using the formula: + // + // SuspicionMaxTimeout = SuspicionMaxTimeoutMult * SuspicionTimeout + // + // If everything is working properly, confirmations from other nodes will + // accelerate suspicion timers in a manner which will cause the timeout + // to reach the base SuspicionTimeout before that elapses, so this value + // will typically only come into play if a node is experiencing issues + // communicating with other nodes. It should be set to a something fairly + // large so that a node having problems will have a lot of chances to + // recover before falsely declaring other nodes as failed, but short + // enough for a legitimately isolated node to still make progress marking + // nodes failed in a reasonable amount of time. + SuspicionMaxTimeoutMult int + + // PushPullInterval is the interval between complete state syncs. + // Complete state syncs are done with a single node over TCP and are + // quite expensive relative to standard gossiped messages. Setting this + // to zero will disable state push/pull syncs completely. + // + // Setting this interval lower (more frequent) will increase convergence + // speeds across larger clusters at the expense of increased bandwidth + // usage. + PushPullInterval time.Duration + + // ProbeInterval and ProbeTimeout are used to configure probing + // behavior for memberlist. + // + // ProbeInterval is the interval between random node probes. Setting + // this lower (more frequent) will cause the memberlist cluster to detect + // failed nodes more quickly at the expense of increased bandwidth usage. + // + // ProbeTimeout is the timeout to wait for an ack from a probed node + // before assuming it is unhealthy. This should be set to 99-percentile + // of RTT (round-trip time) on your network. + ProbeInterval time.Duration + ProbeTimeout time.Duration + + // DisableTcpPings will turn off the fallback TCP pings that are attempted + // if the direct UDP ping fails. These get pipelined along with the + // indirect UDP pings. + DisableTcpPings bool + + // AwarenessMaxMultiplier will increase the probe interval if the node + // becomes aware that it might be degraded and not meeting the soft real + // time requirements to reliably probe other nodes. + AwarenessMaxMultiplier int + + // GossipInterval and GossipNodes are used to configure the gossip + // behavior of memberlist. + // + // GossipInterval is the interval between sending messages that need + // to be gossiped that haven't been able to piggyback on probing messages. + // If this is set to zero, non-piggyback gossip is disabled. By lowering + // this value (more frequent) gossip messages are propagated across + // the cluster more quickly at the expense of increased bandwidth. + // + // GossipNodes is the number of random nodes to send gossip messages to + // per GossipInterval. Increasing this number causes the gossip messages + // to propagate across the cluster more quickly at the expense of + // increased bandwidth. + // + // GossipToTheDeadTime is the interval after which a node has died that + // we will still try to gossip to it. This gives it a chance to refute. + GossipInterval time.Duration + GossipNodes int + GossipToTheDeadTime time.Duration + + // EnableCompression is used to control message compression. This can + // be used to reduce bandwidth usage at the cost of slightly more CPU + // utilization. This is only available starting at protocol version 1. + EnableCompression bool + + // SecretKey is used to initialize the primary encryption key in a keyring. + // The primary encryption key is the only key used to encrypt messages and + // the first key used while attempting to decrypt messages. Providing a + // value for this primary key will enable message-level encryption and + // verification, and automatically install the key onto the keyring. + // The value should be either 16, 24, or 32 bytes to select AES-128, + // AES-192, or AES-256. + SecretKey []byte + + // The keyring holds all of the encryption keys used internally. It is + // automatically initialized using the SecretKey and SecretKeys values. + Keyring *Keyring + + // Delegate and Events are delegates for receiving and providing + // data to memberlist via callback mechanisms. For Delegate, see + // the Delegate interface. For Events, see the EventDelegate interface. + // + // The DelegateProtocolMin/Max are used to guarantee protocol-compatibility + // for any custom messages that the delegate might do (broadcasts, + // local/remote state, etc.). If you don't set these, then the protocol + // versions will just be zero, and version compliance won't be done. + Delegate Delegate + DelegateProtocolVersion uint8 + DelegateProtocolMin uint8 + DelegateProtocolMax uint8 + Events EventDelegate + Conflict ConflictDelegate + Merge MergeDelegate + Ping PingDelegate + Alive AliveDelegate + + // DNSConfigPath points to the system's DNS config file, usually located + // at /etc/resolv.conf. It can be overridden via config for easier testing. + DNSConfigPath string + + // LogOutput is the writer where logs should be sent. If this is not + // set, logging will go to stderr by default. You cannot specify both LogOutput + // and Logger at the same time. + LogOutput io.Writer + + // Logger is a custom logger which you provide. If Logger is set, it will use + // this for the internal logger. If Logger is not set, it will fall back to the + // behavior for using LogOutput. You cannot specify both LogOutput and Logger + // at the same time. + Logger *log.Logger + + // Size of Memberlist's internal channel which handles UDP messages. The + // size of this determines the size of the queue which Memberlist will keep + // while UDP messages are handled. + HandoffQueueDepth int + + // Maximum number of bytes that memberlist expects UDP messages to be. A safe + // value for this is typically 1400 bytes (which is the default.) However, + // depending on your network's MTU (Maximum Transmission Unit) you may be able + // to increase this. + UDPBufferSize int +} + +// DefaultLANConfig returns a sane set of configurations for Memberlist. +// It uses the hostname as the node name, and otherwise sets very conservative +// values that are sane for most LAN environments. The default configuration +// errs on the side of caution, choosing values that are optimized +// for higher convergence at the cost of higher bandwidth usage. Regardless, +// these values are a good starting point when getting started with memberlist. +func DefaultLANConfig() *Config { + hostname, _ := os.Hostname() + return &Config{ + Name: hostname, + BindAddr: "0.0.0.0", + BindPort: 7946, + AdvertiseAddr: "", + AdvertisePort: 7946, + ProtocolVersion: ProtocolVersion2Compatible, + TCPTimeout: 10 * time.Second, // Timeout after 10 seconds + IndirectChecks: 3, // Use 3 nodes for the indirect ping + RetransmitMult: 4, // Retransmit a message 4 * log(N+1) nodes + SuspicionMult: 5, // Suspect a node for 5 * log(N+1) * Interval + SuspicionMaxTimeoutMult: 6, // For 10k nodes this will give a max timeout of 120 seconds + PushPullInterval: 30 * time.Second, // Low frequency + ProbeTimeout: 500 * time.Millisecond, // Reasonable RTT time for LAN + ProbeInterval: 1 * time.Second, // Failure check every second + DisableTcpPings: false, // TCP pings are safe, even with mixed versions + AwarenessMaxMultiplier: 8, // Probe interval backs off to 8 seconds + + GossipNodes: 3, // Gossip to 3 nodes + GossipInterval: 200 * time.Millisecond, // Gossip more rapidly + GossipToTheDeadTime: 30 * time.Second, // Same as push/pull + + EnableCompression: true, // Enable compression by default + + SecretKey: nil, + Keyring: nil, + + DNSConfigPath: "/etc/resolv.conf", + + HandoffQueueDepth: 1024, + UDPBufferSize: 1400, + } +} + +// DefaultWANConfig works like DefaultConfig, however it returns a configuration +// that is optimized for most WAN environments. The default configuration is +// still very conservative and errs on the side of caution. +func DefaultWANConfig() *Config { + conf := DefaultLANConfig() + conf.TCPTimeout = 30 * time.Second + conf.SuspicionMult = 6 + conf.PushPullInterval = 60 * time.Second + conf.ProbeTimeout = 3 * time.Second + conf.ProbeInterval = 5 * time.Second + conf.GossipNodes = 4 // Gossip less frequently, but to an additional node + conf.GossipInterval = 500 * time.Millisecond + conf.GossipToTheDeadTime = 60 * time.Second + return conf +} + +// DefaultLocalConfig works like DefaultConfig, however it returns a configuration +// that is optimized for a local loopback environments. The default configuration is +// still very conservative and errs on the side of caution. +func DefaultLocalConfig() *Config { + conf := DefaultLANConfig() + conf.TCPTimeout = time.Second + conf.IndirectChecks = 1 + conf.RetransmitMult = 2 + conf.SuspicionMult = 3 + conf.PushPullInterval = 15 * time.Second + conf.ProbeTimeout = 200 * time.Millisecond + conf.ProbeInterval = time.Second + conf.GossipInterval = 100 * time.Millisecond + conf.GossipToTheDeadTime = 15 * time.Second + return conf +} + +// Returns whether or not encryption is enabled +func (c *Config) EncryptionEnabled() bool { + return c.Keyring != nil && len(c.Keyring.GetKeys()) > 0 +} diff --git a/vendor/github.com/hashicorp/memberlist/conflict_delegate.go b/vendor/github.com/hashicorp/memberlist/conflict_delegate.go new file mode 100644 index 000000000..f52b136eb --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/conflict_delegate.go @@ -0,0 +1,10 @@ +package memberlist + +// ConflictDelegate is a used to inform a client that +// a node has attempted to join which would result in a +// name conflict. This happens if two clients are configured +// with the same name but different addresses. +type ConflictDelegate interface { + // NotifyConflict is invoked when a name conflict is detected + NotifyConflict(existing, other *Node) +} diff --git a/vendor/github.com/hashicorp/memberlist/delegate.go b/vendor/github.com/hashicorp/memberlist/delegate.go new file mode 100644 index 000000000..66aa2da79 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/delegate.go @@ -0,0 +1,37 @@ +package memberlist + +// Delegate is the interface that clients must implement if they want to hook +// into the gossip layer of Memberlist. All the methods must be thread-safe, +// as they can and generally will be called concurrently. +type Delegate interface { + // NodeMeta is used to retrieve meta-data about the current node + // when broadcasting an alive message. It's length is limited to + // the given byte size. This metadata is available in the Node structure. + NodeMeta(limit int) []byte + + // NotifyMsg is called when a user-data message is received. + // Care should be taken that this method does not block, since doing + // so would block the entire UDP packet receive loop. Additionally, the byte + // slice may be modified after the call returns, so it should be copied if needed. + NotifyMsg([]byte) + + // GetBroadcasts is called when user data messages can be broadcast. + // It can return a list of buffers to send. Each buffer should assume an + // overhead as provided with a limit on the total byte size allowed. + // The total byte size of the resulting data to send must not exceed + // the limit. Care should be taken that this method does not block, + // since doing so would block the entire UDP packet receive loop. + GetBroadcasts(overhead, limit int) [][]byte + + // LocalState is used for a TCP Push/Pull. This is sent to + // the remote side in addition to the membership information. Any + // data can be sent here. See MergeRemoteState as well. The `join` + // boolean indicates this is for a join instead of a push/pull. + LocalState(join bool) []byte + + // MergeRemoteState is invoked after a TCP Push/Pull. This is the + // state received from the remote side and is the result of the + // remote side's LocalState call. The 'join' + // boolean indicates this is for a join instead of a push/pull. + MergeRemoteState(buf []byte, join bool) +} diff --git a/vendor/github.com/hashicorp/memberlist/event_delegate.go b/vendor/github.com/hashicorp/memberlist/event_delegate.go new file mode 100644 index 000000000..35e2a56fd --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/event_delegate.go @@ -0,0 +1,61 @@ +package memberlist + +// EventDelegate is a simpler delegate that is used only to receive +// notifications about members joining and leaving. The methods in this +// delegate may be called by multiple goroutines, but never concurrently. +// This allows you to reason about ordering. +type EventDelegate interface { + // NotifyJoin is invoked when a node is detected to have joined. + // The Node argument must not be modified. + NotifyJoin(*Node) + + // NotifyLeave is invoked when a node is detected to have left. + // The Node argument must not be modified. + NotifyLeave(*Node) + + // NotifyUpdate is invoked when a node is detected to have + // updated, usually involving the meta data. The Node argument + // must not be modified. + NotifyUpdate(*Node) +} + +// ChannelEventDelegate is used to enable an application to receive +// events about joins and leaves over a channel instead of a direct +// function call. +// +// Care must be taken that events are processed in a timely manner from +// the channel, since this delegate will block until an event can be sent. +type ChannelEventDelegate struct { + Ch chan<- NodeEvent +} + +// NodeEventType are the types of events that can be sent from the +// ChannelEventDelegate. +type NodeEventType int + +const ( + NodeJoin NodeEventType = iota + NodeLeave + NodeUpdate +) + +// NodeEvent is a single event related to node activity in the memberlist. +// The Node member of this struct must not be directly modified. It is passed +// as a pointer to avoid unnecessary copies. If you wish to modify the node, +// make a copy first. +type NodeEvent struct { + Event NodeEventType + Node *Node +} + +func (c *ChannelEventDelegate) NotifyJoin(n *Node) { + c.Ch <- NodeEvent{NodeJoin, n} +} + +func (c *ChannelEventDelegate) NotifyLeave(n *Node) { + c.Ch <- NodeEvent{NodeLeave, n} +} + +func (c *ChannelEventDelegate) NotifyUpdate(n *Node) { + c.Ch <- NodeEvent{NodeUpdate, n} +} diff --git a/vendor/github.com/hashicorp/memberlist/keyring.go b/vendor/github.com/hashicorp/memberlist/keyring.go new file mode 100644 index 000000000..a2774a0ce --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/keyring.go @@ -0,0 +1,160 @@ +package memberlist + +import ( + "bytes" + "fmt" + "sync" +) + +type Keyring struct { + // Keys stores the key data used during encryption and decryption. It is + // ordered in such a way where the first key (index 0) is the primary key, + // which is used for encrypting messages, and is the first key tried during + // message decryption. + keys [][]byte + + // The keyring lock is used while performing IO operations on the keyring. + l sync.Mutex +} + +// Init allocates substructures +func (k *Keyring) init() { + k.keys = make([][]byte, 0) +} + +// NewKeyring constructs a new container for a set of encryption keys. The +// keyring contains all key data used internally by memberlist. +// +// While creating a new keyring, you must do one of: +// - Omit keys and primary key, effectively disabling encryption +// - Pass a set of keys plus the primary key +// - Pass only a primary key +// +// If only a primary key is passed, then it will be automatically added to the +// keyring. If creating a keyring with multiple keys, one key must be designated +// primary by passing it as the primaryKey. If the primaryKey does not exist in +// the list of secondary keys, it will be automatically added at position 0. +// +// A key should be either 16, 24, or 32 bytes to select AES-128, +// AES-192, or AES-256. +func NewKeyring(keys [][]byte, primaryKey []byte) (*Keyring, error) { + keyring := &Keyring{} + keyring.init() + + if len(keys) > 0 || len(primaryKey) > 0 { + if len(primaryKey) == 0 { + return nil, fmt.Errorf("Empty primary key not allowed") + } + if err := keyring.AddKey(primaryKey); err != nil { + return nil, err + } + for _, key := range keys { + if err := keyring.AddKey(key); err != nil { + return nil, err + } + } + } + + return keyring, nil +} + +// ValidateKey will check to see if the key is valid and returns an error if not. +// +// key should be either 16, 24, or 32 bytes to select AES-128, +// AES-192, or AES-256. +func ValidateKey(key []byte) error { + if l := len(key); l != 16 && l != 24 && l != 32 { + return fmt.Errorf("key size must be 16, 24 or 32 bytes") + } + return nil +} + +// AddKey will install a new key on the ring. Adding a key to the ring will make +// it available for use in decryption. If the key already exists on the ring, +// this function will just return noop. +// +// key should be either 16, 24, or 32 bytes to select AES-128, +// AES-192, or AES-256. +func (k *Keyring) AddKey(key []byte) error { + if err := ValidateKey(key); err != nil { + return err + } + + // No-op if key is already installed + for _, installedKey := range k.keys { + if bytes.Equal(installedKey, key) { + return nil + } + } + + keys := append(k.keys, key) + primaryKey := k.GetPrimaryKey() + if primaryKey == nil { + primaryKey = key + } + k.installKeys(keys, primaryKey) + return nil +} + +// UseKey changes the key used to encrypt messages. This is the only key used to +// encrypt messages, so peers should know this key before this method is called. +func (k *Keyring) UseKey(key []byte) error { + for _, installedKey := range k.keys { + if bytes.Equal(key, installedKey) { + k.installKeys(k.keys, key) + return nil + } + } + return fmt.Errorf("Requested key is not in the keyring") +} + +// RemoveKey drops a key from the keyring. This will return an error if the key +// requested for removal is currently at position 0 (primary key). +func (k *Keyring) RemoveKey(key []byte) error { + if bytes.Equal(key, k.keys[0]) { + return fmt.Errorf("Removing the primary key is not allowed") + } + for i, installedKey := range k.keys { + if bytes.Equal(key, installedKey) { + keys := append(k.keys[:i], k.keys[i+1:]...) + k.installKeys(keys, k.keys[0]) + } + } + return nil +} + +// installKeys will take out a lock on the keyring, and replace the keys with a +// new set of keys. The key indicated by primaryKey will be installed as the new +// primary key. +func (k *Keyring) installKeys(keys [][]byte, primaryKey []byte) { + k.l.Lock() + defer k.l.Unlock() + + newKeys := [][]byte{primaryKey} + for _, key := range keys { + if !bytes.Equal(key, primaryKey) { + newKeys = append(newKeys, key) + } + } + k.keys = newKeys +} + +// GetKeys returns the current set of keys on the ring. +func (k *Keyring) GetKeys() [][]byte { + k.l.Lock() + defer k.l.Unlock() + + return k.keys +} + +// GetPrimaryKey returns the key on the ring at position 0. This is the key used +// for encrypting messages, and is the first key tried for decrypting messages. +func (k *Keyring) GetPrimaryKey() (key []byte) { + k.l.Lock() + defer k.l.Unlock() + + if len(k.keys) > 0 { + key = k.keys[0] + } + return +} diff --git a/vendor/github.com/hashicorp/memberlist/logging.go b/vendor/github.com/hashicorp/memberlist/logging.go new file mode 100644 index 000000000..f31acfb2f --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/logging.go @@ -0,0 +1,22 @@ +package memberlist + +import ( + "fmt" + "net" +) + +func LogAddress(addr net.Addr) string { + if addr == nil { + return "from=" + } + + return fmt.Sprintf("from=%s", addr.String()) +} + +func LogConn(conn net.Conn) string { + if conn == nil { + return LogAddress(nil) + } + + return LogAddress(conn.RemoteAddr()) +} diff --git a/vendor/github.com/hashicorp/memberlist/memberlist.go b/vendor/github.com/hashicorp/memberlist/memberlist.go new file mode 100644 index 000000000..371e3294b --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/memberlist.go @@ -0,0 +1,660 @@ +/* +memberlist is a library that manages cluster +membership and member failure detection using a gossip based protocol. + +The use cases for such a library are far-reaching: all distributed systems +require membership, and memberlist is a re-usable solution to managing +cluster membership and node failure detection. + +memberlist is eventually consistent but converges quickly on average. +The speed at which it converges can be heavily tuned via various knobs +on the protocol. Node failures are detected and network partitions are partially +tolerated by attempting to communicate to potentially dead nodes through +multiple routes. +*/ +package memberlist + +import ( + "fmt" + "log" + "net" + "os" + "strconv" + "strings" + "sync" + "time" + + "github.com/hashicorp/go-multierror" + sockaddr "github.com/hashicorp/go-sockaddr" + "github.com/miekg/dns" +) + +type Memberlist struct { + sequenceNum uint32 // Local sequence number + incarnation uint32 // Local incarnation number + numNodes uint32 // Number of known nodes (estimate) + + config *Config + shutdown bool + shutdownCh chan struct{} + leave bool + leaveBroadcast chan struct{} + + udpListener *net.UDPConn + tcpListener *net.TCPListener + handoff chan msgHandoff + + nodeLock sync.RWMutex + nodes []*nodeState // Known nodes + nodeMap map[string]*nodeState // Maps Addr.String() -> NodeState + nodeTimers map[string]*suspicion // Maps Addr.String() -> suspicion timer + awareness *awareness + + tickerLock sync.Mutex + tickers []*time.Ticker + stopTick chan struct{} + probeIndex int + + ackLock sync.Mutex + ackHandlers map[uint32]*ackHandler + + broadcasts *TransmitLimitedQueue + + logger *log.Logger +} + +// newMemberlist creates the network listeners. +// Does not schedule execution of background maintenance. +func newMemberlist(conf *Config) (*Memberlist, error) { + if conf.ProtocolVersion < ProtocolVersionMin { + return nil, fmt.Errorf("Protocol version '%d' too low. Must be in range: [%d, %d]", + conf.ProtocolVersion, ProtocolVersionMin, ProtocolVersionMax) + } else if conf.ProtocolVersion > ProtocolVersionMax { + return nil, fmt.Errorf("Protocol version '%d' too high. Must be in range: [%d, %d]", + conf.ProtocolVersion, ProtocolVersionMin, ProtocolVersionMax) + } + + if len(conf.SecretKey) > 0 { + if conf.Keyring == nil { + keyring, err := NewKeyring(nil, conf.SecretKey) + if err != nil { + return nil, err + } + conf.Keyring = keyring + } else { + if err := conf.Keyring.AddKey(conf.SecretKey); err != nil { + return nil, err + } + if err := conf.Keyring.UseKey(conf.SecretKey); err != nil { + return nil, err + } + } + } + + tcpAddr := &net.TCPAddr{IP: net.ParseIP(conf.BindAddr), Port: conf.BindPort} + tcpLn, err := net.ListenTCP("tcp", tcpAddr) + if err != nil { + return nil, fmt.Errorf("Failed to start TCP listener. Err: %s", err) + } + if conf.BindPort == 0 { + conf.BindPort = tcpLn.Addr().(*net.TCPAddr).Port + } + + udpAddr := &net.UDPAddr{IP: net.ParseIP(conf.BindAddr), Port: conf.BindPort} + udpLn, err := net.ListenUDP("udp", udpAddr) + if err != nil { + tcpLn.Close() + return nil, fmt.Errorf("Failed to start UDP listener. Err: %s", err) + } + + // Set the UDP receive window size + setUDPRecvBuf(udpLn) + + if conf.LogOutput != nil && conf.Logger != nil { + return nil, fmt.Errorf("Cannot specify both LogOutput and Logger. Please choose a single log configuration setting.") + } + + logDest := conf.LogOutput + if logDest == nil { + logDest = os.Stderr + } + + logger := conf.Logger + if logger == nil { + logger = log.New(logDest, "", log.LstdFlags) + } + + m := &Memberlist{ + config: conf, + shutdownCh: make(chan struct{}), + leaveBroadcast: make(chan struct{}, 1), + udpListener: udpLn, + tcpListener: tcpLn, + handoff: make(chan msgHandoff, conf.HandoffQueueDepth), + nodeMap: make(map[string]*nodeState), + nodeTimers: make(map[string]*suspicion), + awareness: newAwareness(conf.AwarenessMaxMultiplier), + ackHandlers: make(map[uint32]*ackHandler), + broadcasts: &TransmitLimitedQueue{RetransmitMult: conf.RetransmitMult}, + logger: logger, + } + m.broadcasts.NumNodes = func() int { + return m.estNumNodes() + } + go m.tcpListen() + go m.udpListen() + go m.udpHandler() + return m, nil +} + +// Create will create a new Memberlist using the given configuration. +// This will not connect to any other node (see Join) yet, but will start +// all the listeners to allow other nodes to join this memberlist. +// After creating a Memberlist, the configuration given should not be +// modified by the user anymore. +func Create(conf *Config) (*Memberlist, error) { + m, err := newMemberlist(conf) + if err != nil { + return nil, err + } + if err := m.setAlive(); err != nil { + m.Shutdown() + return nil, err + } + m.schedule() + return m, nil +} + +// Join is used to take an existing Memberlist and attempt to join a cluster +// by contacting all the given hosts and performing a state sync. Initially, +// the Memberlist only contains our own state, so doing this will cause +// remote nodes to become aware of the existence of this node, effectively +// joining the cluster. +// +// This returns the number of hosts successfully contacted and an error if +// none could be reached. If an error is returned, the node did not successfully +// join the cluster. +func (m *Memberlist) Join(existing []string) (int, error) { + numSuccess := 0 + var errs error + for _, exist := range existing { + addrs, err := m.resolveAddr(exist) + if err != nil { + err = fmt.Errorf("Failed to resolve %s: %v", exist, err) + errs = multierror.Append(errs, err) + m.logger.Printf("[WARN] memberlist: %v", err) + continue + } + + for _, addr := range addrs { + if err := m.pushPullNode(addr.ip, addr.port, true); err != nil { + err = fmt.Errorf("Failed to join %s: %v", addr.ip, err) + errs = multierror.Append(errs, err) + m.logger.Printf("[DEBUG] memberlist: %v", err) + continue + } + numSuccess++ + } + + } + if numSuccess > 0 { + errs = nil + } + return numSuccess, errs +} + +// ipPort holds information about a node we want to try to join. +type ipPort struct { + ip net.IP + port uint16 +} + +// tcpLookupIP is a helper to initiate a TCP-based DNS lookup for the given host. +// The built-in Go resolver will do a UDP lookup first, and will only use TCP if +// the response has the truncate bit set, which isn't common on DNS servers like +// Consul's. By doing the TCP lookup directly, we get the best chance for the +// largest list of hosts to join. Since joins are relatively rare events, it's ok +// to do this rather expensive operation. +func (m *Memberlist) tcpLookupIP(host string, defaultPort uint16) ([]ipPort, error) { + // Don't attempt any TCP lookups against non-fully qualified domain + // names, since those will likely come from the resolv.conf file. + if !strings.Contains(host, ".") { + return nil, nil + } + + // Make sure the domain name is terminated with a dot (we know there's + // at least one character at this point). + dn := host + if dn[len(dn)-1] != '.' { + dn = dn + "." + } + + // See if we can find a server to try. + cc, err := dns.ClientConfigFromFile(m.config.DNSConfigPath) + if err != nil { + return nil, err + } + if len(cc.Servers) > 0 { + // We support host:port in the DNS config, but need to add the + // default port if one is not supplied. + server := cc.Servers[0] + if !hasPort(server) { + server = net.JoinHostPort(server, cc.Port) + } + + // Do the lookup. + c := new(dns.Client) + c.Net = "tcp" + msg := new(dns.Msg) + msg.SetQuestion(dn, dns.TypeANY) + in, _, err := c.Exchange(msg, server) + if err != nil { + return nil, err + } + + // Handle any IPs we get back that we can attempt to join. + var ips []ipPort + for _, r := range in.Answer { + switch rr := r.(type) { + case (*dns.A): + ips = append(ips, ipPort{rr.A, defaultPort}) + case (*dns.AAAA): + ips = append(ips, ipPort{rr.AAAA, defaultPort}) + case (*dns.CNAME): + m.logger.Printf("[DEBUG] memberlist: Ignoring CNAME RR in TCP-first answer for '%s'", host) + } + } + return ips, nil + } + + return nil, nil +} + +// resolveAddr is used to resolve the address into an address, +// port, and error. If no port is given, use the default +func (m *Memberlist) resolveAddr(hostStr string) ([]ipPort, error) { + // Normalize the incoming string to host:port so we can apply Go's + // parser to it. + port := uint16(0) + if !hasPort(hostStr) { + hostStr += ":" + strconv.Itoa(m.config.BindPort) + } + host, sport, err := net.SplitHostPort(hostStr) + if err != nil { + return nil, err + } + + // This will capture the supplied port, or the default one added above. + lport, err := strconv.ParseUint(sport, 10, 16) + if err != nil { + return nil, err + } + port = uint16(lport) + + // If it looks like an IP address we are done. The SplitHostPort() above + // will make sure the host part is in good shape for parsing, even for + // IPv6 addresses. + if ip := net.ParseIP(host); ip != nil { + return []ipPort{ipPort{ip, port}}, nil + } + + // First try TCP so we have the best chance for the largest list of + // hosts to join. If this fails it's not fatal since this isn't a standard + // way to query DNS, and we have a fallback below. + ips, err := m.tcpLookupIP(host, port) + if err != nil { + m.logger.Printf("[DEBUG] memberlist: TCP-first lookup failed for '%s', falling back to UDP: %s", hostStr, err) + } + if len(ips) > 0 { + return ips, nil + } + + // If TCP didn't yield anything then use the normal Go resolver which + // will try UDP, then might possibly try TCP again if the UDP response + // indicates it was truncated. + ans, err := net.LookupIP(host) + if err != nil { + return nil, err + } + ips = make([]ipPort, 0, len(ans)) + for _, ip := range ans { + ips = append(ips, ipPort{ip, port}) + } + return ips, nil +} + +// setAlive is used to mark this node as being alive. This is the same +// as if we received an alive notification our own network channel for +// ourself. +func (m *Memberlist) setAlive() error { + var advertiseAddr net.IP + var advertisePort int + if m.config.AdvertiseAddr != "" { + // If AdvertiseAddr is not empty, then advertise + // the given address and port. + ip := net.ParseIP(m.config.AdvertiseAddr) + if ip == nil { + return fmt.Errorf("Failed to parse advertise address!") + } + + // Ensure IPv4 conversion if necessary + if ip4 := ip.To4(); ip4 != nil { + ip = ip4 + } + + advertiseAddr = ip + advertisePort = m.config.AdvertisePort + } else { + if m.config.BindAddr == "0.0.0.0" { + // Otherwise, if we're not bound to a specific IP, let's use a suitable + // private IP address. + var err error + m.config.AdvertiseAddr, err = sockaddr.GetPrivateIP() + if err != nil { + return fmt.Errorf("Failed to get interface addresses: %v", err) + } + if m.config.AdvertiseAddr == "" { + return fmt.Errorf("No private IP address found, and explicit IP not provided") + } + + advertiseAddr = net.ParseIP(m.config.AdvertiseAddr) + if advertiseAddr == nil { + return fmt.Errorf("Failed to parse advertise address: %q", m.config.AdvertiseAddr) + } + } else { + // Use the IP that we're bound to. + addr := m.tcpListener.Addr().(*net.TCPAddr) + advertiseAddr = addr.IP + } + + // Use the port we are bound to. + advertisePort = m.tcpListener.Addr().(*net.TCPAddr).Port + } + + // Check if this is a public address without encryption + ipAddr, err := sockaddr.NewIPAddr(advertiseAddr.String()) + if err != nil { + return fmt.Errorf("Failed to parse interface addresses: %v", err) + } + + ifAddrs := []sockaddr.IfAddr{ + sockaddr.IfAddr{ + SockAddr: ipAddr, + }, + } + + _, publicIfs, err := sockaddr.IfByRFC("6890", ifAddrs) + if len(publicIfs) > 0 && !m.config.EncryptionEnabled() { + m.logger.Printf("[WARN] memberlist: Binding to public address without encryption!") + } + + // Get the node meta data + var meta []byte + if m.config.Delegate != nil { + meta = m.config.Delegate.NodeMeta(MetaMaxSize) + if len(meta) > MetaMaxSize { + panic("Node meta data provided is longer than the limit") + } + } + + a := alive{ + Incarnation: m.nextIncarnation(), + Node: m.config.Name, + Addr: advertiseAddr, + Port: uint16(advertisePort), + Meta: meta, + Vsn: []uint8{ + ProtocolVersionMin, ProtocolVersionMax, m.config.ProtocolVersion, + m.config.DelegateProtocolMin, m.config.DelegateProtocolMax, + m.config.DelegateProtocolVersion, + }, + } + m.aliveNode(&a, nil, true) + + return nil +} + +// LocalNode is used to return the local Node +func (m *Memberlist) LocalNode() *Node { + m.nodeLock.RLock() + defer m.nodeLock.RUnlock() + state := m.nodeMap[m.config.Name] + return &state.Node +} + +// UpdateNode is used to trigger re-advertising the local node. This is +// primarily used with a Delegate to support dynamic updates to the local +// meta data. This will block until the update message is successfully +// broadcasted to a member of the cluster, if any exist or until a specified +// timeout is reached. +func (m *Memberlist) UpdateNode(timeout time.Duration) error { + // Get the node meta data + var meta []byte + if m.config.Delegate != nil { + meta = m.config.Delegate.NodeMeta(MetaMaxSize) + if len(meta) > MetaMaxSize { + panic("Node meta data provided is longer than the limit") + } + } + + // Get the existing node + m.nodeLock.RLock() + state := m.nodeMap[m.config.Name] + m.nodeLock.RUnlock() + + // Format a new alive message + a := alive{ + Incarnation: m.nextIncarnation(), + Node: m.config.Name, + Addr: state.Addr, + Port: state.Port, + Meta: meta, + Vsn: []uint8{ + ProtocolVersionMin, ProtocolVersionMax, m.config.ProtocolVersion, + m.config.DelegateProtocolMin, m.config.DelegateProtocolMax, + m.config.DelegateProtocolVersion, + }, + } + notifyCh := make(chan struct{}) + m.aliveNode(&a, notifyCh, true) + + // Wait for the broadcast or a timeout + if m.anyAlive() { + var timeoutCh <-chan time.Time + if timeout > 0 { + timeoutCh = time.After(timeout) + } + select { + case <-notifyCh: + case <-timeoutCh: + return fmt.Errorf("timeout waiting for update broadcast") + } + } + return nil +} + +// SendTo is used to directly send a message to another node, without +// the use of the gossip mechanism. This will encode the message as a +// user-data message, which a delegate will receive through NotifyMsg +// The actual data is transmitted over UDP, which means this is a +// best-effort transmission mechanism, and the maximum size of the +// message is the size of a single UDP datagram, after compression. +// This method is DEPRECATED in favor or SendToUDP +func (m *Memberlist) SendTo(to net.Addr, msg []byte) error { + // Encode as a user message + buf := make([]byte, 1, len(msg)+1) + buf[0] = byte(userMsg) + buf = append(buf, msg...) + + // Send the message + return m.rawSendMsgUDP(to, nil, buf) +} + +// SendToUDP is used to directly send a message to another node, without +// the use of the gossip mechanism. This will encode the message as a +// user-data message, which a delegate will receive through NotifyMsg +// The actual data is transmitted over UDP, which means this is a +// best-effort transmission mechanism, and the maximum size of the +// message is the size of a single UDP datagram, after compression +func (m *Memberlist) SendToUDP(to *Node, msg []byte) error { + // Encode as a user message + buf := make([]byte, 1, len(msg)+1) + buf[0] = byte(userMsg) + buf = append(buf, msg...) + + // Send the message + destAddr := &net.UDPAddr{IP: to.Addr, Port: int(to.Port)} + return m.rawSendMsgUDP(destAddr, to, buf) +} + +// SendToTCP is used to directly send a message to another node, without +// the use of the gossip mechanism. This will encode the message as a +// user-data message, which a delegate will receive through NotifyMsg +// The actual data is transmitted over TCP, which means delivery +// is guaranteed if no error is returned. There is no limit +// to the size of the message +func (m *Memberlist) SendToTCP(to *Node, msg []byte) error { + // Send the message + destAddr := &net.TCPAddr{IP: to.Addr, Port: int(to.Port)} + return m.sendTCPUserMsg(destAddr, msg) +} + +// Members returns a list of all known live nodes. The node structures +// returned must not be modified. If you wish to modify a Node, make a +// copy first. +func (m *Memberlist) Members() []*Node { + m.nodeLock.RLock() + defer m.nodeLock.RUnlock() + + nodes := make([]*Node, 0, len(m.nodes)) + for _, n := range m.nodes { + if n.State != stateDead { + nodes = append(nodes, &n.Node) + } + } + + return nodes +} + +// NumMembers returns the number of alive nodes currently known. Between +// the time of calling this and calling Members, the number of alive nodes +// may have changed, so this shouldn't be used to determine how many +// members will be returned by Members. +func (m *Memberlist) NumMembers() (alive int) { + m.nodeLock.RLock() + defer m.nodeLock.RUnlock() + + for _, n := range m.nodes { + if n.State != stateDead { + alive++ + } + } + + return +} + +// Leave will broadcast a leave message but will not shutdown the background +// listeners, meaning the node will continue participating in gossip and state +// updates. +// +// This will block until the leave message is successfully broadcasted to +// a member of the cluster, if any exist or until a specified timeout +// is reached. +// +// This method is safe to call multiple times, but must not be called +// after the cluster is already shut down. +func (m *Memberlist) Leave(timeout time.Duration) error { + m.nodeLock.Lock() + // We can't defer m.nodeLock.Unlock() because m.deadNode will also try to + // acquire a lock so we need to Unlock before that. + + if m.shutdown { + m.nodeLock.Unlock() + panic("leave after shutdown") + } + + if !m.leave { + m.leave = true + + state, ok := m.nodeMap[m.config.Name] + m.nodeLock.Unlock() + if !ok { + m.logger.Printf("[WARN] memberlist: Leave but we're not in the node map.") + return nil + } + + d := dead{ + Incarnation: state.Incarnation, + Node: state.Name, + } + m.deadNode(&d) + + // Block until the broadcast goes out + if m.anyAlive() { + var timeoutCh <-chan time.Time + if timeout > 0 { + timeoutCh = time.After(timeout) + } + select { + case <-m.leaveBroadcast: + case <-timeoutCh: + return fmt.Errorf("timeout waiting for leave broadcast") + } + } + } else { + m.nodeLock.Unlock() + } + + return nil +} + +// Check for any other alive node. +func (m *Memberlist) anyAlive() bool { + m.nodeLock.RLock() + defer m.nodeLock.RUnlock() + for _, n := range m.nodes { + if n.State != stateDead && n.Name != m.config.Name { + return true + } + } + return false +} + +// GetHealthScore gives this instance's idea of how well it is meeting the soft +// real-time requirements of the protocol. Lower numbers are better, and zero +// means "totally healthy". +func (m *Memberlist) GetHealthScore() int { + return m.awareness.GetHealthScore() +} + +// ProtocolVersion returns the protocol version currently in use by +// this memberlist. +func (m *Memberlist) ProtocolVersion() uint8 { + // NOTE: This method exists so that in the future we can control + // any locking if necessary, if we change the protocol version at + // runtime, etc. + return m.config.ProtocolVersion +} + +// Shutdown will stop any background maintanence of network activity +// for this memberlist, causing it to appear "dead". A leave message +// will not be broadcasted prior, so the cluster being left will have +// to detect this node's shutdown using probing. If you wish to more +// gracefully exit the cluster, call Leave prior to shutting down. +// +// This method is safe to call multiple times. +func (m *Memberlist) Shutdown() error { + m.nodeLock.Lock() + defer m.nodeLock.Unlock() + + if m.shutdown { + return nil + } + + m.shutdown = true + close(m.shutdownCh) + m.deschedule() + m.udpListener.Close() + m.tcpListener.Close() + return nil +} diff --git a/vendor/github.com/hashicorp/memberlist/merge_delegate.go b/vendor/github.com/hashicorp/memberlist/merge_delegate.go new file mode 100644 index 000000000..89afb59f2 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/merge_delegate.go @@ -0,0 +1,14 @@ +package memberlist + +// MergeDelegate is used to involve a client in +// a potential cluster merge operation. Namely, when +// a node does a TCP push/pull (as part of a join), +// the delegate is involved and allowed to cancel the join +// based on custom logic. The merge delegate is NOT invoked +// as part of the push-pull anti-entropy. +type MergeDelegate interface { + // NotifyMerge is invoked when a merge could take place. + // Provides a list of the nodes known by the peer. If + // the return value is non-nil, the merge is canceled. + NotifyMerge(peers []*Node) error +} diff --git a/vendor/github.com/hashicorp/memberlist/net.go b/vendor/github.com/hashicorp/memberlist/net.go new file mode 100644 index 000000000..e47da411e --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/net.go @@ -0,0 +1,1123 @@ +package memberlist + +import ( + "bufio" + "bytes" + "encoding/binary" + "fmt" + "hash/crc32" + "io" + "net" + "time" + + "github.com/armon/go-metrics" + "github.com/hashicorp/go-msgpack/codec" +) + +// This is the minimum and maximum protocol version that we can +// _understand_. We're allowed to speak at any version within this +// range. This range is inclusive. +const ( + ProtocolVersionMin uint8 = 1 + + // Version 3 added support for TCP pings but we kept the default + // protocol version at 2 to ease transition to this new feature. + // A memberlist speaking version 2 of the protocol will attempt + // to TCP ping another memberlist who understands version 3 or + // greater. + // + // Version 4 added support for nacks as part of indirect probes. + // A memberlist speaking version 2 of the protocol will expect + // nacks from another memberlist who understands version 4 or + // greater, and likewise nacks will be sent to memberlists who + // understand version 4 or greater. + ProtocolVersion2Compatible = 2 + + ProtocolVersionMax = 5 +) + +// messageType is an integer ID of a type of message that can be received +// on network channels from other members. +type messageType uint8 + +// The list of available message types. +const ( + pingMsg messageType = iota + indirectPingMsg + ackRespMsg + suspectMsg + aliveMsg + deadMsg + pushPullMsg + compoundMsg + userMsg // User mesg, not handled by us + compressMsg + encryptMsg + nackRespMsg + hasCrcMsg +) + +// compressionType is used to specify the compression algorithm +type compressionType uint8 + +const ( + lzwAlgo compressionType = iota +) + +const ( + MetaMaxSize = 512 // Maximum size for node meta data + compoundHeaderOverhead = 2 // Assumed header overhead + compoundOverhead = 2 // Assumed overhead per entry in compoundHeader + udpBufSize = 65536 + udpRecvBuf = 2 * 1024 * 1024 + userMsgOverhead = 1 + blockingWarning = 10 * time.Millisecond // Warn if a UDP packet takes this long to process + maxPushStateBytes = 10 * 1024 * 1024 +) + +// ping request sent directly to node +type ping struct { + SeqNo uint32 + + // Node is sent so the target can verify they are + // the intended recipient. This is to protect again an agent + // restart with a new name. + Node string +} + +// indirect ping sent to an indirect ndoe +type indirectPingReq struct { + SeqNo uint32 + Target []byte + Port uint16 + Node string + Nack bool // true if we'd like a nack back +} + +// ack response is sent for a ping +type ackResp struct { + SeqNo uint32 + Payload []byte +} + +// nack response is sent for an indirect ping when the pinger doesn't hear from +// the ping-ee within the configured timeout. This lets the original node know +// that the indirect ping attempt happened but didn't succeed. +type nackResp struct { + SeqNo uint32 +} + +// suspect is broadcast when we suspect a node is dead +type suspect struct { + Incarnation uint32 + Node string + From string // Include who is suspecting +} + +// alive is broadcast when we know a node is alive. +// Overloaded for nodes joining +type alive struct { + Incarnation uint32 + Node string + Addr []byte + Port uint16 + Meta []byte + + // The versions of the protocol/delegate that are being spoken, order: + // pmin, pmax, pcur, dmin, dmax, dcur + Vsn []uint8 +} + +// dead is broadcast when we confirm a node is dead +// Overloaded for nodes leaving +type dead struct { + Incarnation uint32 + Node string + From string // Include who is suspecting +} + +// pushPullHeader is used to inform the +// otherside how many states we are transferring +type pushPullHeader struct { + Nodes int + UserStateLen int // Encodes the byte lengh of user state + Join bool // Is this a join request or a anti-entropy run +} + +// userMsgHeader is used to encapsulate a userMsg +type userMsgHeader struct { + UserMsgLen int // Encodes the byte lengh of user state +} + +// pushNodeState is used for pushPullReq when we are +// transferring out node states +type pushNodeState struct { + Name string + Addr []byte + Port uint16 + Meta []byte + Incarnation uint32 + State nodeStateType + Vsn []uint8 // Protocol versions +} + +// compress is used to wrap an underlying payload +// using a specified compression algorithm +type compress struct { + Algo compressionType + Buf []byte +} + +// msgHandoff is used to transfer a message between goroutines +type msgHandoff struct { + msgType messageType + buf []byte + from net.Addr +} + +// encryptionVersion returns the encryption version to use +func (m *Memberlist) encryptionVersion() encryptionVersion { + switch m.ProtocolVersion() { + case 1: + return 0 + default: + return 1 + } +} + +// setUDPRecvBuf is used to resize the UDP receive window. The function +// attempts to set the read buffer to `udpRecvBuf` but backs off until +// the read buffer can be set. +func setUDPRecvBuf(c *net.UDPConn) { + size := udpRecvBuf + for { + if err := c.SetReadBuffer(size); err == nil { + break + } + size = size / 2 + } +} + +// tcpListen listens for and handles incoming connections +func (m *Memberlist) tcpListen() { + for { + conn, err := m.tcpListener.AcceptTCP() + if err != nil { + if m.shutdown { + break + } + m.logger.Printf("[ERR] memberlist: Error accepting TCP connection: %s", err) + continue + } + go m.handleConn(conn) + } +} + +// handleConn handles a single incoming TCP connection +func (m *Memberlist) handleConn(conn *net.TCPConn) { + m.logger.Printf("[DEBUG] memberlist: TCP connection %s", LogConn(conn)) + + defer conn.Close() + metrics.IncrCounter([]string{"memberlist", "tcp", "accept"}, 1) + + conn.SetDeadline(time.Now().Add(m.config.TCPTimeout)) + msgType, bufConn, dec, err := m.readTCP(conn) + if err != nil { + if err != io.EOF { + m.logger.Printf("[ERR] memberlist: failed to receive: %s %s", err, LogConn(conn)) + } + return + } + + switch msgType { + case userMsg: + if err := m.readUserMsg(bufConn, dec); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to receive user message: %s %s", err, LogConn(conn)) + } + case pushPullMsg: + join, remoteNodes, userState, err := m.readRemoteState(bufConn, dec) + if err != nil { + m.logger.Printf("[ERR] memberlist: Failed to read remote state: %s %s", err, LogConn(conn)) + return + } + + if err := m.sendLocalState(conn, join); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to push local state: %s %s", err, LogConn(conn)) + return + } + + if err := m.mergeRemoteState(join, remoteNodes, userState); err != nil { + m.logger.Printf("[ERR] memberlist: Failed push/pull merge: %s %s", err, LogConn(conn)) + return + } + case pingMsg: + var p ping + if err := dec.Decode(&p); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to decode TCP ping: %s %s", err, LogConn(conn)) + return + } + + if p.Node != "" && p.Node != m.config.Name { + m.logger.Printf("[WARN] memberlist: Got ping for unexpected node %s %s", p.Node, LogConn(conn)) + return + } + + ack := ackResp{p.SeqNo, nil} + out, err := encode(ackRespMsg, &ack) + if err != nil { + m.logger.Printf("[ERR] memberlist: Failed to encode TCP ack: %s", err) + return + } + + err = m.rawSendMsgTCP(conn, out.Bytes()) + if err != nil { + m.logger.Printf("[ERR] memberlist: Failed to send TCP ack: %s %s", err, LogConn(conn)) + return + } + default: + m.logger.Printf("[ERR] memberlist: Received invalid msgType (%d) %s", msgType, LogConn(conn)) + } +} + +// udpListen listens for and handles incoming UDP packets +func (m *Memberlist) udpListen() { + var n int + var addr net.Addr + var err error + var lastPacket time.Time + for { + // Do a check for potentially blocking operations + if !lastPacket.IsZero() && time.Now().Sub(lastPacket) > blockingWarning { + diff := time.Now().Sub(lastPacket) + m.logger.Printf( + "[DEBUG] memberlist: Potential blocking operation. Last command took %v", + diff) + } + + // Create a new buffer + // TODO: Use Sync.Pool eventually + buf := make([]byte, udpBufSize) + + // Read a packet + n, addr, err = m.udpListener.ReadFrom(buf) + if err != nil { + if m.shutdown { + break + } + m.logger.Printf("[ERR] memberlist: Error reading UDP packet: %s", err) + continue + } + + // Capture the reception time of the packet as close to the + // system calls as possible. + lastPacket = time.Now() + + // Check the length + if n < 1 { + m.logger.Printf("[ERR] memberlist: UDP packet too short (%d bytes) %s", + len(buf), LogAddress(addr)) + continue + } + + // Ingest this packet + metrics.IncrCounter([]string{"memberlist", "udp", "received"}, float32(n)) + m.ingestPacket(buf[:n], addr, lastPacket) + } +} + +func (m *Memberlist) ingestPacket(buf []byte, from net.Addr, timestamp time.Time) { + // Check if encryption is enabled + if m.config.EncryptionEnabled() { + // Decrypt the payload + plain, err := decryptPayload(m.config.Keyring.GetKeys(), buf, nil) + if err != nil { + m.logger.Printf("[ERR] memberlist: Decrypt packet failed: %v %s", err, LogAddress(from)) + return + } + + // Continue processing the plaintext buffer + buf = plain + } + + // See if there's a checksum included to verify the contents of the message + if len(buf) >= 5 && messageType(buf[0]) == hasCrcMsg { + crc := crc32.ChecksumIEEE(buf[5:]) + expected := binary.BigEndian.Uint32(buf[1:5]) + if crc != expected { + m.logger.Printf("[WARN] memberlist: Got invalid checksum for UDP packet: %x, %x", crc, expected) + return + } + m.handleCommand(buf[5:], from, timestamp) + } else { + m.handleCommand(buf, from, timestamp) + } +} + +func (m *Memberlist) handleCommand(buf []byte, from net.Addr, timestamp time.Time) { + // Decode the message type + msgType := messageType(buf[0]) + buf = buf[1:] + + // Switch on the msgType + switch msgType { + case compoundMsg: + m.handleCompound(buf, from, timestamp) + case compressMsg: + m.handleCompressed(buf, from, timestamp) + + case pingMsg: + m.handlePing(buf, from) + case indirectPingMsg: + m.handleIndirectPing(buf, from) + case ackRespMsg: + m.handleAck(buf, from, timestamp) + case nackRespMsg: + m.handleNack(buf, from) + + case suspectMsg: + fallthrough + case aliveMsg: + fallthrough + case deadMsg: + fallthrough + case userMsg: + select { + case m.handoff <- msgHandoff{msgType, buf, from}: + default: + m.logger.Printf("[WARN] memberlist: UDP handler queue full, dropping message (%d) %s", msgType, LogAddress(from)) + } + + default: + m.logger.Printf("[ERR] memberlist: UDP msg type (%d) not supported %s", msgType, LogAddress(from)) + } +} + +// udpHandler processes messages received over UDP, but is decoupled +// from the listener to avoid blocking the listener which may cause +// ping/ack messages to be delayed. +func (m *Memberlist) udpHandler() { + for { + select { + case msg := <-m.handoff: + msgType := msg.msgType + buf := msg.buf + from := msg.from + + switch msgType { + case suspectMsg: + m.handleSuspect(buf, from) + case aliveMsg: + m.handleAlive(buf, from) + case deadMsg: + m.handleDead(buf, from) + case userMsg: + m.handleUser(buf, from) + default: + m.logger.Printf("[ERR] memberlist: UDP msg type (%d) not supported %s (handler)", msgType, LogAddress(from)) + } + + case <-m.shutdownCh: + return + } + } +} + +func (m *Memberlist) handleCompound(buf []byte, from net.Addr, timestamp time.Time) { + // Decode the parts + trunc, parts, err := decodeCompoundMessage(buf) + if err != nil { + m.logger.Printf("[ERR] memberlist: Failed to decode compound request: %s %s", err, LogAddress(from)) + return + } + + // Log any truncation + if trunc > 0 { + m.logger.Printf("[WARN] memberlist: Compound request had %d truncated messages %s", trunc, LogAddress(from)) + } + + // Handle each message + for _, part := range parts { + m.handleCommand(part, from, timestamp) + } +} + +func (m *Memberlist) handlePing(buf []byte, from net.Addr) { + var p ping + if err := decode(buf, &p); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to decode ping request: %s %s", err, LogAddress(from)) + return + } + // If node is provided, verify that it is for us + if p.Node != "" && p.Node != m.config.Name { + m.logger.Printf("[WARN] memberlist: Got ping for unexpected node '%s' %s", p.Node, LogAddress(from)) + return + } + var ack ackResp + ack.SeqNo = p.SeqNo + if m.config.Ping != nil { + ack.Payload = m.config.Ping.AckPayload() + } + if err := m.encodeAndSendMsg(from, ackRespMsg, &ack); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to send ack: %s %s", err, LogAddress(from)) + } +} + +func (m *Memberlist) handleIndirectPing(buf []byte, from net.Addr) { + var ind indirectPingReq + if err := decode(buf, &ind); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to decode indirect ping request: %s %s", err, LogAddress(from)) + return + } + + // For proto versions < 2, there is no port provided. Mask old + // behavior by using the configured port. + if m.ProtocolVersion() < 2 || ind.Port == 0 { + ind.Port = uint16(m.config.BindPort) + } + + // Send a ping to the correct host. + localSeqNo := m.nextSeqNo() + ping := ping{SeqNo: localSeqNo, Node: ind.Node} + destAddr := &net.UDPAddr{IP: ind.Target, Port: int(ind.Port)} + + // Setup a response handler to relay the ack + cancelCh := make(chan struct{}) + respHandler := func(payload []byte, timestamp time.Time) { + // Try to prevent the nack if we've caught it in time. + close(cancelCh) + + // Forward the ack back to the requestor. + ack := ackResp{ind.SeqNo, nil} + if err := m.encodeAndSendMsg(from, ackRespMsg, &ack); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to forward ack: %s %s", err, LogAddress(from)) + } + } + m.setAckHandler(localSeqNo, respHandler, m.config.ProbeTimeout) + + // Send the ping. + if err := m.encodeAndSendMsg(destAddr, pingMsg, &ping); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to send ping: %s %s", err, LogAddress(from)) + } + + // Setup a timer to fire off a nack if no ack is seen in time. + if ind.Nack { + go func() { + select { + case <-cancelCh: + return + case <-time.After(m.config.ProbeTimeout): + nack := nackResp{ind.SeqNo} + if err := m.encodeAndSendMsg(from, nackRespMsg, &nack); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to send nack: %s %s", err, LogAddress(from)) + } + } + }() + } +} + +func (m *Memberlist) handleAck(buf []byte, from net.Addr, timestamp time.Time) { + var ack ackResp + if err := decode(buf, &ack); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to decode ack response: %s %s", err, LogAddress(from)) + return + } + m.invokeAckHandler(ack, timestamp) +} + +func (m *Memberlist) handleNack(buf []byte, from net.Addr) { + var nack nackResp + if err := decode(buf, &nack); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to decode nack response: %s %s", err, LogAddress(from)) + return + } + m.invokeNackHandler(nack) +} + +func (m *Memberlist) handleSuspect(buf []byte, from net.Addr) { + var sus suspect + if err := decode(buf, &sus); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to decode suspect message: %s %s", err, LogAddress(from)) + return + } + m.suspectNode(&sus) +} + +func (m *Memberlist) handleAlive(buf []byte, from net.Addr) { + var live alive + if err := decode(buf, &live); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to decode alive message: %s %s", err, LogAddress(from)) + return + } + + // For proto versions < 2, there is no port provided. Mask old + // behavior by using the configured port + if m.ProtocolVersion() < 2 || live.Port == 0 { + live.Port = uint16(m.config.BindPort) + } + + m.aliveNode(&live, nil, false) +} + +func (m *Memberlist) handleDead(buf []byte, from net.Addr) { + var d dead + if err := decode(buf, &d); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to decode dead message: %s %s", err, LogAddress(from)) + return + } + m.deadNode(&d) +} + +// handleUser is used to notify channels of incoming user data +func (m *Memberlist) handleUser(buf []byte, from net.Addr) { + d := m.config.Delegate + if d != nil { + d.NotifyMsg(buf) + } +} + +// handleCompressed is used to unpack a compressed message +func (m *Memberlist) handleCompressed(buf []byte, from net.Addr, timestamp time.Time) { + // Try to decode the payload + payload, err := decompressPayload(buf) + if err != nil { + m.logger.Printf("[ERR] memberlist: Failed to decompress payload: %v %s", err, LogAddress(from)) + return + } + + // Recursively handle the payload + m.handleCommand(payload, from, timestamp) +} + +// encodeAndSendMsg is used to combine the encoding and sending steps +func (m *Memberlist) encodeAndSendMsg(to net.Addr, msgType messageType, msg interface{}) error { + out, err := encode(msgType, msg) + if err != nil { + return err + } + if err := m.sendMsg(to, out.Bytes()); err != nil { + return err + } + return nil +} + +// sendMsg is used to send a UDP message to another host. It will opportunistically +// create a compoundMsg and piggy back other broadcasts +func (m *Memberlist) sendMsg(to net.Addr, msg []byte) error { + // Check if we can piggy back any messages + bytesAvail := m.config.UDPBufferSize - len(msg) - compoundHeaderOverhead + if m.config.EncryptionEnabled() { + bytesAvail -= encryptOverhead(m.encryptionVersion()) + } + extra := m.getBroadcasts(compoundOverhead, bytesAvail) + + // Fast path if nothing to piggypack + if len(extra) == 0 { + return m.rawSendMsgUDP(to, nil, msg) + } + + // Join all the messages + msgs := make([][]byte, 0, 1+len(extra)) + msgs = append(msgs, msg) + msgs = append(msgs, extra...) + + // Create a compound message + compound := makeCompoundMessage(msgs) + + // Send the message + return m.rawSendMsgUDP(to, nil, compound.Bytes()) +} + +// rawSendMsgUDP is used to send a UDP message to another host without modification +func (m *Memberlist) rawSendMsgUDP(addr net.Addr, node *Node, msg []byte) error { + // Check if we have compression enabled + if m.config.EnableCompression { + buf, err := compressPayload(msg) + if err != nil { + m.logger.Printf("[WARN] memberlist: Failed to compress payload: %v", err) + } else { + // Only use compression if it reduced the size + if buf.Len() < len(msg) { + msg = buf.Bytes() + } + } + } + + // Try to look up the destination node + if node == nil { + toAddr, _, err := net.SplitHostPort(addr.String()) + if err != nil { + m.logger.Printf("[ERR] memberlist: Failed to parse address %q: %v", addr.String(), err) + return err + } + m.nodeLock.RLock() + nodeState, ok := m.nodeMap[toAddr] + m.nodeLock.RUnlock() + if ok { + node = &nodeState.Node + } + } + + // Add a CRC to the end of the payload if the recipient understands + // ProtocolVersion >= 5 + if node != nil && node.PMax >= 5 { + crc := crc32.ChecksumIEEE(msg) + header := make([]byte, 5, 5+len(msg)) + header[0] = byte(hasCrcMsg) + binary.BigEndian.PutUint32(header[1:], crc) + msg = append(header, msg...) + } + + // Check if we have encryption enabled + if m.config.EncryptionEnabled() { + // Encrypt the payload + var buf bytes.Buffer + primaryKey := m.config.Keyring.GetPrimaryKey() + err := encryptPayload(m.encryptionVersion(), primaryKey, msg, nil, &buf) + if err != nil { + m.logger.Printf("[ERR] memberlist: Encryption of message failed: %v", err) + return err + } + msg = buf.Bytes() + } + + metrics.IncrCounter([]string{"memberlist", "udp", "sent"}, float32(len(msg))) + _, err := m.udpListener.WriteTo(msg, addr) + return err +} + +// rawSendMsgTCP is used to send a TCP message to another host without modification +func (m *Memberlist) rawSendMsgTCP(conn net.Conn, sendBuf []byte) error { + // Check if compresion is enabled + if m.config.EnableCompression { + compBuf, err := compressPayload(sendBuf) + if err != nil { + m.logger.Printf("[ERROR] memberlist: Failed to compress payload: %v", err) + } else { + sendBuf = compBuf.Bytes() + } + } + + // Check if encryption is enabled + if m.config.EncryptionEnabled() { + crypt, err := m.encryptLocalState(sendBuf) + if err != nil { + m.logger.Printf("[ERROR] memberlist: Failed to encrypt local state: %v", err) + return err + } + sendBuf = crypt + } + + // Write out the entire send buffer + metrics.IncrCounter([]string{"memberlist", "tcp", "sent"}, float32(len(sendBuf))) + + if n, err := conn.Write(sendBuf); err != nil { + return err + } else if n != len(sendBuf) { + return fmt.Errorf("only %d of %d bytes written", n, len(sendBuf)) + } + + return nil +} + +// sendTCPUserMsg is used to send a TCP userMsg to another host +func (m *Memberlist) sendTCPUserMsg(to net.Addr, sendBuf []byte) error { + dialer := net.Dialer{Timeout: m.config.TCPTimeout} + conn, err := dialer.Dial("tcp", to.String()) + if err != nil { + return err + } + defer conn.Close() + + bufConn := bytes.NewBuffer(nil) + + if err := bufConn.WriteByte(byte(userMsg)); err != nil { + return err + } + + // Send our node state + header := userMsgHeader{UserMsgLen: len(sendBuf)} + hd := codec.MsgpackHandle{} + enc := codec.NewEncoder(bufConn, &hd) + + if err := enc.Encode(&header); err != nil { + return err + } + + if _, err := bufConn.Write(sendBuf); err != nil { + return err + } + + return m.rawSendMsgTCP(conn, bufConn.Bytes()) +} + +// sendAndReceiveState is used to initiate a push/pull over TCP with a remote node +func (m *Memberlist) sendAndReceiveState(addr []byte, port uint16, join bool) ([]pushNodeState, []byte, error) { + // Attempt to connect + dialer := net.Dialer{Timeout: m.config.TCPTimeout} + dest := net.TCPAddr{IP: addr, Port: int(port)} + conn, err := dialer.Dial("tcp", dest.String()) + if err != nil { + return nil, nil, err + } + defer conn.Close() + m.logger.Printf("[DEBUG] memberlist: Initiating push/pull sync with: %s", conn.RemoteAddr()) + metrics.IncrCounter([]string{"memberlist", "tcp", "connect"}, 1) + + // Send our state + if err := m.sendLocalState(conn, join); err != nil { + return nil, nil, err + } + + conn.SetDeadline(time.Now().Add(m.config.TCPTimeout)) + msgType, bufConn, dec, err := m.readTCP(conn) + if err != nil { + return nil, nil, err + } + + // Quit if not push/pull + if msgType != pushPullMsg { + err := fmt.Errorf("received invalid msgType (%d), expected pushPullMsg (%d) %s", msgType, pushPullMsg, LogConn(conn)) + return nil, nil, err + } + + // Read remote state + _, remoteNodes, userState, err := m.readRemoteState(bufConn, dec) + return remoteNodes, userState, err +} + +// sendLocalState is invoked to send our local state over a tcp connection +func (m *Memberlist) sendLocalState(conn net.Conn, join bool) error { + // Setup a deadline + conn.SetDeadline(time.Now().Add(m.config.TCPTimeout)) + + // Prepare the local node state + m.nodeLock.RLock() + localNodes := make([]pushNodeState, len(m.nodes)) + for idx, n := range m.nodes { + localNodes[idx].Name = n.Name + localNodes[idx].Addr = n.Addr + localNodes[idx].Port = n.Port + localNodes[idx].Incarnation = n.Incarnation + localNodes[idx].State = n.State + localNodes[idx].Meta = n.Meta + localNodes[idx].Vsn = []uint8{ + n.PMin, n.PMax, n.PCur, + n.DMin, n.DMax, n.DCur, + } + } + m.nodeLock.RUnlock() + + // Get the delegate state + var userData []byte + if m.config.Delegate != nil { + userData = m.config.Delegate.LocalState(join) + } + + // Create a bytes buffer writer + bufConn := bytes.NewBuffer(nil) + + // Send our node state + header := pushPullHeader{Nodes: len(localNodes), UserStateLen: len(userData), Join: join} + hd := codec.MsgpackHandle{} + enc := codec.NewEncoder(bufConn, &hd) + + // Begin state push + if _, err := bufConn.Write([]byte{byte(pushPullMsg)}); err != nil { + return err + } + + if err := enc.Encode(&header); err != nil { + return err + } + for i := 0; i < header.Nodes; i++ { + if err := enc.Encode(&localNodes[i]); err != nil { + return err + } + } + + // Write the user state as well + if userData != nil { + if _, err := bufConn.Write(userData); err != nil { + return err + } + } + + // Get the send buffer + return m.rawSendMsgTCP(conn, bufConn.Bytes()) +} + +// encryptLocalState is used to help encrypt local state before sending +func (m *Memberlist) encryptLocalState(sendBuf []byte) ([]byte, error) { + var buf bytes.Buffer + + // Write the encryptMsg byte + buf.WriteByte(byte(encryptMsg)) + + // Write the size of the message + sizeBuf := make([]byte, 4) + encVsn := m.encryptionVersion() + encLen := encryptedLength(encVsn, len(sendBuf)) + binary.BigEndian.PutUint32(sizeBuf, uint32(encLen)) + buf.Write(sizeBuf) + + // Write the encrypted cipher text to the buffer + key := m.config.Keyring.GetPrimaryKey() + err := encryptPayload(encVsn, key, sendBuf, buf.Bytes()[:5], &buf) + if err != nil { + return nil, err + } + return buf.Bytes(), nil +} + +// decryptRemoteState is used to help decrypt the remote state +func (m *Memberlist) decryptRemoteState(bufConn io.Reader) ([]byte, error) { + // Read in enough to determine message length + cipherText := bytes.NewBuffer(nil) + cipherText.WriteByte(byte(encryptMsg)) + _, err := io.CopyN(cipherText, bufConn, 4) + if err != nil { + return nil, err + } + + // Ensure we aren't asked to download too much. This is to guard against + // an attack vector where a huge amount of state is sent + moreBytes := binary.BigEndian.Uint32(cipherText.Bytes()[1:5]) + if moreBytes > maxPushStateBytes { + return nil, fmt.Errorf("Remote node state is larger than limit (%d)", moreBytes) + } + + // Read in the rest of the payload + _, err = io.CopyN(cipherText, bufConn, int64(moreBytes)) + if err != nil { + return nil, err + } + + // Decrypt the cipherText + dataBytes := cipherText.Bytes()[:5] + cipherBytes := cipherText.Bytes()[5:] + + // Decrypt the payload + keys := m.config.Keyring.GetKeys() + return decryptPayload(keys, cipherBytes, dataBytes) +} + +// readTCP is used to read the start of a TCP stream. +// it decrypts and decompresses the stream if necessary +func (m *Memberlist) readTCP(conn net.Conn) (messageType, io.Reader, *codec.Decoder, error) { + // Created a buffered reader + var bufConn io.Reader = bufio.NewReader(conn) + + // Read the message type + buf := [1]byte{0} + if _, err := bufConn.Read(buf[:]); err != nil { + return 0, nil, nil, err + } + msgType := messageType(buf[0]) + + // Check if the message is encrypted + if msgType == encryptMsg { + if !m.config.EncryptionEnabled() { + return 0, nil, nil, + fmt.Errorf("Remote state is encrypted and encryption is not configured") + } + + plain, err := m.decryptRemoteState(bufConn) + if err != nil { + return 0, nil, nil, err + } + + // Reset message type and bufConn + msgType = messageType(plain[0]) + bufConn = bytes.NewReader(plain[1:]) + } else if m.config.EncryptionEnabled() { + return 0, nil, nil, + fmt.Errorf("Encryption is configured but remote state is not encrypted") + } + + // Get the msgPack decoders + hd := codec.MsgpackHandle{} + dec := codec.NewDecoder(bufConn, &hd) + + // Check if we have a compressed message + if msgType == compressMsg { + var c compress + if err := dec.Decode(&c); err != nil { + return 0, nil, nil, err + } + decomp, err := decompressBuffer(&c) + if err != nil { + return 0, nil, nil, err + } + + // Reset the message type + msgType = messageType(decomp[0]) + + // Create a new bufConn + bufConn = bytes.NewReader(decomp[1:]) + + // Create a new decoder + dec = codec.NewDecoder(bufConn, &hd) + } + + return msgType, bufConn, dec, nil +} + +// readRemoteState is used to read the remote state from a connection +func (m *Memberlist) readRemoteState(bufConn io.Reader, dec *codec.Decoder) (bool, []pushNodeState, []byte, error) { + // Read the push/pull header + var header pushPullHeader + if err := dec.Decode(&header); err != nil { + return false, nil, nil, err + } + + // Allocate space for the transfer + remoteNodes := make([]pushNodeState, header.Nodes) + + // Try to decode all the states + for i := 0; i < header.Nodes; i++ { + if err := dec.Decode(&remoteNodes[i]); err != nil { + return false, nil, nil, err + } + } + + // Read the remote user state into a buffer + var userBuf []byte + if header.UserStateLen > 0 { + userBuf = make([]byte, header.UserStateLen) + bytes, err := io.ReadAtLeast(bufConn, userBuf, header.UserStateLen) + if err == nil && bytes != header.UserStateLen { + err = fmt.Errorf( + "Failed to read full user state (%d / %d)", + bytes, header.UserStateLen) + } + if err != nil { + return false, nil, nil, err + } + } + + // For proto versions < 2, there is no port provided. Mask old + // behavior by using the configured port + for idx := range remoteNodes { + if m.ProtocolVersion() < 2 || remoteNodes[idx].Port == 0 { + remoteNodes[idx].Port = uint16(m.config.BindPort) + } + } + + return header.Join, remoteNodes, userBuf, nil +} + +// mergeRemoteState is used to merge the remote state with our local state +func (m *Memberlist) mergeRemoteState(join bool, remoteNodes []pushNodeState, userBuf []byte) error { + if err := m.verifyProtocol(remoteNodes); err != nil { + return err + } + + // Invoke the merge delegate if any + if join && m.config.Merge != nil { + nodes := make([]*Node, len(remoteNodes)) + for idx, n := range remoteNodes { + nodes[idx] = &Node{ + Name: n.Name, + Addr: n.Addr, + Port: n.Port, + Meta: n.Meta, + PMin: n.Vsn[0], + PMax: n.Vsn[1], + PCur: n.Vsn[2], + DMin: n.Vsn[3], + DMax: n.Vsn[4], + DCur: n.Vsn[5], + } + } + if err := m.config.Merge.NotifyMerge(nodes); err != nil { + return err + } + } + + // Merge the membership state + m.mergeState(remoteNodes) + + // Invoke the delegate for user state + if userBuf != nil && m.config.Delegate != nil { + m.config.Delegate.MergeRemoteState(userBuf, join) + } + return nil +} + +// readUserMsg is used to decode a userMsg from a TCP stream +func (m *Memberlist) readUserMsg(bufConn io.Reader, dec *codec.Decoder) error { + // Read the user message header + var header userMsgHeader + if err := dec.Decode(&header); err != nil { + return err + } + + // Read the user message into a buffer + var userBuf []byte + if header.UserMsgLen > 0 { + userBuf = make([]byte, header.UserMsgLen) + bytes, err := io.ReadAtLeast(bufConn, userBuf, header.UserMsgLen) + if err == nil && bytes != header.UserMsgLen { + err = fmt.Errorf( + "Failed to read full user message (%d / %d)", + bytes, header.UserMsgLen) + } + if err != nil { + return err + } + + d := m.config.Delegate + if d != nil { + d.NotifyMsg(userBuf) + } + } + + return nil +} + +// sendPingAndWaitForAck makes a TCP connection to the given address, sends +// a ping, and waits for an ack. All of this is done as a series of blocking +// operations, given the deadline. The bool return parameter is true if we +// we able to round trip a ping to the other node. +func (m *Memberlist) sendPingAndWaitForAck(destAddr net.Addr, ping ping, deadline time.Time) (bool, error) { + dialer := net.Dialer{Deadline: deadline} + conn, err := dialer.Dial("tcp", destAddr.String()) + if err != nil { + // If the node is actually dead we expect this to fail, so we + // shouldn't spam the logs with it. After this point, errors + // with the connection are real, unexpected errors and should + // get propagated up. + return false, nil + } + defer conn.Close() + conn.SetDeadline(deadline) + + out, err := encode(pingMsg, &ping) + if err != nil { + return false, err + } + + if err = m.rawSendMsgTCP(conn, out.Bytes()); err != nil { + return false, err + } + + msgType, _, dec, err := m.readTCP(conn) + if err != nil { + return false, err + } + + if msgType != ackRespMsg { + return false, fmt.Errorf("Unexpected msgType (%d) from TCP ping %s", msgType, LogConn(conn)) + } + + var ack ackResp + if err = dec.Decode(&ack); err != nil { + return false, err + } + + if ack.SeqNo != ping.SeqNo { + return false, fmt.Errorf("Sequence number from ack (%d) doesn't match ping (%d) from TCP ping %s", ack.SeqNo, ping.SeqNo, LogConn(conn)) + } + + return true, nil +} diff --git a/vendor/github.com/hashicorp/memberlist/ping_delegate.go b/vendor/github.com/hashicorp/memberlist/ping_delegate.go new file mode 100644 index 000000000..1566c8b3d --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/ping_delegate.go @@ -0,0 +1,14 @@ +package memberlist + +import "time" + +// PingDelegate is used to notify an observer how long it took for a ping message to +// complete a round trip. It can also be used for writing arbitrary byte slices +// into ack messages. Note that in order to be meaningful for RTT estimates, this +// delegate does not apply to indirect pings, nor fallback pings sent over TCP. +type PingDelegate interface { + // AckPayload is invoked when an ack is being sent; the returned bytes will be appended to the ack + AckPayload() []byte + // NotifyPing is invoked when an ack for a ping is received + NotifyPingComplete(other *Node, rtt time.Duration, payload []byte) +} diff --git a/vendor/github.com/hashicorp/memberlist/queue.go b/vendor/github.com/hashicorp/memberlist/queue.go new file mode 100644 index 000000000..994b90ff1 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/queue.go @@ -0,0 +1,167 @@ +package memberlist + +import ( + "sort" + "sync" +) + +// TransmitLimitedQueue is used to queue messages to broadcast to +// the cluster (via gossip) but limits the number of transmits per +// message. It also prioritizes messages with lower transmit counts +// (hence newer messages). +type TransmitLimitedQueue struct { + // NumNodes returns the number of nodes in the cluster. This is + // used to determine the retransmit count, which is calculated + // based on the log of this. + NumNodes func() int + + // RetransmitMult is the multiplier used to determine the maximum + // number of retransmissions attempted. + RetransmitMult int + + sync.Mutex + bcQueue limitedBroadcasts +} + +type limitedBroadcast struct { + transmits int // Number of transmissions attempted. + b Broadcast +} +type limitedBroadcasts []*limitedBroadcast + +// Broadcast is something that can be broadcasted via gossip to +// the memberlist cluster. +type Broadcast interface { + // Invalidates checks if enqueuing the current broadcast + // invalidates a previous broadcast + Invalidates(b Broadcast) bool + + // Returns a byte form of the message + Message() []byte + + // Finished is invoked when the message will no longer + // be broadcast, either due to invalidation or to the + // transmit limit being reached + Finished() +} + +// QueueBroadcast is used to enqueue a broadcast +func (q *TransmitLimitedQueue) QueueBroadcast(b Broadcast) { + q.Lock() + defer q.Unlock() + + // Check if this message invalidates another + n := len(q.bcQueue) + for i := 0; i < n; i++ { + if b.Invalidates(q.bcQueue[i].b) { + q.bcQueue[i].b.Finished() + copy(q.bcQueue[i:], q.bcQueue[i+1:]) + q.bcQueue[n-1] = nil + q.bcQueue = q.bcQueue[:n-1] + n-- + } + } + + // Append to the queue + q.bcQueue = append(q.bcQueue, &limitedBroadcast{0, b}) +} + +// GetBroadcasts is used to get a number of broadcasts, up to a byte limit +// and applying a per-message overhead as provided. +func (q *TransmitLimitedQueue) GetBroadcasts(overhead, limit int) [][]byte { + q.Lock() + defer q.Unlock() + + // Fast path the default case + if len(q.bcQueue) == 0 { + return nil + } + + transmitLimit := retransmitLimit(q.RetransmitMult, q.NumNodes()) + bytesUsed := 0 + var toSend [][]byte + + for i := len(q.bcQueue) - 1; i >= 0; i-- { + // Check if this is within our limits + b := q.bcQueue[i] + msg := b.b.Message() + if bytesUsed+overhead+len(msg) > limit { + continue + } + + // Add to slice to send + bytesUsed += overhead + len(msg) + toSend = append(toSend, msg) + + // Check if we should stop transmission + b.transmits++ + if b.transmits >= transmitLimit { + b.b.Finished() + n := len(q.bcQueue) + q.bcQueue[i], q.bcQueue[n-1] = q.bcQueue[n-1], nil + q.bcQueue = q.bcQueue[:n-1] + } + } + + // If we are sending anything, we need to re-sort to deal + // with adjusted transmit counts + if len(toSend) > 0 { + q.bcQueue.Sort() + } + return toSend +} + +// NumQueued returns the number of queued messages +func (q *TransmitLimitedQueue) NumQueued() int { + q.Lock() + defer q.Unlock() + return len(q.bcQueue) +} + +// Reset clears all the queued messages +func (q *TransmitLimitedQueue) Reset() { + q.Lock() + defer q.Unlock() + for _, b := range q.bcQueue { + b.b.Finished() + } + q.bcQueue = nil +} + +// Prune will retain the maxRetain latest messages, and the rest +// will be discarded. This can be used to prevent unbounded queue sizes +func (q *TransmitLimitedQueue) Prune(maxRetain int) { + q.Lock() + defer q.Unlock() + + // Do nothing if queue size is less than the limit + n := len(q.bcQueue) + if n < maxRetain { + return + } + + // Invalidate the messages we will be removing + for i := 0; i < n-maxRetain; i++ { + q.bcQueue[i].b.Finished() + } + + // Move the messages, and retain only the last maxRetain + copy(q.bcQueue[0:], q.bcQueue[n-maxRetain:]) + q.bcQueue = q.bcQueue[:maxRetain] +} + +func (b limitedBroadcasts) Len() int { + return len(b) +} + +func (b limitedBroadcasts) Less(i, j int) bool { + return b[i].transmits < b[j].transmits +} + +func (b limitedBroadcasts) Swap(i, j int) { + b[i], b[j] = b[j], b[i] +} + +func (b limitedBroadcasts) Sort() { + sort.Sort(sort.Reverse(b)) +} diff --git a/vendor/github.com/hashicorp/memberlist/security.go b/vendor/github.com/hashicorp/memberlist/security.go new file mode 100644 index 000000000..d90114eb0 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/security.go @@ -0,0 +1,198 @@ +package memberlist + +import ( + "bytes" + "crypto/aes" + "crypto/cipher" + "crypto/rand" + "fmt" + "io" +) + +/* + +Encrypted messages are prefixed with an encryptionVersion byte +that is used for us to be able to properly encode/decode. We +currently support the following versions: + + 0 - AES-GCM 128, using PKCS7 padding + 1 - AES-GCM 128, no padding. Padding not needed, caused bloat. + +*/ +type encryptionVersion uint8 + +const ( + minEncryptionVersion encryptionVersion = 0 + maxEncryptionVersion encryptionVersion = 1 +) + +const ( + versionSize = 1 + nonceSize = 12 + tagSize = 16 + maxPadOverhead = 16 + blockSize = aes.BlockSize +) + +// pkcs7encode is used to pad a byte buffer to a specific block size using +// the PKCS7 algorithm. "Ignores" some bytes to compensate for IV +func pkcs7encode(buf *bytes.Buffer, ignore, blockSize int) { + n := buf.Len() - ignore + more := blockSize - (n % blockSize) + for i := 0; i < more; i++ { + buf.WriteByte(byte(more)) + } +} + +// pkcs7decode is used to decode a buffer that has been padded +func pkcs7decode(buf []byte, blockSize int) []byte { + if len(buf) == 0 { + panic("Cannot decode a PKCS7 buffer of zero length") + } + n := len(buf) + last := buf[n-1] + n -= int(last) + return buf[:n] +} + +// encryptOverhead returns the maximum possible overhead of encryption by version +func encryptOverhead(vsn encryptionVersion) int { + switch vsn { + case 0: + return 45 // Version: 1, IV: 12, Padding: 16, Tag: 16 + case 1: + return 29 // Version: 1, IV: 12, Tag: 16 + default: + panic("unsupported version") + } +} + +// encryptedLength is used to compute the buffer size needed +// for a message of given length +func encryptedLength(vsn encryptionVersion, inp int) int { + // If we are on version 1, there is no padding + if vsn >= 1 { + return versionSize + nonceSize + inp + tagSize + } + + // Determine the padding size + padding := blockSize - (inp % blockSize) + + // Sum the extra parts to get total size + return versionSize + nonceSize + inp + padding + tagSize +} + +// encryptPayload is used to encrypt a message with a given key. +// We make use of AES-128 in GCM mode. New byte buffer is the version, +// nonce, ciphertext and tag +func encryptPayload(vsn encryptionVersion, key []byte, msg []byte, data []byte, dst *bytes.Buffer) error { + // Get the AES block cipher + aesBlock, err := aes.NewCipher(key) + if err != nil { + return err + } + + // Get the GCM cipher mode + gcm, err := cipher.NewGCM(aesBlock) + if err != nil { + return err + } + + // Grow the buffer to make room for everything + offset := dst.Len() + dst.Grow(encryptedLength(vsn, len(msg))) + + // Write the encryption version + dst.WriteByte(byte(vsn)) + + // Add a random nonce + io.CopyN(dst, rand.Reader, nonceSize) + afterNonce := dst.Len() + + // Ensure we are correctly padded (only version 0) + if vsn == 0 { + io.Copy(dst, bytes.NewReader(msg)) + pkcs7encode(dst, offset+versionSize+nonceSize, aes.BlockSize) + } + + // Encrypt message using GCM + slice := dst.Bytes()[offset:] + nonce := slice[versionSize : versionSize+nonceSize] + + // Message source depends on the encryption version. + // Version 0 uses padding, version 1 does not + var src []byte + if vsn == 0 { + src = slice[versionSize+nonceSize:] + } else { + src = msg + } + out := gcm.Seal(nil, nonce, src, data) + + // Truncate the plaintext, and write the cipher text + dst.Truncate(afterNonce) + dst.Write(out) + return nil +} + +// decryptMessage performs the actual decryption of ciphertext. This is in its +// own function to allow it to be called on all keys easily. +func decryptMessage(key, msg []byte, data []byte) ([]byte, error) { + // Get the AES block cipher + aesBlock, err := aes.NewCipher(key) + if err != nil { + return nil, err + } + + // Get the GCM cipher mode + gcm, err := cipher.NewGCM(aesBlock) + if err != nil { + return nil, err + } + + // Decrypt the message + nonce := msg[versionSize : versionSize+nonceSize] + ciphertext := msg[versionSize+nonceSize:] + plain, err := gcm.Open(nil, nonce, ciphertext, data) + if err != nil { + return nil, err + } + + // Success! + return plain, nil +} + +// decryptPayload is used to decrypt a message with a given key, +// and verify it's contents. Any padding will be removed, and a +// slice to the plaintext is returned. Decryption is done IN PLACE! +func decryptPayload(keys [][]byte, msg []byte, data []byte) ([]byte, error) { + // Ensure we have at least one byte + if len(msg) == 0 { + return nil, fmt.Errorf("Cannot decrypt empty payload") + } + + // Verify the version + vsn := encryptionVersion(msg[0]) + if vsn > maxEncryptionVersion { + return nil, fmt.Errorf("Unsupported encryption version %d", msg[0]) + } + + // Ensure the length is sane + if len(msg) < encryptedLength(vsn, 0) { + return nil, fmt.Errorf("Payload is too small to decrypt: %d", len(msg)) + } + + for _, key := range keys { + plain, err := decryptMessage(key, msg, data) + if err == nil { + // Remove the PKCS7 padding for vsn 0 + if vsn == 0 { + return pkcs7decode(plain, aes.BlockSize), nil + } else { + return plain, nil + } + } + } + + return nil, fmt.Errorf("No installed keys could decrypt the message") +} diff --git a/vendor/github.com/hashicorp/memberlist/state.go b/vendor/github.com/hashicorp/memberlist/state.go new file mode 100644 index 000000000..6b9122f08 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/state.go @@ -0,0 +1,1151 @@ +package memberlist + +import ( + "bytes" + "fmt" + "math" + "math/rand" + "net" + "sync/atomic" + "time" + + "github.com/armon/go-metrics" +) + +type nodeStateType int + +const ( + stateAlive nodeStateType = iota + stateSuspect + stateDead +) + +// Node represents a node in the cluster. +type Node struct { + Name string + Addr net.IP + Port uint16 + Meta []byte // Metadata from the delegate for this node. + PMin uint8 // Minimum protocol version this understands + PMax uint8 // Maximum protocol version this understands + PCur uint8 // Current version node is speaking + DMin uint8 // Min protocol version for the delegate to understand + DMax uint8 // Max protocol version for the delegate to understand + DCur uint8 // Current version delegate is speaking +} + +// NodeState is used to manage our state view of another node +type nodeState struct { + Node + Incarnation uint32 // Last known incarnation number + State nodeStateType // Current state + StateChange time.Time // Time last state change happened +} + +// ackHandler is used to register handlers for incoming acks and nacks. +type ackHandler struct { + ackFn func([]byte, time.Time) + nackFn func() + timer *time.Timer +} + +// NoPingResponseError is used to indicate a 'ping' packet was +// successfully issued but no response was received +type NoPingResponseError struct { + node string +} + +func (f NoPingResponseError) Error() string { + return fmt.Sprintf("No response from node %s", f.node) +} + +// Schedule is used to ensure the Tick is performed periodically. This +// function is safe to call multiple times. If the memberlist is already +// scheduled, then it won't do anything. +func (m *Memberlist) schedule() { + m.tickerLock.Lock() + defer m.tickerLock.Unlock() + + // If we already have tickers, then don't do anything, since we're + // scheduled + if len(m.tickers) > 0 { + return + } + + // Create the stop tick channel, a blocking channel. We close this + // when we should stop the tickers. + stopCh := make(chan struct{}) + + // Create a new probeTicker + if m.config.ProbeInterval > 0 { + t := time.NewTicker(m.config.ProbeInterval) + go m.triggerFunc(m.config.ProbeInterval, t.C, stopCh, m.probe) + m.tickers = append(m.tickers, t) + } + + // Create a push pull ticker if needed + if m.config.PushPullInterval > 0 { + go m.pushPullTrigger(stopCh) + } + + // Create a gossip ticker if needed + if m.config.GossipInterval > 0 && m.config.GossipNodes > 0 { + t := time.NewTicker(m.config.GossipInterval) + go m.triggerFunc(m.config.GossipInterval, t.C, stopCh, m.gossip) + m.tickers = append(m.tickers, t) + } + + // If we made any tickers, then record the stopTick channel for + // later. + if len(m.tickers) > 0 { + m.stopTick = stopCh + } +} + +// triggerFunc is used to trigger a function call each time a +// message is received until a stop tick arrives. +func (m *Memberlist) triggerFunc(stagger time.Duration, C <-chan time.Time, stop <-chan struct{}, f func()) { + // Use a random stagger to avoid syncronizing + randStagger := time.Duration(uint64(rand.Int63()) % uint64(stagger)) + select { + case <-time.After(randStagger): + case <-stop: + return + } + for { + select { + case <-C: + f() + case <-stop: + return + } + } +} + +// pushPullTrigger is used to periodically trigger a push/pull until +// a stop tick arrives. We don't use triggerFunc since the push/pull +// timer is dynamically scaled based on cluster size to avoid network +// saturation +func (m *Memberlist) pushPullTrigger(stop <-chan struct{}) { + interval := m.config.PushPullInterval + + // Use a random stagger to avoid syncronizing + randStagger := time.Duration(uint64(rand.Int63()) % uint64(interval)) + select { + case <-time.After(randStagger): + case <-stop: + return + } + + // Tick using a dynamic timer + for { + tickTime := pushPullScale(interval, m.estNumNodes()) + select { + case <-time.After(tickTime): + m.pushPull() + case <-stop: + return + } + } +} + +// Deschedule is used to stop the background maintenance. This is safe +// to call multiple times. +func (m *Memberlist) deschedule() { + m.tickerLock.Lock() + defer m.tickerLock.Unlock() + + // If we have no tickers, then we aren't scheduled. + if len(m.tickers) == 0 { + return + } + + // Close the stop channel so all the ticker listeners stop. + close(m.stopTick) + + // Explicitly stop all the tickers themselves so they don't take + // up any more resources, and get rid of the list. + for _, t := range m.tickers { + t.Stop() + } + m.tickers = nil +} + +// Tick is used to perform a single round of failure detection and gossip +func (m *Memberlist) probe() { + // Track the number of indexes we've considered probing + numCheck := 0 +START: + m.nodeLock.RLock() + + // Make sure we don't wrap around infinitely + if numCheck >= len(m.nodes) { + m.nodeLock.RUnlock() + return + } + + // Handle the wrap around case + if m.probeIndex >= len(m.nodes) { + m.nodeLock.RUnlock() + m.resetNodes() + m.probeIndex = 0 + numCheck++ + goto START + } + + // Determine if we should probe this node + skip := false + var node nodeState + + node = *m.nodes[m.probeIndex] + if node.Name == m.config.Name { + skip = true + } else if node.State == stateDead { + skip = true + } + + // Potentially skip + m.nodeLock.RUnlock() + m.probeIndex++ + if skip { + numCheck++ + goto START + } + + // Probe the specific node + m.probeNode(&node) +} + +// probeNode handles a single round of failure checking on a node. +func (m *Memberlist) probeNode(node *nodeState) { + defer metrics.MeasureSince([]string{"memberlist", "probeNode"}, time.Now()) + + // We use our health awareness to scale the overall probe interval, so we + // slow down if we detect problems. The ticker that calls us can handle + // us running over the base interval, and will skip missed ticks. + probeInterval := m.awareness.ScaleTimeout(m.config.ProbeInterval) + if probeInterval > m.config.ProbeInterval { + metrics.IncrCounter([]string{"memberlist", "degraded", "probe"}, 1) + } + + // Prepare a ping message and setup an ack handler. + ping := ping{SeqNo: m.nextSeqNo(), Node: node.Name} + ackCh := make(chan ackMessage, m.config.IndirectChecks+1) + nackCh := make(chan struct{}, m.config.IndirectChecks+1) + m.setProbeChannels(ping.SeqNo, ackCh, nackCh, probeInterval) + + // Send a ping to the node. If this node looks like it's suspect or dead, + // also tack on a suspect message so that it has a chance to refute as + // soon as possible. + deadline := time.Now().Add(probeInterval) + destAddr := &net.UDPAddr{IP: node.Addr, Port: int(node.Port)} + if node.State == stateAlive { + if err := m.encodeAndSendMsg(destAddr, pingMsg, &ping); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to send ping: %s", err) + return + } + } else { + var msgs [][]byte + if buf, err := encode(pingMsg, &ping); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to encode ping message: %s", err) + return + } else { + msgs = append(msgs, buf.Bytes()) + } + s := suspect{Incarnation: node.Incarnation, Node: node.Name, From: m.config.Name} + if buf, err := encode(suspectMsg, &s); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to encode suspect message: %s", err) + return + } else { + msgs = append(msgs, buf.Bytes()) + } + + compound := makeCompoundMessage(msgs) + if err := m.rawSendMsgUDP(destAddr, &node.Node, compound.Bytes()); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to send compound ping and suspect message to %s: %s", destAddr, err) + return + } + } + + // Mark the sent time here, which should be after any pre-processing and + // system calls to do the actual send. This probably under-reports a bit, + // but it's the best we can do. + sent := time.Now() + + // Arrange for our self-awareness to get updated. At this point we've + // sent the ping, so any return statement means the probe succeeded + // which will improve our health until we get to the failure scenarios + // at the end of this function, which will alter this delta variable + // accordingly. + awarenessDelta := -1 + defer func() { + m.awareness.ApplyDelta(awarenessDelta) + }() + + // Wait for response or round-trip-time. + select { + case v := <-ackCh: + if v.Complete == true { + if m.config.Ping != nil { + rtt := v.Timestamp.Sub(sent) + m.config.Ping.NotifyPingComplete(&node.Node, rtt, v.Payload) + } + return + } + + // As an edge case, if we get a timeout, we need to re-enqueue it + // here to break out of the select below. + if v.Complete == false { + ackCh <- v + } + case <-time.After(m.config.ProbeTimeout): + // Note that we don't scale this timeout based on awareness and + // the health score. That's because we don't really expect waiting + // longer to help get UDP through. Since health does extend the + // probe interval it will give the TCP fallback more time, which + // is more active in dealing with lost packets, and it gives more + // time to wait for indirect acks/nacks. + m.logger.Printf("[DEBUG] memberlist: Failed UDP ping: %v (timeout reached)", node.Name) + } + + // Get some random live nodes. + m.nodeLock.RLock() + kNodes := kRandomNodes(m.config.IndirectChecks, m.nodes, func(n *nodeState) bool { + return n.Name == m.config.Name || + n.Name == node.Name || + n.State != stateAlive + }) + m.nodeLock.RUnlock() + + // Attempt an indirect ping. + expectedNacks := 0 + ind := indirectPingReq{SeqNo: ping.SeqNo, Target: node.Addr, Port: node.Port, Node: node.Name} + for _, peer := range kNodes { + // We only expect nack to be sent from peers who understand + // version 4 of the protocol. + if ind.Nack = peer.PMax >= 4; ind.Nack { + expectedNacks++ + } + + destAddr := &net.UDPAddr{IP: peer.Addr, Port: int(peer.Port)} + if err := m.encodeAndSendMsg(destAddr, indirectPingMsg, &ind); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to send indirect ping: %s", err) + } + } + + // Also make an attempt to contact the node directly over TCP. This + // helps prevent confused clients who get isolated from UDP traffic + // but can still speak TCP (which also means they can possibly report + // misinformation to other nodes via anti-entropy), avoiding flapping in + // the cluster. + // + // This is a little unusual because we will attempt a TCP ping to any + // member who understands version 3 of the protocol, regardless of + // which protocol version we are speaking. That's why we've included a + // config option to turn this off if desired. + fallbackCh := make(chan bool, 1) + if (!m.config.DisableTcpPings) && (node.PMax >= 3) { + destAddr := &net.TCPAddr{IP: node.Addr, Port: int(node.Port)} + go func() { + defer close(fallbackCh) + didContact, err := m.sendPingAndWaitForAck(destAddr, ping, deadline) + if err != nil { + m.logger.Printf("[ERR] memberlist: Failed TCP fallback ping: %s", err) + } else { + fallbackCh <- didContact + } + }() + } else { + close(fallbackCh) + } + + // Wait for the acks or timeout. Note that we don't check the fallback + // channel here because we want to issue a warning below if that's the + // *only* way we hear back from the peer, so we have to let this time + // out first to allow the normal UDP-based acks to come in. + select { + case v := <-ackCh: + if v.Complete == true { + return + } + } + + // Finally, poll the fallback channel. The timeouts are set such that + // the channel will have something or be closed without having to wait + // any additional time here. + for didContact := range fallbackCh { + if didContact { + m.logger.Printf("[WARN] memberlist: Was able to reach %s via TCP but not UDP, network may be misconfigured and not allowing bidirectional UDP", node.Name) + return + } + } + + // Update our self-awareness based on the results of this failed probe. + // If we don't have peers who will send nacks then we penalize for any + // failed probe as a simple health metric. If we do have peers to nack + // verify, then we can use that as a more sophisticated measure of self- + // health because we assume them to be working, and they can help us + // decide if the probed node was really dead or if it was something wrong + // with ourselves. + awarenessDelta = 0 + if expectedNacks > 0 { + if nackCount := len(nackCh); nackCount < expectedNacks { + awarenessDelta += 2 * (expectedNacks - nackCount) + } + } else { + awarenessDelta += 1 + } + + // No acks received from target, suspect it as failed. + m.logger.Printf("[INFO] memberlist: Suspect %s has failed, no acks received", node.Name) + s := suspect{Incarnation: node.Incarnation, Node: node.Name, From: m.config.Name} + m.suspectNode(&s) +} + +// Ping initiates a ping to the node with the specified name. +func (m *Memberlist) Ping(node string, addr net.Addr) (time.Duration, error) { + // Prepare a ping message and setup an ack handler. + ping := ping{SeqNo: m.nextSeqNo(), Node: node} + ackCh := make(chan ackMessage, m.config.IndirectChecks+1) + m.setProbeChannels(ping.SeqNo, ackCh, nil, m.config.ProbeInterval) + + // Send a ping to the node. + if err := m.encodeAndSendMsg(addr, pingMsg, &ping); err != nil { + return 0, err + } + + // Mark the sent time here, which should be after any pre-processing and + // system calls to do the actual send. This probably under-reports a bit, + // but it's the best we can do. + sent := time.Now() + + // Wait for response or timeout. + select { + case v := <-ackCh: + if v.Complete == true { + return v.Timestamp.Sub(sent), nil + } + case <-time.After(m.config.ProbeTimeout): + // Timeout, return an error below. + } + + m.logger.Printf("[DEBUG] memberlist: Failed UDP ping: %v (timeout reached)", node) + return 0, NoPingResponseError{ping.Node} +} + +// resetNodes is used when the tick wraps around. It will reap the +// dead nodes and shuffle the node list. +func (m *Memberlist) resetNodes() { + m.nodeLock.Lock() + defer m.nodeLock.Unlock() + + // Move dead nodes, but respect gossip to the dead interval + deadIdx := moveDeadNodes(m.nodes, m.config.GossipToTheDeadTime) + + // Deregister the dead nodes + for i := deadIdx; i < len(m.nodes); i++ { + delete(m.nodeMap, m.nodes[i].Name) + m.nodes[i] = nil + } + + // Trim the nodes to exclude the dead nodes + m.nodes = m.nodes[0:deadIdx] + + // Update numNodes after we've trimmed the dead nodes + atomic.StoreUint32(&m.numNodes, uint32(deadIdx)) + + // Shuffle live nodes + shuffleNodes(m.nodes) +} + +// gossip is invoked every GossipInterval period to broadcast our gossip +// messages to a few random nodes. +func (m *Memberlist) gossip() { + defer metrics.MeasureSince([]string{"memberlist", "gossip"}, time.Now()) + + // Get some random live, suspect, or recently dead nodes + m.nodeLock.RLock() + kNodes := kRandomNodes(m.config.GossipNodes, m.nodes, func(n *nodeState) bool { + if n.Name == m.config.Name { + return true + } + + switch n.State { + case stateAlive, stateSuspect: + return false + + case stateDead: + return time.Since(n.StateChange) > m.config.GossipToTheDeadTime + + default: + return true + } + }) + m.nodeLock.RUnlock() + + // Compute the bytes available + bytesAvail := m.config.UDPBufferSize - compoundHeaderOverhead + if m.config.EncryptionEnabled() { + bytesAvail -= encryptOverhead(m.encryptionVersion()) + } + + for _, node := range kNodes { + // Get any pending broadcasts + msgs := m.getBroadcasts(compoundOverhead, bytesAvail) + if len(msgs) == 0 { + return + } + + destAddr := &net.UDPAddr{IP: node.Addr, Port: int(node.Port)} + + if len(msgs) == 1 { + // Send single message as is + if err := m.rawSendMsgUDP(destAddr, &node.Node, msgs[0]); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to send gossip to %s: %s", destAddr, err) + } + } else { + // Otherwise create and send a compound message + compound := makeCompoundMessage(msgs) + if err := m.rawSendMsgUDP(destAddr, &node.Node, compound.Bytes()); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to send gossip to %s: %s", destAddr, err) + } + } + } +} + +// pushPull is invoked periodically to randomly perform a complete state +// exchange. Used to ensure a high level of convergence, but is also +// reasonably expensive as the entire state of this node is exchanged +// with the other node. +func (m *Memberlist) pushPull() { + // Get a random live node + m.nodeLock.RLock() + nodes := kRandomNodes(1, m.nodes, func(n *nodeState) bool { + return n.Name == m.config.Name || + n.State != stateAlive + }) + m.nodeLock.RUnlock() + + // If no nodes, bail + if len(nodes) == 0 { + return + } + node := nodes[0] + + // Attempt a push pull + if err := m.pushPullNode(node.Addr, node.Port, false); err != nil { + m.logger.Printf("[ERR] memberlist: Push/Pull with %s failed: %s", node.Name, err) + } +} + +// pushPullNode does a complete state exchange with a specific node. +func (m *Memberlist) pushPullNode(addr []byte, port uint16, join bool) error { + defer metrics.MeasureSince([]string{"memberlist", "pushPullNode"}, time.Now()) + + // Attempt to send and receive with the node + remote, userState, err := m.sendAndReceiveState(addr, port, join) + if err != nil { + return err + } + + if err := m.mergeRemoteState(join, remote, userState); err != nil { + return err + } + return nil +} + +// verifyProtocol verifies that all the remote nodes can speak with our +// nodes and vice versa on both the core protocol as well as the +// delegate protocol level. +// +// The verification works by finding the maximum minimum and +// minimum maximum understood protocol and delegate versions. In other words, +// it finds the common denominator of protocol and delegate version ranges +// for the entire cluster. +// +// After this, it goes through the entire cluster (local and remote) and +// verifies that everyone's speaking protocol versions satisfy this range. +// If this passes, it means that every node can understand each other. +func (m *Memberlist) verifyProtocol(remote []pushNodeState) error { + m.nodeLock.RLock() + defer m.nodeLock.RUnlock() + + // Maximum minimum understood and minimum maximum understood for both + // the protocol and delegate versions. We use this to verify everyone + // can be understood. + var maxpmin, minpmax uint8 + var maxdmin, mindmax uint8 + minpmax = math.MaxUint8 + mindmax = math.MaxUint8 + + for _, rn := range remote { + // If the node isn't alive, then skip it + if rn.State != stateAlive { + continue + } + + // Skip nodes that don't have versions set, it just means + // their version is zero. + if len(rn.Vsn) == 0 { + continue + } + + if rn.Vsn[0] > maxpmin { + maxpmin = rn.Vsn[0] + } + + if rn.Vsn[1] < minpmax { + minpmax = rn.Vsn[1] + } + + if rn.Vsn[3] > maxdmin { + maxdmin = rn.Vsn[3] + } + + if rn.Vsn[4] < mindmax { + mindmax = rn.Vsn[4] + } + } + + for _, n := range m.nodes { + // Ignore non-alive nodes + if n.State != stateAlive { + continue + } + + if n.PMin > maxpmin { + maxpmin = n.PMin + } + + if n.PMax < minpmax { + minpmax = n.PMax + } + + if n.DMin > maxdmin { + maxdmin = n.DMin + } + + if n.DMax < mindmax { + mindmax = n.DMax + } + } + + // Now that we definitively know the minimum and maximum understood + // version that satisfies the whole cluster, we verify that every + // node in the cluster satisifies this. + for _, n := range remote { + var nPCur, nDCur uint8 + if len(n.Vsn) > 0 { + nPCur = n.Vsn[2] + nDCur = n.Vsn[5] + } + + if nPCur < maxpmin || nPCur > minpmax { + return fmt.Errorf( + "Node '%s' protocol version (%d) is incompatible: [%d, %d]", + n.Name, nPCur, maxpmin, minpmax) + } + + if nDCur < maxdmin || nDCur > mindmax { + return fmt.Errorf( + "Node '%s' delegate protocol version (%d) is incompatible: [%d, %d]", + n.Name, nDCur, maxdmin, mindmax) + } + } + + for _, n := range m.nodes { + nPCur := n.PCur + nDCur := n.DCur + + if nPCur < maxpmin || nPCur > minpmax { + return fmt.Errorf( + "Node '%s' protocol version (%d) is incompatible: [%d, %d]", + n.Name, nPCur, maxpmin, minpmax) + } + + if nDCur < maxdmin || nDCur > mindmax { + return fmt.Errorf( + "Node '%s' delegate protocol version (%d) is incompatible: [%d, %d]", + n.Name, nDCur, maxdmin, mindmax) + } + } + + return nil +} + +// nextSeqNo returns a usable sequence number in a thread safe way +func (m *Memberlist) nextSeqNo() uint32 { + return atomic.AddUint32(&m.sequenceNum, 1) +} + +// nextIncarnation returns the next incarnation number in a thread safe way +func (m *Memberlist) nextIncarnation() uint32 { + return atomic.AddUint32(&m.incarnation, 1) +} + +// skipIncarnation adds the positive offset to the incarnation number. +func (m *Memberlist) skipIncarnation(offset uint32) uint32 { + return atomic.AddUint32(&m.incarnation, offset) +} + +// estNumNodes is used to get the current estimate of the number of nodes +func (m *Memberlist) estNumNodes() int { + return int(atomic.LoadUint32(&m.numNodes)) +} + +type ackMessage struct { + Complete bool + Payload []byte + Timestamp time.Time +} + +// setProbeChannels is used to attach the ackCh to receive a message when an ack +// with a given sequence number is received. The `complete` field of the message +// will be false on timeout. Any nack messages will cause an empty struct to be +// passed to the nackCh, which can be nil if not needed. +func (m *Memberlist) setProbeChannels(seqNo uint32, ackCh chan ackMessage, nackCh chan struct{}, timeout time.Duration) { + // Create handler functions for acks and nacks + ackFn := func(payload []byte, timestamp time.Time) { + select { + case ackCh <- ackMessage{true, payload, timestamp}: + default: + } + } + nackFn := func() { + select { + case nackCh <- struct{}{}: + default: + } + } + + // Add the handlers + ah := &ackHandler{ackFn, nackFn, nil} + m.ackLock.Lock() + m.ackHandlers[seqNo] = ah + m.ackLock.Unlock() + + // Setup a reaping routing + ah.timer = time.AfterFunc(timeout, func() { + m.ackLock.Lock() + delete(m.ackHandlers, seqNo) + m.ackLock.Unlock() + select { + case ackCh <- ackMessage{false, nil, time.Now()}: + default: + } + }) +} + +// setAckHandler is used to attach a handler to be invoked when an ack with a +// given sequence number is received. If a timeout is reached, the handler is +// deleted. This is used for indirect pings so does not configure a function +// for nacks. +func (m *Memberlist) setAckHandler(seqNo uint32, ackFn func([]byte, time.Time), timeout time.Duration) { + // Add the handler + ah := &ackHandler{ackFn, nil, nil} + m.ackLock.Lock() + m.ackHandlers[seqNo] = ah + m.ackLock.Unlock() + + // Setup a reaping routing + ah.timer = time.AfterFunc(timeout, func() { + m.ackLock.Lock() + delete(m.ackHandlers, seqNo) + m.ackLock.Unlock() + }) +} + +// Invokes an ack handler if any is associated, and reaps the handler immediately +func (m *Memberlist) invokeAckHandler(ack ackResp, timestamp time.Time) { + m.ackLock.Lock() + ah, ok := m.ackHandlers[ack.SeqNo] + delete(m.ackHandlers, ack.SeqNo) + m.ackLock.Unlock() + if !ok { + return + } + ah.timer.Stop() + ah.ackFn(ack.Payload, timestamp) +} + +// Invokes nack handler if any is associated. +func (m *Memberlist) invokeNackHandler(nack nackResp) { + m.ackLock.Lock() + ah, ok := m.ackHandlers[nack.SeqNo] + m.ackLock.Unlock() + if !ok || ah.nackFn == nil { + return + } + ah.nackFn() +} + +// refute gossips an alive message in response to incoming information that we +// are suspect or dead. It will make sure the incarnation number beats the given +// accusedInc value, or you can supply 0 to just get the next incarnation number. +// This alters the node state that's passed in so this MUST be called while the +// nodeLock is held. +func (m *Memberlist) refute(me *nodeState, accusedInc uint32) { + // Make sure the incarnation number beats the accusation. + inc := m.nextIncarnation() + if accusedInc >= inc { + inc = m.skipIncarnation(accusedInc - inc + 1) + } + me.Incarnation = inc + + // Decrease our health because we are being asked to refute a problem. + m.awareness.ApplyDelta(1) + + // Format and broadcast an alive message. + a := alive{ + Incarnation: inc, + Node: me.Name, + Addr: me.Addr, + Port: me.Port, + Meta: me.Meta, + Vsn: []uint8{ + me.PMin, me.PMax, me.PCur, + me.DMin, me.DMax, me.DCur, + }, + } + m.encodeAndBroadcast(me.Addr.String(), aliveMsg, a) +} + +// aliveNode is invoked by the network layer when we get a message about a +// live node. +func (m *Memberlist) aliveNode(a *alive, notify chan struct{}, bootstrap bool) { + m.nodeLock.Lock() + defer m.nodeLock.Unlock() + state, ok := m.nodeMap[a.Node] + + // It is possible that during a Leave(), there is already an aliveMsg + // in-queue to be processed but blocked by the locks above. If we let + // that aliveMsg process, it'll cause us to re-join the cluster. This + // ensures that we don't. + if m.leave && a.Node == m.config.Name { + return + } + + // Invoke the Alive delegate if any. This can be used to filter out + // alive messages based on custom logic. For example, using a cluster name. + // Using a merge delegate is not enough, as it is possible for passive + // cluster merging to still occur. + if m.config.Alive != nil { + node := &Node{ + Name: a.Node, + Addr: a.Addr, + Port: a.Port, + Meta: a.Meta, + PMin: a.Vsn[0], + PMax: a.Vsn[1], + PCur: a.Vsn[2], + DMin: a.Vsn[3], + DMax: a.Vsn[4], + DCur: a.Vsn[5], + } + if err := m.config.Alive.NotifyAlive(node); err != nil { + m.logger.Printf("[WARN] memberlist: ignoring alive message for '%s': %s", + a.Node, err) + return + } + } + + // Check if we've never seen this node before, and if not, then + // store this node in our node map. + if !ok { + state = &nodeState{ + Node: Node{ + Name: a.Node, + Addr: a.Addr, + Port: a.Port, + Meta: a.Meta, + }, + State: stateDead, + } + + // Add to map + m.nodeMap[a.Node] = state + + // Get a random offset. This is important to ensure + // the failure detection bound is low on average. If all + // nodes did an append, failure detection bound would be + // very high. + n := len(m.nodes) + offset := randomOffset(n) + + // Add at the end and swap with the node at the offset + m.nodes = append(m.nodes, state) + m.nodes[offset], m.nodes[n] = m.nodes[n], m.nodes[offset] + + // Update numNodes after we've added a new node + atomic.AddUint32(&m.numNodes, 1) + } + + // Check if this address is different than the existing node + if !bytes.Equal([]byte(state.Addr), a.Addr) || state.Port != a.Port { + m.logger.Printf("[ERR] memberlist: Conflicting address for %s. Mine: %v:%d Theirs: %v:%d", + state.Name, state.Addr, state.Port, net.IP(a.Addr), a.Port) + + // Inform the conflict delegate if provided + if m.config.Conflict != nil { + other := Node{ + Name: a.Node, + Addr: a.Addr, + Port: a.Port, + Meta: a.Meta, + } + m.config.Conflict.NotifyConflict(&state.Node, &other) + } + return + } + + // Bail if the incarnation number is older, and this is not about us + isLocalNode := state.Name == m.config.Name + if a.Incarnation <= state.Incarnation && !isLocalNode { + return + } + + // Bail if strictly less and this is about us + if a.Incarnation < state.Incarnation && isLocalNode { + return + } + + // Clear out any suspicion timer that may be in effect. + delete(m.nodeTimers, a.Node) + + // Store the old state and meta data + oldState := state.State + oldMeta := state.Meta + + // If this is us we need to refute, otherwise re-broadcast + if !bootstrap && isLocalNode { + // Compute the version vector + versions := []uint8{ + state.PMin, state.PMax, state.PCur, + state.DMin, state.DMax, state.DCur, + } + + // If the Incarnation is the same, we need special handling, since it + // possible for the following situation to happen: + // 1) Start with configuration C, join cluster + // 2) Hard fail / Kill / Shutdown + // 3) Restart with configuration C', join cluster + // + // In this case, other nodes and the local node see the same incarnation, + // but the values may not be the same. For this reason, we always + // need to do an equality check for this Incarnation. In most cases, + // we just ignore, but we may need to refute. + // + if a.Incarnation == state.Incarnation && + bytes.Equal(a.Meta, state.Meta) && + bytes.Equal(a.Vsn, versions) { + return + } + + m.refute(state, a.Incarnation) + m.logger.Printf("[WARN] memberlist: Refuting an alive message") + } else { + m.encodeBroadcastNotify(a.Node, aliveMsg, a, notify) + + // Update protocol versions if it arrived + if len(a.Vsn) > 0 { + state.PMin = a.Vsn[0] + state.PMax = a.Vsn[1] + state.PCur = a.Vsn[2] + state.DMin = a.Vsn[3] + state.DMax = a.Vsn[4] + state.DCur = a.Vsn[5] + } + + // Update the state and incarnation number + state.Incarnation = a.Incarnation + state.Meta = a.Meta + if state.State != stateAlive { + state.State = stateAlive + state.StateChange = time.Now() + } + } + + // Update metrics + metrics.IncrCounter([]string{"memberlist", "msg", "alive"}, 1) + + // Notify the delegate of any relevant updates + if m.config.Events != nil { + if oldState == stateDead { + // if Dead -> Alive, notify of join + m.config.Events.NotifyJoin(&state.Node) + + } else if !bytes.Equal(oldMeta, state.Meta) { + // if Meta changed, trigger an update notification + m.config.Events.NotifyUpdate(&state.Node) + } + } +} + +// suspectNode is invoked by the network layer when we get a message +// about a suspect node +func (m *Memberlist) suspectNode(s *suspect) { + m.nodeLock.Lock() + defer m.nodeLock.Unlock() + state, ok := m.nodeMap[s.Node] + + // If we've never heard about this node before, ignore it + if !ok { + return + } + + // Ignore old incarnation numbers + if s.Incarnation < state.Incarnation { + return + } + + // See if there's a suspicion timer we can confirm. If the info is new + // to us we will go ahead and re-gossip it. This allows for multiple + // independent confirmations to flow even when a node probes a node + // that's already suspect. + if timer, ok := m.nodeTimers[s.Node]; ok { + if timer.Confirm(s.From) { + m.encodeAndBroadcast(s.Node, suspectMsg, s) + } + return + } + + // Ignore non-alive nodes + if state.State != stateAlive { + return + } + + // If this is us we need to refute, otherwise re-broadcast + if state.Name == m.config.Name { + m.refute(state, s.Incarnation) + m.logger.Printf("[WARN] memberlist: Refuting a suspect message (from: %s)", s.From) + return // Do not mark ourself suspect + } else { + m.encodeAndBroadcast(s.Node, suspectMsg, s) + } + + // Update metrics + metrics.IncrCounter([]string{"memberlist", "msg", "suspect"}, 1) + + // Update the state + state.Incarnation = s.Incarnation + state.State = stateSuspect + changeTime := time.Now() + state.StateChange = changeTime + + // Setup a suspicion timer. Given that we don't have any known phase + // relationship with our peers, we set up k such that we hit the nominal + // timeout two probe intervals short of what we expect given the suspicion + // multiplier. + k := m.config.SuspicionMult - 2 + + // If there aren't enough nodes to give the expected confirmations, just + // set k to 0 to say that we don't expect any. Note we subtract 2 from n + // here to take out ourselves and the node being probed. + n := m.estNumNodes() + if n-2 < k { + k = 0 + } + + // Compute the timeouts based on the size of the cluster. + min := suspicionTimeout(m.config.SuspicionMult, n, m.config.ProbeInterval) + max := time.Duration(m.config.SuspicionMaxTimeoutMult) * min + fn := func(numConfirmations int) { + m.nodeLock.Lock() + state, ok := m.nodeMap[s.Node] + timeout := ok && state.State == stateSuspect && state.StateChange == changeTime + m.nodeLock.Unlock() + + if timeout { + if k > 0 && numConfirmations < k { + metrics.IncrCounter([]string{"memberlist", "degraded", "timeout"}, 1) + } + + m.logger.Printf("[INFO] memberlist: Marking %s as failed, suspect timeout reached (%d peer confirmations)", + state.Name, numConfirmations) + d := dead{Incarnation: state.Incarnation, Node: state.Name, From: m.config.Name} + m.deadNode(&d) + } + } + m.nodeTimers[s.Node] = newSuspicion(s.From, k, min, max, fn) +} + +// deadNode is invoked by the network layer when we get a message +// about a dead node +func (m *Memberlist) deadNode(d *dead) { + m.nodeLock.Lock() + defer m.nodeLock.Unlock() + state, ok := m.nodeMap[d.Node] + + // If we've never heard about this node before, ignore it + if !ok { + return + } + + // Ignore old incarnation numbers + if d.Incarnation < state.Incarnation { + return + } + + // Clear out any suspicion timer that may be in effect. + delete(m.nodeTimers, d.Node) + + // Ignore if node is already dead + if state.State == stateDead { + return + } + + // Check if this is us + if state.Name == m.config.Name { + // If we are not leaving we need to refute + if !m.leave { + m.refute(state, d.Incarnation) + m.logger.Printf("[WARN] memberlist: Refuting a dead message (from: %s)", d.From) + return // Do not mark ourself dead + } + + // If we are leaving, we broadcast and wait + m.encodeBroadcastNotify(d.Node, deadMsg, d, m.leaveBroadcast) + } else { + m.encodeAndBroadcast(d.Node, deadMsg, d) + } + + // Update metrics + metrics.IncrCounter([]string{"memberlist", "msg", "dead"}, 1) + + // Update the state + state.Incarnation = d.Incarnation + state.State = stateDead + state.StateChange = time.Now() + + // Notify of death + if m.config.Events != nil { + m.config.Events.NotifyLeave(&state.Node) + } +} + +// mergeState is invoked by the network layer when we get a Push/Pull +// state transfer +func (m *Memberlist) mergeState(remote []pushNodeState) { + for _, r := range remote { + switch r.State { + case stateAlive: + a := alive{ + Incarnation: r.Incarnation, + Node: r.Name, + Addr: r.Addr, + Port: r.Port, + Meta: r.Meta, + Vsn: r.Vsn, + } + m.aliveNode(&a, nil, false) + + case stateDead: + // If the remote node believes a node is dead, we prefer to + // suspect that node instead of declaring it dead instantly + fallthrough + case stateSuspect: + s := suspect{Incarnation: r.Incarnation, Node: r.Name, From: m.config.Name} + m.suspectNode(&s) + } + } +} diff --git a/vendor/github.com/hashicorp/memberlist/suspicion.go b/vendor/github.com/hashicorp/memberlist/suspicion.go new file mode 100644 index 000000000..5f573e1fc --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/suspicion.go @@ -0,0 +1,130 @@ +package memberlist + +import ( + "math" + "sync/atomic" + "time" +) + +// suspicion manages the suspect timer for a node and provides an interface +// to accelerate the timeout as we get more independent confirmations that +// a node is suspect. +type suspicion struct { + // n is the number of independent confirmations we've seen. This must + // be updated using atomic instructions to prevent contention with the + // timer callback. + n int32 + + // k is the number of independent confirmations we'd like to see in + // order to drive the timer to its minimum value. + k int32 + + // min is the minimum timer value. + min time.Duration + + // max is the maximum timer value. + max time.Duration + + // start captures the timestamp when we began the timer. This is used + // so we can calculate durations to feed the timer during updates in + // a way the achieves the overall time we'd like. + start time.Time + + // timer is the underlying timer that implements the timeout. + timer *time.Timer + + // f is the function to call when the timer expires. We hold on to this + // because there are cases where we call it directly. + timeoutFn func() + + // confirmations is a map of "from" nodes that have confirmed a given + // node is suspect. This prevents double counting. + confirmations map[string]struct{} +} + +// newSuspicion returns a timer started with the max time, and that will drive +// to the min time after seeing k or more confirmations. The from node will be +// excluded from confirmations since we might get our own suspicion message +// gossiped back to us. The minimum time will be used if no confirmations are +// called for (k <= 0). +func newSuspicion(from string, k int, min time.Duration, max time.Duration, fn func(int)) *suspicion { + s := &suspicion{ + k: int32(k), + min: min, + max: max, + confirmations: make(map[string]struct{}), + } + + // Exclude the from node from any confirmations. + s.confirmations[from] = struct{}{} + + // Pass the number of confirmations into the timeout function for + // easy telemetry. + s.timeoutFn = func() { + fn(int(atomic.LoadInt32(&s.n))) + } + + // If there aren't any confirmations to be made then take the min + // time from the start. + timeout := max + if k < 1 { + timeout = min + } + s.timer = time.AfterFunc(timeout, s.timeoutFn) + + // Capture the start time right after starting the timer above so + // we should always err on the side of a little longer timeout if + // there's any preemption that separates this and the step above. + s.start = time.Now() + return s +} + +// remainingSuspicionTime takes the state variables of the suspicion timer and +// calculates the remaining time to wait before considering a node dead. The +// return value can be negative, so be prepared to fire the timer immediately in +// that case. +func remainingSuspicionTime(n, k int32, elapsed time.Duration, min, max time.Duration) time.Duration { + frac := math.Log(float64(n)+1.0) / math.Log(float64(k)+1.0) + raw := max.Seconds() - frac*(max.Seconds()-min.Seconds()) + timeout := time.Duration(math.Floor(1000.0*raw)) * time.Millisecond + if timeout < min { + timeout = min + } + + // We have to take into account the amount of time that has passed so + // far, so we get the right overall timeout. + return timeout - elapsed +} + +// Confirm registers that a possibly new peer has also determined the given +// node is suspect. This returns true if this was new information, and false +// if it was a duplicate confirmation, or if we've got enough confirmations to +// hit the minimum. +func (s *suspicion) Confirm(from string) bool { + // If we've got enough confirmations then stop accepting them. + if atomic.LoadInt32(&s.n) >= s.k { + return false + } + + // Only allow one confirmation from each possible peer. + if _, ok := s.confirmations[from]; ok { + return false + } + s.confirmations[from] = struct{}{} + + // Compute the new timeout given the current number of confirmations and + // adjust the timer. If the timeout becomes negative *and* we can cleanly + // stop the timer then we will call the timeout function directly from + // here. + n := atomic.AddInt32(&s.n, 1) + elapsed := time.Now().Sub(s.start) + remaining := remainingSuspicionTime(n, s.k, elapsed, s.min, s.max) + if s.timer.Stop() { + if remaining > 0 { + s.timer.Reset(remaining) + } else { + go s.timeoutFn() + } + } + return true +} diff --git a/vendor/github.com/hashicorp/memberlist/todo.md b/vendor/github.com/hashicorp/memberlist/todo.md new file mode 100644 index 000000000..009c1d647 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/todo.md @@ -0,0 +1,6 @@ +# TODO +* Dynamic RTT discovery + * Compute 99th percentile for ping/ack + * Better lower bound for ping/ack, faster failure detection +* Dynamic MTU discovery + * Prevent lost updates, increases efficiency diff --git a/vendor/github.com/hashicorp/memberlist/util.go b/vendor/github.com/hashicorp/memberlist/util.go new file mode 100644 index 000000000..2ee58ba10 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/util.go @@ -0,0 +1,288 @@ +package memberlist + +import ( + "bytes" + "compress/lzw" + "encoding/binary" + "fmt" + "io" + "math" + "math/rand" + "strings" + "time" + + "github.com/hashicorp/go-msgpack/codec" + "github.com/sean-/seed" +) + +// pushPullScale is the minimum number of nodes +// before we start scaling the push/pull timing. The scale +// effect is the log2(Nodes) - log2(pushPullScale). This means +// that the 33rd node will cause us to double the interval, +// while the 65th will triple it. +const pushPullScaleThreshold = 32 + +const ( + // Constant litWidth 2-8 + lzwLitWidth = 8 +) + +func init() { + seed.Init() +} + +// Decode reverses the encode operation on a byte slice input +func decode(buf []byte, out interface{}) error { + r := bytes.NewReader(buf) + hd := codec.MsgpackHandle{} + dec := codec.NewDecoder(r, &hd) + return dec.Decode(out) +} + +// Encode writes an encoded object to a new bytes buffer +func encode(msgType messageType, in interface{}) (*bytes.Buffer, error) { + buf := bytes.NewBuffer(nil) + buf.WriteByte(uint8(msgType)) + hd := codec.MsgpackHandle{} + enc := codec.NewEncoder(buf, &hd) + err := enc.Encode(in) + return buf, err +} + +// Returns a random offset between 0 and n +func randomOffset(n int) int { + if n == 0 { + return 0 + } + return int(rand.Uint32() % uint32(n)) +} + +// suspicionTimeout computes the timeout that should be used when +// a node is suspected +func suspicionTimeout(suspicionMult, n int, interval time.Duration) time.Duration { + nodeScale := math.Max(1.0, math.Log10(math.Max(1.0, float64(n)))) + // multiply by 1000 to keep some precision because time.Duration is an int64 type + timeout := time.Duration(suspicionMult) * time.Duration(nodeScale*1000) * interval / 1000 + return timeout +} + +// retransmitLimit computes the limit of retransmissions +func retransmitLimit(retransmitMult, n int) int { + nodeScale := math.Ceil(math.Log10(float64(n + 1))) + limit := retransmitMult * int(nodeScale) + return limit +} + +// shuffleNodes randomly shuffles the input nodes using the Fisher-Yates shuffle +func shuffleNodes(nodes []*nodeState) { + n := len(nodes) + for i := n - 1; i > 0; i-- { + j := rand.Intn(i + 1) + nodes[i], nodes[j] = nodes[j], nodes[i] + } +} + +// pushPushScale is used to scale the time interval at which push/pull +// syncs take place. It is used to prevent network saturation as the +// cluster size grows +func pushPullScale(interval time.Duration, n int) time.Duration { + // Don't scale until we cross the threshold + if n <= pushPullScaleThreshold { + return interval + } + + multiplier := math.Ceil(math.Log2(float64(n))-math.Log2(pushPullScaleThreshold)) + 1.0 + return time.Duration(multiplier) * interval +} + +// moveDeadNodes moves nodes that are dead and beyond the gossip to the dead interval +// to the end of the slice and returns the index of the first moved node. +func moveDeadNodes(nodes []*nodeState, gossipToTheDeadTime time.Duration) int { + numDead := 0 + n := len(nodes) + for i := 0; i < n-numDead; i++ { + if nodes[i].State != stateDead { + continue + } + + // Respect the gossip to the dead interval + if time.Since(nodes[i].StateChange) <= gossipToTheDeadTime { + continue + } + + // Move this node to the end + nodes[i], nodes[n-numDead-1] = nodes[n-numDead-1], nodes[i] + numDead++ + i-- + } + return n - numDead +} + +// kRandomNodes is used to select up to k random nodes, excluding any nodes where +// the filter function returns true. It is possible that less than k nodes are +// returned. +func kRandomNodes(k int, nodes []*nodeState, filterFn func(*nodeState) bool) []*nodeState { + n := len(nodes) + kNodes := make([]*nodeState, 0, k) +OUTER: + // Probe up to 3*n times, with large n this is not necessary + // since k << n, but with small n we want search to be + // exhaustive + for i := 0; i < 3*n && len(kNodes) < k; i++ { + // Get random node + idx := randomOffset(n) + node := nodes[idx] + + // Give the filter a shot at it. + if filterFn != nil && filterFn(node) { + continue OUTER + } + + // Check if we have this node already + for j := 0; j < len(kNodes); j++ { + if node == kNodes[j] { + continue OUTER + } + } + + // Append the node + kNodes = append(kNodes, node) + } + return kNodes +} + +// makeCompoundMessage takes a list of messages and generates +// a single compound message containing all of them +func makeCompoundMessage(msgs [][]byte) *bytes.Buffer { + // Create a local buffer + buf := bytes.NewBuffer(nil) + + // Write out the type + buf.WriteByte(uint8(compoundMsg)) + + // Write out the number of message + buf.WriteByte(uint8(len(msgs))) + + // Add the message lengths + for _, m := range msgs { + binary.Write(buf, binary.BigEndian, uint16(len(m))) + } + + // Append the messages + for _, m := range msgs { + buf.Write(m) + } + + return buf +} + +// decodeCompoundMessage splits a compound message and returns +// the slices of individual messages. Also returns the number +// of truncated messages and any potential error +func decodeCompoundMessage(buf []byte) (trunc int, parts [][]byte, err error) { + if len(buf) < 1 { + err = fmt.Errorf("missing compound length byte") + return + } + numParts := uint8(buf[0]) + buf = buf[1:] + + // Check we have enough bytes + if len(buf) < int(numParts*2) { + err = fmt.Errorf("truncated len slice") + return + } + + // Decode the lengths + lengths := make([]uint16, numParts) + for i := 0; i < int(numParts); i++ { + lengths[i] = binary.BigEndian.Uint16(buf[i*2 : i*2+2]) + } + buf = buf[numParts*2:] + + // Split each message + for idx, msgLen := range lengths { + if len(buf) < int(msgLen) { + trunc = int(numParts) - idx + return + } + + // Extract the slice, seek past on the buffer + slice := buf[:msgLen] + buf = buf[msgLen:] + parts = append(parts, slice) + } + return +} + +// Given a string of the form "host", "host:port", +// "ipv6::addr" or "[ipv6::address]:port", +// return true if the string includes a port. +func hasPort(s string) bool { + last := strings.LastIndex(s, ":") + if last == -1 { + return false + } + if s[0] == '[' { + return s[last-1] == ']' + } + return strings.Index(s, ":") == last +} + +// compressPayload takes an opaque input buffer, compresses it +// and wraps it in a compress{} message that is encoded. +func compressPayload(inp []byte) (*bytes.Buffer, error) { + var buf bytes.Buffer + compressor := lzw.NewWriter(&buf, lzw.LSB, lzwLitWidth) + + _, err := compressor.Write(inp) + if err != nil { + return nil, err + } + + // Ensure we flush everything out + if err := compressor.Close(); err != nil { + return nil, err + } + + // Create a compressed message + c := compress{ + Algo: lzwAlgo, + Buf: buf.Bytes(), + } + return encode(compressMsg, &c) +} + +// decompressPayload is used to unpack an encoded compress{} +// message and return its payload uncompressed +func decompressPayload(msg []byte) ([]byte, error) { + // Decode the message + var c compress + if err := decode(msg, &c); err != nil { + return nil, err + } + return decompressBuffer(&c) +} + +// decompressBuffer is used to decompress the buffer of +// a single compress message, handling multiple algorithms +func decompressBuffer(c *compress) ([]byte, error) { + // Verify the algorithm + if c.Algo != lzwAlgo { + return nil, fmt.Errorf("Cannot decompress unknown algorithm %d", c.Algo) + } + + // Create a uncompressor + uncomp := lzw.NewReader(bytes.NewReader(c.Buf), lzw.LSB, lzwLitWidth) + defer uncomp.Close() + + // Read all the data + var b bytes.Buffer + _, err := io.Copy(&b, uncomp) + if err != nil { + return nil, err + } + + // Return the uncompressed bytes + return b.Bytes(), nil +} diff --git a/vendor/github.com/hashicorp/raft/LICENSE b/vendor/github.com/hashicorp/raft/LICENSE new file mode 100644 index 000000000..c33dcc7c9 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/LICENSE @@ -0,0 +1,354 @@ +Mozilla Public License, version 2.0 + +1. Definitions + +1.1. “Contributor” + + means each individual or legal entity that creates, contributes to the + creation of, or owns Covered Software. + +1.2. “Contributor Version” + + means the combination of the Contributions of others (if any) used by a + Contributor and that particular Contributor’s Contribution. + +1.3. “Contribution” + + means Covered Software of a particular Contributor. + +1.4. “Covered Software” + + means Source Code Form to which the initial Contributor has attached the + notice in Exhibit A, the Executable Form of such Source Code Form, and + Modifications of such Source Code Form, in each case including portions + thereof. + +1.5. “Incompatible With Secondary Licenses” + means + + a. that the initial Contributor has attached the notice described in + Exhibit B to the Covered Software; or + + b. that the Covered Software was made available under the terms of version + 1.1 or earlier of the License, but not also under the terms of a + Secondary License. + +1.6. “Executable Form” + + means any form of the work other than Source Code Form. + +1.7. “Larger Work” + + means a work that combines Covered Software with other material, in a separate + file or files, that is not Covered Software. + +1.8. “License” + + means this document. + +1.9. “Licensable” + + means having the right to grant, to the maximum extent possible, whether at the + time of the initial grant or subsequently, any and all of the rights conveyed by + this License. + +1.10. “Modifications” + + means any of the following: + + a. any file in Source Code Form that results from an addition to, deletion + from, or modification of the contents of Covered Software; or + + b. any new file in Source Code Form that contains any Covered Software. + +1.11. “Patent Claims” of a Contributor + + means any patent claim(s), including without limitation, method, process, + and apparatus claims, in any patent Licensable by such Contributor that + would be infringed, but for the grant of the License, by the making, + using, selling, offering for sale, having made, import, or transfer of + either its Contributions or its Contributor Version. + +1.12. “Secondary License” + + means either the GNU General Public License, Version 2.0, the GNU Lesser + General Public License, Version 2.1, the GNU Affero General Public + License, Version 3.0, or any later versions of those licenses. + +1.13. “Source Code Form” + + means the form of the work preferred for making modifications. + +1.14. “You” (or “Your”) + + means an individual or a legal entity exercising rights under this + License. For legal entities, “You” includes any entity that controls, is + controlled by, or is under common control with You. For purposes of this + definition, “control” means (a) the power, direct or indirect, to cause + the direction or management of such entity, whether by contract or + otherwise, or (b) ownership of more than fifty percent (50%) of the + outstanding shares or beneficial ownership of such entity. + + +2. License Grants and Conditions + +2.1. Grants + + Each Contributor hereby grants You a world-wide, royalty-free, + non-exclusive license: + + a. under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or as + part of a Larger Work; and + + b. under Patent Claims of such Contributor to make, use, sell, offer for + sale, have made, import, and otherwise transfer either its Contributions + or its Contributor Version. + +2.2. Effective Date + + The licenses granted in Section 2.1 with respect to any Contribution become + effective for each Contribution on the date the Contributor first distributes + such Contribution. + +2.3. Limitations on Grant Scope + + The licenses granted in this Section 2 are the only rights granted under this + License. No additional rights or licenses will be implied from the distribution + or licensing of Covered Software under this License. Notwithstanding Section + 2.1(b) above, no patent license is granted by a Contributor: + + a. for any code that a Contributor has removed from Covered Software; or + + b. for infringements caused by: (i) Your and any other third party’s + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + + c. under Patent Claims infringed by Covered Software in the absence of its + Contributions. + + This License does not grant any rights in the trademarks, service marks, or + logos of any Contributor (except as may be necessary to comply with the + notice requirements in Section 3.4). + +2.4. Subsequent Licenses + + No Contributor makes additional grants as a result of Your choice to + distribute the Covered Software under a subsequent version of this License + (see Section 10.2) or under the terms of a Secondary License (if permitted + under the terms of Section 3.3). + +2.5. Representation + + Each Contributor represents that the Contributor believes its Contributions + are its original creation(s) or it has sufficient rights to grant the + rights to its Contributions conveyed by this License. + +2.6. Fair Use + + This License is not intended to limit any rights You have under applicable + copyright doctrines of fair use, fair dealing, or other equivalents. + +2.7. Conditions + + Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in + Section 2.1. + + +3. Responsibilities + +3.1. Distribution of Source Form + + All distribution of Covered Software in Source Code Form, including any + Modifications that You create or to which You contribute, must be under the + terms of this License. You must inform recipients that the Source Code Form + of the Covered Software is governed by the terms of this License, and how + they can obtain a copy of this License. You may not attempt to alter or + restrict the recipients’ rights in the Source Code Form. + +3.2. Distribution of Executable Form + + If You distribute Covered Software in Executable Form then: + + a. such Covered Software must also be made available in Source Code Form, + as described in Section 3.1, and You must inform recipients of the + Executable Form how they can obtain a copy of such Source Code Form by + reasonable means in a timely manner, at a charge no more than the cost + of distribution to the recipient; and + + b. You may distribute such Executable Form under the terms of this License, + or sublicense it under different terms, provided that the license for + the Executable Form does not attempt to limit or alter the recipients’ + rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + + You may create and distribute a Larger Work under terms of Your choice, + provided that You also comply with the requirements of this License for the + Covered Software. If the Larger Work is a combination of Covered Software + with a work governed by one or more Secondary Licenses, and the Covered + Software is not Incompatible With Secondary Licenses, this License permits + You to additionally distribute such Covered Software under the terms of + such Secondary License(s), so that the recipient of the Larger Work may, at + their option, further distribute the Covered Software under the terms of + either this License or such Secondary License(s). + +3.4. Notices + + You may not remove or alter the substance of any license notices (including + copyright notices, patent notices, disclaimers of warranty, or limitations + of liability) contained within the Source Code Form of the Covered + Software, except that You may alter any license notices to the extent + required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + + You may choose to offer, and to charge a fee for, warranty, support, + indemnity or liability obligations to one or more recipients of Covered + Software. However, You may do so only on Your own behalf, and not on behalf + of any Contributor. You must make it absolutely clear that any such + warranty, support, indemnity, or liability obligation is offered by You + alone, and You hereby agree to indemnify every Contributor for any + liability incurred by such Contributor as a result of warranty, support, + indemnity or liability terms You offer. You may include additional + disclaimers of warranty and limitations of liability specific to any + jurisdiction. + +4. Inability to Comply Due to Statute or Regulation + + If it is impossible for You to comply with any of the terms of this License + with respect to some or all of the Covered Software due to statute, judicial + order, or regulation then You must: (a) comply with the terms of this License + to the maximum extent possible; and (b) describe the limitations and the code + they affect. Such description must be placed in a text file included with all + distributions of the Covered Software under this License. Except to the + extent prohibited by statute or regulation, such description must be + sufficiently detailed for a recipient of ordinary skill to be able to + understand it. + +5. Termination + +5.1. The rights granted under this License will terminate automatically if You + fail to comply with any of its terms. However, if You become compliant, + then the rights granted under this License from a particular Contributor + are reinstated (a) provisionally, unless and until such Contributor + explicitly and finally terminates Your grants, and (b) on an ongoing basis, + if such Contributor fails to notify You of the non-compliance by some + reasonable means prior to 60 days after You have come back into compliance. + Moreover, Your grants from a particular Contributor are reinstated on an + ongoing basis if such Contributor notifies You of the non-compliance by + some reasonable means, this is the first time You have received notice of + non-compliance with this License from such Contributor, and You become + compliant prior to 30 days after Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent + infringement claim (excluding declaratory judgment actions, counter-claims, + and cross-claims) alleging that a Contributor Version directly or + indirectly infringes any patent, then the rights granted to You by any and + all Contributors for the Covered Software under Section 2.1 of this License + shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user + license agreements (excluding distributors and resellers) which have been + validly granted by You or Your distributors under this License prior to + termination shall survive termination. + +6. Disclaimer of Warranty + + Covered Software is provided under this License on an “as is” basis, without + warranty of any kind, either expressed, implied, or statutory, including, + without limitation, warranties that the Covered Software is free of defects, + merchantable, fit for a particular purpose or non-infringing. The entire + risk as to the quality and performance of the Covered Software is with You. + Should any Covered Software prove defective in any respect, You (not any + Contributor) assume the cost of any necessary servicing, repair, or + correction. This disclaimer of warranty constitutes an essential part of this + License. No use of any Covered Software is authorized under this License + except under this disclaimer. + +7. Limitation of Liability + + Under no circumstances and under no legal theory, whether tort (including + negligence), contract, or otherwise, shall any Contributor, or anyone who + distributes Covered Software as permitted above, be liable to You for any + direct, indirect, special, incidental, or consequential damages of any + character including, without limitation, damages for lost profits, loss of + goodwill, work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses, even if such party shall have been + informed of the possibility of such damages. This limitation of liability + shall not apply to liability for death or personal injury resulting from such + party’s negligence to the extent applicable law prohibits such limitation. + Some jurisdictions do not allow the exclusion or limitation of incidental or + consequential damages, so this exclusion and limitation may not apply to You. + +8. Litigation + + Any litigation relating to this License may be brought only in the courts of + a jurisdiction where the defendant maintains its principal place of business + and such litigation shall be governed by laws of that jurisdiction, without + reference to its conflict-of-law provisions. Nothing in this Section shall + prevent a party’s ability to bring cross-claims or counter-claims. + +9. Miscellaneous + + This License represents the complete agreement concerning the subject matter + hereof. If any provision of this License is held to be unenforceable, such + provision shall be reformed only to the extent necessary to make it + enforceable. Any law or regulation which provides that the language of a + contract shall be construed against the drafter shall not be used to construe + this License against a Contributor. + + +10. Versions of the License + +10.1. New Versions + + Mozilla Foundation is the license steward. Except as provided in Section + 10.3, no one other than the license steward has the right to modify or + publish new versions of this License. Each version will be given a + distinguishing version number. + +10.2. Effect of New Versions + + You may distribute the Covered Software under the terms of the version of + the License under which You originally received the Covered Software, or + under the terms of any subsequent version published by the license + steward. + +10.3. Modified Versions + + If you create software not governed by this License, and you want to + create a new license for such software, you may create and use a modified + version of this License if you rename the license and remove any + references to the name of the license steward (except to note that such + modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary Licenses + If You choose to distribute Source Code Form that is Incompatible With + Secondary Licenses under the terms of this version of the License, the + notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice + + This Source Code Form is subject to the + terms of the Mozilla Public License, v. + 2.0. If a copy of the MPL was not + distributed with this file, You can + obtain one at + http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular file, then +You may include the notice in a location (such as a LICENSE file in a relevant +directory) where a recipient would be likely to look for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - “Incompatible With Secondary Licenses” Notice + + This Source Code Form is “Incompatible + With Secondary Licenses”, as defined by + the Mozilla Public License, v. 2.0. + diff --git a/vendor/github.com/hashicorp/raft/Makefile b/vendor/github.com/hashicorp/raft/Makefile new file mode 100644 index 000000000..49f829923 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/Makefile @@ -0,0 +1,17 @@ +DEPS = $(go list -f '{{range .TestImports}}{{.}} {{end}}' ./...) + +test: + go test -timeout=60s ./... + +integ: test + INTEG_TESTS=yes go test -timeout=5s -run=Integ ./... + +deps: + go get -d -v ./... + echo $(DEPS) | xargs -n1 go get -d + +cov: + INTEG_TESTS=yes gocov test github.com/hashicorp/raft | gocov-html > /tmp/coverage.html + open /tmp/coverage.html + +.PHONY: test cov integ deps diff --git a/vendor/github.com/hashicorp/raft/README.md b/vendor/github.com/hashicorp/raft/README.md new file mode 100644 index 000000000..8778b13dc --- /dev/null +++ b/vendor/github.com/hashicorp/raft/README.md @@ -0,0 +1,89 @@ +raft [![Build Status](https://travis-ci.org/hashicorp/raft.png)](https://travis-ci.org/hashicorp/raft) +==== + +raft is a [Go](http://www.golang.org) library that manages a replicated +log and can be used with an FSM to manage replicated state machines. It +is library for providing [consensus](http://en.wikipedia.org/wiki/Consensus_(computer_science)). + +The use cases for such a library are far-reaching as replicated state +machines are a key component of many distributed systems. They enable +building Consistent, Partition Tolerant (CP) systems, with limited +fault tolerance as well. + +## Building + +If you wish to build raft you'll need Go version 1.2+ installed. + +Please check your installation with: + +``` +go version +``` + +## Documentation + +For complete documentation, see the associated [Godoc](http://godoc.org/github.com/hashicorp/raft). + +To prevent complications with cgo, the primary backend `MDBStore` is in a separate repository, +called [raft-mdb](http://github.com/hashicorp/raft-mdb). That is the recommended implementation +for the `LogStore` and `StableStore`. + +A pure Go backend using [BoltDB](https://github.com/boltdb/bolt) is also available called +[raft-boltdb](https://github.com/hashicorp/raft-boltdb). It can also be used as a `LogStore` +and `StableStore`. + +## Protocol + +raft is based on ["Raft: In Search of an Understandable Consensus Algorithm"](https://ramcloud.stanford.edu/wiki/download/attachments/11370504/raft.pdf) + +A high level overview of the Raft protocol is described below, but for details please read the full +[Raft paper](https://ramcloud.stanford.edu/wiki/download/attachments/11370504/raft.pdf) +followed by the raft source. Any questions about the raft protocol should be sent to the +[raft-dev mailing list](https://groups.google.com/forum/#!forum/raft-dev). + +### Protocol Description + +Raft nodes are always in one of three states: follower, candidate or leader. All +nodes initially start out as a follower. In this state, nodes can accept log entries +from a leader and cast votes. If no entries are received for some time, nodes +self-promote to the candidate state. In the candidate state nodes request votes from +their peers. If a candidate receives a quorum of votes, then it is promoted to a leader. +The leader must accept new log entries and replicate to all the other followers. +In addition, if stale reads are not acceptable, all queries must also be performed on +the leader. + +Once a cluster has a leader, it is able to accept new log entries. A client can +request that a leader append a new log entry, which is an opaque binary blob to +Raft. The leader then writes the entry to durable storage and attempts to replicate +to a quorum of followers. Once the log entry is considered *committed*, it can be +*applied* to a finite state machine. The finite state machine is application specific, +and is implemented using an interface. + +An obvious question relates to the unbounded nature of a replicated log. Raft provides +a mechanism by which the current state is snapshotted, and the log is compacted. Because +of the FSM abstraction, restoring the state of the FSM must result in the same state +as a replay of old logs. This allows Raft to capture the FSM state at a point in time, +and then remove all the logs that were used to reach that state. This is performed automatically +without user intervention, and prevents unbounded disk usage as well as minimizing +time spent replaying logs. + +Lastly, there is the issue of updating the peer set when new servers are joining +or existing servers are leaving. As long as a quorum of nodes is available, this +is not an issue as Raft provides mechanisms to dynamically update the peer set. +If a quorum of nodes is unavailable, then this becomes a very challenging issue. +For example, suppose there are only 2 peers, A and B. The quorum size is also +2, meaning both nodes must agree to commit a log entry. If either A or B fails, +it is now impossible to reach quorum. This means the cluster is unable to add, +or remove a node, or commit any additional log entries. This results in *unavailability*. +At this point, manual intervention would be required to remove either A or B, +and to restart the remaining node in bootstrap mode. + +A Raft cluster of 3 nodes can tolerate a single node failure, while a cluster +of 5 can tolerate 2 node failures. The recommended configuration is to either +run 3 or 5 raft servers. This maximizes availability without +greatly sacrificing performance. + +In terms of performance, Raft is comparable to Paxos. Assuming stable leadership, +committing a log entry requires a single round trip to half of the cluster. +Thus performance is bound by disk I/O and network latency. + diff --git a/vendor/github.com/hashicorp/raft/api.go b/vendor/github.com/hashicorp/raft/api.go new file mode 100644 index 000000000..2fd78e784 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/api.go @@ -0,0 +1,1007 @@ +package raft + +import ( + "errors" + "fmt" + "io" + "log" + "os" + "strconv" + "sync" + "time" + + "github.com/armon/go-metrics" +) + +var ( + // ErrLeader is returned when an operation can't be completed on a + // leader node. + ErrLeader = errors.New("node is the leader") + + // ErrNotLeader is returned when an operation can't be completed on a + // follower or candidate node. + ErrNotLeader = errors.New("node is not the leader") + + // ErrLeadershipLost is returned when a leader fails to commit a log entry + // because it's been deposed in the process. + ErrLeadershipLost = errors.New("leadership lost while committing log") + + // ErrAbortedByRestore is returned when a leader fails to commit a log + // entry because it's been superseded by a user snapshot restore. + ErrAbortedByRestore = errors.New("snapshot restored while committing log") + + // ErrRaftShutdown is returned when operations are requested against an + // inactive Raft. + ErrRaftShutdown = errors.New("raft is already shutdown") + + // ErrEnqueueTimeout is returned when a command fails due to a timeout. + ErrEnqueueTimeout = errors.New("timed out enqueuing operation") + + // ErrNothingNewToSnapshot is returned when trying to create a snapshot + // but there's nothing new commited to the FSM since we started. + ErrNothingNewToSnapshot = errors.New("nothing new to snapshot") + + // ErrUnsupportedProtocol is returned when an operation is attempted + // that's not supported by the current protocol version. + ErrUnsupportedProtocol = errors.New("operation not supported with current protocol version") + + // ErrCantBootstrap is returned when attempt is made to bootstrap a + // cluster that already has state present. + ErrCantBootstrap = errors.New("bootstrap only works on new clusters") +) + +// Raft implements a Raft node. +type Raft struct { + raftState + + // protocolVersion is used to inter-operate with Raft servers running + // different versions of the library. See comments in config.go for more + // details. + protocolVersion ProtocolVersion + + // applyCh is used to async send logs to the main thread to + // be committed and applied to the FSM. + applyCh chan *logFuture + + // Configuration provided at Raft initialization + conf Config + + // FSM is the client state machine to apply commands to + fsm FSM + + // fsmMutateCh is used to send state-changing updates to the FSM. This + // receives pointers to commitTuple structures when applying logs or + // pointers to restoreFuture structures when restoring a snapshot. We + // need control over the order of these operations when doing user + // restores so that we finish applying any old log applies before we + // take a user snapshot on the leader, otherwise we might restore the + // snapshot and apply old logs to it that were in the pipe. + fsmMutateCh chan interface{} + + // fsmSnapshotCh is used to trigger a new snapshot being taken + fsmSnapshotCh chan *reqSnapshotFuture + + // lastContact is the last time we had contact from the + // leader node. This can be used to gauge staleness. + lastContact time.Time + lastContactLock sync.RWMutex + + // Leader is the current cluster leader + leader ServerAddress + leaderLock sync.RWMutex + + // leaderCh is used to notify of leadership changes + leaderCh chan bool + + // leaderState used only while state is leader + leaderState leaderState + + // Stores our local server ID, used to avoid sending RPCs to ourself + localID ServerID + + // Stores our local addr + localAddr ServerAddress + + // Used for our logging + logger *log.Logger + + // LogStore provides durable storage for logs + logs LogStore + + // Used to request the leader to make configuration changes. + configurationChangeCh chan *configurationChangeFuture + + // Tracks the latest configuration and latest committed configuration from + // the log/snapshot. + configurations configurations + + // RPC chan comes from the transport layer + rpcCh <-chan RPC + + // Shutdown channel to exit, protected to prevent concurrent exits + shutdown bool + shutdownCh chan struct{} + shutdownLock sync.Mutex + + // snapshots is used to store and retrieve snapshots + snapshots SnapshotStore + + // userSnapshotCh is used for user-triggered snapshots + userSnapshotCh chan *userSnapshotFuture + + // userRestoreCh is used for user-triggered restores of external + // snapshots + userRestoreCh chan *userRestoreFuture + + // stable is a StableStore implementation for durable state + // It provides stable storage for many fields in raftState + stable StableStore + + // The transport layer we use + trans Transport + + // verifyCh is used to async send verify futures to the main thread + // to verify we are still the leader + verifyCh chan *verifyFuture + + // configurationsCh is used to get the configuration data safely from + // outside of the main thread. + configurationsCh chan *configurationsFuture + + // bootstrapCh is used to attempt an initial bootstrap from outside of + // the main thread. + bootstrapCh chan *bootstrapFuture + + // List of observers and the mutex that protects them. The observers list + // is indexed by an artificial ID which is used for deregistration. + observersLock sync.RWMutex + observers map[uint64]*Observer +} + +// BootstrapCluster initializes a server's storage with the given cluster +// configuration. This should only be called at the beginning of time for the +// cluster, and you absolutely must make sure that you call it with the same +// configuration on all the Voter servers. There is no need to bootstrap +// Nonvoter and Staging servers. +// +// One sane approach is to boostrap a single server with a configuration +// listing just itself as a Voter, then invoke AddVoter() on it to add other +// servers to the cluster. +func BootstrapCluster(conf *Config, logs LogStore, stable StableStore, + snaps SnapshotStore, trans Transport, configuration Configuration) error { + // Validate the Raft server config. + if err := ValidateConfig(conf); err != nil { + return err + } + + // Sanity check the Raft peer configuration. + if err := checkConfiguration(configuration); err != nil { + return err + } + + // Make sure the cluster is in a clean state. + hasState, err := HasExistingState(logs, stable, snaps) + if err != nil { + return fmt.Errorf("failed to check for existing state: %v", err) + } + if hasState { + return ErrCantBootstrap + } + + // Set current term to 1. + if err := stable.SetUint64(keyCurrentTerm, 1); err != nil { + return fmt.Errorf("failed to save current term: %v", err) + } + + // Append configuration entry to log. + entry := &Log{ + Index: 1, + Term: 1, + } + if conf.ProtocolVersion < 3 { + entry.Type = LogRemovePeerDeprecated + entry.Data = encodePeers(configuration, trans) + } else { + entry.Type = LogConfiguration + entry.Data = encodeConfiguration(configuration) + } + if err := logs.StoreLog(entry); err != nil { + return fmt.Errorf("failed to append configuration entry to log: %v", err) + } + + return nil +} + +// RecoverCluster is used to manually force a new configuration in order to +// recover from a loss of quorum where the current configuration cannot be +// restored, such as when several servers die at the same time. This works by +// reading all the current state for this server, creating a snapshot with the +// supplied configuration, and then truncating the Raft log. This is the only +// safe way to force a given configuration without actually altering the log to +// insert any new entries, which could cause conflicts with other servers with +// different state. +// +// WARNING! This operation implicitly commits all entries in the Raft log, so +// in general this is an extremely unsafe operation. If you've lost your other +// servers and are performing a manual recovery, then you've also lost the +// commit information, so this is likely the best you can do, but you should be +// aware that calling this can cause Raft log entries that were in the process +// of being replicated but not yet be committed to be committed. +// +// Note the FSM passed here is used for the snapshot operations and will be +// left in a state that should not be used by the application. Be sure to +// discard this FSM and any associated state and provide a fresh one when +// calling NewRaft later. +// +// A typical way to recover the cluster is to shut down all servers and then +// run RecoverCluster on every server using an identical configuration. When +// the cluster is then restarted, and election should occur and then Raft will +// resume normal operation. If it's desired to make a particular server the +// leader, this can be used to inject a new configuration with that server as +// the sole voter, and then join up other new clean-state peer servers using +// the usual APIs in order to bring the cluster back into a known state. +func RecoverCluster(conf *Config, fsm FSM, logs LogStore, stable StableStore, + snaps SnapshotStore, trans Transport, configuration Configuration) error { + // Validate the Raft server config. + if err := ValidateConfig(conf); err != nil { + return err + } + + // Sanity check the Raft peer configuration. + if err := checkConfiguration(configuration); err != nil { + return err + } + + // Refuse to recover if there's no existing state. This would be safe to + // do, but it is likely an indication of an operator error where they + // expect data to be there and it's not. By refusing, we force them + // to show intent to start a cluster fresh by explicitly doing a + // bootstrap, rather than quietly fire up a fresh cluster here. + hasState, err := HasExistingState(logs, stable, snaps) + if err != nil { + return fmt.Errorf("failed to check for existing state: %v", err) + } + if !hasState { + return fmt.Errorf("refused to recover cluster with no initial state, this is probably an operator error") + } + + // Attempt to restore any snapshots we find, newest to oldest. + var snapshotIndex uint64 + var snapshotTerm uint64 + snapshots, err := snaps.List() + if err != nil { + return fmt.Errorf("failed to list snapshots: %v", err) + } + for _, snapshot := range snapshots { + _, source, err := snaps.Open(snapshot.ID) + if err != nil { + // Skip this one and try the next. We will detect if we + // couldn't open any snapshots. + continue + } + defer source.Close() + + if err := fsm.Restore(source); err != nil { + // Same here, skip and try the next one. + continue + } + + snapshotIndex = snapshot.Index + snapshotTerm = snapshot.Term + break + } + if len(snapshots) > 0 && (snapshotIndex == 0 || snapshotTerm == 0) { + return fmt.Errorf("failed to restore any of the available snapshots") + } + + // The snapshot information is the best known end point for the data + // until we play back the Raft log entries. + lastIndex := snapshotIndex + lastTerm := snapshotTerm + + // Apply any Raft log entries past the snapshot. + lastLogIndex, err := logs.LastIndex() + if err != nil { + return fmt.Errorf("failed to find last log: %v", err) + } + for index := snapshotIndex + 1; index <= lastLogIndex; index++ { + var entry Log + if err := logs.GetLog(index, &entry); err != nil { + return fmt.Errorf("failed to get log at index %d: %v", index, err) + } + if entry.Type == LogCommand { + _ = fsm.Apply(&entry) + } + lastIndex = entry.Index + lastTerm = entry.Term + } + + // Create a new snapshot, placing the configuration in as if it was + // committed at index 1. + snapshot, err := fsm.Snapshot() + if err != nil { + return fmt.Errorf("failed to snapshot FSM: %v", err) + } + version := getSnapshotVersion(conf.ProtocolVersion) + sink, err := snaps.Create(version, lastIndex, lastTerm, configuration, 1, trans) + if err != nil { + return fmt.Errorf("failed to create snapshot: %v", err) + } + if err := snapshot.Persist(sink); err != nil { + return fmt.Errorf("failed to persist snapshot: %v", err) + } + if err := sink.Close(); err != nil { + return fmt.Errorf("failed to finalize snapshot: %v", err) + } + + // Compact the log so that we don't get bad interference from any + // configuration change log entries that might be there. + firstLogIndex, err := logs.FirstIndex() + if err != nil { + return fmt.Errorf("failed to get first log index: %v", err) + } + if err := logs.DeleteRange(firstLogIndex, lastLogIndex); err != nil { + return fmt.Errorf("log compaction failed: %v", err) + } + + return nil +} + +// HasExistingState returns true if the server has any existing state (logs, +// knowledge of a current term, or any snapshots). +func HasExistingState(logs LogStore, stable StableStore, snaps SnapshotStore) (bool, error) { + // Make sure we don't have a current term. + currentTerm, err := stable.GetUint64(keyCurrentTerm) + if err == nil { + if currentTerm > 0 { + return true, nil + } + } else { + if err.Error() != "not found" { + return false, fmt.Errorf("failed to read current term: %v", err) + } + } + + // Make sure we have an empty log. + lastIndex, err := logs.LastIndex() + if err != nil { + return false, fmt.Errorf("failed to get last log index: %v", err) + } + if lastIndex > 0 { + return true, nil + } + + // Make sure we have no snapshots + snapshots, err := snaps.List() + if err != nil { + return false, fmt.Errorf("failed to list snapshots: %v", err) + } + if len(snapshots) > 0 { + return true, nil + } + + return false, nil +} + +// NewRaft is used to construct a new Raft node. It takes a configuration, as well +// as implementations of various interfaces that are required. If we have any +// old state, such as snapshots, logs, peers, etc, all those will be restored +// when creating the Raft node. +func NewRaft(conf *Config, fsm FSM, logs LogStore, stable StableStore, snaps SnapshotStore, trans Transport) (*Raft, error) { + // Validate the configuration. + if err := ValidateConfig(conf); err != nil { + return nil, err + } + + // Ensure we have a LogOutput. + var logger *log.Logger + if conf.Logger != nil { + logger = conf.Logger + } else { + if conf.LogOutput == nil { + conf.LogOutput = os.Stderr + } + logger = log.New(conf.LogOutput, "", log.LstdFlags) + } + + // Try to restore the current term. + currentTerm, err := stable.GetUint64(keyCurrentTerm) + if err != nil && err.Error() != "not found" { + return nil, fmt.Errorf("failed to load current term: %v", err) + } + + // Read the index of the last log entry. + lastIndex, err := logs.LastIndex() + if err != nil { + return nil, fmt.Errorf("failed to find last log: %v", err) + } + + // Get the last log entry. + var lastLog Log + if lastIndex > 0 { + if err = logs.GetLog(lastIndex, &lastLog); err != nil { + return nil, fmt.Errorf("failed to get last log at index %d: %v", lastIndex, err) + } + } + + // Make sure we have a valid server address and ID. + protocolVersion := conf.ProtocolVersion + localAddr := ServerAddress(trans.LocalAddr()) + localID := conf.LocalID + + // TODO (slackpad) - When we deprecate protocol version 2, remove this + // along with the AddPeer() and RemovePeer() APIs. + if protocolVersion < 3 && string(localID) != string(localAddr) { + return nil, fmt.Errorf("when running with ProtocolVersion < 3, LocalID must be set to the network address") + } + + // Create Raft struct. + r := &Raft{ + protocolVersion: protocolVersion, + applyCh: make(chan *logFuture), + conf: *conf, + fsm: fsm, + fsmMutateCh: make(chan interface{}, 128), + fsmSnapshotCh: make(chan *reqSnapshotFuture), + leaderCh: make(chan bool), + localID: localID, + localAddr: localAddr, + logger: logger, + logs: logs, + configurationChangeCh: make(chan *configurationChangeFuture), + configurations: configurations{}, + rpcCh: trans.Consumer(), + snapshots: snaps, + userSnapshotCh: make(chan *userSnapshotFuture), + userRestoreCh: make(chan *userRestoreFuture), + shutdownCh: make(chan struct{}), + stable: stable, + trans: trans, + verifyCh: make(chan *verifyFuture, 64), + configurationsCh: make(chan *configurationsFuture, 8), + bootstrapCh: make(chan *bootstrapFuture), + observers: make(map[uint64]*Observer), + } + + // Initialize as a follower. + r.setState(Follower) + + // Start as leader if specified. This should only be used + // for testing purposes. + if conf.StartAsLeader { + r.setState(Leader) + r.setLeader(r.localAddr) + } + + // Restore the current term and the last log. + r.setCurrentTerm(currentTerm) + r.setLastLog(lastLog.Index, lastLog.Term) + + // Attempt to restore a snapshot if there are any. + if err := r.restoreSnapshot(); err != nil { + return nil, err + } + + // Scan through the log for any configuration change entries. + snapshotIndex, _ := r.getLastSnapshot() + for index := snapshotIndex + 1; index <= lastLog.Index; index++ { + var entry Log + if err := r.logs.GetLog(index, &entry); err != nil { + r.logger.Printf("[ERR] raft: Failed to get log at %d: %v", index, err) + panic(err) + } + r.processConfigurationLogEntry(&entry) + } + r.logger.Printf("[INFO] raft: Initial configuration (index=%d): %+v", + r.configurations.latestIndex, r.configurations.latest.Servers) + + // Setup a heartbeat fast-path to avoid head-of-line + // blocking where possible. It MUST be safe for this + // to be called concurrently with a blocking RPC. + trans.SetHeartbeatHandler(r.processHeartbeat) + + // Start the background work. + r.goFunc(r.run) + r.goFunc(r.runFSM) + r.goFunc(r.runSnapshots) + return r, nil +} + +// restoreSnapshot attempts to restore the latest snapshots, and fails if none +// of them can be restored. This is called at initialization time, and is +// completely unsafe to call at any other time. +func (r *Raft) restoreSnapshot() error { + snapshots, err := r.snapshots.List() + if err != nil { + r.logger.Printf("[ERR] raft: Failed to list snapshots: %v", err) + return err + } + + // Try to load in order of newest to oldest + for _, snapshot := range snapshots { + _, source, err := r.snapshots.Open(snapshot.ID) + if err != nil { + r.logger.Printf("[ERR] raft: Failed to open snapshot %v: %v", snapshot.ID, err) + continue + } + defer source.Close() + + if err := r.fsm.Restore(source); err != nil { + r.logger.Printf("[ERR] raft: Failed to restore snapshot %v: %v", snapshot.ID, err) + continue + } + + // Log success + r.logger.Printf("[INFO] raft: Restored from snapshot %v", snapshot.ID) + + // Update the lastApplied so we don't replay old logs + r.setLastApplied(snapshot.Index) + + // Update the last stable snapshot info + r.setLastSnapshot(snapshot.Index, snapshot.Term) + + // Update the configuration + if snapshot.Version > 0 { + r.configurations.committed = snapshot.Configuration + r.configurations.committedIndex = snapshot.ConfigurationIndex + r.configurations.latest = snapshot.Configuration + r.configurations.latestIndex = snapshot.ConfigurationIndex + } else { + configuration := decodePeers(snapshot.Peers, r.trans) + r.configurations.committed = configuration + r.configurations.committedIndex = snapshot.Index + r.configurations.latest = configuration + r.configurations.latestIndex = snapshot.Index + } + + // Success! + return nil + } + + // If we had snapshots and failed to load them, its an error + if len(snapshots) > 0 { + return fmt.Errorf("failed to load any existing snapshots") + } + return nil +} + +// BootstrapCluster is equivalent to non-member BootstrapCluster but can be +// called on an un-bootstrapped Raft instance after it has been created. This +// should only be called at the beginning of time for the cluster, and you +// absolutely must make sure that you call it with the same configuration on all +// the Voter servers. There is no need to bootstrap Nonvoter and Staging +// servers. +func (r *Raft) BootstrapCluster(configuration Configuration) Future { + bootstrapReq := &bootstrapFuture{} + bootstrapReq.init() + bootstrapReq.configuration = configuration + select { + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + case r.bootstrapCh <- bootstrapReq: + return bootstrapReq + } +} + +// Leader is used to return the current leader of the cluster. +// It may return empty string if there is no current leader +// or the leader is unknown. +func (r *Raft) Leader() ServerAddress { + r.leaderLock.RLock() + leader := r.leader + r.leaderLock.RUnlock() + return leader +} + +// Apply is used to apply a command to the FSM in a highly consistent +// manner. This returns a future that can be used to wait on the application. +// An optional timeout can be provided to limit the amount of time we wait +// for the command to be started. This must be run on the leader or it +// will fail. +func (r *Raft) Apply(cmd []byte, timeout time.Duration) ApplyFuture { + metrics.IncrCounter([]string{"raft", "apply"}, 1) + var timer <-chan time.Time + if timeout > 0 { + timer = time.After(timeout) + } + + // Create a log future, no index or term yet + logFuture := &logFuture{ + log: Log{ + Type: LogCommand, + Data: cmd, + }, + } + logFuture.init() + + select { + case <-timer: + return errorFuture{ErrEnqueueTimeout} + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + case r.applyCh <- logFuture: + return logFuture + } +} + +// Barrier is used to issue a command that blocks until all preceeding +// operations have been applied to the FSM. It can be used to ensure the +// FSM reflects all queued writes. An optional timeout can be provided to +// limit the amount of time we wait for the command to be started. This +// must be run on the leader or it will fail. +func (r *Raft) Barrier(timeout time.Duration) Future { + metrics.IncrCounter([]string{"raft", "barrier"}, 1) + var timer <-chan time.Time + if timeout > 0 { + timer = time.After(timeout) + } + + // Create a log future, no index or term yet + logFuture := &logFuture{ + log: Log{ + Type: LogBarrier, + }, + } + logFuture.init() + + select { + case <-timer: + return errorFuture{ErrEnqueueTimeout} + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + case r.applyCh <- logFuture: + return logFuture + } +} + +// VerifyLeader is used to ensure the current node is still +// the leader. This can be done to prevent stale reads when a +// new leader has potentially been elected. +func (r *Raft) VerifyLeader() Future { + metrics.IncrCounter([]string{"raft", "verify_leader"}, 1) + verifyFuture := &verifyFuture{} + verifyFuture.init() + select { + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + case r.verifyCh <- verifyFuture: + return verifyFuture + } +} + +// GetConfiguration returns the latest configuration and its associated index +// currently in use. This may not yet be committed. This must not be called on +// the main thread (which can access the information directly). +func (r *Raft) GetConfiguration() ConfigurationFuture { + configReq := &configurationsFuture{} + configReq.init() + select { + case <-r.shutdownCh: + configReq.respond(ErrRaftShutdown) + return configReq + case r.configurationsCh <- configReq: + return configReq + } +} + +// AddPeer (deprecated) is used to add a new peer into the cluster. This must be +// run on the leader or it will fail. Use AddVoter/AddNonvoter instead. +func (r *Raft) AddPeer(peer ServerAddress) Future { + if r.protocolVersion > 2 { + return errorFuture{ErrUnsupportedProtocol} + } + + return r.requestConfigChange(configurationChangeRequest{ + command: AddStaging, + serverID: ServerID(peer), + serverAddress: peer, + prevIndex: 0, + }, 0) +} + +// RemovePeer (deprecated) is used to remove a peer from the cluster. If the +// current leader is being removed, it will cause a new election +// to occur. This must be run on the leader or it will fail. +// Use RemoveServer instead. +func (r *Raft) RemovePeer(peer ServerAddress) Future { + if r.protocolVersion > 2 { + return errorFuture{ErrUnsupportedProtocol} + } + + return r.requestConfigChange(configurationChangeRequest{ + command: RemoveServer, + serverID: ServerID(peer), + prevIndex: 0, + }, 0) +} + +// AddVoter will add the given server to the cluster as a staging server. If the +// server is already in the cluster as a voter, this does nothing. This must be +// run on the leader or it will fail. The leader will promote the staging server +// to a voter once that server is ready. If nonzero, prevIndex is the index of +// the only configuration upon which this change may be applied; if another +// configuration entry has been added in the meantime, this request will fail. +// If nonzero, timeout is how long this server should wait before the +// configuration change log entry is appended. +func (r *Raft) AddVoter(id ServerID, address ServerAddress, prevIndex uint64, timeout time.Duration) IndexFuture { + if r.protocolVersion < 2 { + return errorFuture{ErrUnsupportedProtocol} + } + + return r.requestConfigChange(configurationChangeRequest{ + command: AddStaging, + serverID: id, + serverAddress: address, + prevIndex: prevIndex, + }, timeout) +} + +// AddNonvoter will add the given server to the cluster but won't assign it a +// vote. The server will receive log entries, but it won't participate in +// elections or log entry commitment. If the server is already in the cluster as +// a staging server or voter, this does nothing. This must be run on the leader +// or it will fail. For prevIndex and timeout, see AddVoter. +func (r *Raft) AddNonvoter(id ServerID, address ServerAddress, prevIndex uint64, timeout time.Duration) IndexFuture { + if r.protocolVersion < 3 { + return errorFuture{ErrUnsupportedProtocol} + } + + return r.requestConfigChange(configurationChangeRequest{ + command: AddNonvoter, + serverID: id, + serverAddress: address, + prevIndex: prevIndex, + }, timeout) +} + +// RemoveServer will remove the given server from the cluster. If the current +// leader is being removed, it will cause a new election to occur. This must be +// run on the leader or it will fail. For prevIndex and timeout, see AddVoter. +func (r *Raft) RemoveServer(id ServerID, prevIndex uint64, timeout time.Duration) IndexFuture { + if r.protocolVersion < 2 { + return errorFuture{ErrUnsupportedProtocol} + } + + return r.requestConfigChange(configurationChangeRequest{ + command: RemoveServer, + serverID: id, + prevIndex: prevIndex, + }, timeout) +} + +// DemoteVoter will take away a server's vote, if it has one. If present, the +// server will continue to receive log entries, but it won't participate in +// elections or log entry commitment. If the server is not in the cluster, this +// does nothing. This must be run on the leader or it will fail. For prevIndex +// and timeout, see AddVoter. +func (r *Raft) DemoteVoter(id ServerID, prevIndex uint64, timeout time.Duration) IndexFuture { + if r.protocolVersion < 3 { + return errorFuture{ErrUnsupportedProtocol} + } + + return r.requestConfigChange(configurationChangeRequest{ + command: DemoteVoter, + serverID: id, + prevIndex: prevIndex, + }, timeout) +} + +// Shutdown is used to stop the Raft background routines. +// This is not a graceful operation. Provides a future that +// can be used to block until all background routines have exited. +func (r *Raft) Shutdown() Future { + r.shutdownLock.Lock() + defer r.shutdownLock.Unlock() + + if !r.shutdown { + close(r.shutdownCh) + r.shutdown = true + r.setState(Shutdown) + return &shutdownFuture{r} + } + + // avoid closing transport twice + return &shutdownFuture{nil} +} + +// Snapshot is used to manually force Raft to take a snapshot. Returns a future +// that can be used to block until complete, and that contains a function that +// can be used to open the snapshot. +func (r *Raft) Snapshot() SnapshotFuture { + future := &userSnapshotFuture{} + future.init() + select { + case r.userSnapshotCh <- future: + return future + case <-r.shutdownCh: + future.respond(ErrRaftShutdown) + return future + } +} + +// Restore is used to manually force Raft to consume an external snapshot, such +// as if restoring from a backup. We will use the current Raft configuration, +// not the one from the snapshot, so that we can restore into a new cluster. We +// will also use the higher of the index of the snapshot, or the current index, +// and then add 1 to that, so we force a new state with a hole in the Raft log, +// so that the snapshot will be sent to followers and used for any new joiners. +// This can only be run on the leader, and blocks until the restore is complete +// or an error occurs. +// +// WARNING! This operation has the leader take on the state of the snapshot and +// then sets itself up so that it replicates that to its followers though the +// install snapshot process. This involves a potentially dangerous period where +// the leader commits ahead of its followers, so should only be used for disaster +// recovery into a fresh cluster, and should not be used in normal operations. +func (r *Raft) Restore(meta *SnapshotMeta, reader io.Reader, timeout time.Duration) error { + metrics.IncrCounter([]string{"raft", "restore"}, 1) + var timer <-chan time.Time + if timeout > 0 { + timer = time.After(timeout) + } + + // Perform the restore. + restore := &userRestoreFuture{ + meta: meta, + reader: reader, + } + restore.init() + select { + case <-timer: + return ErrEnqueueTimeout + case <-r.shutdownCh: + return ErrRaftShutdown + case r.userRestoreCh <- restore: + // If the restore is ingested then wait for it to complete. + if err := restore.Error(); err != nil { + return err + } + } + + // Apply a no-op log entry. Waiting for this allows us to wait until the + // followers have gotten the restore and replicated at least this new + // entry, which shows that we've also faulted and installed the + // snapshot with the contents of the restore. + noop := &logFuture{ + log: Log{ + Type: LogNoop, + }, + } + noop.init() + select { + case <-timer: + return ErrEnqueueTimeout + case <-r.shutdownCh: + return ErrRaftShutdown + case r.applyCh <- noop: + return noop.Error() + } +} + +// State is used to return the current raft state. +func (r *Raft) State() RaftState { + return r.getState() +} + +// LeaderCh is used to get a channel which delivers signals on +// acquiring or losing leadership. It sends true if we become +// the leader, and false if we lose it. The channel is not buffered, +// and does not block on writes. +func (r *Raft) LeaderCh() <-chan bool { + return r.leaderCh +} + +// String returns a string representation of this Raft node. +func (r *Raft) String() string { + return fmt.Sprintf("Node at %s [%v]", r.localAddr, r.getState()) +} + +// LastContact returns the time of last contact by a leader. +// This only makes sense if we are currently a follower. +func (r *Raft) LastContact() time.Time { + r.lastContactLock.RLock() + last := r.lastContact + r.lastContactLock.RUnlock() + return last +} + +// Stats is used to return a map of various internal stats. This +// should only be used for informative purposes or debugging. +// +// Keys are: "state", "term", "last_log_index", "last_log_term", +// "commit_index", "applied_index", "fsm_pending", +// "last_snapshot_index", "last_snapshot_term", +// "latest_configuration", "last_contact", and "num_peers". +// +// The value of "state" is a numerical value representing a +// RaftState const. +// +// The value of "latest_configuration" is a string which contains +// the id of each server, its suffrage status, and its address. +// +// The value of "last_contact" is either "never" if there +// has been no contact with a leader, "0" if the node is in the +// leader state, or the time since last contact with a leader +// formatted as a string. +// +// The value of "num_peers" is the number of other voting servers in the +// cluster, not including this node. If this node isn't part of the +// configuration then this will be "0". +// +// All other values are uint64s, formatted as strings. +func (r *Raft) Stats() map[string]string { + toString := func(v uint64) string { + return strconv.FormatUint(v, 10) + } + lastLogIndex, lastLogTerm := r.getLastLog() + lastSnapIndex, lastSnapTerm := r.getLastSnapshot() + s := map[string]string{ + "state": r.getState().String(), + "term": toString(r.getCurrentTerm()), + "last_log_index": toString(lastLogIndex), + "last_log_term": toString(lastLogTerm), + "commit_index": toString(r.getCommitIndex()), + "applied_index": toString(r.getLastApplied()), + "fsm_pending": toString(uint64(len(r.fsmMutateCh))), + "last_snapshot_index": toString(lastSnapIndex), + "last_snapshot_term": toString(lastSnapTerm), + "protocol_version": toString(uint64(r.protocolVersion)), + "protocol_version_min": toString(uint64(ProtocolVersionMin)), + "protocol_version_max": toString(uint64(ProtocolVersionMax)), + "snapshot_version_min": toString(uint64(SnapshotVersionMin)), + "snapshot_version_max": toString(uint64(SnapshotVersionMax)), + } + + future := r.GetConfiguration() + if err := future.Error(); err != nil { + r.logger.Printf("[WARN] raft: could not get configuration for Stats: %v", err) + } else { + configuration := future.Configuration() + s["latest_configuration_index"] = toString(future.Index()) + s["latest_configuration"] = fmt.Sprintf("%+v", configuration.Servers) + + // This is a legacy metric that we've seen people use in the wild. + hasUs := false + numPeers := 0 + for _, server := range configuration.Servers { + if server.Suffrage == Voter { + if server.ID == r.localID { + hasUs = true + } else { + numPeers++ + } + } + } + if !hasUs { + numPeers = 0 + } + s["num_peers"] = toString(uint64(numPeers)) + } + + last := r.LastContact() + if last.IsZero() { + s["last_contact"] = "never" + } else if r.getState() == Leader { + s["last_contact"] = "0" + } else { + s["last_contact"] = fmt.Sprintf("%v", time.Now().Sub(last)) + } + return s +} + +// LastIndex returns the last index in stable storage, +// either from the last log or from the last snapshot. +func (r *Raft) LastIndex() uint64 { + return r.getLastIndex() +} + +// AppliedIndex returns the last index applied to the FSM. This is generally +// lagging behind the last index, especially for indexes that are persisted but +// have not yet been considered committed by the leader. NOTE - this reflects +// the last index that was sent to the application's FSM over the apply channel +// but DOES NOT mean that the application's FSM has yet consumed it and applied +// it to its internal state. Thus, the application's state may lag behind this +// index. +func (r *Raft) AppliedIndex() uint64 { + return r.getLastApplied() +} diff --git a/vendor/github.com/hashicorp/raft/commands.go b/vendor/github.com/hashicorp/raft/commands.go new file mode 100644 index 000000000..5d89e7bcd --- /dev/null +++ b/vendor/github.com/hashicorp/raft/commands.go @@ -0,0 +1,151 @@ +package raft + +// RPCHeader is a common sub-structure used to pass along protocol version and +// other information about the cluster. For older Raft implementations before +// versioning was added this will default to a zero-valued structure when read +// by newer Raft versions. +type RPCHeader struct { + // ProtocolVersion is the version of the protocol the sender is + // speaking. + ProtocolVersion ProtocolVersion +} + +// WithRPCHeader is an interface that exposes the RPC header. +type WithRPCHeader interface { + GetRPCHeader() RPCHeader +} + +// AppendEntriesRequest is the command used to append entries to the +// replicated log. +type AppendEntriesRequest struct { + RPCHeader + + // Provide the current term and leader + Term uint64 + Leader []byte + + // Provide the previous entries for integrity checking + PrevLogEntry uint64 + PrevLogTerm uint64 + + // New entries to commit + Entries []*Log + + // Commit index on the leader + LeaderCommitIndex uint64 +} + +// See WithRPCHeader. +func (r *AppendEntriesRequest) GetRPCHeader() RPCHeader { + return r.RPCHeader +} + +// AppendEntriesResponse is the response returned from an +// AppendEntriesRequest. +type AppendEntriesResponse struct { + RPCHeader + + // Newer term if leader is out of date + Term uint64 + + // Last Log is a hint to help accelerate rebuilding slow nodes + LastLog uint64 + + // We may not succeed if we have a conflicting entry + Success bool + + // There are scenarios where this request didn't succeed + // but there's no need to wait/back-off the next attempt. + NoRetryBackoff bool +} + +// See WithRPCHeader. +func (r *AppendEntriesResponse) GetRPCHeader() RPCHeader { + return r.RPCHeader +} + +// RequestVoteRequest is the command used by a candidate to ask a Raft peer +// for a vote in an election. +type RequestVoteRequest struct { + RPCHeader + + // Provide the term and our id + Term uint64 + Candidate []byte + + // Used to ensure safety + LastLogIndex uint64 + LastLogTerm uint64 +} + +// See WithRPCHeader. +func (r *RequestVoteRequest) GetRPCHeader() RPCHeader { + return r.RPCHeader +} + +// RequestVoteResponse is the response returned from a RequestVoteRequest. +type RequestVoteResponse struct { + RPCHeader + + // Newer term if leader is out of date. + Term uint64 + + // Peers is deprecated, but required by servers that only understand + // protocol version 0. This is not populated in protocol version 2 + // and later. + Peers []byte + + // Is the vote granted. + Granted bool +} + +// See WithRPCHeader. +func (r *RequestVoteResponse) GetRPCHeader() RPCHeader { + return r.RPCHeader +} + +// InstallSnapshotRequest is the command sent to a Raft peer to bootstrap its +// log (and state machine) from a snapshot on another peer. +type InstallSnapshotRequest struct { + RPCHeader + SnapshotVersion SnapshotVersion + + Term uint64 + Leader []byte + + // These are the last index/term included in the snapshot + LastLogIndex uint64 + LastLogTerm uint64 + + // Peer Set in the snapshot. This is deprecated in favor of Configuration + // but remains here in case we receive an InstallSnapshot from a leader + // that's running old code. + Peers []byte + + // Cluster membership. + Configuration []byte + // Log index where 'Configuration' entry was originally written. + ConfigurationIndex uint64 + + // Size of the snapshot + Size int64 +} + +// See WithRPCHeader. +func (r *InstallSnapshotRequest) GetRPCHeader() RPCHeader { + return r.RPCHeader +} + +// InstallSnapshotResponse is the response returned from an +// InstallSnapshotRequest. +type InstallSnapshotResponse struct { + RPCHeader + + Term uint64 + Success bool +} + +// See WithRPCHeader. +func (r *InstallSnapshotResponse) GetRPCHeader() RPCHeader { + return r.RPCHeader +} diff --git a/vendor/github.com/hashicorp/raft/commitment.go b/vendor/github.com/hashicorp/raft/commitment.go new file mode 100644 index 000000000..b5ba2634e --- /dev/null +++ b/vendor/github.com/hashicorp/raft/commitment.go @@ -0,0 +1,101 @@ +package raft + +import ( + "sort" + "sync" +) + +// Commitment is used to advance the leader's commit index. The leader and +// replication goroutines report in newly written entries with Match(), and +// this notifies on commitCh when the commit index has advanced. +type commitment struct { + // protectes matchIndexes and commitIndex + sync.Mutex + // notified when commitIndex increases + commitCh chan struct{} + // voter ID to log index: the server stores up through this log entry + matchIndexes map[ServerID]uint64 + // a quorum stores up through this log entry. monotonically increases. + commitIndex uint64 + // the first index of this leader's term: this needs to be replicated to a + // majority of the cluster before this leader may mark anything committed + // (per Raft's commitment rule) + startIndex uint64 +} + +// newCommitment returns an commitment struct that notifies the provided +// channel when log entries have been committed. A new commitment struct is +// created each time this server becomes leader for a particular term. +// 'configuration' is the servers in the cluster. +// 'startIndex' is the first index created in this term (see +// its description above). +func newCommitment(commitCh chan struct{}, configuration Configuration, startIndex uint64) *commitment { + matchIndexes := make(map[ServerID]uint64) + for _, server := range configuration.Servers { + if server.Suffrage == Voter { + matchIndexes[server.ID] = 0 + } + } + return &commitment{ + commitCh: commitCh, + matchIndexes: matchIndexes, + commitIndex: 0, + startIndex: startIndex, + } +} + +// Called when a new cluster membership configuration is created: it will be +// used to determine commitment from now on. 'configuration' is the servers in +// the cluster. +func (c *commitment) setConfiguration(configuration Configuration) { + c.Lock() + defer c.Unlock() + oldMatchIndexes := c.matchIndexes + c.matchIndexes = make(map[ServerID]uint64) + for _, server := range configuration.Servers { + if server.Suffrage == Voter { + c.matchIndexes[server.ID] = oldMatchIndexes[server.ID] // defaults to 0 + } + } + c.recalculate() +} + +// Called by leader after commitCh is notified +func (c *commitment) getCommitIndex() uint64 { + c.Lock() + defer c.Unlock() + return c.commitIndex +} + +// Match is called once a server completes writing entries to disk: either the +// leader has written the new entry or a follower has replied to an +// AppendEntries RPC. The given server's disk agrees with this server's log up +// through the given index. +func (c *commitment) match(server ServerID, matchIndex uint64) { + c.Lock() + defer c.Unlock() + if prev, hasVote := c.matchIndexes[server]; hasVote && matchIndex > prev { + c.matchIndexes[server] = matchIndex + c.recalculate() + } +} + +// Internal helper to calculate new commitIndex from matchIndexes. +// Must be called with lock held. +func (c *commitment) recalculate() { + if len(c.matchIndexes) == 0 { + return + } + + matched := make([]uint64, 0, len(c.matchIndexes)) + for _, idx := range c.matchIndexes { + matched = append(matched, idx) + } + sort.Sort(uint64Slice(matched)) + quorumMatchIndex := matched[(len(matched)-1)/2] + + if quorumMatchIndex > c.commitIndex && quorumMatchIndex >= c.startIndex { + c.commitIndex = quorumMatchIndex + asyncNotifyCh(c.commitCh) + } +} diff --git a/vendor/github.com/hashicorp/raft/config.go b/vendor/github.com/hashicorp/raft/config.go new file mode 100644 index 000000000..c1ce03ac2 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/config.go @@ -0,0 +1,258 @@ +package raft + +import ( + "fmt" + "io" + "log" + "time" +) + +// These are the versions of the protocol (which includes RPC messages as +// well as Raft-specific log entries) that this server can _understand_. Use +// the ProtocolVersion member of the Config object to control the version of +// the protocol to use when _speaking_ to other servers. Note that depending on +// the protocol version being spoken, some otherwise understood RPC messages +// may be refused. See dispositionRPC for details of this logic. +// +// There are notes about the upgrade path in the description of the versions +// below. If you are starting a fresh cluster then there's no reason not to +// jump right to the latest protocol version. If you need to interoperate with +// older, version 0 Raft servers you'll need to drive the cluster through the +// different versions in order. +// +// The version details are complicated, but here's a summary of what's required +// to get from a version 0 cluster to version 3: +// +// 1. In version N of your app that starts using the new Raft library with +// versioning, set ProtocolVersion to 1. +// 2. Make version N+1 of your app require version N as a prerequisite (all +// servers must be upgraded). For version N+1 of your app set ProtocolVersion +// to 2. +// 3. Similarly, make version N+2 of your app require version N+1 as a +// prerequisite. For version N+2 of your app, set ProtocolVersion to 3. +// +// During this upgrade, older cluster members will still have Server IDs equal +// to their network addresses. To upgrade an older member and give it an ID, it +// needs to leave the cluster and re-enter: +// +// 1. Remove the server from the cluster with RemoveServer, using its network +// address as its ServerID. +// 2. Update the server's config to a better ID (restarting the server). +// 3. Add the server back to the cluster with AddVoter, using its new ID. +// +// You can do this during the rolling upgrade from N+1 to N+2 of your app, or +// as a rolling change at any time after the upgrade. +// +// Version History +// +// 0: Original Raft library before versioning was added. Servers running this +// version of the Raft library use AddPeerDeprecated/RemovePeerDeprecated +// for all configuration changes, and have no support for LogConfiguration. +// 1: First versioned protocol, used to interoperate with old servers, and begin +// the migration path to newer versions of the protocol. Under this version +// all configuration changes are propagated using the now-deprecated +// RemovePeerDeprecated Raft log entry. This means that server IDs are always +// set to be the same as the server addresses (since the old log entry type +// cannot transmit an ID), and only AddPeer/RemovePeer APIs are supported. +// Servers running this version of the protocol can understand the new +// LogConfiguration Raft log entry but will never generate one so they can +// remain compatible with version 0 Raft servers in the cluster. +// 2: Transitional protocol used when migrating an existing cluster to the new +// server ID system. Server IDs are still set to be the same as server +// addresses, but all configuration changes are propagated using the new +// LogConfiguration Raft log entry type, which can carry full ID information. +// This version supports the old AddPeer/RemovePeer APIs as well as the new +// ID-based AddVoter/RemoveServer APIs which should be used when adding +// version 3 servers to the cluster later. This version sheds all +// interoperability with version 0 servers, but can interoperate with newer +// Raft servers running with protocol version 1 since they can understand the +// new LogConfiguration Raft log entry, and this version can still understand +// their RemovePeerDeprecated Raft log entries. We need this protocol version +// as an intermediate step between 1 and 3 so that servers will propagate the +// ID information that will come from newly-added (or -rolled) servers using +// protocol version 3, but since they are still using their address-based IDs +// from the previous step they will still be able to track commitments and +// their own voting status properly. If we skipped this step, servers would +// be started with their new IDs, but they wouldn't see themselves in the old +// address-based configuration, so none of the servers would think they had a +// vote. +// 3: Protocol adding full support for server IDs and new ID-based server APIs +// (AddVoter, AddNonvoter, etc.), old AddPeer/RemovePeer APIs are no longer +// supported. Version 2 servers should be swapped out by removing them from +// the cluster one-by-one and re-adding them with updated configuration for +// this protocol version, along with their server ID. The remove/add cycle +// is required to populate their server ID. Note that removing must be done +// by ID, which will be the old server's address. +type ProtocolVersion int + +const ( + ProtocolVersionMin ProtocolVersion = 0 + ProtocolVersionMax = 3 +) + +// These are versions of snapshots that this server can _understand_. Currently, +// it is always assumed that this server generates the latest version, though +// this may be changed in the future to include a configurable version. +// +// Version History +// +// 0: Original Raft library before versioning was added. The peers portion of +// these snapshots is encoded in the legacy format which requires decodePeers +// to parse. This version of snapshots should only be produced by the +// unversioned Raft library. +// 1: New format which adds support for a full configuration structure and its +// associated log index, with support for server IDs and non-voting server +// modes. To ease upgrades, this also includes the legacy peers structure but +// that will never be used by servers that understand version 1 snapshots. +// Since the original Raft library didn't enforce any versioning, we must +// include the legacy peers structure for this version, but we can deprecate +// it in the next snapshot version. +type SnapshotVersion int + +const ( + SnapshotVersionMin SnapshotVersion = 0 + SnapshotVersionMax = 1 +) + +// Config provides any necessary configuration for the Raft server. +type Config struct { + // ProtocolVersion allows a Raft server to inter-operate with older + // Raft servers running an older version of the code. This is used to + // version the wire protocol as well as Raft-specific log entries that + // the server uses when _speaking_ to other servers. There is currently + // no auto-negotiation of versions so all servers must be manually + // configured with compatible versions. See ProtocolVersionMin and + // ProtocolVersionMax for the versions of the protocol that this server + // can _understand_. + ProtocolVersion ProtocolVersion + + // HeartbeatTimeout specifies the time in follower state without + // a leader before we attempt an election. + HeartbeatTimeout time.Duration + + // ElectionTimeout specifies the time in candidate state without + // a leader before we attempt an election. + ElectionTimeout time.Duration + + // CommitTimeout controls the time without an Apply() operation + // before we heartbeat to ensure a timely commit. Due to random + // staggering, may be delayed as much as 2x this value. + CommitTimeout time.Duration + + // MaxAppendEntries controls the maximum number of append entries + // to send at once. We want to strike a balance between efficiency + // and avoiding waste if the follower is going to reject because of + // an inconsistent log. + MaxAppendEntries int + + // If we are a member of a cluster, and RemovePeer is invoked for the + // local node, then we forget all peers and transition into the follower state. + // If ShutdownOnRemove is is set, we additional shutdown Raft. Otherwise, + // we can become a leader of a cluster containing only this node. + ShutdownOnRemove bool + + // TrailingLogs controls how many logs we leave after a snapshot. This is + // used so that we can quickly replay logs on a follower instead of being + // forced to send an entire snapshot. + TrailingLogs uint64 + + // SnapshotInterval controls how often we check if we should perform a snapshot. + // We randomly stagger between this value and 2x this value to avoid the entire + // cluster from performing a snapshot at once. + SnapshotInterval time.Duration + + // SnapshotThreshold controls how many outstanding logs there must be before + // we perform a snapshot. This is to prevent excessive snapshots when we can + // just replay a small set of logs. + SnapshotThreshold uint64 + + // LeaderLeaseTimeout is used to control how long the "lease" lasts + // for being the leader without being able to contact a quorum + // of nodes. If we reach this interval without contact, we will + // step down as leader. + LeaderLeaseTimeout time.Duration + + // StartAsLeader forces Raft to start in the leader state. This should + // never be used except for testing purposes, as it can cause a split-brain. + StartAsLeader bool + + // The unique ID for this server across all time. When running with + // ProtocolVersion < 3, you must set this to be the same as the network + // address of your transport. + LocalID ServerID + + // NotifyCh is used to provide a channel that will be notified of leadership + // changes. Raft will block writing to this channel, so it should either be + // buffered or aggressively consumed. + NotifyCh chan<- bool + + // LogOutput is used as a sink for logs, unless Logger is specified. + // Defaults to os.Stderr. + LogOutput io.Writer + + // Logger is a user-provided logger. If nil, a logger writing to LogOutput + // is used. + Logger *log.Logger +} + +// DefaultConfig returns a Config with usable defaults. +func DefaultConfig() *Config { + return &Config{ + ProtocolVersion: ProtocolVersionMax, + HeartbeatTimeout: 1000 * time.Millisecond, + ElectionTimeout: 1000 * time.Millisecond, + CommitTimeout: 50 * time.Millisecond, + MaxAppendEntries: 64, + ShutdownOnRemove: true, + TrailingLogs: 10240, + SnapshotInterval: 120 * time.Second, + SnapshotThreshold: 8192, + LeaderLeaseTimeout: 500 * time.Millisecond, + } +} + +// ValidateConfig is used to validate a sane configuration +func ValidateConfig(config *Config) error { + // We don't actually support running as 0 in the library any more, but + // we do understand it. + protocolMin := ProtocolVersionMin + if protocolMin == 0 { + protocolMin = 1 + } + if config.ProtocolVersion < protocolMin || + config.ProtocolVersion > ProtocolVersionMax { + return fmt.Errorf("Protocol version %d must be >= %d and <= %d", + config.ProtocolVersion, protocolMin, ProtocolVersionMax) + } + if len(config.LocalID) == 0 { + return fmt.Errorf("LocalID cannot be empty") + } + if config.HeartbeatTimeout < 5*time.Millisecond { + return fmt.Errorf("Heartbeat timeout is too low") + } + if config.ElectionTimeout < 5*time.Millisecond { + return fmt.Errorf("Election timeout is too low") + } + if config.CommitTimeout < time.Millisecond { + return fmt.Errorf("Commit timeout is too low") + } + if config.MaxAppendEntries <= 0 { + return fmt.Errorf("MaxAppendEntries must be positive") + } + if config.MaxAppendEntries > 1024 { + return fmt.Errorf("MaxAppendEntries is too large") + } + if config.SnapshotInterval < 5*time.Millisecond { + return fmt.Errorf("Snapshot interval is too low") + } + if config.LeaderLeaseTimeout < 5*time.Millisecond { + return fmt.Errorf("Leader lease timeout is too low") + } + if config.LeaderLeaseTimeout > config.HeartbeatTimeout { + return fmt.Errorf("Leader lease timeout cannot be larger than heartbeat timeout") + } + if config.ElectionTimeout < config.HeartbeatTimeout { + return fmt.Errorf("Election timeout must be equal or greater than Heartbeat Timeout") + } + return nil +} diff --git a/vendor/github.com/hashicorp/raft/configuration.go b/vendor/github.com/hashicorp/raft/configuration.go new file mode 100644 index 000000000..74508c5e5 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/configuration.go @@ -0,0 +1,343 @@ +package raft + +import "fmt" + +// ServerSuffrage determines whether a Server in a Configuration gets a vote. +type ServerSuffrage int + +// Note: Don't renumber these, since the numbers are written into the log. +const ( + // Voter is a server whose vote is counted in elections and whose match index + // is used in advancing the leader's commit index. + Voter ServerSuffrage = iota + // Nonvoter is a server that receives log entries but is not considered for + // elections or commitment purposes. + Nonvoter + // Staging is a server that acts like a nonvoter with one exception: once a + // staging server receives enough log entries to be sufficiently caught up to + // the leader's log, the leader will invoke a membership change to change + // the Staging server to a Voter. + Staging +) + +func (s ServerSuffrage) String() string { + switch s { + case Voter: + return "Voter" + case Nonvoter: + return "Nonvoter" + case Staging: + return "Staging" + } + return "ServerSuffrage" +} + +// ServerID is a unique string identifying a server for all time. +type ServerID string + +// ServerAddress is a network address for a server that a transport can contact. +type ServerAddress string + +// Server tracks the information about a single server in a configuration. +type Server struct { + // Suffrage determines whether the server gets a vote. + Suffrage ServerSuffrage + // ID is a unique string identifying this server for all time. + ID ServerID + // Address is its network address that a transport can contact. + Address ServerAddress +} + +// Configuration tracks which servers are in the cluster, and whether they have +// votes. This should include the local server, if it's a member of the cluster. +// The servers are listed no particular order, but each should only appear once. +// These entries are appended to the log during membership changes. +type Configuration struct { + Servers []Server +} + +// Clone makes a deep copy of a Configuration. +func (c *Configuration) Clone() (copy Configuration) { + copy.Servers = append(copy.Servers, c.Servers...) + return +} + +// ConfigurationChangeCommand is the different ways to change the cluster +// configuration. +type ConfigurationChangeCommand uint8 + +const ( + // AddStaging makes a server Staging unless its Voter. + AddStaging ConfigurationChangeCommand = iota + // AddNonvoter makes a server Nonvoter unless its Staging or Voter. + AddNonvoter + // DemoteVoter makes a server Nonvoter unless its absent. + DemoteVoter + // RemoveServer removes a server entirely from the cluster membership. + RemoveServer + // Promote is created automatically by a leader; it turns a Staging server + // into a Voter. + Promote +) + +func (c ConfigurationChangeCommand) String() string { + switch c { + case AddStaging: + return "AddStaging" + case AddNonvoter: + return "AddNonvoter" + case DemoteVoter: + return "DemoteVoter" + case RemoveServer: + return "RemoveServer" + case Promote: + return "Promote" + } + return "ConfigurationChangeCommand" +} + +// configurationChangeRequest describes a change that a leader would like to +// make to its current configuration. It's used only within a single server +// (never serialized into the log), as part of `configurationChangeFuture`. +type configurationChangeRequest struct { + command ConfigurationChangeCommand + serverID ServerID + serverAddress ServerAddress // only present for AddStaging, AddNonvoter + // prevIndex, if nonzero, is the index of the only configuration upon which + // this change may be applied; if another configuration entry has been + // added in the meantime, this request will fail. + prevIndex uint64 +} + +// configurations is state tracked on every server about its Configurations. +// Note that, per Diego's dissertation, there can be at most one uncommitted +// configuration at a time (the next configuration may not be created until the +// prior one has been committed). +// +// One downside to storing just two configurations is that if you try to take a +// snahpsot when your state machine hasn't yet applied the committedIndex, we +// have no record of the configuration that would logically fit into that +// snapshot. We disallow snapshots in that case now. An alternative approach, +// which LogCabin uses, is to track every configuration change in the +// log. +type configurations struct { + // committed is the latest configuration in the log/snapshot that has been + // committed (the one with the largest index). + committed Configuration + // committedIndex is the log index where 'committed' was written. + committedIndex uint64 + // latest is the latest configuration in the log/snapshot (may be committed + // or uncommitted) + latest Configuration + // latestIndex is the log index where 'latest' was written. + latestIndex uint64 +} + +// Clone makes a deep copy of a configurations object. +func (c *configurations) Clone() (copy configurations) { + copy.committed = c.committed.Clone() + copy.committedIndex = c.committedIndex + copy.latest = c.latest.Clone() + copy.latestIndex = c.latestIndex + return +} + +// hasVote returns true if the server identified by 'id' is a Voter in the +// provided Configuration. +func hasVote(configuration Configuration, id ServerID) bool { + for _, server := range configuration.Servers { + if server.ID == id { + return server.Suffrage == Voter + } + } + return false +} + +// checkConfiguration tests a cluster membership configuration for common +// errors. +func checkConfiguration(configuration Configuration) error { + idSet := make(map[ServerID]bool) + addressSet := make(map[ServerAddress]bool) + var voters int + for _, server := range configuration.Servers { + if server.ID == "" { + return fmt.Errorf("Empty ID in configuration: %v", configuration) + } + if server.Address == "" { + return fmt.Errorf("Empty address in configuration: %v", server) + } + if idSet[server.ID] { + return fmt.Errorf("Found duplicate ID in configuration: %v", server.ID) + } + idSet[server.ID] = true + if addressSet[server.Address] { + return fmt.Errorf("Found duplicate address in configuration: %v", server.Address) + } + addressSet[server.Address] = true + if server.Suffrage == Voter { + voters++ + } + } + if voters == 0 { + return fmt.Errorf("Need at least one voter in configuration: %v", configuration) + } + return nil +} + +// nextConfiguration generates a new Configuration from the current one and a +// configuration change request. It's split from appendConfigurationEntry so +// that it can be unit tested easily. +func nextConfiguration(current Configuration, currentIndex uint64, change configurationChangeRequest) (Configuration, error) { + if change.prevIndex > 0 && change.prevIndex != currentIndex { + return Configuration{}, fmt.Errorf("Configuration changed since %v (latest is %v)", change.prevIndex, currentIndex) + } + + configuration := current.Clone() + switch change.command { + case AddStaging: + // TODO: barf on new address? + newServer := Server{ + // TODO: This should add the server as Staging, to be automatically + // promoted to Voter later. However, the promoton to Voter is not yet + // implemented, and doing so is not trivial with the way the leader loop + // coordinates with the replication goroutines today. So, for now, the + // server will have a vote right away, and the Promote case below is + // unused. + Suffrage: Voter, + ID: change.serverID, + Address: change.serverAddress, + } + found := false + for i, server := range configuration.Servers { + if server.ID == change.serverID { + if server.Suffrage == Voter { + configuration.Servers[i].Address = change.serverAddress + } else { + configuration.Servers[i] = newServer + } + found = true + break + } + } + if !found { + configuration.Servers = append(configuration.Servers, newServer) + } + case AddNonvoter: + newServer := Server{ + Suffrage: Nonvoter, + ID: change.serverID, + Address: change.serverAddress, + } + found := false + for i, server := range configuration.Servers { + if server.ID == change.serverID { + if server.Suffrage != Nonvoter { + configuration.Servers[i].Address = change.serverAddress + } else { + configuration.Servers[i] = newServer + } + found = true + break + } + } + if !found { + configuration.Servers = append(configuration.Servers, newServer) + } + case DemoteVoter: + for i, server := range configuration.Servers { + if server.ID == change.serverID { + configuration.Servers[i].Suffrage = Nonvoter + break + } + } + case RemoveServer: + for i, server := range configuration.Servers { + if server.ID == change.serverID { + configuration.Servers = append(configuration.Servers[:i], configuration.Servers[i+1:]...) + break + } + } + case Promote: + for i, server := range configuration.Servers { + if server.ID == change.serverID && server.Suffrage == Staging { + configuration.Servers[i].Suffrage = Voter + break + } + } + } + + // Make sure we didn't do something bad like remove the last voter + if err := checkConfiguration(configuration); err != nil { + return Configuration{}, err + } + + return configuration, nil +} + +// encodePeers is used to serialize a Configuration into the old peers format. +// This is here for backwards compatibility when operating with a mix of old +// servers and should be removed once we deprecate support for protocol version 1. +func encodePeers(configuration Configuration, trans Transport) []byte { + // Gather up all the voters, other suffrage types are not supported by + // this data format. + var encPeers [][]byte + for _, server := range configuration.Servers { + if server.Suffrage == Voter { + encPeers = append(encPeers, trans.EncodePeer(server.Address)) + } + } + + // Encode the entire array. + buf, err := encodeMsgPack(encPeers) + if err != nil { + panic(fmt.Errorf("failed to encode peers: %v", err)) + } + + return buf.Bytes() +} + +// decodePeers is used to deserialize an old list of peers into a Configuration. +// This is here for backwards compatibility with old log entries and snapshots; +// it should be removed eventually. +func decodePeers(buf []byte, trans Transport) Configuration { + // Decode the buffer first. + var encPeers [][]byte + if err := decodeMsgPack(buf, &encPeers); err != nil { + panic(fmt.Errorf("failed to decode peers: %v", err)) + } + + // Deserialize each peer. + var servers []Server + for _, enc := range encPeers { + p := trans.DecodePeer(enc) + servers = append(servers, Server{ + Suffrage: Voter, + ID: ServerID(p), + Address: ServerAddress(p), + }) + } + + return Configuration{ + Servers: servers, + } +} + +// encodeConfiguration serializes a Configuration using MsgPack, or panics on +// errors. +func encodeConfiguration(configuration Configuration) []byte { + buf, err := encodeMsgPack(configuration) + if err != nil { + panic(fmt.Errorf("failed to encode configuration: %v", err)) + } + return buf.Bytes() +} + +// decodeConfiguration deserializes a Configuration using MsgPack, or panics on +// errors. +func decodeConfiguration(buf []byte) Configuration { + var configuration Configuration + if err := decodeMsgPack(buf, &configuration); err != nil { + panic(fmt.Errorf("failed to decode configuration: %v", err)) + } + return configuration +} diff --git a/vendor/github.com/hashicorp/raft/discard_snapshot.go b/vendor/github.com/hashicorp/raft/discard_snapshot.go new file mode 100644 index 000000000..5e93a9fe0 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/discard_snapshot.go @@ -0,0 +1,49 @@ +package raft + +import ( + "fmt" + "io" +) + +// DiscardSnapshotStore is used to successfully snapshot while +// always discarding the snapshot. This is useful for when the +// log should be truncated but no snapshot should be retained. +// This should never be used for production use, and is only +// suitable for testing. +type DiscardSnapshotStore struct{} + +type DiscardSnapshotSink struct{} + +// NewDiscardSnapshotStore is used to create a new DiscardSnapshotStore. +func NewDiscardSnapshotStore() *DiscardSnapshotStore { + return &DiscardSnapshotStore{} +} + +func (d *DiscardSnapshotStore) Create(version SnapshotVersion, index, term uint64, + configuration Configuration, configurationIndex uint64, trans Transport) (SnapshotSink, error) { + return &DiscardSnapshotSink{}, nil +} + +func (d *DiscardSnapshotStore) List() ([]*SnapshotMeta, error) { + return nil, nil +} + +func (d *DiscardSnapshotStore) Open(id string) (*SnapshotMeta, io.ReadCloser, error) { + return nil, nil, fmt.Errorf("open is not supported") +} + +func (d *DiscardSnapshotSink) Write(b []byte) (int, error) { + return len(b), nil +} + +func (d *DiscardSnapshotSink) Close() error { + return nil +} + +func (d *DiscardSnapshotSink) ID() string { + return "discard" +} + +func (d *DiscardSnapshotSink) Cancel() error { + return nil +} diff --git a/vendor/github.com/hashicorp/raft/file_snapshot.go b/vendor/github.com/hashicorp/raft/file_snapshot.go new file mode 100644 index 000000000..17d080134 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/file_snapshot.go @@ -0,0 +1,494 @@ +package raft + +import ( + "bufio" + "bytes" + "encoding/json" + "fmt" + "hash" + "hash/crc64" + "io" + "io/ioutil" + "log" + "os" + "path/filepath" + "sort" + "strings" + "time" +) + +const ( + testPath = "permTest" + snapPath = "snapshots" + metaFilePath = "meta.json" + stateFilePath = "state.bin" + tmpSuffix = ".tmp" +) + +// FileSnapshotStore implements the SnapshotStore interface and allows +// snapshots to be made on the local disk. +type FileSnapshotStore struct { + path string + retain int + logger *log.Logger +} + +type snapMetaSlice []*fileSnapshotMeta + +// FileSnapshotSink implements SnapshotSink with a file. +type FileSnapshotSink struct { + store *FileSnapshotStore + logger *log.Logger + dir string + meta fileSnapshotMeta + + stateFile *os.File + stateHash hash.Hash64 + buffered *bufio.Writer + + closed bool +} + +// fileSnapshotMeta is stored on disk. We also put a CRC +// on disk so that we can verify the snapshot. +type fileSnapshotMeta struct { + SnapshotMeta + CRC []byte +} + +// bufferedFile is returned when we open a snapshot. This way +// reads are buffered and the file still gets closed. +type bufferedFile struct { + bh *bufio.Reader + fh *os.File +} + +func (b *bufferedFile) Read(p []byte) (n int, err error) { + return b.bh.Read(p) +} + +func (b *bufferedFile) Close() error { + return b.fh.Close() +} + +// NewFileSnapshotStoreWithLogger creates a new FileSnapshotStore based +// on a base directory. The `retain` parameter controls how many +// snapshots are retained. Must be at least 1. +func NewFileSnapshotStoreWithLogger(base string, retain int, logger *log.Logger) (*FileSnapshotStore, error) { + if retain < 1 { + return nil, fmt.Errorf("must retain at least one snapshot") + } + if logger == nil { + logger = log.New(os.Stderr, "", log.LstdFlags) + } + + // Ensure our path exists + path := filepath.Join(base, snapPath) + if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { + return nil, fmt.Errorf("snapshot path not accessible: %v", err) + } + + // Setup the store + store := &FileSnapshotStore{ + path: path, + retain: retain, + logger: logger, + } + + // Do a permissions test + if err := store.testPermissions(); err != nil { + return nil, fmt.Errorf("permissions test failed: %v", err) + } + return store, nil +} + +// NewFileSnapshotStore creates a new FileSnapshotStore based +// on a base directory. The `retain` parameter controls how many +// snapshots are retained. Must be at least 1. +func NewFileSnapshotStore(base string, retain int, logOutput io.Writer) (*FileSnapshotStore, error) { + if logOutput == nil { + logOutput = os.Stderr + } + return NewFileSnapshotStoreWithLogger(base, retain, log.New(logOutput, "", log.LstdFlags)) +} + +// testPermissions tries to touch a file in our path to see if it works. +func (f *FileSnapshotStore) testPermissions() error { + path := filepath.Join(f.path, testPath) + fh, err := os.Create(path) + if err != nil { + return err + } + + if err = fh.Close(); err != nil { + return err + } + + if err = os.Remove(path); err != nil { + return err + } + return nil +} + +// snapshotName generates a name for the snapshot. +func snapshotName(term, index uint64) string { + now := time.Now() + msec := now.UnixNano() / int64(time.Millisecond) + return fmt.Sprintf("%d-%d-%d", term, index, msec) +} + +// Create is used to start a new snapshot +func (f *FileSnapshotStore) Create(version SnapshotVersion, index, term uint64, + configuration Configuration, configurationIndex uint64, trans Transport) (SnapshotSink, error) { + // We only support version 1 snapshots at this time. + if version != 1 { + return nil, fmt.Errorf("unsupported snapshot version %d", version) + } + + // Create a new path + name := snapshotName(term, index) + path := filepath.Join(f.path, name+tmpSuffix) + f.logger.Printf("[INFO] snapshot: Creating new snapshot at %s", path) + + // Make the directory + if err := os.MkdirAll(path, 0755); err != nil { + f.logger.Printf("[ERR] snapshot: Failed to make snapshot directory: %v", err) + return nil, err + } + + // Create the sink + sink := &FileSnapshotSink{ + store: f, + logger: f.logger, + dir: path, + meta: fileSnapshotMeta{ + SnapshotMeta: SnapshotMeta{ + Version: version, + ID: name, + Index: index, + Term: term, + Peers: encodePeers(configuration, trans), + Configuration: configuration, + ConfigurationIndex: configurationIndex, + }, + CRC: nil, + }, + } + + // Write out the meta data + if err := sink.writeMeta(); err != nil { + f.logger.Printf("[ERR] snapshot: Failed to write metadata: %v", err) + return nil, err + } + + // Open the state file + statePath := filepath.Join(path, stateFilePath) + fh, err := os.Create(statePath) + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to create state file: %v", err) + return nil, err + } + sink.stateFile = fh + + // Create a CRC64 hash + sink.stateHash = crc64.New(crc64.MakeTable(crc64.ECMA)) + + // Wrap both the hash and file in a MultiWriter with buffering + multi := io.MultiWriter(sink.stateFile, sink.stateHash) + sink.buffered = bufio.NewWriter(multi) + + // Done + return sink, nil +} + +// List returns available snapshots in the store. +func (f *FileSnapshotStore) List() ([]*SnapshotMeta, error) { + // Get the eligible snapshots + snapshots, err := f.getSnapshots() + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to get snapshots: %v", err) + return nil, err + } + + var snapMeta []*SnapshotMeta + for _, meta := range snapshots { + snapMeta = append(snapMeta, &meta.SnapshotMeta) + if len(snapMeta) == f.retain { + break + } + } + return snapMeta, nil +} + +// getSnapshots returns all the known snapshots. +func (f *FileSnapshotStore) getSnapshots() ([]*fileSnapshotMeta, error) { + // Get the eligible snapshots + snapshots, err := ioutil.ReadDir(f.path) + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to scan snapshot dir: %v", err) + return nil, err + } + + // Populate the metadata + var snapMeta []*fileSnapshotMeta + for _, snap := range snapshots { + // Ignore any files + if !snap.IsDir() { + continue + } + + // Ignore any temporary snapshots + dirName := snap.Name() + if strings.HasSuffix(dirName, tmpSuffix) { + f.logger.Printf("[WARN] snapshot: Found temporary snapshot: %v", dirName) + continue + } + + // Try to read the meta data + meta, err := f.readMeta(dirName) + if err != nil { + f.logger.Printf("[WARN] snapshot: Failed to read metadata for %v: %v", dirName, err) + continue + } + + // Make sure we can understand this version. + if meta.Version < SnapshotVersionMin || meta.Version > SnapshotVersionMax { + f.logger.Printf("[WARN] snapshot: Snapshot version for %v not supported: %d", dirName, meta.Version) + continue + } + + // Append, but only return up to the retain count + snapMeta = append(snapMeta, meta) + } + + // Sort the snapshot, reverse so we get new -> old + sort.Sort(sort.Reverse(snapMetaSlice(snapMeta))) + + return snapMeta, nil +} + +// readMeta is used to read the meta data for a given named backup +func (f *FileSnapshotStore) readMeta(name string) (*fileSnapshotMeta, error) { + // Open the meta file + metaPath := filepath.Join(f.path, name, metaFilePath) + fh, err := os.Open(metaPath) + if err != nil { + return nil, err + } + defer fh.Close() + + // Buffer the file IO + buffered := bufio.NewReader(fh) + + // Read in the JSON + meta := &fileSnapshotMeta{} + dec := json.NewDecoder(buffered) + if err := dec.Decode(meta); err != nil { + return nil, err + } + return meta, nil +} + +// Open takes a snapshot ID and returns a ReadCloser for that snapshot. +func (f *FileSnapshotStore) Open(id string) (*SnapshotMeta, io.ReadCloser, error) { + // Get the metadata + meta, err := f.readMeta(id) + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to get meta data to open snapshot: %v", err) + return nil, nil, err + } + + // Open the state file + statePath := filepath.Join(f.path, id, stateFilePath) + fh, err := os.Open(statePath) + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to open state file: %v", err) + return nil, nil, err + } + + // Create a CRC64 hash + stateHash := crc64.New(crc64.MakeTable(crc64.ECMA)) + + // Compute the hash + _, err = io.Copy(stateHash, fh) + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to read state file: %v", err) + fh.Close() + return nil, nil, err + } + + // Verify the hash + computed := stateHash.Sum(nil) + if bytes.Compare(meta.CRC, computed) != 0 { + f.logger.Printf("[ERR] snapshot: CRC checksum failed (stored: %v computed: %v)", + meta.CRC, computed) + fh.Close() + return nil, nil, fmt.Errorf("CRC mismatch") + } + + // Seek to the start + if _, err := fh.Seek(0, 0); err != nil { + f.logger.Printf("[ERR] snapshot: State file seek failed: %v", err) + fh.Close() + return nil, nil, err + } + + // Return a buffered file + buffered := &bufferedFile{ + bh: bufio.NewReader(fh), + fh: fh, + } + + return &meta.SnapshotMeta, buffered, nil +} + +// ReapSnapshots reaps any snapshots beyond the retain count. +func (f *FileSnapshotStore) ReapSnapshots() error { + snapshots, err := f.getSnapshots() + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to get snapshots: %v", err) + return err + } + + for i := f.retain; i < len(snapshots); i++ { + path := filepath.Join(f.path, snapshots[i].ID) + f.logger.Printf("[INFO] snapshot: reaping snapshot %v", path) + if err := os.RemoveAll(path); err != nil { + f.logger.Printf("[ERR] snapshot: Failed to reap snapshot %v: %v", path, err) + return err + } + } + return nil +} + +// ID returns the ID of the snapshot, can be used with Open() +// after the snapshot is finalized. +func (s *FileSnapshotSink) ID() string { + return s.meta.ID +} + +// Write is used to append to the state file. We write to the +// buffered IO object to reduce the amount of context switches. +func (s *FileSnapshotSink) Write(b []byte) (int, error) { + return s.buffered.Write(b) +} + +// Close is used to indicate a successful end. +func (s *FileSnapshotSink) Close() error { + // Make sure close is idempotent + if s.closed { + return nil + } + s.closed = true + + // Close the open handles + if err := s.finalize(); err != nil { + s.logger.Printf("[ERR] snapshot: Failed to finalize snapshot: %v", err) + return err + } + + // Write out the meta data + if err := s.writeMeta(); err != nil { + s.logger.Printf("[ERR] snapshot: Failed to write metadata: %v", err) + return err + } + + // Move the directory into place + newPath := strings.TrimSuffix(s.dir, tmpSuffix) + if err := os.Rename(s.dir, newPath); err != nil { + s.logger.Printf("[ERR] snapshot: Failed to move snapshot into place: %v", err) + return err + } + + // Reap any old snapshots + if err := s.store.ReapSnapshots(); err != nil { + return err + } + + return nil +} + +// Cancel is used to indicate an unsuccessful end. +func (s *FileSnapshotSink) Cancel() error { + // Make sure close is idempotent + if s.closed { + return nil + } + s.closed = true + + // Close the open handles + if err := s.finalize(); err != nil { + s.logger.Printf("[ERR] snapshot: Failed to finalize snapshot: %v", err) + return err + } + + // Attempt to remove all artifacts + return os.RemoveAll(s.dir) +} + +// finalize is used to close all of our resources. +func (s *FileSnapshotSink) finalize() error { + // Flush any remaining data + if err := s.buffered.Flush(); err != nil { + return err + } + + // Get the file size + stat, statErr := s.stateFile.Stat() + + // Close the file + if err := s.stateFile.Close(); err != nil { + return err + } + + // Set the file size, check after we close + if statErr != nil { + return statErr + } + s.meta.Size = stat.Size() + + // Set the CRC + s.meta.CRC = s.stateHash.Sum(nil) + return nil +} + +// writeMeta is used to write out the metadata we have. +func (s *FileSnapshotSink) writeMeta() error { + // Open the meta file + metaPath := filepath.Join(s.dir, metaFilePath) + fh, err := os.Create(metaPath) + if err != nil { + return err + } + defer fh.Close() + + // Buffer the file IO + buffered := bufio.NewWriter(fh) + defer buffered.Flush() + + // Write out as JSON + enc := json.NewEncoder(buffered) + if err := enc.Encode(&s.meta); err != nil { + return err + } + return nil +} + +// Implement the sort interface for []*fileSnapshotMeta. +func (s snapMetaSlice) Len() int { + return len(s) +} + +func (s snapMetaSlice) Less(i, j int) bool { + if s[i].Term != s[j].Term { + return s[i].Term < s[j].Term + } + if s[i].Index != s[j].Index { + return s[i].Index < s[j].Index + } + return s[i].ID < s[j].ID +} + +func (s snapMetaSlice) Swap(i, j int) { + s[i], s[j] = s[j], s[i] +} diff --git a/vendor/github.com/hashicorp/raft/fsm.go b/vendor/github.com/hashicorp/raft/fsm.go new file mode 100644 index 000000000..c89986c0f --- /dev/null +++ b/vendor/github.com/hashicorp/raft/fsm.go @@ -0,0 +1,136 @@ +package raft + +import ( + "fmt" + "io" + "time" + + "github.com/armon/go-metrics" +) + +// FSM provides an interface that can be implemented by +// clients to make use of the replicated log. +type FSM interface { + // Apply log is invoked once a log entry is committed. + // It returns a value which will be made available in the + // ApplyFuture returned by Raft.Apply method if that + // method was called on the same Raft node as the FSM. + Apply(*Log) interface{} + + // Snapshot is used to support log compaction. This call should + // return an FSMSnapshot which can be used to save a point-in-time + // snapshot of the FSM. Apply and Snapshot are not called in multiple + // threads, but Apply will be called concurrently with Persist. This means + // the FSM should be implemented in a fashion that allows for concurrent + // updates while a snapshot is happening. + Snapshot() (FSMSnapshot, error) + + // Restore is used to restore an FSM from a snapshot. It is not called + // concurrently with any other command. The FSM must discard all previous + // state. + Restore(io.ReadCloser) error +} + +// FSMSnapshot is returned by an FSM in response to a Snapshot +// It must be safe to invoke FSMSnapshot methods with concurrent +// calls to Apply. +type FSMSnapshot interface { + // Persist should dump all necessary state to the WriteCloser 'sink', + // and call sink.Close() when finished or call sink.Cancel() on error. + Persist(sink SnapshotSink) error + + // Release is invoked when we are finished with the snapshot. + Release() +} + +// runFSM is a long running goroutine responsible for applying logs +// to the FSM. This is done async of other logs since we don't want +// the FSM to block our internal operations. +func (r *Raft) runFSM() { + var lastIndex, lastTerm uint64 + + commit := func(req *commitTuple) { + // Apply the log if a command + var resp interface{} + if req.log.Type == LogCommand { + start := time.Now() + resp = r.fsm.Apply(req.log) + metrics.MeasureSince([]string{"raft", "fsm", "apply"}, start) + } + + // Update the indexes + lastIndex = req.log.Index + lastTerm = req.log.Term + + // Invoke the future if given + if req.future != nil { + req.future.response = resp + req.future.respond(nil) + } + } + + restore := func(req *restoreFuture) { + // Open the snapshot + meta, source, err := r.snapshots.Open(req.ID) + if err != nil { + req.respond(fmt.Errorf("failed to open snapshot %v: %v", req.ID, err)) + return + } + + // Attempt to restore + start := time.Now() + if err := r.fsm.Restore(source); err != nil { + req.respond(fmt.Errorf("failed to restore snapshot %v: %v", req.ID, err)) + source.Close() + return + } + source.Close() + metrics.MeasureSince([]string{"raft", "fsm", "restore"}, start) + + // Update the last index and term + lastIndex = meta.Index + lastTerm = meta.Term + req.respond(nil) + } + + snapshot := func(req *reqSnapshotFuture) { + // Is there something to snapshot? + if lastIndex == 0 { + req.respond(ErrNothingNewToSnapshot) + return + } + + // Start a snapshot + start := time.Now() + snap, err := r.fsm.Snapshot() + metrics.MeasureSince([]string{"raft", "fsm", "snapshot"}, start) + + // Respond to the request + req.index = lastIndex + req.term = lastTerm + req.snapshot = snap + req.respond(err) + } + + for { + select { + case ptr := <-r.fsmMutateCh: + switch req := ptr.(type) { + case *commitTuple: + commit(req) + + case *restoreFuture: + restore(req) + + default: + panic(fmt.Errorf("bad type passed to fsmMutateCh: %#v", ptr)) + } + + case req := <-r.fsmSnapshotCh: + snapshot(req) + + case <-r.shutdownCh: + return + } + } +} diff --git a/vendor/github.com/hashicorp/raft/future.go b/vendor/github.com/hashicorp/raft/future.go new file mode 100644 index 000000000..fac59a5cc --- /dev/null +++ b/vendor/github.com/hashicorp/raft/future.go @@ -0,0 +1,289 @@ +package raft + +import ( + "fmt" + "io" + "sync" + "time" +) + +// Future is used to represent an action that may occur in the future. +type Future interface { + // Error blocks until the future arrives and then + // returns the error status of the future. + // This may be called any number of times - all + // calls will return the same value. + // Note that it is not OK to call this method + // twice concurrently on the same Future instance. + Error() error +} + +// IndexFuture is used for future actions that can result in a raft log entry +// being created. +type IndexFuture interface { + Future + + // Index holds the index of the newly applied log entry. + // This must not be called until after the Error method has returned. + Index() uint64 +} + +// ApplyFuture is used for Apply and can return the FSM response. +type ApplyFuture interface { + IndexFuture + + // Response returns the FSM response as returned + // by the FSM.Apply method. This must not be called + // until after the Error method has returned. + Response() interface{} +} + +// ConfigurationFuture is used for GetConfiguration and can return the +// latest configuration in use by Raft. +type ConfigurationFuture interface { + IndexFuture + + // Configuration contains the latest configuration. This must + // not be called until after the Error method has returned. + Configuration() Configuration +} + +// SnapshotFuture is used for waiting on a user-triggered snapshot to complete. +type SnapshotFuture interface { + Future + + // Open is a function you can call to access the underlying snapshot and + // its metadata. This must not be called until after the Error method + // has returned. + Open() (*SnapshotMeta, io.ReadCloser, error) +} + +// errorFuture is used to return a static error. +type errorFuture struct { + err error +} + +func (e errorFuture) Error() error { + return e.err +} + +func (e errorFuture) Response() interface{} { + return nil +} + +func (e errorFuture) Index() uint64 { + return 0 +} + +// deferError can be embedded to allow a future +// to provide an error in the future. +type deferError struct { + err error + errCh chan error + responded bool +} + +func (d *deferError) init() { + d.errCh = make(chan error, 1) +} + +func (d *deferError) Error() error { + if d.err != nil { + // Note that when we've received a nil error, this + // won't trigger, but the channel is closed after + // send so we'll still return nil below. + return d.err + } + if d.errCh == nil { + panic("waiting for response on nil channel") + } + d.err = <-d.errCh + return d.err +} + +func (d *deferError) respond(err error) { + if d.errCh == nil { + return + } + if d.responded { + return + } + d.errCh <- err + close(d.errCh) + d.responded = true +} + +// There are several types of requests that cause a configuration entry to +// be appended to the log. These are encoded here for leaderLoop() to process. +// This is internal to a single server. +type configurationChangeFuture struct { + logFuture + req configurationChangeRequest +} + +// bootstrapFuture is used to attempt a live bootstrap of the cluster. See the +// Raft object's BootstrapCluster member function for more details. +type bootstrapFuture struct { + deferError + + // configuration is the proposed bootstrap configuration to apply. + configuration Configuration +} + +// logFuture is used to apply a log entry and waits until +// the log is considered committed. +type logFuture struct { + deferError + log Log + response interface{} + dispatch time.Time +} + +func (l *logFuture) Response() interface{} { + return l.response +} + +func (l *logFuture) Index() uint64 { + return l.log.Index +} + +type shutdownFuture struct { + raft *Raft +} + +func (s *shutdownFuture) Error() error { + if s.raft == nil { + return nil + } + s.raft.waitShutdown() + if closeable, ok := s.raft.trans.(WithClose); ok { + closeable.Close() + } + return nil +} + +// userSnapshotFuture is used for waiting on a user-triggered snapshot to +// complete. +type userSnapshotFuture struct { + deferError + + // opener is a function used to open the snapshot. This is filled in + // once the future returns with no error. + opener func() (*SnapshotMeta, io.ReadCloser, error) +} + +// Open is a function you can call to access the underlying snapshot and its +// metadata. +func (u *userSnapshotFuture) Open() (*SnapshotMeta, io.ReadCloser, error) { + if u.opener == nil { + return nil, nil, fmt.Errorf("no snapshot available") + } else { + // Invalidate the opener so it can't get called multiple times, + // which isn't generally safe. + defer func() { + u.opener = nil + }() + return u.opener() + } +} + +// userRestoreFuture is used for waiting on a user-triggered restore of an +// external snapshot to complete. +type userRestoreFuture struct { + deferError + + // meta is the metadata that belongs with the snapshot. + meta *SnapshotMeta + + // reader is the interface to read the snapshot contents from. + reader io.Reader +} + +// reqSnapshotFuture is used for requesting a snapshot start. +// It is only used internally. +type reqSnapshotFuture struct { + deferError + + // snapshot details provided by the FSM runner before responding + index uint64 + term uint64 + snapshot FSMSnapshot +} + +// restoreFuture is used for requesting an FSM to perform a +// snapshot restore. Used internally only. +type restoreFuture struct { + deferError + ID string +} + +// verifyFuture is used to verify the current node is still +// the leader. This is to prevent a stale read. +type verifyFuture struct { + deferError + notifyCh chan *verifyFuture + quorumSize int + votes int + voteLock sync.Mutex +} + +// configurationsFuture is used to retrieve the current configurations. This is +// used to allow safe access to this information outside of the main thread. +type configurationsFuture struct { + deferError + configurations configurations +} + +// Configuration returns the latest configuration in use by Raft. +func (c *configurationsFuture) Configuration() Configuration { + return c.configurations.latest +} + +// Index returns the index of the latest configuration in use by Raft. +func (c *configurationsFuture) Index() uint64 { + return c.configurations.latestIndex +} + +// vote is used to respond to a verifyFuture. +// This may block when responding on the notifyCh. +func (v *verifyFuture) vote(leader bool) { + v.voteLock.Lock() + defer v.voteLock.Unlock() + + // Guard against having notified already + if v.notifyCh == nil { + return + } + + if leader { + v.votes++ + if v.votes >= v.quorumSize { + v.notifyCh <- v + v.notifyCh = nil + } + } else { + v.notifyCh <- v + v.notifyCh = nil + } +} + +// appendFuture is used for waiting on a pipelined append +// entries RPC. +type appendFuture struct { + deferError + start time.Time + args *AppendEntriesRequest + resp *AppendEntriesResponse +} + +func (a *appendFuture) Start() time.Time { + return a.start +} + +func (a *appendFuture) Request() *AppendEntriesRequest { + return a.args +} + +func (a *appendFuture) Response() *AppendEntriesResponse { + return a.resp +} diff --git a/vendor/github.com/hashicorp/raft/inmem_snapshot.go b/vendor/github.com/hashicorp/raft/inmem_snapshot.go new file mode 100644 index 000000000..3aa92b3e9 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/inmem_snapshot.go @@ -0,0 +1,106 @@ +package raft + +import ( + "bytes" + "fmt" + "io" + "io/ioutil" + "sync" +) + +// InmemSnapshotStore implements the SnapshotStore interface and +// retains only the most recent snapshot +type InmemSnapshotStore struct { + latest *InmemSnapshotSink + hasSnapshot bool + sync.RWMutex +} + +// InmemSnapshotSink implements SnapshotSink in memory +type InmemSnapshotSink struct { + meta SnapshotMeta + contents *bytes.Buffer +} + +// NewInmemSnapshotStore creates a blank new InmemSnapshotStore +func NewInmemSnapshotStore() *InmemSnapshotStore { + return &InmemSnapshotStore{ + latest: &InmemSnapshotSink{ + contents: &bytes.Buffer{}, + }, + } +} + +// Create replaces the stored snapshot with a new one using the given args +func (m *InmemSnapshotStore) Create(version SnapshotVersion, index, term uint64, + configuration Configuration, configurationIndex uint64, trans Transport) (SnapshotSink, error) { + // We only support version 1 snapshots at this time. + if version != 1 { + return nil, fmt.Errorf("unsupported snapshot version %d", version) + } + + name := snapshotName(term, index) + + m.Lock() + defer m.Unlock() + + sink := &InmemSnapshotSink{ + meta: SnapshotMeta{ + Version: version, + ID: name, + Index: index, + Term: term, + Peers: encodePeers(configuration, trans), + Configuration: configuration, + ConfigurationIndex: configurationIndex, + }, + contents: &bytes.Buffer{}, + } + m.hasSnapshot = true + m.latest = sink + + return sink, nil +} + +// List returns the latest snapshot taken +func (m *InmemSnapshotStore) List() ([]*SnapshotMeta, error) { + m.RLock() + defer m.RUnlock() + + if !m.hasSnapshot { + return []*SnapshotMeta{}, nil + } + return []*SnapshotMeta{&m.latest.meta}, nil +} + +// Open wraps an io.ReadCloser around the snapshot contents +func (m *InmemSnapshotStore) Open(id string) (*SnapshotMeta, io.ReadCloser, error) { + m.RLock() + defer m.RUnlock() + + if m.latest.meta.ID != id { + return nil, nil, fmt.Errorf("[ERR] snapshot: failed to open snapshot id: %s", id) + } + + return &m.latest.meta, ioutil.NopCloser(m.latest.contents), nil +} + +// Write appends the given bytes to the snapshot contents +func (s *InmemSnapshotSink) Write(p []byte) (n int, err error) { + written, err := io.Copy(s.contents, bytes.NewReader(p)) + s.meta.Size += written + return int(written), err +} + +// Close updates the Size and is otherwise a no-op +func (s *InmemSnapshotSink) Close() error { + return nil +} + +func (s *InmemSnapshotSink) ID() string { + return s.meta.ID +} + +func (s *InmemSnapshotSink) Cancel() error { + return nil +} diff --git a/vendor/github.com/hashicorp/raft/inmem_store.go b/vendor/github.com/hashicorp/raft/inmem_store.go new file mode 100644 index 000000000..e5d579e1b --- /dev/null +++ b/vendor/github.com/hashicorp/raft/inmem_store.go @@ -0,0 +1,125 @@ +package raft + +import ( + "sync" +) + +// InmemStore implements the LogStore and StableStore interface. +// It should NOT EVER be used for production. It is used only for +// unit tests. Use the MDBStore implementation instead. +type InmemStore struct { + l sync.RWMutex + lowIndex uint64 + highIndex uint64 + logs map[uint64]*Log + kv map[string][]byte + kvInt map[string]uint64 +} + +// NewInmemStore returns a new in-memory backend. Do not ever +// use for production. Only for testing. +func NewInmemStore() *InmemStore { + i := &InmemStore{ + logs: make(map[uint64]*Log), + kv: make(map[string][]byte), + kvInt: make(map[string]uint64), + } + return i +} + +// FirstIndex implements the LogStore interface. +func (i *InmemStore) FirstIndex() (uint64, error) { + i.l.RLock() + defer i.l.RUnlock() + return i.lowIndex, nil +} + +// LastIndex implements the LogStore interface. +func (i *InmemStore) LastIndex() (uint64, error) { + i.l.RLock() + defer i.l.RUnlock() + return i.highIndex, nil +} + +// GetLog implements the LogStore interface. +func (i *InmemStore) GetLog(index uint64, log *Log) error { + i.l.RLock() + defer i.l.RUnlock() + l, ok := i.logs[index] + if !ok { + return ErrLogNotFound + } + *log = *l + return nil +} + +// StoreLog implements the LogStore interface. +func (i *InmemStore) StoreLog(log *Log) error { + return i.StoreLogs([]*Log{log}) +} + +// StoreLogs implements the LogStore interface. +func (i *InmemStore) StoreLogs(logs []*Log) error { + i.l.Lock() + defer i.l.Unlock() + for _, l := range logs { + i.logs[l.Index] = l + if i.lowIndex == 0 { + i.lowIndex = l.Index + } + if l.Index > i.highIndex { + i.highIndex = l.Index + } + } + return nil +} + +// DeleteRange implements the LogStore interface. +func (i *InmemStore) DeleteRange(min, max uint64) error { + i.l.Lock() + defer i.l.Unlock() + for j := min; j <= max; j++ { + delete(i.logs, j) + } + if min <= i.lowIndex { + i.lowIndex = max + 1 + } + if max >= i.highIndex { + i.highIndex = min - 1 + } + if i.lowIndex > i.highIndex { + i.lowIndex = 0 + i.highIndex = 0 + } + return nil +} + +// Set implements the StableStore interface. +func (i *InmemStore) Set(key []byte, val []byte) error { + i.l.Lock() + defer i.l.Unlock() + i.kv[string(key)] = val + return nil +} + +// Get implements the StableStore interface. +func (i *InmemStore) Get(key []byte) ([]byte, error) { + i.l.RLock() + defer i.l.RUnlock() + return i.kv[string(key)], nil +} + +// SetUint64 implements the StableStore interface. +func (i *InmemStore) SetUint64(key []byte, val uint64) error { + i.l.Lock() + defer i.l.Unlock() + i.kvInt[string(key)] = val + return nil +} + +// GetUint64 implements the StableStore interface. +func (i *InmemStore) GetUint64(key []byte) (uint64, error) { + i.l.RLock() + defer i.l.RUnlock() + return i.kvInt[string(key)], nil +} diff --git a/vendor/github.com/hashicorp/raft/inmem_transport.go b/vendor/github.com/hashicorp/raft/inmem_transport.go new file mode 100644 index 000000000..3693cd5ad --- /dev/null +++ b/vendor/github.com/hashicorp/raft/inmem_transport.go @@ -0,0 +1,322 @@ +package raft + +import ( + "fmt" + "io" + "sync" + "time" +) + +// NewInmemAddr returns a new in-memory addr with +// a randomly generate UUID as the ID. +func NewInmemAddr() ServerAddress { + return ServerAddress(generateUUID()) +} + +// inmemPipeline is used to pipeline requests for the in-mem transport. +type inmemPipeline struct { + trans *InmemTransport + peer *InmemTransport + peerAddr ServerAddress + + doneCh chan AppendFuture + inprogressCh chan *inmemPipelineInflight + + shutdown bool + shutdownCh chan struct{} + shutdownLock sync.Mutex +} + +type inmemPipelineInflight struct { + future *appendFuture + respCh <-chan RPCResponse +} + +// InmemTransport Implements the Transport interface, to allow Raft to be +// tested in-memory without going over a network. +type InmemTransport struct { + sync.RWMutex + consumerCh chan RPC + localAddr ServerAddress + peers map[ServerAddress]*InmemTransport + pipelines []*inmemPipeline + timeout time.Duration +} + +// NewInmemTransport is used to initialize a new transport +// and generates a random local address if none is specified +func NewInmemTransport(addr ServerAddress) (ServerAddress, *InmemTransport) { + if string(addr) == "" { + addr = NewInmemAddr() + } + trans := &InmemTransport{ + consumerCh: make(chan RPC, 16), + localAddr: addr, + peers: make(map[ServerAddress]*InmemTransport), + timeout: 50 * time.Millisecond, + } + return addr, trans +} + +// SetHeartbeatHandler is used to set optional fast-path for +// heartbeats, not supported for this transport. +func (i *InmemTransport) SetHeartbeatHandler(cb func(RPC)) { +} + +// Consumer implements the Transport interface. +func (i *InmemTransport) Consumer() <-chan RPC { + return i.consumerCh +} + +// LocalAddr implements the Transport interface. +func (i *InmemTransport) LocalAddr() ServerAddress { + return i.localAddr +} + +// AppendEntriesPipeline returns an interface that can be used to pipeline +// AppendEntries requests. +func (i *InmemTransport) AppendEntriesPipeline(target ServerAddress) (AppendPipeline, error) { + i.RLock() + peer, ok := i.peers[target] + i.RUnlock() + if !ok { + return nil, fmt.Errorf("failed to connect to peer: %v", target) + } + pipeline := newInmemPipeline(i, peer, target) + i.Lock() + i.pipelines = append(i.pipelines, pipeline) + i.Unlock() + return pipeline, nil +} + +// AppendEntries implements the Transport interface. +func (i *InmemTransport) AppendEntries(target ServerAddress, args *AppendEntriesRequest, resp *AppendEntriesResponse) error { + rpcResp, err := i.makeRPC(target, args, nil, i.timeout) + if err != nil { + return err + } + + // Copy the result back + out := rpcResp.Response.(*AppendEntriesResponse) + *resp = *out + return nil +} + +// RequestVote implements the Transport interface. +func (i *InmemTransport) RequestVote(target ServerAddress, args *RequestVoteRequest, resp *RequestVoteResponse) error { + rpcResp, err := i.makeRPC(target, args, nil, i.timeout) + if err != nil { + return err + } + + // Copy the result back + out := rpcResp.Response.(*RequestVoteResponse) + *resp = *out + return nil +} + +// InstallSnapshot implements the Transport interface. +func (i *InmemTransport) InstallSnapshot(target ServerAddress, args *InstallSnapshotRequest, resp *InstallSnapshotResponse, data io.Reader) error { + rpcResp, err := i.makeRPC(target, args, data, 10*i.timeout) + if err != nil { + return err + } + + // Copy the result back + out := rpcResp.Response.(*InstallSnapshotResponse) + *resp = *out + return nil +} + +func (i *InmemTransport) makeRPC(target ServerAddress, args interface{}, r io.Reader, timeout time.Duration) (rpcResp RPCResponse, err error) { + i.RLock() + peer, ok := i.peers[target] + i.RUnlock() + + if !ok { + err = fmt.Errorf("failed to connect to peer: %v", target) + return + } + + // Send the RPC over + respCh := make(chan RPCResponse) + peer.consumerCh <- RPC{ + Command: args, + Reader: r, + RespChan: respCh, + } + + // Wait for a response + select { + case rpcResp = <-respCh: + if rpcResp.Error != nil { + err = rpcResp.Error + } + case <-time.After(timeout): + err = fmt.Errorf("command timed out") + } + return +} + +// EncodePeer implements the Transport interface. +func (i *InmemTransport) EncodePeer(p ServerAddress) []byte { + return []byte(p) +} + +// DecodePeer implements the Transport interface. +func (i *InmemTransport) DecodePeer(buf []byte) ServerAddress { + return ServerAddress(buf) +} + +// Connect is used to connect this transport to another transport for +// a given peer name. This allows for local routing. +func (i *InmemTransport) Connect(peer ServerAddress, t Transport) { + trans := t.(*InmemTransport) + i.Lock() + defer i.Unlock() + i.peers[peer] = trans +} + +// Disconnect is used to remove the ability to route to a given peer. +func (i *InmemTransport) Disconnect(peer ServerAddress) { + i.Lock() + defer i.Unlock() + delete(i.peers, peer) + + // Disconnect any pipelines + n := len(i.pipelines) + for idx := 0; idx < n; idx++ { + if i.pipelines[idx].peerAddr == peer { + i.pipelines[idx].Close() + i.pipelines[idx], i.pipelines[n-1] = i.pipelines[n-1], nil + idx-- + n-- + } + } + i.pipelines = i.pipelines[:n] +} + +// DisconnectAll is used to remove all routes to peers. +func (i *InmemTransport) DisconnectAll() { + i.Lock() + defer i.Unlock() + i.peers = make(map[ServerAddress]*InmemTransport) + + // Handle pipelines + for _, pipeline := range i.pipelines { + pipeline.Close() + } + i.pipelines = nil +} + +// Close is used to permanently disable the transport +func (i *InmemTransport) Close() error { + i.DisconnectAll() + return nil +} + +func newInmemPipeline(trans *InmemTransport, peer *InmemTransport, addr ServerAddress) *inmemPipeline { + i := &inmemPipeline{ + trans: trans, + peer: peer, + peerAddr: addr, + doneCh: make(chan AppendFuture, 16), + inprogressCh: make(chan *inmemPipelineInflight, 16), + shutdownCh: make(chan struct{}), + } + go i.decodeResponses() + return i +} + +func (i *inmemPipeline) decodeResponses() { + timeout := i.trans.timeout + for { + select { + case inp := <-i.inprogressCh: + var timeoutCh <-chan time.Time + if timeout > 0 { + timeoutCh = time.After(timeout) + } + + select { + case rpcResp := <-inp.respCh: + // Copy the result back + *inp.future.resp = *rpcResp.Response.(*AppendEntriesResponse) + inp.future.respond(rpcResp.Error) + + select { + case i.doneCh <- inp.future: + case <-i.shutdownCh: + return + } + + case <-timeoutCh: + inp.future.respond(fmt.Errorf("command timed out")) + select { + case i.doneCh <- inp.future: + case <-i.shutdownCh: + return + } + + case <-i.shutdownCh: + return + } + case <-i.shutdownCh: + return + } + } +} + +func (i *inmemPipeline) AppendEntries(args *AppendEntriesRequest, resp *AppendEntriesResponse) (AppendFuture, error) { + // Create a new future + future := &appendFuture{ + start: time.Now(), + args: args, + resp: resp, + } + future.init() + + // Handle a timeout + var timeout <-chan time.Time + if i.trans.timeout > 0 { + timeout = time.After(i.trans.timeout) + } + + // Send the RPC over + respCh := make(chan RPCResponse, 1) + rpc := RPC{ + Command: args, + RespChan: respCh, + } + select { + case i.peer.consumerCh <- rpc: + case <-timeout: + return nil, fmt.Errorf("command enqueue timeout") + case <-i.shutdownCh: + return nil, ErrPipelineShutdown + } + + // Send to be decoded + select { + case i.inprogressCh <- &inmemPipelineInflight{future, respCh}: + return future, nil + case <-i.shutdownCh: + return nil, ErrPipelineShutdown + } +} + +func (i *inmemPipeline) Consumer() <-chan AppendFuture { + return i.doneCh +} + +func (i *inmemPipeline) Close() error { + i.shutdownLock.Lock() + defer i.shutdownLock.Unlock() + if i.shutdown { + return nil + } + + i.shutdown = true + close(i.shutdownCh) + return nil +} diff --git a/vendor/github.com/hashicorp/raft/log.go b/vendor/github.com/hashicorp/raft/log.go new file mode 100644 index 000000000..4ade38ecc --- /dev/null +++ b/vendor/github.com/hashicorp/raft/log.go @@ -0,0 +1,72 @@ +package raft + +// LogType describes various types of log entries. +type LogType uint8 + +const ( + // LogCommand is applied to a user FSM. + LogCommand LogType = iota + + // LogNoop is used to assert leadership. + LogNoop + + // LogAddPeer is used to add a new peer. This should only be used with + // older protocol versions designed to be compatible with unversioned + // Raft servers. See comments in config.go for details. + LogAddPeerDeprecated + + // LogRemovePeer is used to remove an existing peer. This should only be + // used with older protocol versions designed to be compatible with + // unversioned Raft servers. See comments in config.go for details. + LogRemovePeerDeprecated + + // LogBarrier is used to ensure all preceding operations have been + // applied to the FSM. It is similar to LogNoop, but instead of returning + // once committed, it only returns once the FSM manager acks it. Otherwise + // it is possible there are operations committed but not yet applied to + // the FSM. + LogBarrier + + // LogConfiguration establishes a membership change configuration. It is + // created when a server is added, removed, promoted, etc. Only used + // when protocol version 1 or greater is in use. + LogConfiguration +) + +// Log entries are replicated to all members of the Raft cluster +// and form the heart of the replicated state machine. +type Log struct { + // Index holds the index of the log entry. + Index uint64 + + // Term holds the election term of the log entry. + Term uint64 + + // Type holds the type of the log entry. + Type LogType + + // Data holds the log entry's type-specific data. + Data []byte +} + +// LogStore is used to provide an interface for storing +// and retrieving logs in a durable fashion. +type LogStore interface { + // FirstIndex returns the first index written. 0 for no entries. + FirstIndex() (uint64, error) + + // LastIndex returns the last index written. 0 for no entries. + LastIndex() (uint64, error) + + // GetLog gets a log entry at a given index. + GetLog(index uint64, log *Log) error + + // StoreLog stores a log entry. + StoreLog(log *Log) error + + // StoreLogs stores multiple log entries. + StoreLogs(logs []*Log) error + + // DeleteRange deletes a range of log entries. The range is inclusive. + DeleteRange(min, max uint64) error +} diff --git a/vendor/github.com/hashicorp/raft/log_cache.go b/vendor/github.com/hashicorp/raft/log_cache.go new file mode 100644 index 000000000..952e98c22 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/log_cache.go @@ -0,0 +1,79 @@ +package raft + +import ( + "fmt" + "sync" +) + +// LogCache wraps any LogStore implementation to provide an +// in-memory ring buffer. This is used to cache access to +// the recently written entries. For implementations that do not +// cache themselves, this can provide a substantial boost by +// avoiding disk I/O on recent entries. +type LogCache struct { + store LogStore + + cache []*Log + l sync.RWMutex +} + +// NewLogCache is used to create a new LogCache with the +// given capacity and backend store. +func NewLogCache(capacity int, store LogStore) (*LogCache, error) { + if capacity <= 0 { + return nil, fmt.Errorf("capacity must be positive") + } + c := &LogCache{ + store: store, + cache: make([]*Log, capacity), + } + return c, nil +} + +func (c *LogCache) GetLog(idx uint64, log *Log) error { + // Check the buffer for an entry + c.l.RLock() + cached := c.cache[idx%uint64(len(c.cache))] + c.l.RUnlock() + + // Check if entry is valid + if cached != nil && cached.Index == idx { + *log = *cached + return nil + } + + // Forward request on cache miss + return c.store.GetLog(idx, log) +} + +func (c *LogCache) StoreLog(log *Log) error { + return c.StoreLogs([]*Log{log}) +} + +func (c *LogCache) StoreLogs(logs []*Log) error { + // Insert the logs into the ring buffer + c.l.Lock() + for _, l := range logs { + c.cache[l.Index%uint64(len(c.cache))] = l + } + c.l.Unlock() + + return c.store.StoreLogs(logs) +} + +func (c *LogCache) FirstIndex() (uint64, error) { + return c.store.FirstIndex() +} + +func (c *LogCache) LastIndex() (uint64, error) { + return c.store.LastIndex() +} + +func (c *LogCache) DeleteRange(min, max uint64) error { + // Invalidate the cache on deletes + c.l.Lock() + c.cache = make([]*Log, len(c.cache)) + c.l.Unlock() + + return c.store.DeleteRange(min, max) +} diff --git a/vendor/github.com/hashicorp/raft/membership.md b/vendor/github.com/hashicorp/raft/membership.md new file mode 100644 index 000000000..df1f83e27 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/membership.md @@ -0,0 +1,83 @@ +Simon (@superfell) and I (@ongardie) talked through reworking this library's cluster membership changes last Friday. We don't see a way to split this into independent patches, so we're taking the next best approach: submitting the plan here for review, then working on an enormous PR. Your feedback would be appreciated. (@superfell is out this week, however, so don't expect him to respond quickly.) + +These are the main goals: + - Bringing things in line with the description in my PhD dissertation; + - Catching up new servers prior to granting them a vote, as well as allowing permanent non-voting members; and + - Eliminating the `peers.json` file, to avoid issues of consistency between that and the log/snapshot. + +## Data-centric view + +We propose to re-define a *configuration* as a set of servers, where each server includes an address (as it does today) and a mode that is either: + - *Voter*: a server whose vote is counted in elections and whose match index is used in advancing the leader's commit index. + - *Nonvoter*: a server that receives log entries but is not considered for elections or commitment purposes. + - *Staging*: a server that acts like a nonvoter with one exception: once a staging server receives enough log entries to catch up sufficiently to the leader's log, the leader will invoke a membership change to change the staging server to a voter. + +All changes to the configuration will be done by writing a new configuration to the log. The new configuration will be in affect as soon as it is appended to the log (not when it is committed like a normal state machine command). Note that, per my dissertation, there can be at most one uncommitted configuration at a time (the next configuration may not be created until the prior one has been committed). It's not strictly necessary to follow these same rules for the nonvoter/staging servers, but we think its best to treat all changes uniformly. + +Each server will track two configurations: + 1. its *committed configuration*: the latest configuration in the log/snapshot that has been committed, along with its index. + 2. its *latest configuration*: the latest configuration in the log/snapshot (may be committed or uncommitted), along with its index. + +When there's no membership change happening, these two will be the same. The latest configuration is almost always the one used, except: + - When followers truncate the suffix of their logs, they may need to fall back to the committed configuration. + - When snapshotting, the committed configuration is written, to correspond with the committed log prefix that is being snapshotted. + + +## Application API + +We propose the following operations for clients to manipulate the cluster configuration: + - AddVoter: server becomes staging unless voter, + - AddNonvoter: server becomes nonvoter unless staging or voter, + - DemoteVoter: server becomes nonvoter unless absent, + - RemovePeer: server removed from configuration, + - GetConfiguration: waits for latest config to commit, returns committed config. + +This diagram, of which I'm quite proud, shows the possible transitions: +``` ++-----------------------------------------------------------------------------+ +| | +| Start -> +--------+ | +| ,------<------------| | | +| / | absent | | +| / RemovePeer--> | | <---RemovePeer | +| / | +--------+ \ | +| / | | \ | +| AddNonvoter | AddVoter \ | +| | ,->---' `--<-. | \ | +| v / \ v \ | +| +----------+ +----------+ +----------+ | +| | | ---AddVoter--> | | -log caught up --> | | | +| | nonvoter | | staging | | voter | | +| | | <-DemoteVoter- | | ,- | | | +| +----------+ \ +----------+ / +----------+ | +| \ / | +| `--------------<---------------' | +| | ++-----------------------------------------------------------------------------+ +``` + +While these operations aren't quite symmetric, we think they're a good set to capture +the possible intent of the user. For example, if I want to make sure a server doesn't have a vote, but the server isn't part of the configuration at all, it probably shouldn't be added as a nonvoting server. + +Each of these application-level operations will be interpreted by the leader and, if it has an effect, will cause the leader to write a new configuration entry to its log. Which particular application-level operation caused the log entry to be written need not be part of the log entry. + +## Code implications + +This is a non-exhaustive list, but we came up with a few things: +- Remove the PeerStore: the `peers.json` file introduces the possibility of getting out of sync with the log and snapshot, and it's hard to maintain this atomically as the log changes. It's not clear whether it's meant to track the committed or latest configuration, either. +- Servers will have to search their snapshot and log to find the committed configuration and the latest configuration on startup. +- Bootstrap will no longer use `peers.json` but should initialize the log or snapshot with an application-provided configuration entry. +- Snapshots should store the index of their configuration along with the configuration itself. In my experience with LogCabin, the original log index of the configuration is very useful to include in debug log messages. +- As noted in hashicorp/raft#84, configuration change requests should come in via a separate channel, and one may not proceed until the last has been committed. +- As to deciding when a log is sufficiently caught up, implementing a sophisticated algorithm *is* something that can be done in a separate PR. An easy and decent placeholder is: once the staging server has reached 95% of the leader's commit index, promote it. + +## Feedback + +Again, we're looking for feedback here before we start working on this. Here are some questions to think about: + - Does this seem like where we want things to go? + - Is there anything here that should be left out? + - Is there anything else we're forgetting about? + - Is there a good way to break this up? + - What do we need to worry about in terms of backwards compatibility? + - What implication will this have on current tests? + - What's the best way to test this code, in particular the small changes that will be sprinkled all over the library? diff --git a/vendor/github.com/hashicorp/raft/net_transport.go b/vendor/github.com/hashicorp/raft/net_transport.go new file mode 100644 index 000000000..7c55ac537 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/net_transport.go @@ -0,0 +1,622 @@ +package raft + +import ( + "bufio" + "errors" + "fmt" + "io" + "log" + "net" + "os" + "sync" + "time" + + "github.com/hashicorp/go-msgpack/codec" +) + +const ( + rpcAppendEntries uint8 = iota + rpcRequestVote + rpcInstallSnapshot + + // DefaultTimeoutScale is the default TimeoutScale in a NetworkTransport. + DefaultTimeoutScale = 256 * 1024 // 256KB + + // rpcMaxPipeline controls the maximum number of outstanding + // AppendEntries RPC calls. + rpcMaxPipeline = 128 +) + +var ( + // ErrTransportShutdown is returned when operations on a transport are + // invoked after it's been terminated. + ErrTransportShutdown = errors.New("transport shutdown") + + // ErrPipelineShutdown is returned when the pipeline is closed. + ErrPipelineShutdown = errors.New("append pipeline closed") +) + +/* + +NetworkTransport provides a network based transport that can be +used to communicate with Raft on remote machines. It requires +an underlying stream layer to provide a stream abstraction, which can +be simple TCP, TLS, etc. + +This transport is very simple and lightweight. Each RPC request is +framed by sending a byte that indicates the message type, followed +by the MsgPack encoded request. + +The response is an error string followed by the response object, +both are encoded using MsgPack. + +InstallSnapshot is special, in that after the RPC request we stream +the entire state. That socket is not re-used as the connection state +is not known if there is an error. + +*/ +type NetworkTransport struct { + connPool map[ServerAddress][]*netConn + connPoolLock sync.Mutex + + consumeCh chan RPC + + heartbeatFn func(RPC) + heartbeatFnLock sync.Mutex + + logger *log.Logger + + maxPool int + + shutdown bool + shutdownCh chan struct{} + shutdownLock sync.Mutex + + stream StreamLayer + + timeout time.Duration + TimeoutScale int +} + +// StreamLayer is used with the NetworkTransport to provide +// the low level stream abstraction. +type StreamLayer interface { + net.Listener + + // Dial is used to create a new outgoing connection + Dial(address ServerAddress, timeout time.Duration) (net.Conn, error) +} + +type netConn struct { + target ServerAddress + conn net.Conn + r *bufio.Reader + w *bufio.Writer + dec *codec.Decoder + enc *codec.Encoder +} + +func (n *netConn) Release() error { + return n.conn.Close() +} + +type netPipeline struct { + conn *netConn + trans *NetworkTransport + + doneCh chan AppendFuture + inprogressCh chan *appendFuture + + shutdown bool + shutdownCh chan struct{} + shutdownLock sync.Mutex +} + +// NewNetworkTransport creates a new network transport with the given dialer +// and listener. The maxPool controls how many connections we will pool. The +// timeout is used to apply I/O deadlines. For InstallSnapshot, we multiply +// the timeout by (SnapshotSize / TimeoutScale). +func NewNetworkTransport( + stream StreamLayer, + maxPool int, + timeout time.Duration, + logOutput io.Writer, +) *NetworkTransport { + if logOutput == nil { + logOutput = os.Stderr + } + return NewNetworkTransportWithLogger(stream, maxPool, timeout, log.New(logOutput, "", log.LstdFlags)) +} + +// NewNetworkTransportWithLogger creates a new network transport with the given dialer +// and listener. The maxPool controls how many connections we will pool. The +// timeout is used to apply I/O deadlines. For InstallSnapshot, we multiply +// the timeout by (SnapshotSize / TimeoutScale). +func NewNetworkTransportWithLogger( + stream StreamLayer, + maxPool int, + timeout time.Duration, + logger *log.Logger, +) *NetworkTransport { + if logger == nil { + logger = log.New(os.Stderr, "", log.LstdFlags) + } + trans := &NetworkTransport{ + connPool: make(map[ServerAddress][]*netConn), + consumeCh: make(chan RPC), + logger: logger, + maxPool: maxPool, + shutdownCh: make(chan struct{}), + stream: stream, + timeout: timeout, + TimeoutScale: DefaultTimeoutScale, + } + go trans.listen() + return trans +} + +// SetHeartbeatHandler is used to setup a heartbeat handler +// as a fast-pass. This is to avoid head-of-line blocking from +// disk IO. +func (n *NetworkTransport) SetHeartbeatHandler(cb func(rpc RPC)) { + n.heartbeatFnLock.Lock() + defer n.heartbeatFnLock.Unlock() + n.heartbeatFn = cb +} + +// Close is used to stop the network transport. +func (n *NetworkTransport) Close() error { + n.shutdownLock.Lock() + defer n.shutdownLock.Unlock() + + if !n.shutdown { + close(n.shutdownCh) + n.stream.Close() + n.shutdown = true + } + return nil +} + +// Consumer implements the Transport interface. +func (n *NetworkTransport) Consumer() <-chan RPC { + return n.consumeCh +} + +// LocalAddr implements the Transport interface. +func (n *NetworkTransport) LocalAddr() ServerAddress { + return ServerAddress(n.stream.Addr().String()) +} + +// IsShutdown is used to check if the transport is shutdown. +func (n *NetworkTransport) IsShutdown() bool { + select { + case <-n.shutdownCh: + return true + default: + return false + } +} + +// getExistingConn is used to grab a pooled connection. +func (n *NetworkTransport) getPooledConn(target ServerAddress) *netConn { + n.connPoolLock.Lock() + defer n.connPoolLock.Unlock() + + conns, ok := n.connPool[target] + if !ok || len(conns) == 0 { + return nil + } + + var conn *netConn + num := len(conns) + conn, conns[num-1] = conns[num-1], nil + n.connPool[target] = conns[:num-1] + return conn +} + +// getConn is used to get a connection from the pool. +func (n *NetworkTransport) getConn(target ServerAddress) (*netConn, error) { + // Check for a pooled conn + if conn := n.getPooledConn(target); conn != nil { + return conn, nil + } + + // Dial a new connection + conn, err := n.stream.Dial(target, n.timeout) + if err != nil { + return nil, err + } + + // Wrap the conn + netConn := &netConn{ + target: target, + conn: conn, + r: bufio.NewReader(conn), + w: bufio.NewWriter(conn), + } + + // Setup encoder/decoders + netConn.dec = codec.NewDecoder(netConn.r, &codec.MsgpackHandle{}) + netConn.enc = codec.NewEncoder(netConn.w, &codec.MsgpackHandle{}) + + // Done + return netConn, nil +} + +// returnConn returns a connection back to the pool. +func (n *NetworkTransport) returnConn(conn *netConn) { + n.connPoolLock.Lock() + defer n.connPoolLock.Unlock() + + key := conn.target + conns, _ := n.connPool[key] + + if !n.IsShutdown() && len(conns) < n.maxPool { + n.connPool[key] = append(conns, conn) + } else { + conn.Release() + } +} + +// AppendEntriesPipeline returns an interface that can be used to pipeline +// AppendEntries requests. +func (n *NetworkTransport) AppendEntriesPipeline(target ServerAddress) (AppendPipeline, error) { + // Get a connection + conn, err := n.getConn(target) + if err != nil { + return nil, err + } + + // Create the pipeline + return newNetPipeline(n, conn), nil +} + +// AppendEntries implements the Transport interface. +func (n *NetworkTransport) AppendEntries(target ServerAddress, args *AppendEntriesRequest, resp *AppendEntriesResponse) error { + return n.genericRPC(target, rpcAppendEntries, args, resp) +} + +// RequestVote implements the Transport interface. +func (n *NetworkTransport) RequestVote(target ServerAddress, args *RequestVoteRequest, resp *RequestVoteResponse) error { + return n.genericRPC(target, rpcRequestVote, args, resp) +} + +// genericRPC handles a simple request/response RPC. +func (n *NetworkTransport) genericRPC(target ServerAddress, rpcType uint8, args interface{}, resp interface{}) error { + // Get a conn + conn, err := n.getConn(target) + if err != nil { + return err + } + + // Set a deadline + if n.timeout > 0 { + conn.conn.SetDeadline(time.Now().Add(n.timeout)) + } + + // Send the RPC + if err = sendRPC(conn, rpcType, args); err != nil { + return err + } + + // Decode the response + canReturn, err := decodeResponse(conn, resp) + if canReturn { + n.returnConn(conn) + } + return err +} + +// InstallSnapshot implements the Transport interface. +func (n *NetworkTransport) InstallSnapshot(target ServerAddress, args *InstallSnapshotRequest, resp *InstallSnapshotResponse, data io.Reader) error { + // Get a conn, always close for InstallSnapshot + conn, err := n.getConn(target) + if err != nil { + return err + } + defer conn.Release() + + // Set a deadline, scaled by request size + if n.timeout > 0 { + timeout := n.timeout * time.Duration(args.Size/int64(n.TimeoutScale)) + if timeout < n.timeout { + timeout = n.timeout + } + conn.conn.SetDeadline(time.Now().Add(timeout)) + } + + // Send the RPC + if err = sendRPC(conn, rpcInstallSnapshot, args); err != nil { + return err + } + + // Stream the state + if _, err = io.Copy(conn.w, data); err != nil { + return err + } + + // Flush + if err = conn.w.Flush(); err != nil { + return err + } + + // Decode the response, do not return conn + _, err = decodeResponse(conn, resp) + return err +} + +// EncodePeer implements the Transport interface. +func (n *NetworkTransport) EncodePeer(p ServerAddress) []byte { + return []byte(p) +} + +// DecodePeer implements the Transport interface. +func (n *NetworkTransport) DecodePeer(buf []byte) ServerAddress { + return ServerAddress(buf) +} + +// listen is used to handling incoming connections. +func (n *NetworkTransport) listen() { + for { + // Accept incoming connections + conn, err := n.stream.Accept() + if err != nil { + if n.IsShutdown() { + return + } + n.logger.Printf("[ERR] raft-net: Failed to accept connection: %v", err) + continue + } + n.logger.Printf("[DEBUG] raft-net: %v accepted connection from: %v", n.LocalAddr(), conn.RemoteAddr()) + + // Handle the connection in dedicated routine + go n.handleConn(conn) + } +} + +// handleConn is used to handle an inbound connection for its lifespan. +func (n *NetworkTransport) handleConn(conn net.Conn) { + defer conn.Close() + r := bufio.NewReader(conn) + w := bufio.NewWriter(conn) + dec := codec.NewDecoder(r, &codec.MsgpackHandle{}) + enc := codec.NewEncoder(w, &codec.MsgpackHandle{}) + + for { + if err := n.handleCommand(r, dec, enc); err != nil { + if err != io.EOF { + n.logger.Printf("[ERR] raft-net: Failed to decode incoming command: %v", err) + } + return + } + if err := w.Flush(); err != nil { + n.logger.Printf("[ERR] raft-net: Failed to flush response: %v", err) + return + } + } +} + +// handleCommand is used to decode and dispatch a single command. +func (n *NetworkTransport) handleCommand(r *bufio.Reader, dec *codec.Decoder, enc *codec.Encoder) error { + // Get the rpc type + rpcType, err := r.ReadByte() + if err != nil { + return err + } + + // Create the RPC object + respCh := make(chan RPCResponse, 1) + rpc := RPC{ + RespChan: respCh, + } + + // Decode the command + isHeartbeat := false + switch rpcType { + case rpcAppendEntries: + var req AppendEntriesRequest + if err := dec.Decode(&req); err != nil { + return err + } + rpc.Command = &req + + // Check if this is a heartbeat + if req.Term != 0 && req.Leader != nil && + req.PrevLogEntry == 0 && req.PrevLogTerm == 0 && + len(req.Entries) == 0 && req.LeaderCommitIndex == 0 { + isHeartbeat = true + } + + case rpcRequestVote: + var req RequestVoteRequest + if err := dec.Decode(&req); err != nil { + return err + } + rpc.Command = &req + + case rpcInstallSnapshot: + var req InstallSnapshotRequest + if err := dec.Decode(&req); err != nil { + return err + } + rpc.Command = &req + rpc.Reader = io.LimitReader(r, req.Size) + + default: + return fmt.Errorf("unknown rpc type %d", rpcType) + } + + // Check for heartbeat fast-path + if isHeartbeat { + n.heartbeatFnLock.Lock() + fn := n.heartbeatFn + n.heartbeatFnLock.Unlock() + if fn != nil { + fn(rpc) + goto RESP + } + } + + // Dispatch the RPC + select { + case n.consumeCh <- rpc: + case <-n.shutdownCh: + return ErrTransportShutdown + } + + // Wait for response +RESP: + select { + case resp := <-respCh: + // Send the error first + respErr := "" + if resp.Error != nil { + respErr = resp.Error.Error() + } + if err := enc.Encode(respErr); err != nil { + return err + } + + // Send the response + if err := enc.Encode(resp.Response); err != nil { + return err + } + case <-n.shutdownCh: + return ErrTransportShutdown + } + return nil +} + +// decodeResponse is used to decode an RPC response and reports whether +// the connection can be reused. +func decodeResponse(conn *netConn, resp interface{}) (bool, error) { + // Decode the error if any + var rpcError string + if err := conn.dec.Decode(&rpcError); err != nil { + conn.Release() + return false, err + } + + // Decode the response + if err := conn.dec.Decode(resp); err != nil { + conn.Release() + return false, err + } + + // Format an error if any + if rpcError != "" { + return true, fmt.Errorf(rpcError) + } + return true, nil +} + +// sendRPC is used to encode and send the RPC. +func sendRPC(conn *netConn, rpcType uint8, args interface{}) error { + // Write the request type + if err := conn.w.WriteByte(rpcType); err != nil { + conn.Release() + return err + } + + // Send the request + if err := conn.enc.Encode(args); err != nil { + conn.Release() + return err + } + + // Flush + if err := conn.w.Flush(); err != nil { + conn.Release() + return err + } + return nil +} + +// newNetPipeline is used to construct a netPipeline from a given +// transport and connection. +func newNetPipeline(trans *NetworkTransport, conn *netConn) *netPipeline { + n := &netPipeline{ + conn: conn, + trans: trans, + doneCh: make(chan AppendFuture, rpcMaxPipeline), + inprogressCh: make(chan *appendFuture, rpcMaxPipeline), + shutdownCh: make(chan struct{}), + } + go n.decodeResponses() + return n +} + +// decodeResponses is a long running routine that decodes the responses +// sent on the connection. +func (n *netPipeline) decodeResponses() { + timeout := n.trans.timeout + for { + select { + case future := <-n.inprogressCh: + if timeout > 0 { + n.conn.conn.SetReadDeadline(time.Now().Add(timeout)) + } + + _, err := decodeResponse(n.conn, future.resp) + future.respond(err) + select { + case n.doneCh <- future: + case <-n.shutdownCh: + return + } + case <-n.shutdownCh: + return + } + } +} + +// AppendEntries is used to pipeline a new append entries request. +func (n *netPipeline) AppendEntries(args *AppendEntriesRequest, resp *AppendEntriesResponse) (AppendFuture, error) { + // Create a new future + future := &appendFuture{ + start: time.Now(), + args: args, + resp: resp, + } + future.init() + + // Add a send timeout + if timeout := n.trans.timeout; timeout > 0 { + n.conn.conn.SetWriteDeadline(time.Now().Add(timeout)) + } + + // Send the RPC + if err := sendRPC(n.conn, rpcAppendEntries, future.args); err != nil { + return nil, err + } + + // Hand-off for decoding, this can also cause back-pressure + // to prevent too many inflight requests + select { + case n.inprogressCh <- future: + return future, nil + case <-n.shutdownCh: + return nil, ErrPipelineShutdown + } +} + +// Consumer returns a channel that can be used to consume complete futures. +func (n *netPipeline) Consumer() <-chan AppendFuture { + return n.doneCh +} + +// Closed is used to shutdown the pipeline connection. +func (n *netPipeline) Close() error { + n.shutdownLock.Lock() + defer n.shutdownLock.Unlock() + if n.shutdown { + return nil + } + + // Release the connection + n.conn.Release() + + n.shutdown = true + close(n.shutdownCh) + return nil +} diff --git a/vendor/github.com/hashicorp/raft/observer.go b/vendor/github.com/hashicorp/raft/observer.go new file mode 100644 index 000000000..22500fa87 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/observer.go @@ -0,0 +1,115 @@ +package raft + +import ( + "sync/atomic" +) + +// Observation is sent along the given channel to observers when an event occurs. +type Observation struct { + // Raft holds the Raft instance generating the observation. + Raft *Raft + // Data holds observation-specific data. Possible types are + // *RequestVoteRequest and RaftState. + Data interface{} +} + +// nextObserverId is used to provide a unique ID for each observer to aid in +// deregistration. +var nextObserverID uint64 + +// FilterFn is a function that can be registered in order to filter observations. +// The function reports whether the observation should be included - if +// it returns false, the observation will be filtered out. +type FilterFn func(o *Observation) bool + +// Observer describes what to do with a given observation. +type Observer struct { + // channel receives observations. + channel chan Observation + + // blocking, if true, will cause Raft to block when sending an observation + // to this observer. This should generally be set to false. + blocking bool + + // filter will be called to determine if an observation should be sent to + // the channel. + filter FilterFn + + // id is the ID of this observer in the Raft map. + id uint64 + + // numObserved and numDropped are performance counters for this observer. + numObserved uint64 + numDropped uint64 +} + +// NewObserver creates a new observer that can be registered +// to make observations on a Raft instance. Observations +// will be sent on the given channel if they satisfy the +// given filter. +// +// If blocking is true, the observer will block when it can't +// send on the channel, otherwise it may discard events. +func NewObserver(channel chan Observation, blocking bool, filter FilterFn) *Observer { + return &Observer{ + channel: channel, + blocking: blocking, + filter: filter, + id: atomic.AddUint64(&nextObserverID, 1), + } +} + +// GetNumObserved returns the number of observations. +func (or *Observer) GetNumObserved() uint64 { + return atomic.LoadUint64(&or.numObserved) +} + +// GetNumDropped returns the number of dropped observations due to blocking. +func (or *Observer) GetNumDropped() uint64 { + return atomic.LoadUint64(&or.numDropped) +} + +// RegisterObserver registers a new observer. +func (r *Raft) RegisterObserver(or *Observer) { + r.observersLock.Lock() + defer r.observersLock.Unlock() + r.observers[or.id] = or +} + +// DeregisterObserver deregisters an observer. +func (r *Raft) DeregisterObserver(or *Observer) { + r.observersLock.Lock() + defer r.observersLock.Unlock() + delete(r.observers, or.id) +} + +// observe sends an observation to every observer. +func (r *Raft) observe(o interface{}) { + // In general observers should not block. But in any case this isn't + // disastrous as we only hold a read lock, which merely prevents + // registration / deregistration of observers. + r.observersLock.RLock() + defer r.observersLock.RUnlock() + for _, or := range r.observers { + // It's wasteful to do this in the loop, but for the common case + // where there are no observers we won't create any objects. + ob := Observation{Raft: r, Data: o} + if or.filter != nil && !or.filter(&ob) { + continue + } + if or.channel == nil { + continue + } + if or.blocking { + or.channel <- ob + atomic.AddUint64(&or.numObserved, 1) + } else { + select { + case or.channel <- ob: + atomic.AddUint64(&or.numObserved, 1) + default: + atomic.AddUint64(&or.numDropped, 1) + } + } + } +} diff --git a/vendor/github.com/hashicorp/raft/peersjson.go b/vendor/github.com/hashicorp/raft/peersjson.go new file mode 100644 index 000000000..c55fdbb43 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/peersjson.go @@ -0,0 +1,46 @@ +package raft + +import ( + "bytes" + "encoding/json" + "io/ioutil" +) + +// ReadPeersJSON consumes a legacy peers.json file in the format of the old JSON +// peer store and creates a new-style configuration structure. This can be used +// to migrate this data or perform manual recovery when running protocol versions +// that can interoperate with older, unversioned Raft servers. This should not be +// used once server IDs are in use, because the old peers.json file didn't have +// support for these, nor non-voter suffrage types. +func ReadPeersJSON(path string) (Configuration, error) { + // Read in the file. + buf, err := ioutil.ReadFile(path) + if err != nil { + return Configuration{}, err + } + + // Parse it as JSON. + var peers []string + dec := json.NewDecoder(bytes.NewReader(buf)) + if err := dec.Decode(&peers); err != nil { + return Configuration{}, err + } + + // Map it into the new-style configuration structure. We can only specify + // voter roles here, and the ID has to be the same as the address. + var configuration Configuration + for _, peer := range peers { + server := Server{ + Suffrage: Voter, + ID: ServerID(peer), + Address: ServerAddress(peer), + } + configuration.Servers = append(configuration.Servers, server) + } + + // We should only ingest valid configurations. + if err := checkConfiguration(configuration); err != nil { + return Configuration{}, err + } + return configuration, nil +} diff --git a/vendor/github.com/hashicorp/raft/raft.go b/vendor/github.com/hashicorp/raft/raft.go new file mode 100644 index 000000000..aa8fe8208 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/raft.go @@ -0,0 +1,1456 @@ +package raft + +import ( + "bytes" + "container/list" + "fmt" + "io" + "time" + + "github.com/armon/go-metrics" +) + +const ( + minCheckInterval = 10 * time.Millisecond +) + +var ( + keyCurrentTerm = []byte("CurrentTerm") + keyLastVoteTerm = []byte("LastVoteTerm") + keyLastVoteCand = []byte("LastVoteCand") +) + +// getRPCHeader returns an initialized RPCHeader struct for the given +// Raft instance. This structure is sent along with RPC requests and +// responses. +func (r *Raft) getRPCHeader() RPCHeader { + return RPCHeader{ + ProtocolVersion: r.conf.ProtocolVersion, + } +} + +// checkRPCHeader houses logic about whether this instance of Raft can process +// the given RPC message. +func (r *Raft) checkRPCHeader(rpc RPC) error { + // Get the header off the RPC message. + wh, ok := rpc.Command.(WithRPCHeader) + if !ok { + return fmt.Errorf("RPC does not have a header") + } + header := wh.GetRPCHeader() + + // First check is to just make sure the code can understand the + // protocol at all. + if header.ProtocolVersion < ProtocolVersionMin || + header.ProtocolVersion > ProtocolVersionMax { + return ErrUnsupportedProtocol + } + + // Second check is whether we should support this message, given the + // current protocol we are configured to run. This will drop support + // for protocol version 0 starting at protocol version 2, which is + // currently what we want, and in general support one version back. We + // may need to revisit this policy depending on how future protocol + // changes evolve. + if header.ProtocolVersion < r.conf.ProtocolVersion-1 { + return ErrUnsupportedProtocol + } + + return nil +} + +// getSnapshotVersion returns the snapshot version that should be used when +// creating snapshots, given the protocol version in use. +func getSnapshotVersion(protocolVersion ProtocolVersion) SnapshotVersion { + // Right now we only have two versions and they are backwards compatible + // so we don't need to look at the protocol version. + return 1 +} + +// commitTuple is used to send an index that was committed, +// with an optional associated future that should be invoked. +type commitTuple struct { + log *Log + future *logFuture +} + +// leaderState is state that is used while we are a leader. +type leaderState struct { + commitCh chan struct{} + commitment *commitment + inflight *list.List // list of logFuture in log index order + replState map[ServerID]*followerReplication + notify map[*verifyFuture]struct{} + stepDown chan struct{} +} + +// setLeader is used to modify the current leader of the cluster +func (r *Raft) setLeader(leader ServerAddress) { + r.leaderLock.Lock() + r.leader = leader + r.leaderLock.Unlock() +} + +// requestConfigChange is a helper for the above functions that make +// configuration change requests. 'req' describes the change. For timeout, +// see AddVoter. +func (r *Raft) requestConfigChange(req configurationChangeRequest, timeout time.Duration) IndexFuture { + var timer <-chan time.Time + if timeout > 0 { + timer = time.After(timeout) + } + future := &configurationChangeFuture{ + req: req, + } + future.init() + select { + case <-timer: + return errorFuture{ErrEnqueueTimeout} + case r.configurationChangeCh <- future: + return future + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + } +} + +// run is a long running goroutine that runs the Raft FSM. +func (r *Raft) run() { + for { + // Check if we are doing a shutdown + select { + case <-r.shutdownCh: + // Clear the leader to prevent forwarding + r.setLeader("") + return + default: + } + + // Enter into a sub-FSM + switch r.getState() { + case Follower: + r.runFollower() + case Candidate: + r.runCandidate() + case Leader: + r.runLeader() + } + } +} + +// runFollower runs the FSM for a follower. +func (r *Raft) runFollower() { + didWarn := false + r.logger.Printf("[INFO] raft: %v entering Follower state (Leader: %q)", r, r.Leader()) + metrics.IncrCounter([]string{"raft", "state", "follower"}, 1) + heartbeatTimer := randomTimeout(r.conf.HeartbeatTimeout) + for { + select { + case rpc := <-r.rpcCh: + r.processRPC(rpc) + + case c := <-r.configurationChangeCh: + // Reject any operations since we are not the leader + c.respond(ErrNotLeader) + + case a := <-r.applyCh: + // Reject any operations since we are not the leader + a.respond(ErrNotLeader) + + case v := <-r.verifyCh: + // Reject any operations since we are not the leader + v.respond(ErrNotLeader) + + case r := <-r.userRestoreCh: + // Reject any restores since we are not the leader + r.respond(ErrNotLeader) + + case c := <-r.configurationsCh: + c.configurations = r.configurations.Clone() + c.respond(nil) + + case b := <-r.bootstrapCh: + b.respond(r.liveBootstrap(b.configuration)) + + case <-heartbeatTimer: + // Restart the heartbeat timer + heartbeatTimer = randomTimeout(r.conf.HeartbeatTimeout) + + // Check if we have had a successful contact + lastContact := r.LastContact() + if time.Now().Sub(lastContact) < r.conf.HeartbeatTimeout { + continue + } + + // Heartbeat failed! Transition to the candidate state + lastLeader := r.Leader() + r.setLeader("") + + if r.configurations.latestIndex == 0 { + if !didWarn { + r.logger.Printf("[WARN] raft: no known peers, aborting election") + didWarn = true + } + } else if r.configurations.latestIndex == r.configurations.committedIndex && + !hasVote(r.configurations.latest, r.localID) { + if !didWarn { + r.logger.Printf("[WARN] raft: not part of stable configuration, aborting election") + didWarn = true + } + } else { + r.logger.Printf(`[WARN] raft: Heartbeat timeout from %q reached, starting election`, lastLeader) + metrics.IncrCounter([]string{"raft", "transition", "heartbeat_timeout"}, 1) + r.setState(Candidate) + return + } + + case <-r.shutdownCh: + return + } + } +} + +// liveBootstrap attempts to seed an initial configuration for the cluster. See +// the Raft object's member BootstrapCluster for more details. This must only be +// called on the main thread, and only makes sense in the follower state. +func (r *Raft) liveBootstrap(configuration Configuration) error { + // Use the pre-init API to make the static updates. + err := BootstrapCluster(&r.conf, r.logs, r.stable, r.snapshots, + r.trans, configuration) + if err != nil { + return err + } + + // Make the configuration live. + var entry Log + if err := r.logs.GetLog(1, &entry); err != nil { + panic(err) + } + r.setCurrentTerm(1) + r.setLastLog(entry.Index, entry.Term) + r.processConfigurationLogEntry(&entry) + return nil +} + +// runCandidate runs the FSM for a candidate. +func (r *Raft) runCandidate() { + r.logger.Printf("[INFO] raft: %v entering Candidate state in term %v", + r, r.getCurrentTerm()+1) + metrics.IncrCounter([]string{"raft", "state", "candidate"}, 1) + + // Start vote for us, and set a timeout + voteCh := r.electSelf() + electionTimer := randomTimeout(r.conf.ElectionTimeout) + + // Tally the votes, need a simple majority + grantedVotes := 0 + votesNeeded := r.quorumSize() + r.logger.Printf("[DEBUG] raft: Votes needed: %d", votesNeeded) + + for r.getState() == Candidate { + select { + case rpc := <-r.rpcCh: + r.processRPC(rpc) + + case vote := <-voteCh: + // Check if the term is greater than ours, bail + if vote.Term > r.getCurrentTerm() { + r.logger.Printf("[DEBUG] raft: Newer term discovered, fallback to follower") + r.setState(Follower) + r.setCurrentTerm(vote.Term) + return + } + + // Check if the vote is granted + if vote.Granted { + grantedVotes++ + r.logger.Printf("[DEBUG] raft: Vote granted from %s in term %v. Tally: %d", + vote.voterID, vote.Term, grantedVotes) + } + + // Check if we've become the leader + if grantedVotes >= votesNeeded { + r.logger.Printf("[INFO] raft: Election won. Tally: %d", grantedVotes) + r.setState(Leader) + r.setLeader(r.localAddr) + return + } + + case c := <-r.configurationChangeCh: + // Reject any operations since we are not the leader + c.respond(ErrNotLeader) + + case a := <-r.applyCh: + // Reject any operations since we are not the leader + a.respond(ErrNotLeader) + + case v := <-r.verifyCh: + // Reject any operations since we are not the leader + v.respond(ErrNotLeader) + + case r := <-r.userRestoreCh: + // Reject any restores since we are not the leader + r.respond(ErrNotLeader) + + case c := <-r.configurationsCh: + c.configurations = r.configurations.Clone() + c.respond(nil) + + case b := <-r.bootstrapCh: + b.respond(ErrCantBootstrap) + + case <-electionTimer: + // Election failed! Restart the election. We simply return, + // which will kick us back into runCandidate + r.logger.Printf("[WARN] raft: Election timeout reached, restarting election") + return + + case <-r.shutdownCh: + return + } + } +} + +// runLeader runs the FSM for a leader. Do the setup here and drop into +// the leaderLoop for the hot loop. +func (r *Raft) runLeader() { + r.logger.Printf("[INFO] raft: %v entering Leader state", r) + metrics.IncrCounter([]string{"raft", "state", "leader"}, 1) + + // Notify that we are the leader + asyncNotifyBool(r.leaderCh, true) + + // Push to the notify channel if given + if notify := r.conf.NotifyCh; notify != nil { + select { + case notify <- true: + case <-r.shutdownCh: + } + } + + // Setup leader state + r.leaderState.commitCh = make(chan struct{}, 1) + r.leaderState.commitment = newCommitment(r.leaderState.commitCh, + r.configurations.latest, + r.getLastIndex()+1 /* first index that may be committed in this term */) + r.leaderState.inflight = list.New() + r.leaderState.replState = make(map[ServerID]*followerReplication) + r.leaderState.notify = make(map[*verifyFuture]struct{}) + r.leaderState.stepDown = make(chan struct{}, 1) + + // Cleanup state on step down + defer func() { + // Since we were the leader previously, we update our + // last contact time when we step down, so that we are not + // reporting a last contact time from before we were the + // leader. Otherwise, to a client it would seem our data + // is extremely stale. + r.setLastContact() + + // Stop replication + for _, p := range r.leaderState.replState { + close(p.stopCh) + } + + // Respond to all inflight operations + for e := r.leaderState.inflight.Front(); e != nil; e = e.Next() { + e.Value.(*logFuture).respond(ErrLeadershipLost) + } + + // Respond to any pending verify requests + for future := range r.leaderState.notify { + future.respond(ErrLeadershipLost) + } + + // Clear all the state + r.leaderState.commitCh = nil + r.leaderState.commitment = nil + r.leaderState.inflight = nil + r.leaderState.replState = nil + r.leaderState.notify = nil + r.leaderState.stepDown = nil + + // If we are stepping down for some reason, no known leader. + // We may have stepped down due to an RPC call, which would + // provide the leader, so we cannot always blank this out. + r.leaderLock.Lock() + if r.leader == r.localAddr { + r.leader = "" + } + r.leaderLock.Unlock() + + // Notify that we are not the leader + asyncNotifyBool(r.leaderCh, false) + + // Push to the notify channel if given + if notify := r.conf.NotifyCh; notify != nil { + select { + case notify <- false: + case <-r.shutdownCh: + // On shutdown, make a best effort but do not block + select { + case notify <- false: + default: + } + } + } + }() + + // Start a replication routine for each peer + r.startStopReplication() + + // Dispatch a no-op log entry first. This gets this leader up to the latest + // possible commit index, even in the absence of client commands. This used + // to append a configuration entry instead of a noop. However, that permits + // an unbounded number of uncommitted configurations in the log. We now + // maintain that there exists at most one uncommitted configuration entry in + // any log, so we have to do proper no-ops here. + noop := &logFuture{ + log: Log{ + Type: LogNoop, + }, + } + r.dispatchLogs([]*logFuture{noop}) + + // Sit in the leader loop until we step down + r.leaderLoop() +} + +// startStopReplication will set up state and start asynchronous replication to +// new peers, and stop replication to removed peers. Before removing a peer, +// it'll instruct the replication routines to try to replicate to the current +// index. This must only be called from the main thread. +func (r *Raft) startStopReplication() { + inConfig := make(map[ServerID]bool, len(r.configurations.latest.Servers)) + lastIdx := r.getLastIndex() + + // Start replication goroutines that need starting + for _, server := range r.configurations.latest.Servers { + if server.ID == r.localID { + continue + } + inConfig[server.ID] = true + if _, ok := r.leaderState.replState[server.ID]; !ok { + r.logger.Printf("[INFO] raft: Added peer %v, starting replication", server.ID) + s := &followerReplication{ + peer: server, + commitment: r.leaderState.commitment, + stopCh: make(chan uint64, 1), + triggerCh: make(chan struct{}, 1), + currentTerm: r.getCurrentTerm(), + nextIndex: lastIdx + 1, + lastContact: time.Now(), + notifyCh: make(chan struct{}, 1), + stepDown: r.leaderState.stepDown, + } + r.leaderState.replState[server.ID] = s + r.goFunc(func() { r.replicate(s) }) + asyncNotifyCh(s.triggerCh) + } + } + + // Stop replication goroutines that need stopping + for serverID, repl := range r.leaderState.replState { + if inConfig[serverID] { + continue + } + // Replicate up to lastIdx and stop + r.logger.Printf("[INFO] raft: Removed peer %v, stopping replication after %v", serverID, lastIdx) + repl.stopCh <- lastIdx + close(repl.stopCh) + delete(r.leaderState.replState, serverID) + } +} + +// configurationChangeChIfStable returns r.configurationChangeCh if it's safe +// to process requests from it, or nil otherwise. This must only be called +// from the main thread. +// +// Note that if the conditions here were to change outside of leaderLoop to take +// this from nil to non-nil, we would need leaderLoop to be kicked. +func (r *Raft) configurationChangeChIfStable() chan *configurationChangeFuture { + // Have to wait until: + // 1. The latest configuration is committed, and + // 2. This leader has committed some entry (the noop) in this term + // https://groups.google.com/forum/#!msg/raft-dev/t4xj6dJTP6E/d2D9LrWRza8J + if r.configurations.latestIndex == r.configurations.committedIndex && + r.getCommitIndex() >= r.leaderState.commitment.startIndex { + return r.configurationChangeCh + } + return nil +} + +// leaderLoop is the hot loop for a leader. It is invoked +// after all the various leader setup is done. +func (r *Raft) leaderLoop() { + // stepDown is used to track if there is an inflight log that + // would cause us to lose leadership (specifically a RemovePeer of + // ourselves). If this is the case, we must not allow any logs to + // be processed in parallel, otherwise we are basing commit on + // only a single peer (ourself) and replicating to an undefined set + // of peers. + stepDown := false + + lease := time.After(r.conf.LeaderLeaseTimeout) + for r.getState() == Leader { + select { + case rpc := <-r.rpcCh: + r.processRPC(rpc) + + case <-r.leaderState.stepDown: + r.setState(Follower) + + case <-r.leaderState.commitCh: + // Process the newly committed entries + oldCommitIndex := r.getCommitIndex() + commitIndex := r.leaderState.commitment.getCommitIndex() + r.setCommitIndex(commitIndex) + + if r.configurations.latestIndex > oldCommitIndex && + r.configurations.latestIndex <= commitIndex { + r.configurations.committed = r.configurations.latest + r.configurations.committedIndex = r.configurations.latestIndex + if !hasVote(r.configurations.committed, r.localID) { + stepDown = true + } + } + + for { + e := r.leaderState.inflight.Front() + if e == nil { + break + } + commitLog := e.Value.(*logFuture) + idx := commitLog.log.Index + if idx > commitIndex { + break + } + // Measure the commit time + metrics.MeasureSince([]string{"raft", "commitTime"}, commitLog.dispatch) + r.processLogs(idx, commitLog) + r.leaderState.inflight.Remove(e) + } + + if stepDown { + if r.conf.ShutdownOnRemove { + r.logger.Printf("[INFO] raft: Removed ourself, shutting down") + r.Shutdown() + } else { + r.logger.Printf("[INFO] raft: Removed ourself, transitioning to follower") + r.setState(Follower) + } + } + + case v := <-r.verifyCh: + if v.quorumSize == 0 { + // Just dispatched, start the verification + r.verifyLeader(v) + + } else if v.votes < v.quorumSize { + // Early return, means there must be a new leader + r.logger.Printf("[WARN] raft: New leader elected, stepping down") + r.setState(Follower) + delete(r.leaderState.notify, v) + v.respond(ErrNotLeader) + + } else { + // Quorum of members agree, we are still leader + delete(r.leaderState.notify, v) + v.respond(nil) + } + + case future := <-r.userRestoreCh: + err := r.restoreUserSnapshot(future.meta, future.reader) + future.respond(err) + + case c := <-r.configurationsCh: + c.configurations = r.configurations.Clone() + c.respond(nil) + + case future := <-r.configurationChangeChIfStable(): + r.appendConfigurationEntry(future) + + case b := <-r.bootstrapCh: + b.respond(ErrCantBootstrap) + + case newLog := <-r.applyCh: + // Group commit, gather all the ready commits + ready := []*logFuture{newLog} + for i := 0; i < r.conf.MaxAppendEntries; i++ { + select { + case newLog := <-r.applyCh: + ready = append(ready, newLog) + default: + break + } + } + + // Dispatch the logs + if stepDown { + // we're in the process of stepping down as leader, don't process anything new + for i := range ready { + ready[i].respond(ErrNotLeader) + } + } else { + r.dispatchLogs(ready) + } + + case <-lease: + // Check if we've exceeded the lease, potentially stepping down + maxDiff := r.checkLeaderLease() + + // Next check interval should adjust for the last node we've + // contacted, without going negative + checkInterval := r.conf.LeaderLeaseTimeout - maxDiff + if checkInterval < minCheckInterval { + checkInterval = minCheckInterval + } + + // Renew the lease timer + lease = time.After(checkInterval) + + case <-r.shutdownCh: + return + } + } +} + +// verifyLeader must be called from the main thread for safety. +// Causes the followers to attempt an immediate heartbeat. +func (r *Raft) verifyLeader(v *verifyFuture) { + // Current leader always votes for self + v.votes = 1 + + // Set the quorum size, hot-path for single node + v.quorumSize = r.quorumSize() + if v.quorumSize == 1 { + v.respond(nil) + return + } + + // Track this request + v.notifyCh = r.verifyCh + r.leaderState.notify[v] = struct{}{} + + // Trigger immediate heartbeats + for _, repl := range r.leaderState.replState { + repl.notifyLock.Lock() + repl.notify = append(repl.notify, v) + repl.notifyLock.Unlock() + asyncNotifyCh(repl.notifyCh) + } +} + +// checkLeaderLease is used to check if we can contact a quorum of nodes +// within the last leader lease interval. If not, we need to step down, +// as we may have lost connectivity. Returns the maximum duration without +// contact. This must only be called from the main thread. +func (r *Raft) checkLeaderLease() time.Duration { + // Track contacted nodes, we can always contact ourself + contacted := 1 + + // Check each follower + var maxDiff time.Duration + now := time.Now() + for peer, f := range r.leaderState.replState { + diff := now.Sub(f.LastContact()) + if diff <= r.conf.LeaderLeaseTimeout { + contacted++ + if diff > maxDiff { + maxDiff = diff + } + } else { + // Log at least once at high value, then debug. Otherwise it gets very verbose. + if diff <= 3*r.conf.LeaderLeaseTimeout { + r.logger.Printf("[WARN] raft: Failed to contact %v in %v", peer, diff) + } else { + r.logger.Printf("[DEBUG] raft: Failed to contact %v in %v", peer, diff) + } + } + metrics.AddSample([]string{"raft", "leader", "lastContact"}, float32(diff/time.Millisecond)) + } + + // Verify we can contact a quorum + quorum := r.quorumSize() + if contacted < quorum { + r.logger.Printf("[WARN] raft: Failed to contact quorum of nodes, stepping down") + r.setState(Follower) + metrics.IncrCounter([]string{"raft", "transition", "leader_lease_timeout"}, 1) + } + return maxDiff +} + +// quorumSize is used to return the quorum size. This must only be called on +// the main thread. +// TODO: revisit usage +func (r *Raft) quorumSize() int { + voters := 0 + for _, server := range r.configurations.latest.Servers { + if server.Suffrage == Voter { + voters++ + } + } + return voters/2 + 1 +} + +// restoreUserSnapshot is used to manually consume an external snapshot, such +// as if restoring from a backup. We will use the current Raft configuration, +// not the one from the snapshot, so that we can restore into a new cluster. We +// will also use the higher of the index of the snapshot, or the current index, +// and then add 1 to that, so we force a new state with a hole in the Raft log, +// so that the snapshot will be sent to followers and used for any new joiners. +// This can only be run on the leader, and returns a future that can be used to +// block until complete. +func (r *Raft) restoreUserSnapshot(meta *SnapshotMeta, reader io.Reader) error { + defer metrics.MeasureSince([]string{"raft", "restoreUserSnapshot"}, time.Now()) + + // Sanity check the version. + version := meta.Version + if version < SnapshotVersionMin || version > SnapshotVersionMax { + return fmt.Errorf("unsupported snapshot version %d", version) + } + + // We don't support snapshots while there's a config change + // outstanding since the snapshot doesn't have a means to + // represent this state. + committedIndex := r.configurations.committedIndex + latestIndex := r.configurations.latestIndex + if committedIndex != latestIndex { + return fmt.Errorf("cannot restore snapshot now, wait until the configuration entry at %v has been applied (have applied %v)", + latestIndex, committedIndex) + } + + // Cancel any inflight requests. + for { + e := r.leaderState.inflight.Front() + if e == nil { + break + } + e.Value.(*logFuture).respond(ErrAbortedByRestore) + r.leaderState.inflight.Remove(e) + } + + // We will overwrite the snapshot metadata with the current term, + // an index that's greater than the current index, or the last + // index in the snapshot. It's important that we leave a hole in + // the index so we know there's nothing in the Raft log there and + // replication will fault and send the snapshot. + term := r.getCurrentTerm() + lastIndex := r.getLastIndex() + if meta.Index > lastIndex { + lastIndex = meta.Index + } + lastIndex++ + + // Dump the snapshot. Note that we use the latest configuration, + // not the one that came with the snapshot. + sink, err := r.snapshots.Create(version, lastIndex, term, + r.configurations.latest, r.configurations.latestIndex, r.trans) + if err != nil { + return fmt.Errorf("failed to create snapshot: %v", err) + } + n, err := io.Copy(sink, reader) + if err != nil { + sink.Cancel() + return fmt.Errorf("failed to write snapshot: %v", err) + } + if n != meta.Size { + sink.Cancel() + return fmt.Errorf("failed to write snapshot, size didn't match (%d != %d)", n, meta.Size) + } + if err := sink.Close(); err != nil { + return fmt.Errorf("failed to close snapshot: %v", err) + } + r.logger.Printf("[INFO] raft: Copied %d bytes to local snapshot", n) + + // Restore the snapshot into the FSM. If this fails we are in a + // bad state so we panic to take ourselves out. + fsm := &restoreFuture{ID: sink.ID()} + fsm.init() + select { + case r.fsmMutateCh <- fsm: + case <-r.shutdownCh: + return ErrRaftShutdown + } + if err := fsm.Error(); err != nil { + panic(fmt.Errorf("failed to restore snapshot: %v", err)) + } + + // We set the last log so it looks like we've stored the empty + // index we burned. The last applied is set because we made the + // FSM take the snapshot state, and we store the last snapshot + // in the stable store since we created a snapshot as part of + // this process. + r.setLastLog(lastIndex, term) + r.setLastApplied(lastIndex) + r.setLastSnapshot(lastIndex, term) + + r.logger.Printf("[INFO] raft: Restored user snapshot (index %d)", lastIndex) + return nil +} + +// appendConfigurationEntry changes the configuration and adds a new +// configuration entry to the log. This must only be called from the +// main thread. +func (r *Raft) appendConfigurationEntry(future *configurationChangeFuture) { + configuration, err := nextConfiguration(r.configurations.latest, r.configurations.latestIndex, future.req) + if err != nil { + future.respond(err) + return + } + + r.logger.Printf("[INFO] raft: Updating configuration with %s (%v, %v) to %+v", + future.req.command, future.req.serverID, future.req.serverAddress, configuration.Servers) + + // In pre-ID compatibility mode we translate all configuration changes + // in to an old remove peer message, which can handle all supported + // cases for peer changes in the pre-ID world (adding and removing + // voters). Both add peer and remove peer log entries are handled + // similarly on old Raft servers, but remove peer does extra checks to + // see if a leader needs to step down. Since they both assert the full + // configuration, then we can safely call remove peer for everything. + if r.protocolVersion < 2 { + future.log = Log{ + Type: LogRemovePeerDeprecated, + Data: encodePeers(configuration, r.trans), + } + } else { + future.log = Log{ + Type: LogConfiguration, + Data: encodeConfiguration(configuration), + } + } + + r.dispatchLogs([]*logFuture{&future.logFuture}) + index := future.Index() + r.configurations.latest = configuration + r.configurations.latestIndex = index + r.leaderState.commitment.setConfiguration(configuration) + r.startStopReplication() +} + +// dispatchLog is called on the leader to push a log to disk, mark it +// as inflight and begin replication of it. +func (r *Raft) dispatchLogs(applyLogs []*logFuture) { + now := time.Now() + defer metrics.MeasureSince([]string{"raft", "leader", "dispatchLog"}, now) + + term := r.getCurrentTerm() + lastIndex := r.getLastIndex() + logs := make([]*Log, len(applyLogs)) + + for idx, applyLog := range applyLogs { + applyLog.dispatch = now + lastIndex++ + applyLog.log.Index = lastIndex + applyLog.log.Term = term + logs[idx] = &applyLog.log + r.leaderState.inflight.PushBack(applyLog) + } + + // Write the log entry locally + if err := r.logs.StoreLogs(logs); err != nil { + r.logger.Printf("[ERR] raft: Failed to commit logs: %v", err) + for _, applyLog := range applyLogs { + applyLog.respond(err) + } + r.setState(Follower) + return + } + r.leaderState.commitment.match(r.localID, lastIndex) + + // Update the last log since it's on disk now + r.setLastLog(lastIndex, term) + + // Notify the replicators of the new log + for _, f := range r.leaderState.replState { + asyncNotifyCh(f.triggerCh) + } +} + +// processLogs is used to apply all the committed entires that haven't been +// applied up to the given index limit. +// This can be called from both leaders and followers. +// Followers call this from AppendEntires, for n entires at a time, and always +// pass future=nil. +// Leaders call this once per inflight when entries are committed. They pass +// the future from inflights. +func (r *Raft) processLogs(index uint64, future *logFuture) { + // Reject logs we've applied already + lastApplied := r.getLastApplied() + if index <= lastApplied { + r.logger.Printf("[WARN] raft: Skipping application of old log: %d", index) + return + } + + // Apply all the preceding logs + for idx := r.getLastApplied() + 1; idx <= index; idx++ { + // Get the log, either from the future or from our log store + if future != nil && future.log.Index == idx { + r.processLog(&future.log, future) + + } else { + l := new(Log) + if err := r.logs.GetLog(idx, l); err != nil { + r.logger.Printf("[ERR] raft: Failed to get log at %d: %v", idx, err) + panic(err) + } + r.processLog(l, nil) + } + + // Update the lastApplied index and term + r.setLastApplied(idx) + } +} + +// processLog is invoked to process the application of a single committed log entry. +func (r *Raft) processLog(l *Log, future *logFuture) { + switch l.Type { + case LogBarrier: + // Barrier is handled by the FSM + fallthrough + + case LogCommand: + // Forward to the fsm handler + select { + case r.fsmMutateCh <- &commitTuple{l, future}: + case <-r.shutdownCh: + if future != nil { + future.respond(ErrRaftShutdown) + } + } + + // Return so that the future is only responded to + // by the FSM handler when the application is done + return + + case LogConfiguration: + case LogAddPeerDeprecated: + case LogRemovePeerDeprecated: + case LogNoop: + // Ignore the no-op + + default: + panic(fmt.Errorf("unrecognized log type: %#v", l)) + } + + // Invoke the future if given + if future != nil { + future.respond(nil) + } +} + +// processRPC is called to handle an incoming RPC request. This must only be +// called from the main thread. +func (r *Raft) processRPC(rpc RPC) { + if err := r.checkRPCHeader(rpc); err != nil { + rpc.Respond(nil, err) + return + } + + switch cmd := rpc.Command.(type) { + case *AppendEntriesRequest: + r.appendEntries(rpc, cmd) + case *RequestVoteRequest: + r.requestVote(rpc, cmd) + case *InstallSnapshotRequest: + r.installSnapshot(rpc, cmd) + default: + r.logger.Printf("[ERR] raft: Got unexpected command: %#v", rpc.Command) + rpc.Respond(nil, fmt.Errorf("unexpected command")) + } +} + +// processHeartbeat is a special handler used just for heartbeat requests +// so that they can be fast-pathed if a transport supports it. This must only +// be called from the main thread. +func (r *Raft) processHeartbeat(rpc RPC) { + defer metrics.MeasureSince([]string{"raft", "rpc", "processHeartbeat"}, time.Now()) + + // Check if we are shutdown, just ignore the RPC + select { + case <-r.shutdownCh: + return + default: + } + + // Ensure we are only handling a heartbeat + switch cmd := rpc.Command.(type) { + case *AppendEntriesRequest: + r.appendEntries(rpc, cmd) + default: + r.logger.Printf("[ERR] raft: Expected heartbeat, got command: %#v", rpc.Command) + rpc.Respond(nil, fmt.Errorf("unexpected command")) + } +} + +// appendEntries is invoked when we get an append entries RPC call. This must +// only be called from the main thread. +func (r *Raft) appendEntries(rpc RPC, a *AppendEntriesRequest) { + defer metrics.MeasureSince([]string{"raft", "rpc", "appendEntries"}, time.Now()) + // Setup a response + resp := &AppendEntriesResponse{ + RPCHeader: r.getRPCHeader(), + Term: r.getCurrentTerm(), + LastLog: r.getLastIndex(), + Success: false, + NoRetryBackoff: false, + } + var rpcErr error + defer func() { + rpc.Respond(resp, rpcErr) + }() + + // Ignore an older term + if a.Term < r.getCurrentTerm() { + return + } + + // Increase the term if we see a newer one, also transition to follower + // if we ever get an appendEntries call + if a.Term > r.getCurrentTerm() || r.getState() != Follower { + // Ensure transition to follower + r.setState(Follower) + r.setCurrentTerm(a.Term) + resp.Term = a.Term + } + + // Save the current leader + r.setLeader(ServerAddress(r.trans.DecodePeer(a.Leader))) + + // Verify the last log entry + if a.PrevLogEntry > 0 { + lastIdx, lastTerm := r.getLastEntry() + + var prevLogTerm uint64 + if a.PrevLogEntry == lastIdx { + prevLogTerm = lastTerm + + } else { + var prevLog Log + if err := r.logs.GetLog(a.PrevLogEntry, &prevLog); err != nil { + r.logger.Printf("[WARN] raft: Failed to get previous log: %d %v (last: %d)", + a.PrevLogEntry, err, lastIdx) + resp.NoRetryBackoff = true + return + } + prevLogTerm = prevLog.Term + } + + if a.PrevLogTerm != prevLogTerm { + r.logger.Printf("[WARN] raft: Previous log term mis-match: ours: %d remote: %d", + prevLogTerm, a.PrevLogTerm) + resp.NoRetryBackoff = true + return + } + } + + // Process any new entries + if len(a.Entries) > 0 { + start := time.Now() + + // Delete any conflicting entries, skip any duplicates + lastLogIdx, _ := r.getLastLog() + var newEntries []*Log + for i, entry := range a.Entries { + if entry.Index > lastLogIdx { + newEntries = a.Entries[i:] + break + } + var storeEntry Log + if err := r.logs.GetLog(entry.Index, &storeEntry); err != nil { + r.logger.Printf("[WARN] raft: Failed to get log entry %d: %v", + entry.Index, err) + return + } + if entry.Term != storeEntry.Term { + r.logger.Printf("[WARN] raft: Clearing log suffix from %d to %d", entry.Index, lastLogIdx) + if err := r.logs.DeleteRange(entry.Index, lastLogIdx); err != nil { + r.logger.Printf("[ERR] raft: Failed to clear log suffix: %v", err) + return + } + if entry.Index <= r.configurations.latestIndex { + r.configurations.latest = r.configurations.committed + r.configurations.latestIndex = r.configurations.committedIndex + } + newEntries = a.Entries[i:] + break + } + } + + if n := len(newEntries); n > 0 { + // Append the new entries + if err := r.logs.StoreLogs(newEntries); err != nil { + r.logger.Printf("[ERR] raft: Failed to append to logs: %v", err) + // TODO: leaving r.getLastLog() in the wrong + // state if there was a truncation above + return + } + + // Handle any new configuration changes + for _, newEntry := range newEntries { + r.processConfigurationLogEntry(newEntry) + } + + // Update the lastLog + last := newEntries[n-1] + r.setLastLog(last.Index, last.Term) + } + + metrics.MeasureSince([]string{"raft", "rpc", "appendEntries", "storeLogs"}, start) + } + + // Update the commit index + if a.LeaderCommitIndex > 0 && a.LeaderCommitIndex > r.getCommitIndex() { + start := time.Now() + idx := min(a.LeaderCommitIndex, r.getLastIndex()) + r.setCommitIndex(idx) + if r.configurations.latestIndex <= idx { + r.configurations.committed = r.configurations.latest + r.configurations.committedIndex = r.configurations.latestIndex + } + r.processLogs(idx, nil) + metrics.MeasureSince([]string{"raft", "rpc", "appendEntries", "processLogs"}, start) + } + + // Everything went well, set success + resp.Success = true + r.setLastContact() + return +} + +// processConfigurationLogEntry takes a log entry and updates the latest +// configuration if the entry results in a new configuration. This must only be +// called from the main thread, or from NewRaft() before any threads have begun. +func (r *Raft) processConfigurationLogEntry(entry *Log) { + if entry.Type == LogConfiguration { + r.configurations.committed = r.configurations.latest + r.configurations.committedIndex = r.configurations.latestIndex + r.configurations.latest = decodeConfiguration(entry.Data) + r.configurations.latestIndex = entry.Index + } else if entry.Type == LogAddPeerDeprecated || entry.Type == LogRemovePeerDeprecated { + r.configurations.committed = r.configurations.latest + r.configurations.committedIndex = r.configurations.latestIndex + r.configurations.latest = decodePeers(entry.Data, r.trans) + r.configurations.latestIndex = entry.Index + } +} + +// requestVote is invoked when we get an request vote RPC call. +func (r *Raft) requestVote(rpc RPC, req *RequestVoteRequest) { + defer metrics.MeasureSince([]string{"raft", "rpc", "requestVote"}, time.Now()) + r.observe(*req) + + // Setup a response + resp := &RequestVoteResponse{ + RPCHeader: r.getRPCHeader(), + Term: r.getCurrentTerm(), + Granted: false, + } + var rpcErr error + defer func() { + rpc.Respond(resp, rpcErr) + }() + + // Version 0 servers will panic unless the peers is present. It's only + // used on them to produce a warning message. + if r.protocolVersion < 2 { + resp.Peers = encodePeers(r.configurations.latest, r.trans) + } + + // Check if we have an existing leader [who's not the candidate] + candidate := r.trans.DecodePeer(req.Candidate) + if leader := r.Leader(); leader != "" && leader != candidate { + r.logger.Printf("[WARN] raft: Rejecting vote request from %v since we have a leader: %v", + candidate, leader) + return + } + + // Ignore an older term + if req.Term < r.getCurrentTerm() { + return + } + + // Increase the term if we see a newer one + if req.Term > r.getCurrentTerm() { + // Ensure transition to follower + r.setState(Follower) + r.setCurrentTerm(req.Term) + resp.Term = req.Term + } + + // Check if we have voted yet + lastVoteTerm, err := r.stable.GetUint64(keyLastVoteTerm) + if err != nil && err.Error() != "not found" { + r.logger.Printf("[ERR] raft: Failed to get last vote term: %v", err) + return + } + lastVoteCandBytes, err := r.stable.Get(keyLastVoteCand) + if err != nil && err.Error() != "not found" { + r.logger.Printf("[ERR] raft: Failed to get last vote candidate: %v", err) + return + } + + // Check if we've voted in this election before + if lastVoteTerm == req.Term && lastVoteCandBytes != nil { + r.logger.Printf("[INFO] raft: Duplicate RequestVote for same term: %d", req.Term) + if bytes.Compare(lastVoteCandBytes, req.Candidate) == 0 { + r.logger.Printf("[WARN] raft: Duplicate RequestVote from candidate: %s", req.Candidate) + resp.Granted = true + } + return + } + + // Reject if their term is older + lastIdx, lastTerm := r.getLastEntry() + if lastTerm > req.LastLogTerm { + r.logger.Printf("[WARN] raft: Rejecting vote request from %v since our last term is greater (%d, %d)", + candidate, lastTerm, req.LastLogTerm) + return + } + + if lastTerm == req.LastLogTerm && lastIdx > req.LastLogIndex { + r.logger.Printf("[WARN] raft: Rejecting vote request from %v since our last index is greater (%d, %d)", + candidate, lastIdx, req.LastLogIndex) + return + } + + // Persist a vote for safety + if err := r.persistVote(req.Term, req.Candidate); err != nil { + r.logger.Printf("[ERR] raft: Failed to persist vote: %v", err) + return + } + + resp.Granted = true + r.setLastContact() + return +} + +// installSnapshot is invoked when we get a InstallSnapshot RPC call. +// We must be in the follower state for this, since it means we are +// too far behind a leader for log replay. This must only be called +// from the main thread. +func (r *Raft) installSnapshot(rpc RPC, req *InstallSnapshotRequest) { + defer metrics.MeasureSince([]string{"raft", "rpc", "installSnapshot"}, time.Now()) + // Setup a response + resp := &InstallSnapshotResponse{ + Term: r.getCurrentTerm(), + Success: false, + } + var rpcErr error + defer func() { + rpc.Respond(resp, rpcErr) + }() + + // Sanity check the version + if req.SnapshotVersion < SnapshotVersionMin || + req.SnapshotVersion > SnapshotVersionMax { + rpcErr = fmt.Errorf("unsupported snapshot version %d", req.SnapshotVersion) + return + } + + // Ignore an older term + if req.Term < r.getCurrentTerm() { + return + } + + // Increase the term if we see a newer one + if req.Term > r.getCurrentTerm() { + // Ensure transition to follower + r.setState(Follower) + r.setCurrentTerm(req.Term) + resp.Term = req.Term + } + + // Save the current leader + r.setLeader(ServerAddress(r.trans.DecodePeer(req.Leader))) + + // Create a new snapshot + var reqConfiguration Configuration + var reqConfigurationIndex uint64 + if req.SnapshotVersion > 0 { + reqConfiguration = decodeConfiguration(req.Configuration) + reqConfigurationIndex = req.ConfigurationIndex + } else { + reqConfiguration = decodePeers(req.Peers, r.trans) + reqConfigurationIndex = req.LastLogIndex + } + version := getSnapshotVersion(r.protocolVersion) + sink, err := r.snapshots.Create(version, req.LastLogIndex, req.LastLogTerm, + reqConfiguration, reqConfigurationIndex, r.trans) + if err != nil { + r.logger.Printf("[ERR] raft: Failed to create snapshot to install: %v", err) + rpcErr = fmt.Errorf("failed to create snapshot: %v", err) + return + } + + // Spill the remote snapshot to disk + n, err := io.Copy(sink, rpc.Reader) + if err != nil { + sink.Cancel() + r.logger.Printf("[ERR] raft: Failed to copy snapshot: %v", err) + rpcErr = err + return + } + + // Check that we received it all + if n != req.Size { + sink.Cancel() + r.logger.Printf("[ERR] raft: Failed to receive whole snapshot: %d / %d", n, req.Size) + rpcErr = fmt.Errorf("short read") + return + } + + // Finalize the snapshot + if err := sink.Close(); err != nil { + r.logger.Printf("[ERR] raft: Failed to finalize snapshot: %v", err) + rpcErr = err + return + } + r.logger.Printf("[INFO] raft: Copied %d bytes to local snapshot", n) + + // Restore snapshot + future := &restoreFuture{ID: sink.ID()} + future.init() + select { + case r.fsmMutateCh <- future: + case <-r.shutdownCh: + future.respond(ErrRaftShutdown) + return + } + + // Wait for the restore to happen + if err := future.Error(); err != nil { + r.logger.Printf("[ERR] raft: Failed to restore snapshot: %v", err) + rpcErr = err + return + } + + // Update the lastApplied so we don't replay old logs + r.setLastApplied(req.LastLogIndex) + + // Update the last stable snapshot info + r.setLastSnapshot(req.LastLogIndex, req.LastLogTerm) + + // Restore the peer set + r.configurations.latest = reqConfiguration + r.configurations.latestIndex = reqConfigurationIndex + r.configurations.committed = reqConfiguration + r.configurations.committedIndex = reqConfigurationIndex + + // Compact logs, continue even if this fails + if err := r.compactLogs(req.LastLogIndex); err != nil { + r.logger.Printf("[ERR] raft: Failed to compact logs: %v", err) + } + + r.logger.Printf("[INFO] raft: Installed remote snapshot") + resp.Success = true + r.setLastContact() + return +} + +// setLastContact is used to set the last contact time to now +func (r *Raft) setLastContact() { + r.lastContactLock.Lock() + r.lastContact = time.Now() + r.lastContactLock.Unlock() +} + +type voteResult struct { + RequestVoteResponse + voterID ServerID +} + +// electSelf is used to send a RequestVote RPC to all peers, and vote for +// ourself. This has the side affecting of incrementing the current term. The +// response channel returned is used to wait for all the responses (including a +// vote for ourself). This must only be called from the main thread. +func (r *Raft) electSelf() <-chan *voteResult { + // Create a response channel + respCh := make(chan *voteResult, len(r.configurations.latest.Servers)) + + // Increment the term + r.setCurrentTerm(r.getCurrentTerm() + 1) + + // Construct the request + lastIdx, lastTerm := r.getLastEntry() + req := &RequestVoteRequest{ + RPCHeader: r.getRPCHeader(), + Term: r.getCurrentTerm(), + Candidate: r.trans.EncodePeer(r.localAddr), + LastLogIndex: lastIdx, + LastLogTerm: lastTerm, + } + + // Construct a function to ask for a vote + askPeer := func(peer Server) { + r.goFunc(func() { + defer metrics.MeasureSince([]string{"raft", "candidate", "electSelf"}, time.Now()) + resp := &voteResult{voterID: peer.ID} + err := r.trans.RequestVote(peer.Address, req, &resp.RequestVoteResponse) + if err != nil { + r.logger.Printf("[ERR] raft: Failed to make RequestVote RPC to %v: %v", peer, err) + resp.Term = req.Term + resp.Granted = false + } + respCh <- resp + }) + } + + // For each peer, request a vote + for _, server := range r.configurations.latest.Servers { + if server.Suffrage == Voter { + if server.ID == r.localID { + // Persist a vote for ourselves + if err := r.persistVote(req.Term, req.Candidate); err != nil { + r.logger.Printf("[ERR] raft: Failed to persist vote : %v", err) + return nil + } + // Include our own vote + respCh <- &voteResult{ + RequestVoteResponse: RequestVoteResponse{ + RPCHeader: r.getRPCHeader(), + Term: req.Term, + Granted: true, + }, + voterID: r.localID, + } + } else { + askPeer(server) + } + } + } + + return respCh +} + +// persistVote is used to persist our vote for safety. +func (r *Raft) persistVote(term uint64, candidate []byte) error { + if err := r.stable.SetUint64(keyLastVoteTerm, term); err != nil { + return err + } + if err := r.stable.Set(keyLastVoteCand, candidate); err != nil { + return err + } + return nil +} + +// setCurrentTerm is used to set the current term in a durable manner. +func (r *Raft) setCurrentTerm(t uint64) { + // Persist to disk first + if err := r.stable.SetUint64(keyCurrentTerm, t); err != nil { + panic(fmt.Errorf("failed to save current term: %v", err)) + } + r.raftState.setCurrentTerm(t) +} + +// setState is used to update the current state. Any state +// transition causes the known leader to be cleared. This means +// that leader should be set only after updating the state. +func (r *Raft) setState(state RaftState) { + r.setLeader("") + oldState := r.raftState.getState() + r.raftState.setState(state) + if oldState != state { + r.observe(state) + } +} diff --git a/vendor/github.com/hashicorp/raft/replication.go b/vendor/github.com/hashicorp/raft/replication.go new file mode 100644 index 000000000..683927343 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/replication.go @@ -0,0 +1,561 @@ +package raft + +import ( + "errors" + "fmt" + "sync" + "time" + + "github.com/armon/go-metrics" +) + +const ( + maxFailureScale = 12 + failureWait = 10 * time.Millisecond +) + +var ( + // ErrLogNotFound indicates a given log entry is not available. + ErrLogNotFound = errors.New("log not found") + + // ErrPipelineReplicationNotSupported can be returned by the transport to + // signal that pipeline replication is not supported in general, and that + // no error message should be produced. + ErrPipelineReplicationNotSupported = errors.New("pipeline replication not supported") +) + +// followerReplication is in charge of sending snapshots and log entries from +// this leader during this particular term to a remote follower. +type followerReplication struct { + // peer contains the network address and ID of the remote follower. + peer Server + + // commitment tracks the entries acknowledged by followers so that the + // leader's commit index can advance. It is updated on successsful + // AppendEntries responses. + commitment *commitment + + // stopCh is notified/closed when this leader steps down or the follower is + // removed from the cluster. In the follower removed case, it carries a log + // index; replication should be attempted with a best effort up through that + // index, before exiting. + stopCh chan uint64 + // triggerCh is notified every time new entries are appended to the log. + triggerCh chan struct{} + + // currentTerm is the term of this leader, to be included in AppendEntries + // requests. + currentTerm uint64 + // nextIndex is the index of the next log entry to send to the follower, + // which may fall past the end of the log. + nextIndex uint64 + + // lastContact is updated to the current time whenever any response is + // received from the follower (successful or not). This is used to check + // whether the leader should step down (Raft.checkLeaderLease()). + lastContact time.Time + // lastContactLock protects 'lastContact'. + lastContactLock sync.RWMutex + + // failures counts the number of failed RPCs since the last success, which is + // used to apply backoff. + failures uint64 + + // notifyCh is notified to send out a heartbeat, which is used to check that + // this server is still leader. + notifyCh chan struct{} + // notify is a list of futures to be resolved upon receipt of an + // acknowledgement, then cleared from this list. + notify []*verifyFuture + // notifyLock protects 'notify'. + notifyLock sync.Mutex + + // stepDown is used to indicate to the leader that we + // should step down based on information from a follower. + stepDown chan struct{} + + // allowPipeline is used to determine when to pipeline the AppendEntries RPCs. + // It is private to this replication goroutine. + allowPipeline bool +} + +// notifyAll is used to notify all the waiting verify futures +// if the follower believes we are still the leader. +func (s *followerReplication) notifyAll(leader bool) { + // Clear the waiting notifies minimizing lock time + s.notifyLock.Lock() + n := s.notify + s.notify = nil + s.notifyLock.Unlock() + + // Submit our votes + for _, v := range n { + v.vote(leader) + } +} + +// LastContact returns the time of last contact. +func (s *followerReplication) LastContact() time.Time { + s.lastContactLock.RLock() + last := s.lastContact + s.lastContactLock.RUnlock() + return last +} + +// setLastContact sets the last contact to the current time. +func (s *followerReplication) setLastContact() { + s.lastContactLock.Lock() + s.lastContact = time.Now() + s.lastContactLock.Unlock() +} + +// replicate is a long running routine that replicates log entries to a single +// follower. +func (r *Raft) replicate(s *followerReplication) { + // Start an async heartbeating routing + stopHeartbeat := make(chan struct{}) + defer close(stopHeartbeat) + r.goFunc(func() { r.heartbeat(s, stopHeartbeat) }) + +RPC: + shouldStop := false + for !shouldStop { + select { + case maxIndex := <-s.stopCh: + // Make a best effort to replicate up to this index + if maxIndex > 0 { + r.replicateTo(s, maxIndex) + } + return + case <-s.triggerCh: + lastLogIdx, _ := r.getLastLog() + shouldStop = r.replicateTo(s, lastLogIdx) + case <-randomTimeout(r.conf.CommitTimeout): // TODO: what is this? + lastLogIdx, _ := r.getLastLog() + shouldStop = r.replicateTo(s, lastLogIdx) + } + + // If things looks healthy, switch to pipeline mode + if !shouldStop && s.allowPipeline { + goto PIPELINE + } + } + return + +PIPELINE: + // Disable until re-enabled + s.allowPipeline = false + + // Replicates using a pipeline for high performance. This method + // is not able to gracefully recover from errors, and so we fall back + // to standard mode on failure. + if err := r.pipelineReplicate(s); err != nil { + if err != ErrPipelineReplicationNotSupported { + r.logger.Printf("[ERR] raft: Failed to start pipeline replication to %s: %s", s.peer, err) + } + } + goto RPC +} + +// replicateTo is a hepler to replicate(), used to replicate the logs up to a +// given last index. +// If the follower log is behind, we take care to bring them up to date. +func (r *Raft) replicateTo(s *followerReplication, lastIndex uint64) (shouldStop bool) { + // Create the base request + var req AppendEntriesRequest + var resp AppendEntriesResponse + var start time.Time +START: + // Prevent an excessive retry rate on errors + if s.failures > 0 { + select { + case <-time.After(backoff(failureWait, s.failures, maxFailureScale)): + case <-r.shutdownCh: + } + } + + // Setup the request + if err := r.setupAppendEntries(s, &req, s.nextIndex, lastIndex); err == ErrLogNotFound { + goto SEND_SNAP + } else if err != nil { + return + } + + // Make the RPC call + start = time.Now() + if err := r.trans.AppendEntries(s.peer.Address, &req, &resp); err != nil { + r.logger.Printf("[ERR] raft: Failed to AppendEntries to %v: %v", s.peer, err) + s.failures++ + return + } + appendStats(string(s.peer.ID), start, float32(len(req.Entries))) + + // Check for a newer term, stop running + if resp.Term > req.Term { + r.handleStaleTerm(s) + return true + } + + // Update the last contact + s.setLastContact() + + // Update s based on success + if resp.Success { + // Update our replication state + updateLastAppended(s, &req) + + // Clear any failures, allow pipelining + s.failures = 0 + s.allowPipeline = true + } else { + s.nextIndex = max(min(s.nextIndex-1, resp.LastLog+1), 1) + if resp.NoRetryBackoff { + s.failures = 0 + } else { + s.failures++ + } + r.logger.Printf("[WARN] raft: AppendEntries to %v rejected, sending older logs (next: %d)", s.peer, s.nextIndex) + } + +CHECK_MORE: + // Poll the stop channel here in case we are looping and have been asked + // to stop, or have stepped down as leader. Even for the best effort case + // where we are asked to replicate to a given index and then shutdown, + // it's better to not loop in here to send lots of entries to a straggler + // that's leaving the cluster anyways. + select { + case <-s.stopCh: + return true + default: + } + + // Check if there are more logs to replicate + if s.nextIndex <= lastIndex { + goto START + } + return + + // SEND_SNAP is used when we fail to get a log, usually because the follower + // is too far behind, and we must ship a snapshot down instead +SEND_SNAP: + if stop, err := r.sendLatestSnapshot(s); stop { + return true + } else if err != nil { + r.logger.Printf("[ERR] raft: Failed to send snapshot to %v: %v", s.peer, err) + return + } + + // Check if there is more to replicate + goto CHECK_MORE +} + +// sendLatestSnapshot is used to send the latest snapshot we have +// down to our follower. +func (r *Raft) sendLatestSnapshot(s *followerReplication) (bool, error) { + // Get the snapshots + snapshots, err := r.snapshots.List() + if err != nil { + r.logger.Printf("[ERR] raft: Failed to list snapshots: %v", err) + return false, err + } + + // Check we have at least a single snapshot + if len(snapshots) == 0 { + return false, fmt.Errorf("no snapshots found") + } + + // Open the most recent snapshot + snapID := snapshots[0].ID + meta, snapshot, err := r.snapshots.Open(snapID) + if err != nil { + r.logger.Printf("[ERR] raft: Failed to open snapshot %v: %v", snapID, err) + return false, err + } + defer snapshot.Close() + + // Setup the request + req := InstallSnapshotRequest{ + RPCHeader: r.getRPCHeader(), + SnapshotVersion: meta.Version, + Term: s.currentTerm, + Leader: r.trans.EncodePeer(r.localAddr), + LastLogIndex: meta.Index, + LastLogTerm: meta.Term, + Peers: meta.Peers, + Size: meta.Size, + Configuration: encodeConfiguration(meta.Configuration), + ConfigurationIndex: meta.ConfigurationIndex, + } + + // Make the call + start := time.Now() + var resp InstallSnapshotResponse + if err := r.trans.InstallSnapshot(s.peer.Address, &req, &resp, snapshot); err != nil { + r.logger.Printf("[ERR] raft: Failed to install snapshot %v: %v", snapID, err) + s.failures++ + return false, err + } + metrics.MeasureSince([]string{"raft", "replication", "installSnapshot", string(s.peer.ID)}, start) + + // Check for a newer term, stop running + if resp.Term > req.Term { + r.handleStaleTerm(s) + return true, nil + } + + // Update the last contact + s.setLastContact() + + // Check for success + if resp.Success { + // Update the indexes + s.nextIndex = meta.Index + 1 + s.commitment.match(s.peer.ID, meta.Index) + + // Clear any failures + s.failures = 0 + + // Notify we are still leader + s.notifyAll(true) + } else { + s.failures++ + r.logger.Printf("[WARN] raft: InstallSnapshot to %v rejected", s.peer) + } + return false, nil +} + +// heartbeat is used to periodically invoke AppendEntries on a peer +// to ensure they don't time out. This is done async of replicate(), +// since that routine could potentially be blocked on disk IO. +func (r *Raft) heartbeat(s *followerReplication, stopCh chan struct{}) { + var failures uint64 + req := AppendEntriesRequest{ + RPCHeader: r.getRPCHeader(), + Term: s.currentTerm, + Leader: r.trans.EncodePeer(r.localAddr), + } + var resp AppendEntriesResponse + for { + // Wait for the next heartbeat interval or forced notify + select { + case <-s.notifyCh: + case <-randomTimeout(r.conf.HeartbeatTimeout / 10): + case <-stopCh: + return + } + + start := time.Now() + if err := r.trans.AppendEntries(s.peer.Address, &req, &resp); err != nil { + r.logger.Printf("[ERR] raft: Failed to heartbeat to %v: %v", s.peer.Address, err) + failures++ + select { + case <-time.After(backoff(failureWait, failures, maxFailureScale)): + case <-stopCh: + } + } else { + s.setLastContact() + failures = 0 + metrics.MeasureSince([]string{"raft", "replication", "heartbeat", string(s.peer.ID)}, start) + s.notifyAll(resp.Success) + } + } +} + +// pipelineReplicate is used when we have synchronized our state with the follower, +// and want to switch to a higher performance pipeline mode of replication. +// We only pipeline AppendEntries commands, and if we ever hit an error, we fall +// back to the standard replication which can handle more complex situations. +func (r *Raft) pipelineReplicate(s *followerReplication) error { + // Create a new pipeline + pipeline, err := r.trans.AppendEntriesPipeline(s.peer.Address) + if err != nil { + return err + } + defer pipeline.Close() + + // Log start and stop of pipeline + r.logger.Printf("[INFO] raft: pipelining replication to peer %v", s.peer) + defer r.logger.Printf("[INFO] raft: aborting pipeline replication to peer %v", s.peer) + + // Create a shutdown and finish channel + stopCh := make(chan struct{}) + finishCh := make(chan struct{}) + + // Start a dedicated decoder + r.goFunc(func() { r.pipelineDecode(s, pipeline, stopCh, finishCh) }) + + // Start pipeline sends at the last good nextIndex + nextIndex := s.nextIndex + + shouldStop := false +SEND: + for !shouldStop { + select { + case <-finishCh: + break SEND + case maxIndex := <-s.stopCh: + // Make a best effort to replicate up to this index + if maxIndex > 0 { + r.pipelineSend(s, pipeline, &nextIndex, maxIndex) + } + break SEND + case <-s.triggerCh: + lastLogIdx, _ := r.getLastLog() + shouldStop = r.pipelineSend(s, pipeline, &nextIndex, lastLogIdx) + case <-randomTimeout(r.conf.CommitTimeout): + lastLogIdx, _ := r.getLastLog() + shouldStop = r.pipelineSend(s, pipeline, &nextIndex, lastLogIdx) + } + } + + // Stop our decoder, and wait for it to finish + close(stopCh) + select { + case <-finishCh: + case <-r.shutdownCh: + } + return nil +} + +// pipelineSend is used to send data over a pipeline. It is a helper to +// pipelineReplicate. +func (r *Raft) pipelineSend(s *followerReplication, p AppendPipeline, nextIdx *uint64, lastIndex uint64) (shouldStop bool) { + // Create a new append request + req := new(AppendEntriesRequest) + if err := r.setupAppendEntries(s, req, *nextIdx, lastIndex); err != nil { + return true + } + + // Pipeline the append entries + if _, err := p.AppendEntries(req, new(AppendEntriesResponse)); err != nil { + r.logger.Printf("[ERR] raft: Failed to pipeline AppendEntries to %v: %v", s.peer, err) + return true + } + + // Increase the next send log to avoid re-sending old logs + if n := len(req.Entries); n > 0 { + last := req.Entries[n-1] + *nextIdx = last.Index + 1 + } + return false +} + +// pipelineDecode is used to decode the responses of pipelined requests. +func (r *Raft) pipelineDecode(s *followerReplication, p AppendPipeline, stopCh, finishCh chan struct{}) { + defer close(finishCh) + respCh := p.Consumer() + for { + select { + case ready := <-respCh: + req, resp := ready.Request(), ready.Response() + appendStats(string(s.peer.ID), ready.Start(), float32(len(req.Entries))) + + // Check for a newer term, stop running + if resp.Term > req.Term { + r.handleStaleTerm(s) + return + } + + // Update the last contact + s.setLastContact() + + // Abort pipeline if not successful + if !resp.Success { + return + } + + // Update our replication state + updateLastAppended(s, req) + case <-stopCh: + return + } + } +} + +// setupAppendEntries is used to setup an append entries request. +func (r *Raft) setupAppendEntries(s *followerReplication, req *AppendEntriesRequest, nextIndex, lastIndex uint64) error { + req.RPCHeader = r.getRPCHeader() + req.Term = s.currentTerm + req.Leader = r.trans.EncodePeer(r.localAddr) + req.LeaderCommitIndex = r.getCommitIndex() + if err := r.setPreviousLog(req, nextIndex); err != nil { + return err + } + if err := r.setNewLogs(req, nextIndex, lastIndex); err != nil { + return err + } + return nil +} + +// setPreviousLog is used to setup the PrevLogEntry and PrevLogTerm for an +// AppendEntriesRequest given the next index to replicate. +func (r *Raft) setPreviousLog(req *AppendEntriesRequest, nextIndex uint64) error { + // Guard for the first index, since there is no 0 log entry + // Guard against the previous index being a snapshot as well + lastSnapIdx, lastSnapTerm := r.getLastSnapshot() + if nextIndex == 1 { + req.PrevLogEntry = 0 + req.PrevLogTerm = 0 + + } else if (nextIndex - 1) == lastSnapIdx { + req.PrevLogEntry = lastSnapIdx + req.PrevLogTerm = lastSnapTerm + + } else { + var l Log + if err := r.logs.GetLog(nextIndex-1, &l); err != nil { + r.logger.Printf("[ERR] raft: Failed to get log at index %d: %v", + nextIndex-1, err) + return err + } + + // Set the previous index and term (0 if nextIndex is 1) + req.PrevLogEntry = l.Index + req.PrevLogTerm = l.Term + } + return nil +} + +// setNewLogs is used to setup the logs which should be appended for a request. +func (r *Raft) setNewLogs(req *AppendEntriesRequest, nextIndex, lastIndex uint64) error { + // Append up to MaxAppendEntries or up to the lastIndex + req.Entries = make([]*Log, 0, r.conf.MaxAppendEntries) + maxIndex := min(nextIndex+uint64(r.conf.MaxAppendEntries)-1, lastIndex) + for i := nextIndex; i <= maxIndex; i++ { + oldLog := new(Log) + if err := r.logs.GetLog(i, oldLog); err != nil { + r.logger.Printf("[ERR] raft: Failed to get log at index %d: %v", i, err) + return err + } + req.Entries = append(req.Entries, oldLog) + } + return nil +} + +// appendStats is used to emit stats about an AppendEntries invocation. +func appendStats(peer string, start time.Time, logs float32) { + metrics.MeasureSince([]string{"raft", "replication", "appendEntries", "rpc", peer}, start) + metrics.IncrCounter([]string{"raft", "replication", "appendEntries", "logs", peer}, logs) +} + +// handleStaleTerm is used when a follower indicates that we have a stale term. +func (r *Raft) handleStaleTerm(s *followerReplication) { + r.logger.Printf("[ERR] raft: peer %v has newer term, stopping replication", s.peer) + s.notifyAll(false) // No longer leader + asyncNotifyCh(s.stepDown) +} + +// updateLastAppended is used to update follower replication state after a +// successful AppendEntries RPC. +// TODO: This isn't used during InstallSnapshot, but the code there is similar. +func updateLastAppended(s *followerReplication, req *AppendEntriesRequest) { + // Mark any inflight logs as committed + if logs := req.Entries; len(logs) > 0 { + last := logs[len(logs)-1] + s.nextIndex = last.Index + 1 + s.commitment.match(s.peer.ID, last.Index) + } + + // Notify still leader + s.notifyAll(true) +} diff --git a/vendor/github.com/hashicorp/raft/snapshot.go b/vendor/github.com/hashicorp/raft/snapshot.go new file mode 100644 index 000000000..5287ebc41 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/snapshot.go @@ -0,0 +1,239 @@ +package raft + +import ( + "fmt" + "io" + "time" + + "github.com/armon/go-metrics" +) + +// SnapshotMeta is for metadata of a snapshot. +type SnapshotMeta struct { + // Version is the version number of the snapshot metadata. This does not cover + // the application's data in the snapshot, that should be versioned + // separately. + Version SnapshotVersion + + // ID is opaque to the store, and is used for opening. + ID string + + // Index and Term store when the snapshot was taken. + Index uint64 + Term uint64 + + // Peers is deprecated and used to support version 0 snapshots, but will + // be populated in version 1 snapshots as well to help with upgrades. + Peers []byte + + // Configuration and ConfigurationIndex are present in version 1 + // snapshots and later. + Configuration Configuration + ConfigurationIndex uint64 + + // Size is the size of the snapshot in bytes. + Size int64 +} + +// SnapshotStore interface is used to allow for flexible implementations +// of snapshot storage and retrieval. For example, a client could implement +// a shared state store such as S3, allowing new nodes to restore snapshots +// without streaming from the leader. +type SnapshotStore interface { + // Create is used to begin a snapshot at a given index and term, and with + // the given committed configuration. The version parameter controls + // which snapshot version to create. + Create(version SnapshotVersion, index, term uint64, configuration Configuration, + configurationIndex uint64, trans Transport) (SnapshotSink, error) + + // List is used to list the available snapshots in the store. + // It should return then in descending order, with the highest index first. + List() ([]*SnapshotMeta, error) + + // Open takes a snapshot ID and provides a ReadCloser. Once close is + // called it is assumed the snapshot is no longer needed. + Open(id string) (*SnapshotMeta, io.ReadCloser, error) +} + +// SnapshotSink is returned by StartSnapshot. The FSM will Write state +// to the sink and call Close on completion. On error, Cancel will be invoked. +type SnapshotSink interface { + io.WriteCloser + ID() string + Cancel() error +} + +// runSnapshots is a long running goroutine used to manage taking +// new snapshots of the FSM. It runs in parallel to the FSM and +// main goroutines, so that snapshots do not block normal operation. +func (r *Raft) runSnapshots() { + for { + select { + case <-randomTimeout(r.conf.SnapshotInterval): + // Check if we should snapshot + if !r.shouldSnapshot() { + continue + } + + // Trigger a snapshot + if _, err := r.takeSnapshot(); err != nil { + r.logger.Printf("[ERR] raft: Failed to take snapshot: %v", err) + } + + case future := <-r.userSnapshotCh: + // User-triggered, run immediately + id, err := r.takeSnapshot() + if err != nil { + r.logger.Printf("[ERR] raft: Failed to take snapshot: %v", err) + } else { + future.opener = func() (*SnapshotMeta, io.ReadCloser, error) { + return r.snapshots.Open(id) + } + } + future.respond(err) + + case <-r.shutdownCh: + return + } + } +} + +// shouldSnapshot checks if we meet the conditions to take +// a new snapshot. +func (r *Raft) shouldSnapshot() bool { + // Check the last snapshot index + lastSnap, _ := r.getLastSnapshot() + + // Check the last log index + lastIdx, err := r.logs.LastIndex() + if err != nil { + r.logger.Printf("[ERR] raft: Failed to get last log index: %v", err) + return false + } + + // Compare the delta to the threshold + delta := lastIdx - lastSnap + return delta >= r.conf.SnapshotThreshold +} + +// takeSnapshot is used to take a new snapshot. This must only be called from +// the snapshot thread, never the main thread. This returns the ID of the new +// snapshot, along with an error. +func (r *Raft) takeSnapshot() (string, error) { + defer metrics.MeasureSince([]string{"raft", "snapshot", "takeSnapshot"}, time.Now()) + + // Create a request for the FSM to perform a snapshot. + snapReq := &reqSnapshotFuture{} + snapReq.init() + + // Wait for dispatch or shutdown. + select { + case r.fsmSnapshotCh <- snapReq: + case <-r.shutdownCh: + return "", ErrRaftShutdown + } + + // Wait until we get a response + if err := snapReq.Error(); err != nil { + if err != ErrNothingNewToSnapshot { + err = fmt.Errorf("failed to start snapshot: %v", err) + } + return "", err + } + defer snapReq.snapshot.Release() + + // Make a request for the configurations and extract the committed info. + // We have to use the future here to safely get this information since + // it is owned by the main thread. + configReq := &configurationsFuture{} + configReq.init() + select { + case r.configurationsCh <- configReq: + case <-r.shutdownCh: + return "", ErrRaftShutdown + } + if err := configReq.Error(); err != nil { + return "", err + } + committed := configReq.configurations.committed + committedIndex := configReq.configurations.committedIndex + + // We don't support snapshots while there's a config change outstanding + // since the snapshot doesn't have a means to represent this state. This + // is a little weird because we need the FSM to apply an index that's + // past the configuration change, even though the FSM itself doesn't see + // the configuration changes. It should be ok in practice with normal + // application traffic flowing through the FSM. If there's none of that + // then it's not crucial that we snapshot, since there's not much going + // on Raft-wise. + if snapReq.index < committedIndex { + return "", fmt.Errorf("cannot take snapshot now, wait until the configuration entry at %v has been applied (have applied %v)", + committedIndex, snapReq.index) + } + + // Create a new snapshot. + r.logger.Printf("[INFO] raft: Starting snapshot up to %d", snapReq.index) + start := time.Now() + version := getSnapshotVersion(r.protocolVersion) + sink, err := r.snapshots.Create(version, snapReq.index, snapReq.term, committed, committedIndex, r.trans) + if err != nil { + return "", fmt.Errorf("failed to create snapshot: %v", err) + } + metrics.MeasureSince([]string{"raft", "snapshot", "create"}, start) + + // Try to persist the snapshot. + start = time.Now() + if err := snapReq.snapshot.Persist(sink); err != nil { + sink.Cancel() + return "", fmt.Errorf("failed to persist snapshot: %v", err) + } + metrics.MeasureSince([]string{"raft", "snapshot", "persist"}, start) + + // Close and check for error. + if err := sink.Close(); err != nil { + return "", fmt.Errorf("failed to close snapshot: %v", err) + } + + // Update the last stable snapshot info. + r.setLastSnapshot(snapReq.index, snapReq.term) + + // Compact the logs. + if err := r.compactLogs(snapReq.index); err != nil { + return "", err + } + + r.logger.Printf("[INFO] raft: Snapshot to %d complete", snapReq.index) + return sink.ID(), nil +} + +// compactLogs takes the last inclusive index of a snapshot +// and trims the logs that are no longer needed. +func (r *Raft) compactLogs(snapIdx uint64) error { + defer metrics.MeasureSince([]string{"raft", "compactLogs"}, time.Now()) + // Determine log ranges to compact + minLog, err := r.logs.FirstIndex() + if err != nil { + return fmt.Errorf("failed to get first log index: %v", err) + } + + // Check if we have enough logs to truncate + lastLogIdx, _ := r.getLastLog() + if lastLogIdx <= r.conf.TrailingLogs { + return nil + } + + // Truncate up to the end of the snapshot, or `TrailingLogs` + // back from the head, which ever is further back. This ensures + // at least `TrailingLogs` entries, but does not allow logs + // after the snapshot to be removed. + maxLog := min(snapIdx, lastLogIdx-r.conf.TrailingLogs) + + // Log this + r.logger.Printf("[INFO] raft: Compacting logs from %d to %d", minLog, maxLog) + + // Compact the logs + if err := r.logs.DeleteRange(minLog, maxLog); err != nil { + return fmt.Errorf("log compaction failed: %v", err) + } + return nil +} diff --git a/vendor/github.com/hashicorp/raft/stable.go b/vendor/github.com/hashicorp/raft/stable.go new file mode 100644 index 000000000..ff59a8c57 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/stable.go @@ -0,0 +1,15 @@ +package raft + +// StableStore is used to provide stable storage +// of key configurations to ensure safety. +type StableStore interface { + Set(key []byte, val []byte) error + + // Get returns the value for key, or an empty byte slice if key was not found. + Get(key []byte) ([]byte, error) + + SetUint64(key []byte, val uint64) error + + // GetUint64 returns the uint64 value for key, or 0 if key was not found. + GetUint64(key []byte) (uint64, error) +} diff --git a/vendor/github.com/hashicorp/raft/state.go b/vendor/github.com/hashicorp/raft/state.go new file mode 100644 index 000000000..f6d658b8b --- /dev/null +++ b/vendor/github.com/hashicorp/raft/state.go @@ -0,0 +1,167 @@ +package raft + +import ( + "sync" + "sync/atomic" +) + +// RaftState captures the state of a Raft node: Follower, Candidate, Leader, +// or Shutdown. +type RaftState uint32 + +const ( + // Follower is the initial state of a Raft node. + Follower RaftState = iota + + // Candidate is one of the valid states of a Raft node. + Candidate + + // Leader is one of the valid states of a Raft node. + Leader + + // Shutdown is the terminal state of a Raft node. + Shutdown +) + +func (s RaftState) String() string { + switch s { + case Follower: + return "Follower" + case Candidate: + return "Candidate" + case Leader: + return "Leader" + case Shutdown: + return "Shutdown" + default: + return "Unknown" + } +} + +// raftState is used to maintain various state variables +// and provides an interface to set/get the variables in a +// thread safe manner. +type raftState struct { + // The current term, cache of StableStore + currentTerm uint64 + + // Highest committed log entry + commitIndex uint64 + + // Last applied log to the FSM + lastApplied uint64 + + // protects 4 next fields + lastLock sync.Mutex + + // Cache the latest snapshot index/term + lastSnapshotIndex uint64 + lastSnapshotTerm uint64 + + // Cache the latest log from LogStore + lastLogIndex uint64 + lastLogTerm uint64 + + // Tracks running goroutines + routinesGroup sync.WaitGroup + + // The current state + state RaftState +} + +func (r *raftState) getState() RaftState { + stateAddr := (*uint32)(&r.state) + return RaftState(atomic.LoadUint32(stateAddr)) +} + +func (r *raftState) setState(s RaftState) { + stateAddr := (*uint32)(&r.state) + atomic.StoreUint32(stateAddr, uint32(s)) +} + +func (r *raftState) getCurrentTerm() uint64 { + return atomic.LoadUint64(&r.currentTerm) +} + +func (r *raftState) setCurrentTerm(term uint64) { + atomic.StoreUint64(&r.currentTerm, term) +} + +func (r *raftState) getLastLog() (index, term uint64) { + r.lastLock.Lock() + index = r.lastLogIndex + term = r.lastLogTerm + r.lastLock.Unlock() + return +} + +func (r *raftState) setLastLog(index, term uint64) { + r.lastLock.Lock() + r.lastLogIndex = index + r.lastLogTerm = term + r.lastLock.Unlock() +} + +func (r *raftState) getLastSnapshot() (index, term uint64) { + r.lastLock.Lock() + index = r.lastSnapshotIndex + term = r.lastSnapshotTerm + r.lastLock.Unlock() + return +} + +func (r *raftState) setLastSnapshot(index, term uint64) { + r.lastLock.Lock() + r.lastSnapshotIndex = index + r.lastSnapshotTerm = term + r.lastLock.Unlock() +} + +func (r *raftState) getCommitIndex() uint64 { + return atomic.LoadUint64(&r.commitIndex) +} + +func (r *raftState) setCommitIndex(index uint64) { + atomic.StoreUint64(&r.commitIndex, index) +} + +func (r *raftState) getLastApplied() uint64 { + return atomic.LoadUint64(&r.lastApplied) +} + +func (r *raftState) setLastApplied(index uint64) { + atomic.StoreUint64(&r.lastApplied, index) +} + +// Start a goroutine and properly handle the race between a routine +// starting and incrementing, and exiting and decrementing. +func (r *raftState) goFunc(f func()) { + r.routinesGroup.Add(1) + go func() { + defer r.routinesGroup.Done() + f() + }() +} + +func (r *raftState) waitShutdown() { + r.routinesGroup.Wait() +} + +// getLastIndex returns the last index in stable storage. +// Either from the last log or from the last snapshot. +func (r *raftState) getLastIndex() uint64 { + r.lastLock.Lock() + defer r.lastLock.Unlock() + return max(r.lastLogIndex, r.lastSnapshotIndex) +} + +// getLastEntry returns the last index and term in stable storage. +// Either from the last log or from the last snapshot. +func (r *raftState) getLastEntry() (uint64, uint64) { + r.lastLock.Lock() + defer r.lastLock.Unlock() + if r.lastLogIndex >= r.lastSnapshotIndex { + return r.lastLogIndex, r.lastLogTerm + } + return r.lastSnapshotIndex, r.lastSnapshotTerm +} diff --git a/vendor/github.com/hashicorp/raft/tcp_transport.go b/vendor/github.com/hashicorp/raft/tcp_transport.go new file mode 100644 index 000000000..9281508a0 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/tcp_transport.go @@ -0,0 +1,105 @@ +package raft + +import ( + "errors" + "io" + "log" + "net" + "time" +) + +var ( + errNotAdvertisable = errors.New("local bind address is not advertisable") + errNotTCP = errors.New("local address is not a TCP address") +) + +// TCPStreamLayer implements StreamLayer interface for plain TCP. +type TCPStreamLayer struct { + advertise net.Addr + listener *net.TCPListener +} + +// NewTCPTransport returns a NetworkTransport that is built on top of +// a TCP streaming transport layer. +func NewTCPTransport( + bindAddr string, + advertise net.Addr, + maxPool int, + timeout time.Duration, + logOutput io.Writer, +) (*NetworkTransport, error) { + return newTCPTransport(bindAddr, advertise, maxPool, timeout, func(stream StreamLayer) *NetworkTransport { + return NewNetworkTransport(stream, maxPool, timeout, logOutput) + }) +} + +// NewTCPTransportWithLogger returns a NetworkTransport that is built on top of +// a TCP streaming transport layer, with log output going to the supplied Logger +func NewTCPTransportWithLogger( + bindAddr string, + advertise net.Addr, + maxPool int, + timeout time.Duration, + logger *log.Logger, +) (*NetworkTransport, error) { + return newTCPTransport(bindAddr, advertise, maxPool, timeout, func(stream StreamLayer) *NetworkTransport { + return NewNetworkTransportWithLogger(stream, maxPool, timeout, logger) + }) +} + +func newTCPTransport(bindAddr string, + advertise net.Addr, + maxPool int, + timeout time.Duration, + transportCreator func(stream StreamLayer) *NetworkTransport) (*NetworkTransport, error) { + // Try to bind + list, err := net.Listen("tcp", bindAddr) + if err != nil { + return nil, err + } + + // Create stream + stream := &TCPStreamLayer{ + advertise: advertise, + listener: list.(*net.TCPListener), + } + + // Verify that we have a usable advertise address + addr, ok := stream.Addr().(*net.TCPAddr) + if !ok { + list.Close() + return nil, errNotTCP + } + if addr.IP.IsUnspecified() { + list.Close() + return nil, errNotAdvertisable + } + + // Create the network transport + trans := transportCreator(stream) + return trans, nil +} + +// Dial implements the StreamLayer interface. +func (t *TCPStreamLayer) Dial(address ServerAddress, timeout time.Duration) (net.Conn, error) { + return net.DialTimeout("tcp", string(address), timeout) +} + +// Accept implements the net.Listener interface. +func (t *TCPStreamLayer) Accept() (c net.Conn, err error) { + return t.listener.Accept() +} + +// Close implements the net.Listener interface. +func (t *TCPStreamLayer) Close() (err error) { + return t.listener.Close() +} + +// Addr implements the net.Listener interface. +func (t *TCPStreamLayer) Addr() net.Addr { + // Use an advertise addr if provided + if t.advertise != nil { + return t.advertise + } + return t.listener.Addr() +} diff --git a/vendor/github.com/hashicorp/raft/transport.go b/vendor/github.com/hashicorp/raft/transport.go new file mode 100644 index 000000000..633f97a8c --- /dev/null +++ b/vendor/github.com/hashicorp/raft/transport.go @@ -0,0 +1,124 @@ +package raft + +import ( + "io" + "time" +) + +// RPCResponse captures both a response and a potential error. +type RPCResponse struct { + Response interface{} + Error error +} + +// RPC has a command, and provides a response mechanism. +type RPC struct { + Command interface{} + Reader io.Reader // Set only for InstallSnapshot + RespChan chan<- RPCResponse +} + +// Respond is used to respond with a response, error or both +func (r *RPC) Respond(resp interface{}, err error) { + r.RespChan <- RPCResponse{resp, err} +} + +// Transport provides an interface for network transports +// to allow Raft to communicate with other nodes. +type Transport interface { + // Consumer returns a channel that can be used to + // consume and respond to RPC requests. + Consumer() <-chan RPC + + // LocalAddr is used to return our local address to distinguish from our peers. + LocalAddr() ServerAddress + + // AppendEntriesPipeline returns an interface that can be used to pipeline + // AppendEntries requests. + AppendEntriesPipeline(target ServerAddress) (AppendPipeline, error) + + // AppendEntries sends the appropriate RPC to the target node. + AppendEntries(target ServerAddress, args *AppendEntriesRequest, resp *AppendEntriesResponse) error + + // RequestVote sends the appropriate RPC to the target node. + RequestVote(target ServerAddress, args *RequestVoteRequest, resp *RequestVoteResponse) error + + // InstallSnapshot is used to push a snapshot down to a follower. The data is read from + // the ReadCloser and streamed to the client. + InstallSnapshot(target ServerAddress, args *InstallSnapshotRequest, resp *InstallSnapshotResponse, data io.Reader) error + + // EncodePeer is used to serialize a peer's address. + EncodePeer(ServerAddress) []byte + + // DecodePeer is used to deserialize a peer's address. + DecodePeer([]byte) ServerAddress + + // SetHeartbeatHandler is used to setup a heartbeat handler + // as a fast-pass. This is to avoid head-of-line blocking from + // disk IO. If a Transport does not support this, it can simply + // ignore the call, and push the heartbeat onto the Consumer channel. + SetHeartbeatHandler(cb func(rpc RPC)) +} + +// WithClose is an interface that a transport may provide which +// allows a transport to be shut down cleanly when a Raft instance +// shuts down. +// +// It is defined separately from Transport as unfortunately it wasn't in the +// original interface specification. +type WithClose interface { + // Close permanently closes a transport, stopping + // any associated goroutines and freeing other resources. + Close() error +} + +// LoopbackTransport is an interface that provides a loopback transport suitable for testing +// e.g. InmemTransport. It's there so we don't have to rewrite tests. +type LoopbackTransport interface { + Transport // Embedded transport reference + WithPeers // Embedded peer management + WithClose // with a close routine +} + +// WithPeers is an interface that a transport may provide which allows for connection and +// disconnection. Unless the transport is a loopback transport, the transport specified to +// "Connect" is likely to be nil. +type WithPeers interface { + Connect(peer ServerAddress, t Transport) // Connect a peer + Disconnect(peer ServerAddress) // Disconnect a given peer + DisconnectAll() // Disconnect all peers, possibly to reconnect them later +} + +// AppendPipeline is used for pipelining AppendEntries requests. It is used +// to increase the replication throughput by masking latency and better +// utilizing bandwidth. +type AppendPipeline interface { + // AppendEntries is used to add another request to the pipeline. + // The send may block which is an effective form of back-pressure. + AppendEntries(args *AppendEntriesRequest, resp *AppendEntriesResponse) (AppendFuture, error) + + // Consumer returns a channel that can be used to consume + // response futures when they are ready. + Consumer() <-chan AppendFuture + + // Close closes the pipeline and cancels all inflight RPCs + Close() error +} + +// AppendFuture is used to return information about a pipelined AppendEntries request. +type AppendFuture interface { + Future + + // Start returns the time that the append request was started. + // It is always OK to call this method. + Start() time.Time + + // Request holds the parameters of the AppendEntries call. + // It is always OK to call this method. + Request() *AppendEntriesRequest + + // Response holds the results of the AppendEntries call. + // This method must only be called after the Error + // method returns, and will only be valid on success. + Response() *AppendEntriesResponse +} diff --git a/vendor/github.com/hashicorp/raft/util.go b/vendor/github.com/hashicorp/raft/util.go new file mode 100644 index 000000000..90428d743 --- /dev/null +++ b/vendor/github.com/hashicorp/raft/util.go @@ -0,0 +1,133 @@ +package raft + +import ( + "bytes" + crand "crypto/rand" + "fmt" + "math" + "math/big" + "math/rand" + "time" + + "github.com/hashicorp/go-msgpack/codec" +) + +func init() { + // Ensure we use a high-entropy seed for the psuedo-random generator + rand.Seed(newSeed()) +} + +// returns an int64 from a crypto random source +// can be used to seed a source for a math/rand. +func newSeed() int64 { + r, err := crand.Int(crand.Reader, big.NewInt(math.MaxInt64)) + if err != nil { + panic(fmt.Errorf("failed to read random bytes: %v", err)) + } + return r.Int64() +} + +// randomTimeout returns a value that is between the minVal and 2x minVal. +func randomTimeout(minVal time.Duration) <-chan time.Time { + if minVal == 0 { + return nil + } + extra := (time.Duration(rand.Int63()) % minVal) + return time.After(minVal + extra) +} + +// min returns the minimum. +func min(a, b uint64) uint64 { + if a <= b { + return a + } + return b +} + +// max returns the maximum. +func max(a, b uint64) uint64 { + if a >= b { + return a + } + return b +} + +// generateUUID is used to generate a random UUID. +func generateUUID() string { + buf := make([]byte, 16) + if _, err := crand.Read(buf); err != nil { + panic(fmt.Errorf("failed to read random bytes: %v", err)) + } + + return fmt.Sprintf("%08x-%04x-%04x-%04x-%12x", + buf[0:4], + buf[4:6], + buf[6:8], + buf[8:10], + buf[10:16]) +} + +// asyncNotifyCh is used to do an async channel send +// to a single channel without blocking. +func asyncNotifyCh(ch chan struct{}) { + select { + case ch <- struct{}{}: + default: + } +} + +// drainNotifyCh empties out a single-item notification channel without +// blocking, and returns whether it received anything. +func drainNotifyCh(ch chan struct{}) bool { + select { + case <-ch: + return true + default: + return false + } +} + +// asyncNotifyBool is used to do an async notification +// on a bool channel. +func asyncNotifyBool(ch chan bool, v bool) { + select { + case ch <- v: + default: + } +} + +// Decode reverses the encode operation on a byte slice input. +func decodeMsgPack(buf []byte, out interface{}) error { + r := bytes.NewBuffer(buf) + hd := codec.MsgpackHandle{} + dec := codec.NewDecoder(r, &hd) + return dec.Decode(out) +} + +// Encode writes an encoded object to a new bytes buffer. +func encodeMsgPack(in interface{}) (*bytes.Buffer, error) { + buf := bytes.NewBuffer(nil) + hd := codec.MsgpackHandle{} + enc := codec.NewEncoder(buf, &hd) + err := enc.Encode(in) + return buf, err +} + +// backoff is used to compute an exponential backoff +// duration. Base time is scaled by the current round, +// up to some maximum scale factor. +func backoff(base time.Duration, round, limit uint64) time.Duration { + power := min(round, limit) + for power > 2 { + base *= 2 + power-- + } + return base +} + +// Needed for sorting []uint64, used to determine commitment +type uint64Slice []uint64 + +func (p uint64Slice) Len() int { return len(p) } +func (p uint64Slice) Less(i, j int) bool { return p[i] < p[j] } +func (p uint64Slice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } diff --git a/vendor/github.com/hashicorp/serf/serf/broadcast.go b/vendor/github.com/hashicorp/serf/serf/broadcast.go new file mode 100644 index 000000000..d20728f3f --- /dev/null +++ b/vendor/github.com/hashicorp/serf/serf/broadcast.go @@ -0,0 +1,27 @@ +package serf + +import ( + "github.com/hashicorp/memberlist" +) + +// broadcast is an implementation of memberlist.Broadcast and is used +// to manage broadcasts across the memberlist channel that are related +// only to Serf. +type broadcast struct { + msg []byte + notify chan<- struct{} +} + +func (b *broadcast) Invalidates(other memberlist.Broadcast) bool { + return false +} + +func (b *broadcast) Message() []byte { + return b.msg +} + +func (b *broadcast) Finished() { + if b.notify != nil { + close(b.notify) + } +} diff --git a/vendor/github.com/hashicorp/serf/serf/coalesce.go b/vendor/github.com/hashicorp/serf/serf/coalesce.go new file mode 100644 index 000000000..567943be1 --- /dev/null +++ b/vendor/github.com/hashicorp/serf/serf/coalesce.go @@ -0,0 +1,80 @@ +package serf + +import ( + "time" +) + +// coalescer is a simple interface that must be implemented to be +// used inside of a coalesceLoop +type coalescer interface { + // Can the coalescer handle this event, if not it is + // directly passed through to the destination channel + Handle(Event) bool + + // Invoked to coalesce the given event + Coalesce(Event) + + // Invoked to flush the coalesced events + Flush(outChan chan<- Event) +} + +// coalescedEventCh returns an event channel where the events are coalesced +// using the given coalescer. +func coalescedEventCh(outCh chan<- Event, shutdownCh <-chan struct{}, + cPeriod time.Duration, qPeriod time.Duration, c coalescer) chan<- Event { + inCh := make(chan Event, 1024) + go coalesceLoop(inCh, outCh, shutdownCh, cPeriod, qPeriod, c) + return inCh +} + +// coalesceLoop is a simple long-running routine that manages the high-level +// flow of coalescing based on quiescence and a maximum quantum period. +func coalesceLoop(inCh <-chan Event, outCh chan<- Event, shutdownCh <-chan struct{}, + coalescePeriod time.Duration, quiescentPeriod time.Duration, c coalescer) { + var quiescent <-chan time.Time + var quantum <-chan time.Time + shutdown := false + +INGEST: + // Reset the timers + quantum = nil + quiescent = nil + + for { + select { + case e := <-inCh: + // Ignore any non handled events + if !c.Handle(e) { + outCh <- e + continue + } + + // Start a new quantum if we need to + // and restart the quiescent timer + if quantum == nil { + quantum = time.After(coalescePeriod) + } + quiescent = time.After(quiescentPeriod) + + // Coalesce the event + c.Coalesce(e) + + case <-quantum: + goto FLUSH + case <-quiescent: + goto FLUSH + case <-shutdownCh: + shutdown = true + goto FLUSH + } + } + +FLUSH: + // Flush the coalesced events + c.Flush(outCh) + + // Restart ingestion if we are not done + if !shutdown { + goto INGEST + } +} diff --git a/vendor/github.com/hashicorp/serf/serf/coalesce_member.go b/vendor/github.com/hashicorp/serf/serf/coalesce_member.go new file mode 100644 index 000000000..82fdb8dac --- /dev/null +++ b/vendor/github.com/hashicorp/serf/serf/coalesce_member.go @@ -0,0 +1,68 @@ +package serf + +type coalesceEvent struct { + Type EventType + Member *Member +} + +type memberEventCoalescer struct { + lastEvents map[string]EventType + latestEvents map[string]coalesceEvent +} + +func (c *memberEventCoalescer) Handle(e Event) bool { + switch e.EventType() { + case EventMemberJoin: + return true + case EventMemberLeave: + return true + case EventMemberFailed: + return true + case EventMemberUpdate: + return true + case EventMemberReap: + return true + default: + return false + } +} + +func (c *memberEventCoalescer) Coalesce(raw Event) { + e := raw.(MemberEvent) + for _, m := range e.Members { + c.latestEvents[m.Name] = coalesceEvent{ + Type: e.Type, + Member: &m, + } + } +} + +func (c *memberEventCoalescer) Flush(outCh chan<- Event) { + // Coalesce the various events we got into a single set of events. + events := make(map[EventType]*MemberEvent) + for name, cevent := range c.latestEvents { + previous, ok := c.lastEvents[name] + + // If we sent the same event before, then ignore + // unless it is a MemberUpdate + if ok && previous == cevent.Type && cevent.Type != EventMemberUpdate { + continue + } + + // Update our last event + c.lastEvents[name] = cevent.Type + + // Add it to our event + newEvent, ok := events[cevent.Type] + if !ok { + newEvent = &MemberEvent{Type: cevent.Type} + events[cevent.Type] = newEvent + } + newEvent.Members = append(newEvent.Members, *cevent.Member) + } + + // Send out those events + for _, event := range events { + outCh <- *event + } +} diff --git a/vendor/github.com/hashicorp/serf/serf/coalesce_user.go b/vendor/github.com/hashicorp/serf/serf/coalesce_user.go new file mode 100644 index 000000000..1551b6c52 --- /dev/null +++ b/vendor/github.com/hashicorp/serf/serf/coalesce_user.go @@ -0,0 +1,52 @@ +package serf + +type latestUserEvents struct { + LTime LamportTime + Events []Event +} + +type userEventCoalescer struct { + // Maps an event name into the latest versions + events map[string]*latestUserEvents +} + +func (c *userEventCoalescer) Handle(e Event) bool { + // Only handle EventUser messages + if e.EventType() != EventUser { + return false + } + + // Check if coalescing is enabled + user := e.(UserEvent) + return user.Coalesce +} + +func (c *userEventCoalescer) Coalesce(e Event) { + user := e.(UserEvent) + latest, ok := c.events[user.Name] + + // Create a new entry if there are none, or + // if this message has the newest LTime + if !ok || latest.LTime < user.LTime { + latest = &latestUserEvents{ + LTime: user.LTime, + Events: []Event{e}, + } + c.events[user.Name] = latest + return + } + + // If the the same age, save it + if latest.LTime == user.LTime { + latest.Events = append(latest.Events, e) + } +} + +func (c *userEventCoalescer) Flush(outChan chan<- Event) { + for _, latest := range c.events { + for _, e := range latest.Events { + outChan <- e + } + } + c.events = make(map[string]*latestUserEvents) +} diff --git a/vendor/github.com/hashicorp/serf/serf/config.go b/vendor/github.com/hashicorp/serf/serf/config.go new file mode 100644 index 000000000..74f21ffbd --- /dev/null +++ b/vendor/github.com/hashicorp/serf/serf/config.go @@ -0,0 +1,267 @@ +package serf + +import ( + "io" + "log" + "os" + "time" + + "github.com/hashicorp/memberlist" +) + +// ProtocolVersionMap is the mapping of Serf delegate protocol versions +// to memberlist protocol versions. We mask the memberlist protocols using +// our own protocol version. +var ProtocolVersionMap map[uint8]uint8 + +func init() { + ProtocolVersionMap = map[uint8]uint8{ + 5: 2, + 4: 2, + 3: 2, + 2: 2, + } +} + +// Config is the configuration for creating a Serf instance. +type Config struct { + // The name of this node. This must be unique in the cluster. If this + // is not set, Serf will set it to the hostname of the running machine. + NodeName string + + // The tags for this role, if any. This is used to provide arbitrary + // key/value metadata per-node. For example, a "role" tag may be used to + // differentiate "load-balancer" from a "web" role as parts of the same cluster. + // Tags are deprecating 'Role', and instead it acts as a special key in this + // map. + Tags map[string]string + + // EventCh is a channel that receives all the Serf events. The events + // are sent on this channel in proper ordering. Care must be taken that + // this channel doesn't block, either by processing the events quick + // enough or buffering the channel, otherwise it can block state updates + // within Serf itself. If no EventCh is specified, no events will be fired, + // but point-in-time snapshots of members can still be retrieved by + // calling Members on Serf. + EventCh chan<- Event + + // ProtocolVersion is the protocol version to speak. This must be between + // ProtocolVersionMin and ProtocolVersionMax. + ProtocolVersion uint8 + + // BroadcastTimeout is the amount of time to wait for a broadcast + // message to be sent to the cluster. Broadcast messages are used for + // things like leave messages and force remove messages. If this is not + // set, a timeout of 5 seconds will be set. + BroadcastTimeout time.Duration + + // The settings below relate to Serf's event coalescence feature. Serf + // is able to coalesce multiple events into single events in order to + // reduce the amount of noise that is sent along the EventCh. For example + // if five nodes quickly join, the EventCh will be sent one EventMemberJoin + // containing the five nodes rather than five individual EventMemberJoin + // events. Coalescence can mitigate potential flapping behavior. + // + // Coalescence is disabled by default and can be enabled by setting + // CoalescePeriod. + // + // CoalescePeriod specifies the time duration to coalesce events. + // For example, if this is set to 5 seconds, then all events received + // within 5 seconds that can be coalesced will be. + // + // QuiescentPeriod specifies the duration of time where if no events + // are received, coalescence immediately happens. For example, if + // CoalscePeriod is set to 10 seconds but QuiscentPeriod is set to 2 + // seconds, then the events will be coalesced and dispatched if no + // new events are received within 2 seconds of the last event. Otherwise, + // every event will always be delayed by at least 10 seconds. + CoalescePeriod time.Duration + QuiescentPeriod time.Duration + + // The settings below relate to Serf's user event coalescing feature. + // The settings operate like above but only affect user messages and + // not the Member* messages that Serf generates. + UserCoalescePeriod time.Duration + UserQuiescentPeriod time.Duration + + // The settings below relate to Serf keeping track of recently + // failed/left nodes and attempting reconnects. + // + // ReapInterval is the interval when the reaper runs. If this is not + // set (it is zero), it will be set to a reasonable default. + // + // ReconnectInterval is the interval when we attempt to reconnect + // to failed nodes. If this is not set (it is zero), it will be set + // to a reasonable default. + // + // ReconnectTimeout is the amount of time to attempt to reconnect to + // a failed node before giving up and considering it completely gone. + // + // TombstoneTimeout is the amount of time to keep around nodes + // that gracefully left as tombstones for syncing state with other + // Serf nodes. + ReapInterval time.Duration + ReconnectInterval time.Duration + ReconnectTimeout time.Duration + TombstoneTimeout time.Duration + + // FlapTimeout is the amount of time less than which we consider a node + // being failed and rejoining looks like a flap for telemetry purposes. + // This should be set less than a typical reboot time, but large enough + // to see actual events, given our expected detection times for a failed + // node. + FlapTimeout time.Duration + + // QueueDepthWarning is used to generate warning message if the + // number of queued messages to broadcast exceeds this number. This + // is to provide the user feedback if events are being triggered + // faster than they can be disseminated + QueueDepthWarning int + + // MaxQueueDepth is used to start dropping messages if the number + // of queued messages to broadcast exceeds this number. This is to + // prevent an unbounded growth of memory utilization + MaxQueueDepth int + + // RecentIntentTimeout is used to determine how long we store recent + // join and leave intents. This is used to guard against the case where + // Serf broadcasts an intent that arrives before the Memberlist event. + // It is important that this not be too short to avoid continuous + // rebroadcasting of dead events. + RecentIntentTimeout time.Duration + + // EventBuffer is used to control how many events are buffered. + // This is used to prevent re-delivery of events to a client. The buffer + // must be large enough to handle all "recent" events, since Serf will + // not deliver messages that are older than the oldest entry in the buffer. + // Thus if a client is generating too many events, it's possible that the + // buffer gets overrun and messages are not delivered. + EventBuffer int + + // QueryBuffer is used to control how many queries are buffered. + // This is used to prevent re-delivery of queries to a client. The buffer + // must be large enough to handle all "recent" events, since Serf will not + // deliver queries older than the oldest entry in the buffer. + // Thus if a client is generating too many queries, it's possible that the + // buffer gets overrun and messages are not delivered. + QueryBuffer int + + // QueryTimeoutMult configures the default timeout multipler for a query to run if no + // specific value is provided. Queries are real-time by nature, where the + // reply is time sensitive. As a result, results are collected in an async + // fashion, however the query must have a bounded duration. We want the timeout + // to be long enough that all nodes have time to receive the message, run a handler, + // and generate a reply. Once the timeout is exceeded, any further replies are ignored. + // The default value is + // + // Timeout = GossipInterval * QueryTimeoutMult * log(N+1) + // + QueryTimeoutMult int + + // QueryResponseSizeLimit and QuerySizeLimit limit the inbound and + // outbound payload sizes for queries, respectively. These must fit + // in a UDP packet with some additional overhead, so tuning these + // past the default values of 1024 will depend on your network + // configuration. + QueryResponseSizeLimit int + QuerySizeLimit int + + // MemberlistConfig is the memberlist configuration that Serf will + // use to do the underlying membership management and gossip. Some + // fields in the MemberlistConfig will be overwritten by Serf no + // matter what: + // + // * Name - This will always be set to the same as the NodeName + // in this configuration. + // + // * Events - Serf uses a custom event delegate. + // + // * Delegate - Serf uses a custom delegate. + // + MemberlistConfig *memberlist.Config + + // LogOutput is the location to write logs to. If this is not set, + // logs will go to stderr. + LogOutput io.Writer + + // Logger is a custom logger which you provide. If Logger is set, it will use + // this for the internal logger. If Logger is not set, it will fall back to the + // behavior for using LogOutput. You cannot specify both LogOutput and Logger + // at the same time. + Logger *log.Logger + + // SnapshotPath if provided is used to snapshot live nodes as well + // as lamport clock values. When Serf is started with a snapshot, + // it will attempt to join all the previously known nodes until one + // succeeds and will also avoid replaying old user events. + SnapshotPath string + + // RejoinAfterLeave controls our interaction with the snapshot file. + // When set to false (default), a leave causes a Serf to not rejoin + // the cluster until an explicit join is received. If this is set to + // true, we ignore the leave, and rejoin the cluster on start. + RejoinAfterLeave bool + + // EnableNameConflictResolution controls if Serf will actively attempt + // to resolve a name conflict. Since each Serf member must have a unique + // name, a cluster can run into issues if multiple nodes claim the same + // name. Without automatic resolution, Serf merely logs some warnings, but + // otherwise does not take any action. Automatic resolution detects the + // conflict and issues a special query which asks the cluster for the + // Name -> IP:Port mapping. If there is a simple majority of votes, that + // node stays while the other node will leave the cluster and exit. + EnableNameConflictResolution bool + + // DisableCoordinates controls if Serf will maintain an estimate of this + // node's network coordinate internally. A network coordinate is useful + // for estimating the network distance (i.e. round trip time) between + // two nodes. Enabling this option adds some overhead to ping messages. + DisableCoordinates bool + + // KeyringFile provides the location of a writable file where Serf can + // persist changes to the encryption keyring. + KeyringFile string + + // Merge can be optionally provided to intercept a cluster merge + // and conditionally abort the merge. + Merge MergeDelegate +} + +// Init allocates the subdata structures +func (c *Config) Init() { + if c.Tags == nil { + c.Tags = make(map[string]string) + } +} + +// DefaultConfig returns a Config struct that contains reasonable defaults +// for most of the configurations. +func DefaultConfig() *Config { + hostname, err := os.Hostname() + if err != nil { + panic(err) + } + + return &Config{ + NodeName: hostname, + BroadcastTimeout: 5 * time.Second, + EventBuffer: 512, + QueryBuffer: 512, + LogOutput: os.Stderr, + ProtocolVersion: 4, + ReapInterval: 15 * time.Second, + RecentIntentTimeout: 5 * time.Minute, + ReconnectInterval: 30 * time.Second, + ReconnectTimeout: 24 * time.Hour, + QueueDepthWarning: 128, + MaxQueueDepth: 4096, + TombstoneTimeout: 24 * time.Hour, + FlapTimeout: 60 * time.Second, + MemberlistConfig: memberlist.DefaultLANConfig(), + QueryTimeoutMult: 16, + QueryResponseSizeLimit: 1024, + QuerySizeLimit: 1024, + EnableNameConflictResolution: true, + DisableCoordinates: false, + } +} diff --git a/vendor/github.com/hashicorp/serf/serf/conflict_delegate.go b/vendor/github.com/hashicorp/serf/serf/conflict_delegate.go new file mode 100644 index 000000000..65a50156c --- /dev/null +++ b/vendor/github.com/hashicorp/serf/serf/conflict_delegate.go @@ -0,0 +1,13 @@ +package serf + +import ( + "github.com/hashicorp/memberlist" +) + +type conflictDelegate struct { + serf *Serf +} + +func (c *conflictDelegate) NotifyConflict(existing, other *memberlist.Node) { + c.serf.handleNodeConflict(existing, other) +} diff --git a/vendor/github.com/hashicorp/serf/serf/delegate.go b/vendor/github.com/hashicorp/serf/serf/delegate.go new file mode 100644 index 000000000..8f51cb7d0 --- /dev/null +++ b/vendor/github.com/hashicorp/serf/serf/delegate.go @@ -0,0 +1,275 @@ +package serf + +import ( + "bytes" + "fmt" + + "github.com/armon/go-metrics" + "github.com/hashicorp/go-msgpack/codec" +) + +// delegate is the memberlist.Delegate implementation that Serf uses. +type delegate struct { + serf *Serf +} + +func (d *delegate) NodeMeta(limit int) []byte { + roleBytes := d.serf.encodeTags(d.serf.config.Tags) + if len(roleBytes) > limit { + panic(fmt.Errorf("Node tags '%v' exceeds length limit of %d bytes", d.serf.config.Tags, limit)) + } + + return roleBytes +} + +func (d *delegate) NotifyMsg(buf []byte) { + // If we didn't actually receive any data, then ignore it. + if len(buf) == 0 { + return + } + metrics.AddSample([]string{"serf", "msgs", "received"}, float32(len(buf))) + + rebroadcast := false + rebroadcastQueue := d.serf.broadcasts + t := messageType(buf[0]) + switch t { + case messageLeaveType: + var leave messageLeave + if err := decodeMessage(buf[1:], &leave); err != nil { + d.serf.logger.Printf("[ERR] serf: Error decoding leave message: %s", err) + break + } + + d.serf.logger.Printf("[DEBUG] serf: messageLeaveType: %s", leave.Node) + rebroadcast = d.serf.handleNodeLeaveIntent(&leave) + + case messageJoinType: + var join messageJoin + if err := decodeMessage(buf[1:], &join); err != nil { + d.serf.logger.Printf("[ERR] serf: Error decoding join message: %s", err) + break + } + + d.serf.logger.Printf("[DEBUG] serf: messageJoinType: %s", join.Node) + rebroadcast = d.serf.handleNodeJoinIntent(&join) + + case messageUserEventType: + var event messageUserEvent + if err := decodeMessage(buf[1:], &event); err != nil { + d.serf.logger.Printf("[ERR] serf: Error decoding user event message: %s", err) + break + } + + d.serf.logger.Printf("[DEBUG] serf: messageUserEventType: %s", event.Name) + rebroadcast = d.serf.handleUserEvent(&event) + rebroadcastQueue = d.serf.eventBroadcasts + + case messageQueryType: + var query messageQuery + if err := decodeMessage(buf[1:], &query); err != nil { + d.serf.logger.Printf("[ERR] serf: Error decoding query message: %s", err) + break + } + + d.serf.logger.Printf("[DEBUG] serf: messageQueryType: %s", query.Name) + rebroadcast = d.serf.handleQuery(&query) + rebroadcastQueue = d.serf.queryBroadcasts + + case messageQueryResponseType: + var resp messageQueryResponse + if err := decodeMessage(buf[1:], &resp); err != nil { + d.serf.logger.Printf("[ERR] serf: Error decoding query response message: %s", err) + break + } + + d.serf.logger.Printf("[DEBUG] serf: messageQueryResponseType: %v", resp.From) + d.serf.handleQueryResponse(&resp) + + case messageRelayType: + var header relayHeader + var handle codec.MsgpackHandle + reader := bytes.NewReader(buf[1:]) + decoder := codec.NewDecoder(reader, &handle) + if err := decoder.Decode(&header); err != nil { + d.serf.logger.Printf("[ERR] serf: Error decoding relay header: %s", err) + break + } + + // The remaining contents are the message itself, so forward that + raw := make([]byte, reader.Len()) + reader.Read(raw) + d.serf.logger.Printf("[DEBUG] serf: Relaying response to addr: %s", header.DestAddr.String()) + if err := d.serf.memberlist.SendTo(&header.DestAddr, raw); err != nil { + d.serf.logger.Printf("[ERR] serf: Error forwarding message to %s: %s", header.DestAddr.String(), err) + break + } + + default: + d.serf.logger.Printf("[WARN] serf: Received message of unknown type: %d", t) + } + + if rebroadcast { + // Copy the buffer since it we cannot rely on the slice not changing + newBuf := make([]byte, len(buf)) + copy(newBuf, buf) + + rebroadcastQueue.QueueBroadcast(&broadcast{ + msg: newBuf, + notify: nil, + }) + } +} + +func (d *delegate) GetBroadcasts(overhead, limit int) [][]byte { + msgs := d.serf.broadcasts.GetBroadcasts(overhead, limit) + + // Determine the bytes used already + bytesUsed := 0 + for _, msg := range msgs { + lm := len(msg) + bytesUsed += lm + overhead + metrics.AddSample([]string{"serf", "msgs", "sent"}, float32(lm)) + } + + // Get any additional query broadcasts + queryMsgs := d.serf.queryBroadcasts.GetBroadcasts(overhead, limit-bytesUsed) + if queryMsgs != nil { + for _, m := range queryMsgs { + lm := len(m) + bytesUsed += lm + overhead + metrics.AddSample([]string{"serf", "msgs", "sent"}, float32(lm)) + } + msgs = append(msgs, queryMsgs...) + } + + // Get any additional event broadcasts + eventMsgs := d.serf.eventBroadcasts.GetBroadcasts(overhead, limit-bytesUsed) + if eventMsgs != nil { + for _, m := range eventMsgs { + lm := len(m) + bytesUsed += lm + overhead + metrics.AddSample([]string{"serf", "msgs", "sent"}, float32(lm)) + } + msgs = append(msgs, eventMsgs...) + } + + return msgs +} + +func (d *delegate) LocalState(join bool) []byte { + d.serf.memberLock.RLock() + defer d.serf.memberLock.RUnlock() + d.serf.eventLock.RLock() + defer d.serf.eventLock.RUnlock() + + // Create the message to send + pp := messagePushPull{ + LTime: d.serf.clock.Time(), + StatusLTimes: make(map[string]LamportTime, len(d.serf.members)), + LeftMembers: make([]string, 0, len(d.serf.leftMembers)), + EventLTime: d.serf.eventClock.Time(), + Events: d.serf.eventBuffer, + QueryLTime: d.serf.queryClock.Time(), + } + + // Add all the join LTimes + for name, member := range d.serf.members { + pp.StatusLTimes[name] = member.statusLTime + } + + // Add all the left nodes + for _, member := range d.serf.leftMembers { + pp.LeftMembers = append(pp.LeftMembers, member.Name) + } + + // Encode the push pull state + buf, err := encodeMessage(messagePushPullType, &pp) + if err != nil { + d.serf.logger.Printf("[ERR] serf: Failed to encode local state: %v", err) + return nil + } + return buf +} + +func (d *delegate) MergeRemoteState(buf []byte, isJoin bool) { + // Ensure we have a message + if len(buf) == 0 { + d.serf.logger.Printf("[ERR] serf: Remote state is zero bytes") + return + } + + // Check the message type + if messageType(buf[0]) != messagePushPullType { + d.serf.logger.Printf("[ERR] serf: Remote state has bad type prefix: %v", buf[0]) + return + } + + // Attempt a decode + pp := messagePushPull{} + if err := decodeMessage(buf[1:], &pp); err != nil { + d.serf.logger.Printf("[ERR] serf: Failed to decode remote state: %v", err) + return + } + + // Witness the Lamport clocks first. + // We subtract 1 since no message with that clock has been sent yet + if pp.LTime > 0 { + d.serf.clock.Witness(pp.LTime - 1) + } + if pp.EventLTime > 0 { + d.serf.eventClock.Witness(pp.EventLTime - 1) + } + if pp.QueryLTime > 0 { + d.serf.queryClock.Witness(pp.QueryLTime - 1) + } + + // Process the left nodes first to avoid the LTimes from being increment + // in the wrong order + leftMap := make(map[string]struct{}, len(pp.LeftMembers)) + leave := messageLeave{} + for _, name := range pp.LeftMembers { + leftMap[name] = struct{}{} + leave.LTime = pp.StatusLTimes[name] + leave.Node = name + d.serf.handleNodeLeaveIntent(&leave) + } + + // Update any other LTimes + join := messageJoin{} + for name, statusLTime := range pp.StatusLTimes { + // Skip the left nodes + if _, ok := leftMap[name]; ok { + continue + } + + // Create an artificial join message + join.LTime = statusLTime + join.Node = name + d.serf.handleNodeJoinIntent(&join) + } + + // If we are doing a join, and eventJoinIgnore is set + // then we set the eventMinTime to the EventLTime. This + // prevents any of the incoming events from being processed + if isJoin && d.serf.eventJoinIgnore { + d.serf.eventLock.Lock() + if pp.EventLTime > d.serf.eventMinTime { + d.serf.eventMinTime = pp.EventLTime + } + d.serf.eventLock.Unlock() + } + + // Process all the events + userEvent := messageUserEvent{} + for _, events := range pp.Events { + if events == nil { + continue + } + userEvent.LTime = events.LTime + for _, e := range events.Events { + userEvent.Name = e.Name + userEvent.Payload = e.Payload + d.serf.handleUserEvent(&userEvent) + } + } +} diff --git a/vendor/github.com/hashicorp/serf/serf/event.go b/vendor/github.com/hashicorp/serf/serf/event.go new file mode 100644 index 000000000..29211393f --- /dev/null +++ b/vendor/github.com/hashicorp/serf/serf/event.go @@ -0,0 +1,174 @@ +package serf + +import ( + "fmt" + "net" + "sync" + "time" +) + +// EventType are all the types of events that may occur and be sent +// along the Serf channel. +type EventType int + +const ( + EventMemberJoin EventType = iota + EventMemberLeave + EventMemberFailed + EventMemberUpdate + EventMemberReap + EventUser + EventQuery +) + +func (t EventType) String() string { + switch t { + case EventMemberJoin: + return "member-join" + case EventMemberLeave: + return "member-leave" + case EventMemberFailed: + return "member-failed" + case EventMemberUpdate: + return "member-update" + case EventMemberReap: + return "member-reap" + case EventUser: + return "user" + case EventQuery: + return "query" + default: + panic(fmt.Sprintf("unknown event type: %d", t)) + } +} + +// Event is a generic interface for exposing Serf events +// Clients will usually need to use a type switches to get +// to a more useful type +type Event interface { + EventType() EventType + String() string +} + +// MemberEvent is the struct used for member related events +// Because Serf coalesces events, an event may contain multiple members. +type MemberEvent struct { + Type EventType + Members []Member +} + +func (m MemberEvent) EventType() EventType { + return m.Type +} + +func (m MemberEvent) String() string { + switch m.Type { + case EventMemberJoin: + return "member-join" + case EventMemberLeave: + return "member-leave" + case EventMemberFailed: + return "member-failed" + case EventMemberUpdate: + return "member-update" + case EventMemberReap: + return "member-reap" + default: + panic(fmt.Sprintf("unknown event type: %d", m.Type)) + } +} + +// UserEvent is the struct used for events that are triggered +// by the user and are not related to members +type UserEvent struct { + LTime LamportTime + Name string + Payload []byte + Coalesce bool +} + +func (u UserEvent) EventType() EventType { + return EventUser +} + +func (u UserEvent) String() string { + return fmt.Sprintf("user-event: %s", u.Name) +} + +// Query is the struct used by EventQuery type events +type Query struct { + LTime LamportTime + Name string + Payload []byte + + serf *Serf + id uint32 // ID is not exported, since it may change + addr []byte // Address to respond to + port uint16 // Port to respond to + deadline time.Time // Must respond by this deadline + relayFactor uint8 // Number of duplicate responses to relay back to sender + respLock sync.Mutex +} + +func (q *Query) EventType() EventType { + return EventQuery +} + +func (q *Query) String() string { + return fmt.Sprintf("query: %s", q.Name) +} + +// Deadline returns the time by which a response must be sent +func (q *Query) Deadline() time.Time { + return q.deadline +} + +// Respond is used to send a response to the user query +func (q *Query) Respond(buf []byte) error { + q.respLock.Lock() + defer q.respLock.Unlock() + + // Check if we've already responded + if q.deadline.IsZero() { + return fmt.Errorf("response already sent") + } + + // Ensure we aren't past our response deadline + if time.Now().After(q.deadline) { + return fmt.Errorf("response is past the deadline") + } + + // Create response + resp := messageQueryResponse{ + LTime: q.LTime, + ID: q.id, + From: q.serf.config.NodeName, + Payload: buf, + } + + // Send a direct response + raw, err := encodeMessage(messageQueryResponseType, &resp) + if err != nil { + return fmt.Errorf("failed to format response: %v", err) + } + + // Check the size limit + if len(raw) > q.serf.config.QueryResponseSizeLimit { + return fmt.Errorf("response exceeds limit of %d bytes", q.serf.config.QueryResponseSizeLimit) + } + + // Send the response directly to the originator + addr := net.UDPAddr{IP: q.addr, Port: int(q.port)} + if err := q.serf.memberlist.SendTo(&addr, raw); err != nil { + return err + } + + // Relay the response through up to relayFactor other nodes + if err := q.serf.relayResponse(q.relayFactor, addr, &resp); err != nil { + return err + } + + // Clear the deadline, responses sent + q.deadline = time.Time{} + return nil +} diff --git a/vendor/github.com/hashicorp/serf/serf/event_delegate.go b/vendor/github.com/hashicorp/serf/serf/event_delegate.go new file mode 100644 index 000000000..e20132281 --- /dev/null +++ b/vendor/github.com/hashicorp/serf/serf/event_delegate.go @@ -0,0 +1,21 @@ +package serf + +import ( + "github.com/hashicorp/memberlist" +) + +type eventDelegate struct { + serf *Serf +} + +func (e *eventDelegate) NotifyJoin(n *memberlist.Node) { + e.serf.handleNodeJoin(n) +} + +func (e *eventDelegate) NotifyLeave(n *memberlist.Node) { + e.serf.handleNodeLeave(n) +} + +func (e *eventDelegate) NotifyUpdate(n *memberlist.Node) { + e.serf.handleNodeUpdate(n) +} diff --git a/vendor/github.com/hashicorp/serf/serf/internal_query.go b/vendor/github.com/hashicorp/serf/serf/internal_query.go new file mode 100644 index 000000000..128b2cf21 --- /dev/null +++ b/vendor/github.com/hashicorp/serf/serf/internal_query.go @@ -0,0 +1,312 @@ +package serf + +import ( + "encoding/base64" + "log" + "strings" +) + +const ( + // This is the prefix we use for queries that are internal to Serf. + // They are handled internally, and not forwarded to a client. + InternalQueryPrefix = "_serf_" + + // pingQuery is run to check for reachability + pingQuery = "ping" + + // conflictQuery is run to resolve a name conflict + conflictQuery = "conflict" + + // installKeyQuery is used to install a new key + installKeyQuery = "install-key" + + // useKeyQuery is used to change the primary encryption key + useKeyQuery = "use-key" + + // removeKeyQuery is used to remove a key from the keyring + removeKeyQuery = "remove-key" + + // listKeysQuery is used to list all known keys in the cluster + listKeysQuery = "list-keys" +) + +// internalQueryName is used to generate a query name for an internal query +func internalQueryName(name string) string { + return InternalQueryPrefix + name +} + +// serfQueries is used to listen for queries that start with +// _serf and respond to them as appropriate. +type serfQueries struct { + inCh chan Event + logger *log.Logger + outCh chan<- Event + serf *Serf + shutdownCh <-chan struct{} +} + +// nodeKeyResponse is used to store the result from an individual node while +// replying to key modification queries +type nodeKeyResponse struct { + // Result indicates true/false if there were errors or not + Result bool + + // Message contains error messages or other information + Message string + + // Keys is used in listing queries to relay a list of installed keys + Keys []string +} + +// newSerfQueries is used to create a new serfQueries. We return an event +// channel that is ingested and forwarded to an outCh. Any Queries that +// have the InternalQueryPrefix are handled instead of forwarded. +func newSerfQueries(serf *Serf, logger *log.Logger, outCh chan<- Event, shutdownCh <-chan struct{}) (chan<- Event, error) { + inCh := make(chan Event, 1024) + q := &serfQueries{ + inCh: inCh, + logger: logger, + outCh: outCh, + serf: serf, + shutdownCh: shutdownCh, + } + go q.stream() + return inCh, nil +} + +// stream is a long running routine to ingest the event stream +func (s *serfQueries) stream() { + for { + select { + case e := <-s.inCh: + // Check if this is a query we should process + if q, ok := e.(*Query); ok && strings.HasPrefix(q.Name, InternalQueryPrefix) { + go s.handleQuery(q) + + } else if s.outCh != nil { + s.outCh <- e + } + + case <-s.shutdownCh: + return + } + } +} + +// handleQuery is invoked when we get an internal query +func (s *serfQueries) handleQuery(q *Query) { + // Get the queryName after the initial prefix + queryName := q.Name[len(InternalQueryPrefix):] + switch queryName { + case pingQuery: + // Nothing to do, we will ack the query + case conflictQuery: + s.handleConflict(q) + case installKeyQuery: + s.handleInstallKey(q) + case useKeyQuery: + s.handleUseKey(q) + case removeKeyQuery: + s.handleRemoveKey(q) + case listKeysQuery: + s.handleListKeys(q) + default: + s.logger.Printf("[WARN] serf: Unhandled internal query '%s'", queryName) + } +} + +// handleConflict is invoked when we get a query that is attempting to +// disambiguate a name conflict. They payload is a node name, and the response +// should the address we believe that node is at, if any. +func (s *serfQueries) handleConflict(q *Query) { + // The target node name is the payload + node := string(q.Payload) + + // Do not respond to the query if it is about us + if node == s.serf.config.NodeName { + return + } + s.logger.Printf("[DEBUG] serf: Got conflict resolution query for '%s'", node) + + // Look for the member info + var out *Member + s.serf.memberLock.Lock() + if member, ok := s.serf.members[node]; ok { + out = &member.Member + } + s.serf.memberLock.Unlock() + + // Encode the response + buf, err := encodeMessage(messageConflictResponseType, out) + if err != nil { + s.logger.Printf("[ERR] serf: Failed to encode conflict query response: %v", err) + return + } + + // Send our answer + if err := q.Respond(buf); err != nil { + s.logger.Printf("[ERR] serf: Failed to respond to conflict query: %v", err) + } +} + +// sendKeyResponse handles responding to key-related queries. +func (s *serfQueries) sendKeyResponse(q *Query, resp *nodeKeyResponse) { + buf, err := encodeMessage(messageKeyResponseType, resp) + if err != nil { + s.logger.Printf("[ERR] serf: Failed to encode key response: %v", err) + return + } + + if err := q.Respond(buf); err != nil { + s.logger.Printf("[ERR] serf: Failed to respond to key query: %v", err) + return + } +} + +// handleInstallKey is invoked whenever a new encryption key is received from +// another member in the cluster, and handles the process of installing it onto +// the memberlist keyring. This type of query may fail if the provided key does +// not fit the constraints that memberlist enforces. If the query fails, the +// response will contain the error message so that it may be relayed. +func (s *serfQueries) handleInstallKey(q *Query) { + response := nodeKeyResponse{Result: false} + keyring := s.serf.config.MemberlistConfig.Keyring + req := keyRequest{} + + err := decodeMessage(q.Payload[1:], &req) + if err != nil { + s.logger.Printf("[ERR] serf: Failed to decode key request: %v", err) + goto SEND + } + + if !s.serf.EncryptionEnabled() { + response.Message = "No keyring to modify (encryption not enabled)" + s.logger.Printf("[ERR] serf: No keyring to modify (encryption not enabled)") + goto SEND + } + + s.logger.Printf("[INFO] serf: Received install-key query") + if err := keyring.AddKey(req.Key); err != nil { + response.Message = err.Error() + s.logger.Printf("[ERR] serf: Failed to install key: %s", err) + goto SEND + } + + if err := s.serf.writeKeyringFile(); err != nil { + response.Message = err.Error() + s.logger.Printf("[ERR] serf: Failed to write keyring file: %s", err) + goto SEND + } + + response.Result = true + +SEND: + s.sendKeyResponse(q, &response) +} + +// handleUseKey is invoked whenever a query is received to mark a different key +// in the internal keyring as the primary key. This type of query may fail due +// to operator error (requested key not in ring), and thus sends error messages +// back in the response. +func (s *serfQueries) handleUseKey(q *Query) { + response := nodeKeyResponse{Result: false} + keyring := s.serf.config.MemberlistConfig.Keyring + req := keyRequest{} + + err := decodeMessage(q.Payload[1:], &req) + if err != nil { + s.logger.Printf("[ERR] serf: Failed to decode key request: %v", err) + goto SEND + } + + if !s.serf.EncryptionEnabled() { + response.Message = "No keyring to modify (encryption not enabled)" + s.logger.Printf("[ERR] serf: No keyring to modify (encryption not enabled)") + goto SEND + } + + s.logger.Printf("[INFO] serf: Received use-key query") + if err := keyring.UseKey(req.Key); err != nil { + response.Message = err.Error() + s.logger.Printf("[ERR] serf: Failed to change primary key: %s", err) + goto SEND + } + + if err := s.serf.writeKeyringFile(); err != nil { + response.Message = err.Error() + s.logger.Printf("[ERR] serf: Failed to write keyring file: %s", err) + goto SEND + } + + response.Result = true + +SEND: + s.sendKeyResponse(q, &response) +} + +// handleRemoveKey is invoked when a query is received to remove a particular +// key from the keyring. This type of query can fail if the key requested for +// deletion is currently the primary key in the keyring, so therefore it will +// reply to the query with any relevant errors from the operation. +func (s *serfQueries) handleRemoveKey(q *Query) { + response := nodeKeyResponse{Result: false} + keyring := s.serf.config.MemberlistConfig.Keyring + req := keyRequest{} + + err := decodeMessage(q.Payload[1:], &req) + if err != nil { + s.logger.Printf("[ERR] serf: Failed to decode key request: %v", err) + goto SEND + } + + if !s.serf.EncryptionEnabled() { + response.Message = "No keyring to modify (encryption not enabled)" + s.logger.Printf("[ERR] serf: No keyring to modify (encryption not enabled)") + goto SEND + } + + s.logger.Printf("[INFO] serf: Received remove-key query") + if err := keyring.RemoveKey(req.Key); err != nil { + response.Message = err.Error() + s.logger.Printf("[ERR] serf: Failed to remove key: %s", err) + goto SEND + } + + if err := s.serf.writeKeyringFile(); err != nil { + response.Message = err.Error() + s.logger.Printf("[ERR] serf: Failed to write keyring file: %s", err) + goto SEND + } + + response.Result = true + +SEND: + s.sendKeyResponse(q, &response) +} + +// handleListKeys is invoked when a query is received to return a list of all +// installed keys the Serf instance knows of. For performance, the keys are +// encoded to base64 on each of the members to remove this burden from the +// node asking for the results. +func (s *serfQueries) handleListKeys(q *Query) { + response := nodeKeyResponse{Result: false} + keyring := s.serf.config.MemberlistConfig.Keyring + + if !s.serf.EncryptionEnabled() { + response.Message = "Keyring is empty (encryption not enabled)" + s.logger.Printf("[ERR] serf: Keyring is empty (encryption not enabled)") + goto SEND + } + + s.logger.Printf("[INFO] serf: Received list-keys query") + for _, keyBytes := range keyring.GetKeys() { + // Encode the keys before sending the response. This should help take + // some the burden of doing this off of the asking member. + key := base64.StdEncoding.EncodeToString(keyBytes) + response.Keys = append(response.Keys, key) + } + response.Result = true + +SEND: + s.sendKeyResponse(q, &response) +} diff --git a/vendor/github.com/hashicorp/serf/serf/keymanager.go b/vendor/github.com/hashicorp/serf/serf/keymanager.go new file mode 100644 index 000000000..fd53182fc --- /dev/null +++ b/vendor/github.com/hashicorp/serf/serf/keymanager.go @@ -0,0 +1,192 @@ +package serf + +import ( + "encoding/base64" + "fmt" + "sync" +) + +// KeyManager encapsulates all functionality within Serf for handling +// encryption keyring changes across a cluster. +type KeyManager struct { + serf *Serf + + // Lock to protect read and write operations + l sync.RWMutex +} + +// keyRequest is used to contain input parameters which get broadcasted to all +// nodes as part of a key query operation. +type keyRequest struct { + Key []byte +} + +// KeyResponse is used to relay a query for a list of all keys in use. +type KeyResponse struct { + Messages map[string]string // Map of node name to response message + NumNodes int // Total nodes memberlist knows of + NumResp int // Total responses received + NumErr int // Total errors from request + + // Keys is a mapping of the base64-encoded value of the key bytes to the + // number of nodes that have the key installed. + Keys map[string]int +} + +// KeyRequestOptions is used to contain optional parameters for a keyring operation +type KeyRequestOptions struct { + // RelayFactor is the number of duplicate query responses to send by relaying through + // other nodes, for redundancy + RelayFactor uint8 +} + +// streamKeyResp takes care of reading responses from a channel and composing +// them into a KeyResponse. It will update a KeyResponse *in place* and +// therefore has nothing to return. +func (k *KeyManager) streamKeyResp(resp *KeyResponse, ch <-chan NodeResponse) { + for r := range ch { + var nodeResponse nodeKeyResponse + + resp.NumResp++ + + // Decode the response + if len(r.Payload) < 1 || messageType(r.Payload[0]) != messageKeyResponseType { + resp.Messages[r.From] = fmt.Sprintf( + "Invalid key query response type: %v", r.Payload) + resp.NumErr++ + goto NEXT + } + if err := decodeMessage(r.Payload[1:], &nodeResponse); err != nil { + resp.Messages[r.From] = fmt.Sprintf( + "Failed to decode key query response: %v", r.Payload) + resp.NumErr++ + goto NEXT + } + + if !nodeResponse.Result { + resp.Messages[r.From] = nodeResponse.Message + resp.NumErr++ + } + + // Currently only used for key list queries, this adds keys to a counter + // and increments them for each node response which contains them. + for _, key := range nodeResponse.Keys { + if _, ok := resp.Keys[key]; !ok { + resp.Keys[key] = 1 + } else { + resp.Keys[key]++ + } + } + + NEXT: + // Return early if all nodes have responded. This allows us to avoid + // waiting for the full timeout when there is nothing left to do. + if resp.NumResp == resp.NumNodes { + return + } + } +} + +// handleKeyRequest performs query broadcasting to all members for any type of +// key operation and manages gathering responses and packing them up into a +// KeyResponse for uniform response handling. +func (k *KeyManager) handleKeyRequest(key, query string, opts *KeyRequestOptions) (*KeyResponse, error) { + resp := &KeyResponse{ + Messages: make(map[string]string), + Keys: make(map[string]int), + } + qName := internalQueryName(query) + + // Decode the new key into raw bytes + rawKey, err := base64.StdEncoding.DecodeString(key) + if err != nil { + return resp, err + } + + // Encode the query request + req, err := encodeMessage(messageKeyRequestType, keyRequest{Key: rawKey}) + if err != nil { + return resp, err + } + + qParam := k.serf.DefaultQueryParams() + if opts != nil { + qParam.RelayFactor = opts.RelayFactor + } + queryResp, err := k.serf.Query(qName, req, qParam) + if err != nil { + return resp, err + } + + // Handle the response stream and populate the KeyResponse + resp.NumNodes = k.serf.memberlist.NumMembers() + k.streamKeyResp(resp, queryResp.respCh) + + // Check the response for any reported failure conditions + if resp.NumErr != 0 { + return resp, fmt.Errorf("%d/%d nodes reported failure", resp.NumErr, resp.NumNodes) + } + if resp.NumResp != resp.NumNodes { + return resp, fmt.Errorf("%d/%d nodes reported success", resp.NumResp, resp.NumNodes) + } + + return resp, nil +} + +// InstallKey handles broadcasting a query to all members and gathering +// responses from each of them, returning a list of messages from each node +// and any applicable error conditions. +func (k *KeyManager) InstallKey(key string) (*KeyResponse, error) { + return k.InstallKeyWithOptions(key, nil) +} + +func (k *KeyManager) InstallKeyWithOptions(key string, opts *KeyRequestOptions) (*KeyResponse, error) { + k.l.Lock() + defer k.l.Unlock() + + return k.handleKeyRequest(key, installKeyQuery, opts) +} + +// UseKey handles broadcasting a primary key change to all members in the +// cluster, and gathering any response messages. If successful, there should +// be an empty KeyResponse returned. +func (k *KeyManager) UseKey(key string) (*KeyResponse, error) { + return k.UseKeyWithOptions(key, nil) +} + +func (k *KeyManager) UseKeyWithOptions(key string, opts *KeyRequestOptions) (*KeyResponse, error) { + k.l.Lock() + defer k.l.Unlock() + + return k.handleKeyRequest(key, useKeyQuery, opts) +} + +// RemoveKey handles broadcasting a key to the cluster for removal. Each member +// will receive this event, and if they have the key in their keyring, remove +// it. If any errors are encountered, RemoveKey will collect and relay them. +func (k *KeyManager) RemoveKey(key string) (*KeyResponse, error) { + return k.RemoveKeyWithOptions(key, nil) +} + +func (k *KeyManager) RemoveKeyWithOptions(key string, opts *KeyRequestOptions) (*KeyResponse, error) { + k.l.Lock() + defer k.l.Unlock() + + return k.handleKeyRequest(key, removeKeyQuery, opts) +} + +// ListKeys is used to collect installed keys from members in a Serf cluster +// and return an aggregated list of all installed keys. This is useful to +// operators to ensure that there are no lingering keys installed on any agents. +// Since having multiple keys installed can cause performance penalties in some +// cases, it's important to verify this information and remove unneeded keys. +func (k *KeyManager) ListKeys() (*KeyResponse, error) { + return k.ListKeysWithOptions(nil) +} + +func (k *KeyManager) ListKeysWithOptions(opts *KeyRequestOptions) (*KeyResponse, error) { + k.l.RLock() + defer k.l.RUnlock() + + return k.handleKeyRequest("", listKeysQuery, opts) +} \ No newline at end of file diff --git a/vendor/github.com/hashicorp/serf/serf/lamport.go b/vendor/github.com/hashicorp/serf/serf/lamport.go new file mode 100644 index 000000000..08f4aa7a6 --- /dev/null +++ b/vendor/github.com/hashicorp/serf/serf/lamport.go @@ -0,0 +1,45 @@ +package serf + +import ( + "sync/atomic" +) + +// LamportClock is a thread safe implementation of a lamport clock. It +// uses efficient atomic operations for all of its functions, falling back +// to a heavy lock only if there are enough CAS failures. +type LamportClock struct { + counter uint64 +} + +// LamportTime is the value of a LamportClock. +type LamportTime uint64 + +// Time is used to return the current value of the lamport clock +func (l *LamportClock) Time() LamportTime { + return LamportTime(atomic.LoadUint64(&l.counter)) +} + +// Increment is used to increment and return the value of the lamport clock +func (l *LamportClock) Increment() LamportTime { + return LamportTime(atomic.AddUint64(&l.counter, 1)) +} + +// Witness is called to update our local clock if necessary after +// witnessing a clock value received from another process +func (l *LamportClock) Witness(v LamportTime) { +WITNESS: + // If the other value is old, we do not need to do anything + cur := atomic.LoadUint64(&l.counter) + other := uint64(v) + if other < cur { + return + } + + // Ensure that our local clock is at least one ahead. + if !atomic.CompareAndSwapUint64(&l.counter, cur, other+1) { + // The CAS failed, so we just retry. Eventually our CAS should + // succeed or a future witness will pass us by and our witness + // will end. + goto WITNESS + } +} diff --git a/vendor/github.com/hashicorp/serf/serf/merge_delegate.go b/vendor/github.com/hashicorp/serf/serf/merge_delegate.go new file mode 100644 index 000000000..7fdc73288 --- /dev/null +++ b/vendor/github.com/hashicorp/serf/serf/merge_delegate.go @@ -0,0 +1,44 @@ +package serf + +import ( + "net" + + "github.com/hashicorp/memberlist" +) + +type MergeDelegate interface { + NotifyMerge([]*Member) error +} + +type mergeDelegate struct { + serf *Serf +} + +func (m *mergeDelegate) NotifyMerge(nodes []*memberlist.Node) error { + members := make([]*Member, len(nodes)) + for idx, n := range nodes { + members[idx] = m.nodeToMember(n) + } + return m.serf.config.Merge.NotifyMerge(members) +} + +func (m *mergeDelegate) NotifyAlive(peer *memberlist.Node) error { + member := m.nodeToMember(peer) + return m.serf.config.Merge.NotifyMerge([]*Member{member}) +} + +func (m *mergeDelegate) nodeToMember(n *memberlist.Node) *Member { + return &Member{ + Name: n.Name, + Addr: net.IP(n.Addr), + Port: n.Port, + Tags: m.serf.decodeTags(n.Meta), + Status: StatusNone, + ProtocolMin: n.PMin, + ProtocolMax: n.PMax, + ProtocolCur: n.PCur, + DelegateMin: n.DMin, + DelegateMax: n.DMax, + DelegateCur: n.DCur, + } +} diff --git a/vendor/github.com/hashicorp/serf/serf/messages.go b/vendor/github.com/hashicorp/serf/serf/messages.go new file mode 100644 index 000000000..20df5b8e8 --- /dev/null +++ b/vendor/github.com/hashicorp/serf/serf/messages.go @@ -0,0 +1,173 @@ +package serf + +import ( + "bytes" + "net" + "time" + + "github.com/hashicorp/go-msgpack/codec" +) + +// messageType are the types of gossip messages Serf will send along +// memberlist. +type messageType uint8 + +const ( + messageLeaveType messageType = iota + messageJoinType + messagePushPullType + messageUserEventType + messageQueryType + messageQueryResponseType + messageConflictResponseType + messageKeyRequestType + messageKeyResponseType + messageRelayType +) + +const ( + // Ack flag is used to force receiver to send an ack back + queryFlagAck uint32 = 1 << iota + + // NoBroadcast is used to prevent re-broadcast of a query. + // this can be used to selectively send queries to individual members + queryFlagNoBroadcast +) + +// filterType is used with a queryFilter to specify the type of +// filter we are sending +type filterType uint8 + +const ( + filterNodeType filterType = iota + filterTagType +) + +// messageJoin is the message broadcasted after we join to +// associated the node with a lamport clock +type messageJoin struct { + LTime LamportTime + Node string +} + +// messageLeave is the message broadcasted to signal the intentional to +// leave. +type messageLeave struct { + LTime LamportTime + Node string +} + +// messagePushPullType is used when doing a state exchange. This +// is a relatively large message, but is sent infrequently +type messagePushPull struct { + LTime LamportTime // Current node lamport time + StatusLTimes map[string]LamportTime // Maps the node to its status time + LeftMembers []string // List of left nodes + EventLTime LamportTime // Lamport time for event clock + Events []*userEvents // Recent events + QueryLTime LamportTime // Lamport time for query clock +} + +// messageUserEvent is used for user-generated events +type messageUserEvent struct { + LTime LamportTime + Name string + Payload []byte + CC bool // "Can Coalesce". Zero value is compatible with Serf 0.1 +} + +// messageQuery is used for query events +type messageQuery struct { + LTime LamportTime // Event lamport time + ID uint32 // Query ID, randomly generated + Addr []byte // Source address, used for a direct reply + Port uint16 // Source port, used for a direct reply + Filters [][]byte // Potential query filters + Flags uint32 // Used to provide various flags + RelayFactor uint8 // Used to set the number of duplicate relayed responses + Timeout time.Duration // Maximum time between delivery and response + Name string // Query name + Payload []byte // Query payload +} + +// Ack checks if the ack flag is set +func (m *messageQuery) Ack() bool { + return (m.Flags & queryFlagAck) != 0 +} + +// NoBroadcast checks if the no broadcast flag is set +func (m *messageQuery) NoBroadcast() bool { + return (m.Flags & queryFlagNoBroadcast) != 0 +} + +// filterNode is used with the filterNodeType, and is a list +// of node names +type filterNode []string + +// filterTag is used with the filterTagType and is a regular +// expression to apply to a tag +type filterTag struct { + Tag string + Expr string +} + +// messageQueryResponse is used to respond to a query +type messageQueryResponse struct { + LTime LamportTime // Event lamport time + ID uint32 // Query ID + From string // Node name + Flags uint32 // Used to provide various flags + Payload []byte // Optional response payload +} + +// Ack checks if the ack flag is set +func (m *messageQueryResponse) Ack() bool { + return (m.Flags & queryFlagAck) != 0 +} + +func decodeMessage(buf []byte, out interface{}) error { + var handle codec.MsgpackHandle + return codec.NewDecoder(bytes.NewReader(buf), &handle).Decode(out) +} + +func encodeMessage(t messageType, msg interface{}) ([]byte, error) { + buf := bytes.NewBuffer(nil) + buf.WriteByte(uint8(t)) + + handle := codec.MsgpackHandle{} + encoder := codec.NewEncoder(buf, &handle) + err := encoder.Encode(msg) + return buf.Bytes(), err +} + +// relayHeader is used to store the end destination of a relayed message +type relayHeader struct { + DestAddr net.UDPAddr +} + +// encodeRelayMessage wraps a message in the messageRelayType, adding the length and +// address of the end recipient to the front of the message +func encodeRelayMessage(t messageType, addr net.UDPAddr, msg interface{}) ([]byte, error) { + buf := bytes.NewBuffer(nil) + handle := codec.MsgpackHandle{} + encoder := codec.NewEncoder(buf, &handle) + + buf.WriteByte(uint8(messageRelayType)) + if err := encoder.Encode(relayHeader{DestAddr: addr}); err != nil { + return nil, err + } + + buf.WriteByte(uint8(t)) + err := encoder.Encode(msg) + return buf.Bytes(), err +} + +func encodeFilter(f filterType, filt interface{}) ([]byte, error) { + buf := bytes.NewBuffer(nil) + buf.WriteByte(uint8(f)) + + handle := codec.MsgpackHandle{} + encoder := codec.NewEncoder(buf, &handle) + err := encoder.Encode(filt) + return buf.Bytes(), err +} diff --git a/vendor/github.com/hashicorp/serf/serf/ping_delegate.go b/vendor/github.com/hashicorp/serf/serf/ping_delegate.go new file mode 100644 index 000000000..a482685a2 --- /dev/null +++ b/vendor/github.com/hashicorp/serf/serf/ping_delegate.go @@ -0,0 +1,89 @@ +package serf + +import ( + "bytes" + "log" + "time" + + "github.com/armon/go-metrics" + "github.com/hashicorp/go-msgpack/codec" + "github.com/hashicorp/memberlist" + "github.com/hashicorp/serf/coordinate" +) + +// pingDelegate is notified when memberlist successfully completes a direct ping +// of a peer node. We use this to update our estimated network coordinate, as +// well as cache the coordinate of the peer. +type pingDelegate struct { + serf *Serf +} + +const ( + // PingVersion is an internal version for the ping message, above the normal + // versioning we get from the protocol version. This enables small updates + // to the ping message without a full protocol bump. + PingVersion = 1 +) + +// AckPayload is called to produce a payload to send back in response to a ping +// request. +func (p *pingDelegate) AckPayload() []byte { + var buf bytes.Buffer + + // The first byte is the version number, forming a simple header. + version := []byte{PingVersion} + buf.Write(version) + + // The rest of the message is the serialized coordinate. + enc := codec.NewEncoder(&buf, &codec.MsgpackHandle{}) + if err := enc.Encode(p.serf.coordClient.GetCoordinate()); err != nil { + log.Printf("[ERR] serf: Failed to encode coordinate: %v\n", err) + } + return buf.Bytes() +} + +// NotifyPingComplete is called when this node successfully completes a direct ping +// of a peer node. +func (p *pingDelegate) NotifyPingComplete(other *memberlist.Node, rtt time.Duration, payload []byte) { + if payload == nil || len(payload) == 0 { + return + } + + // Verify ping version in the header. + version := payload[0] + if version != PingVersion { + log.Printf("[ERR] serf: Unsupported ping version: %v", version) + return + } + + // Process the remainder of the message as a coordinate. + r := bytes.NewReader(payload[1:]) + dec := codec.NewDecoder(r, &codec.MsgpackHandle{}) + var coord coordinate.Coordinate + if err := dec.Decode(&coord); err != nil { + log.Printf("[ERR] serf: Failed to decode coordinate from ping: %v", err) + } + + // Apply the update. Since this is a coordinate coming from some place + // else we harden this and look for dimensionality problems proactively. + before := p.serf.coordClient.GetCoordinate() + if before.IsCompatibleWith(&coord) { + after := p.serf.coordClient.Update(other.Name, &coord, rtt) + + // Publish some metrics to give us an idea of how much we are + // adjusting each time we update. + d := float32(before.DistanceTo(after).Seconds() * 1.0e3) + metrics.AddSample([]string{"serf", "coordinate", "adjustment-ms"}, d) + + // Cache the coordinate for the other node, and add our own + // to the cache as well since it just got updated. This lets + // users call GetCachedCoordinate with our node name, which is + // more friendly. + p.serf.coordCacheLock.Lock() + p.serf.coordCache[other.Name] = &coord + p.serf.coordCache[p.serf.config.NodeName] = p.serf.coordClient.GetCoordinate() + p.serf.coordCacheLock.Unlock() + } else { + log.Printf("[ERR] serf: Rejected bad coordinate: %v\n", coord) + } +} diff --git a/vendor/github.com/hashicorp/serf/serf/query.go b/vendor/github.com/hashicorp/serf/serf/query.go new file mode 100644 index 000000000..5412821e3 --- /dev/null +++ b/vendor/github.com/hashicorp/serf/serf/query.go @@ -0,0 +1,294 @@ +package serf + +import ( + "fmt" + "math" + "math/rand" + "net" + "regexp" + "sync" + "time" +) + +// QueryParam is provided to Query() to configure the parameters of the +// query. If not provided, sane defaults will be used. +type QueryParam struct { + // If provided, we restrict the nodes that should respond to those + // with names in this list + FilterNodes []string + + // FilterTags maps a tag name to a regular expression that is applied + // to restrict the nodes that should respond + FilterTags map[string]string + + // If true, we are requesting an delivery acknowledgement from + // every node that meets the filter requirement. This means nodes + // the receive the message but do not pass the filters, will not + // send an ack. + RequestAck bool + + // RelayFactor controls the number of duplicate responses to relay + // back to the sender through other nodes for redundancy. + RelayFactor uint8 + + // The timeout limits how long the query is left open. If not provided, + // then a default timeout is used based on the configuration of Serf + Timeout time.Duration +} + +// DefaultQueryTimeout returns the default timeout value for a query +// Computed as GossipInterval * QueryTimeoutMult * log(N+1) +func (s *Serf) DefaultQueryTimeout() time.Duration { + n := s.memberlist.NumMembers() + timeout := s.config.MemberlistConfig.GossipInterval + timeout *= time.Duration(s.config.QueryTimeoutMult) + timeout *= time.Duration(math.Ceil(math.Log10(float64(n + 1)))) + return timeout +} + +// DefaultQueryParam is used to return the default query parameters +func (s *Serf) DefaultQueryParams() *QueryParam { + return &QueryParam{ + FilterNodes: nil, + FilterTags: nil, + RequestAck: false, + Timeout: s.DefaultQueryTimeout(), + } +} + +// encodeFilters is used to convert the filters into the wire format +func (q *QueryParam) encodeFilters() ([][]byte, error) { + var filters [][]byte + + // Add the node filter + if len(q.FilterNodes) > 0 { + if buf, err := encodeFilter(filterNodeType, q.FilterNodes); err != nil { + return nil, err + } else { + filters = append(filters, buf) + } + } + + // Add the tag filters + for tag, expr := range q.FilterTags { + filt := filterTag{tag, expr} + if buf, err := encodeFilter(filterTagType, &filt); err != nil { + return nil, err + } else { + filters = append(filters, buf) + } + } + + return filters, nil +} + +// QueryResponse is returned for each new Query. It is used to collect +// Ack's as well as responses and to provide those back to a client. +type QueryResponse struct { + // ackCh is used to send the name of a node for which we've received an ack + ackCh chan string + + // deadline is the query end time (start + query timeout) + deadline time.Time + + // Query ID + id uint32 + + // Stores the LTime of the query + lTime LamportTime + + // respCh is used to send a response from a node + respCh chan NodeResponse + + // acks/responses are used to track the nodes that have sent an ack/response + acks map[string]struct{} + responses map[string]struct{} + + closed bool + closeLock sync.Mutex +} + +// newQueryResponse is used to construct a new query response +func newQueryResponse(n int, q *messageQuery) *QueryResponse { + resp := &QueryResponse{ + deadline: time.Now().Add(q.Timeout), + id: q.ID, + lTime: q.LTime, + respCh: make(chan NodeResponse, n), + responses: make(map[string]struct{}), + } + if q.Ack() { + resp.ackCh = make(chan string, n) + resp.acks = make(map[string]struct{}) + } + return resp +} + +// Close is used to close the query, which will close the underlying +// channels and prevent further deliveries +func (r *QueryResponse) Close() { + r.closeLock.Lock() + defer r.closeLock.Unlock() + if r.closed { + return + } + r.closed = true + if r.ackCh != nil { + close(r.ackCh) + } + if r.respCh != nil { + close(r.respCh) + } +} + +// Deadline returns the ending deadline of the query +func (r *QueryResponse) Deadline() time.Time { + return r.deadline +} + +// Finished returns if the query is finished running +func (r *QueryResponse) Finished() bool { + return r.closed || time.Now().After(r.deadline) +} + +// AckCh returns a channel that can be used to listen for acks +// Channel will be closed when the query is finished. This is nil, +// if the query did not specify RequestAck. +func (r *QueryResponse) AckCh() <-chan string { + return r.ackCh +} + +// ResponseCh returns a channel that can be used to listen for responses. +// Channel will be closed when the query is finished. +func (r *QueryResponse) ResponseCh() <-chan NodeResponse { + return r.respCh +} + +// NodeResponse is used to represent a single response from a node +type NodeResponse struct { + From string + Payload []byte +} + +// shouldProcessQuery checks if a query should be proceeded given +// a set of filers. +func (s *Serf) shouldProcessQuery(filters [][]byte) bool { + for _, filter := range filters { + switch filterType(filter[0]) { + case filterNodeType: + // Decode the filter + var nodes filterNode + if err := decodeMessage(filter[1:], &nodes); err != nil { + s.logger.Printf("[WARN] serf: failed to decode filterNodeType: %v", err) + return false + } + + // Check if we are being targeted + found := false + for _, n := range nodes { + if n == s.config.NodeName { + found = true + break + } + } + if !found { + return false + } + + case filterTagType: + // Decode the filter + var filt filterTag + if err := decodeMessage(filter[1:], &filt); err != nil { + s.logger.Printf("[WARN] serf: failed to decode filterTagType: %v", err) + return false + } + + // Check if we match this regex + tags := s.config.Tags + matched, err := regexp.MatchString(filt.Expr, tags[filt.Tag]) + if err != nil { + s.logger.Printf("[WARN] serf: failed to compile filter regex (%s): %v", filt.Expr, err) + return false + } + if !matched { + return false + } + + default: + s.logger.Printf("[WARN] serf: query has unrecognized filter type: %d", filter[0]) + return false + } + } + return true +} + +// relayResponse will relay a copy of the given response to up to relayFactor +// other members. +func (s *Serf) relayResponse(relayFactor uint8, addr net.UDPAddr, resp *messageQueryResponse) error { + if relayFactor == 0 { + return nil + } + + // Needs to be worth it; we need to have at least relayFactor *other* + // nodes. If you have a tiny cluster then the relayFactor shouldn't + // be needed. + members := s.Members() + if len(members) < int(relayFactor)+1 { + return nil + } + + // Prep the relay message, which is a wrapped version of the original. + raw, err := encodeRelayMessage(messageQueryResponseType, addr, &resp) + if err != nil { + return fmt.Errorf("failed to format relayed response: %v", err) + } + if len(raw) > s.config.QueryResponseSizeLimit { + return fmt.Errorf("relayed response exceeds limit of %d bytes", s.config.QueryResponseSizeLimit) + } + + // Relay to a random set of peers. + localName := s.LocalMember().Name + relayMembers := kRandomMembers(int(relayFactor), members, func(m Member) bool { + return m.Status != StatusAlive || m.ProtocolMax < 5 || m.Name == localName + }) + for _, m := range relayMembers { + relayAddr := net.UDPAddr{IP: m.Addr, Port: int(m.Port)} + if err := s.memberlist.SendTo(&relayAddr, raw); err != nil { + return fmt.Errorf("failed to send relay response: %v", err) + } + } + return nil +} + +// kRandomMembers selects up to k members from a given list, optionally +// filtering by the given filterFunc +func kRandomMembers(k int, members []Member, filterFunc func(Member) bool) []Member { + n := len(members) + kMembers := make([]Member, 0, k) +OUTER: + // Probe up to 3*n times, with large n this is not necessary + // since k << n, but with small n we want search to be + // exhaustive + for i := 0; i < 3*n && len(kMembers) < k; i++ { + // Get random member + idx := rand.Intn(n) + member := members[idx] + + // Give the filter a shot at it. + if filterFunc != nil && filterFunc(member) { + continue OUTER + } + + // Check if we have this member already + for j := 0; j < len(kMembers); j++ { + if member.Name == kMembers[j].Name { + continue OUTER + } + } + + // Append the member + kMembers = append(kMembers, member) + } + + return kMembers +} diff --git a/vendor/github.com/hashicorp/serf/serf/serf.go b/vendor/github.com/hashicorp/serf/serf/serf.go new file mode 100644 index 000000000..af2d17ede --- /dev/null +++ b/vendor/github.com/hashicorp/serf/serf/serf.go @@ -0,0 +1,1739 @@ +package serf + +import ( + "bytes" + "encoding/base64" + "encoding/json" + "errors" + "fmt" + "io/ioutil" + "log" + "math/rand" + "net" + "os" + "strconv" + "sync" + "time" + + "github.com/armon/go-metrics" + "github.com/hashicorp/go-msgpack/codec" + "github.com/hashicorp/memberlist" + "github.com/hashicorp/serf/coordinate" +) + +// These are the protocol versions that Serf can _understand_. These are +// Serf-level protocol versions that are passed down as the delegate +// version to memberlist below. +const ( + ProtocolVersionMin uint8 = 2 + ProtocolVersionMax = 5 +) + +const ( + // Used to detect if the meta data is tags + // or if it is a raw role + tagMagicByte uint8 = 255 +) + +var ( + // FeatureNotSupported is returned if a feature cannot be used + // due to an older protocol version being used. + FeatureNotSupported = fmt.Errorf("Feature not supported") +) + +func init() { + // Seed the random number generator + rand.Seed(time.Now().UnixNano()) +} + +// Serf is a single node that is part of a single cluster that gets +// events about joins/leaves/failures/etc. It is created with the Create +// method. +// +// All functions on the Serf structure are safe to call concurrently. +type Serf struct { + // The clocks for different purposes. These MUST be the first things + // in this struct due to Golang issue #599. + clock LamportClock + eventClock LamportClock + queryClock LamportClock + + broadcasts *memberlist.TransmitLimitedQueue + config *Config + failedMembers []*memberState + leftMembers []*memberState + memberlist *memberlist.Memberlist + memberLock sync.RWMutex + members map[string]*memberState + + // recentIntents the lamport time and type of intent for a given node in + // case we get an intent before the relevant memberlist event. This is + // indexed by node, and always store the latest lamport time / intent + // we've seen. The memberLock protects this structure. + recentIntents map[string]nodeIntent + + eventBroadcasts *memberlist.TransmitLimitedQueue + eventBuffer []*userEvents + eventJoinIgnore bool + eventMinTime LamportTime + eventLock sync.RWMutex + + queryBroadcasts *memberlist.TransmitLimitedQueue + queryBuffer []*queries + queryMinTime LamportTime + queryResponse map[LamportTime]*QueryResponse + queryLock sync.RWMutex + + logger *log.Logger + joinLock sync.Mutex + stateLock sync.Mutex + state SerfState + shutdownCh chan struct{} + + snapshotter *Snapshotter + keyManager *KeyManager + + coordClient *coordinate.Client + coordCache map[string]*coordinate.Coordinate + coordCacheLock sync.RWMutex +} + +// SerfState is the state of the Serf instance. +type SerfState int + +const ( + SerfAlive SerfState = iota + SerfLeaving + SerfLeft + SerfShutdown +) + +func (s SerfState) String() string { + switch s { + case SerfAlive: + return "alive" + case SerfLeaving: + return "leaving" + case SerfLeft: + return "left" + case SerfShutdown: + return "shutdown" + default: + return "unknown" + } +} + +// Member is a single member of the Serf cluster. +type Member struct { + Name string + Addr net.IP + Port uint16 + Tags map[string]string + Status MemberStatus + + // The minimum, maximum, and current values of the protocol versions + // and delegate (Serf) protocol versions that each member can understand + // or is speaking. + ProtocolMin uint8 + ProtocolMax uint8 + ProtocolCur uint8 + DelegateMin uint8 + DelegateMax uint8 + DelegateCur uint8 +} + +// MemberStatus is the state that a member is in. +type MemberStatus int + +const ( + StatusNone MemberStatus = iota + StatusAlive + StatusLeaving + StatusLeft + StatusFailed +) + +func (s MemberStatus) String() string { + switch s { + case StatusNone: + return "none" + case StatusAlive: + return "alive" + case StatusLeaving: + return "leaving" + case StatusLeft: + return "left" + case StatusFailed: + return "failed" + default: + panic(fmt.Sprintf("unknown MemberStatus: %d", s)) + } +} + +// memberState is used to track members that are no longer active due to +// leaving, failing, partitioning, etc. It tracks the member along with +// when that member was marked as leaving. +type memberState struct { + Member + statusLTime LamportTime // lamport clock time of last received message + leaveTime time.Time // wall clock time of leave +} + +// nodeIntent is used to buffer intents for out-of-order deliveries. +type nodeIntent struct { + // Type is the intent being tracked. Only messageJoinType and + // messageLeaveType are tracked. + Type messageType + + // WallTime is the wall clock time we saw this intent in order to + // expire it from the buffer. + WallTime time.Time + + // LTime is the Lamport time, used for cluster-wide ordering of events. + LTime LamportTime +} + +// userEvent is used to buffer events to prevent re-delivery +type userEvent struct { + Name string + Payload []byte +} + +func (ue *userEvent) Equals(other *userEvent) bool { + if ue.Name != other.Name { + return false + } + if bytes.Compare(ue.Payload, other.Payload) != 0 { + return false + } + return true +} + +// userEvents stores all the user events at a specific time +type userEvents struct { + LTime LamportTime + Events []userEvent +} + +// queries stores all the query ids at a specific time +type queries struct { + LTime LamportTime + QueryIDs []uint32 +} + +const ( + UserEventSizeLimit = 512 // Maximum byte size for event name and payload + snapshotSizeLimit = 128 * 1024 // Maximum 128 KB snapshot +) + +// Create creates a new Serf instance, starting all the background tasks +// to maintain cluster membership information. +// +// After calling this function, the configuration should no longer be used +// or modified by the caller. +func Create(conf *Config) (*Serf, error) { + conf.Init() + if conf.ProtocolVersion < ProtocolVersionMin { + return nil, fmt.Errorf("Protocol version '%d' too low. Must be in range: [%d, %d]", + conf.ProtocolVersion, ProtocolVersionMin, ProtocolVersionMax) + } else if conf.ProtocolVersion > ProtocolVersionMax { + return nil, fmt.Errorf("Protocol version '%d' too high. Must be in range: [%d, %d]", + conf.ProtocolVersion, ProtocolVersionMin, ProtocolVersionMax) + } + + if conf.LogOutput != nil && conf.Logger != nil { + return nil, fmt.Errorf("Cannot specify both LogOutput and Logger. Please choose a single log configuration setting.") + } + + logDest := conf.LogOutput + if logDest == nil { + logDest = os.Stderr + } + + logger := conf.Logger + if logger == nil { + logger = log.New(logDest, "", log.LstdFlags) + } + + serf := &Serf{ + config: conf, + logger: logger, + members: make(map[string]*memberState), + queryResponse: make(map[LamportTime]*QueryResponse), + shutdownCh: make(chan struct{}), + state: SerfAlive, + } + + // Check that the meta data length is okay + if len(serf.encodeTags(conf.Tags)) > memberlist.MetaMaxSize { + return nil, fmt.Errorf("Encoded length of tags exceeds limit of %d bytes", memberlist.MetaMaxSize) + } + + // Check if serf member event coalescing is enabled + if conf.CoalescePeriod > 0 && conf.QuiescentPeriod > 0 && conf.EventCh != nil { + c := &memberEventCoalescer{ + lastEvents: make(map[string]EventType), + latestEvents: make(map[string]coalesceEvent), + } + + conf.EventCh = coalescedEventCh(conf.EventCh, serf.shutdownCh, + conf.CoalescePeriod, conf.QuiescentPeriod, c) + } + + // Check if user event coalescing is enabled + if conf.UserCoalescePeriod > 0 && conf.UserQuiescentPeriod > 0 && conf.EventCh != nil { + c := &userEventCoalescer{ + events: make(map[string]*latestUserEvents), + } + + conf.EventCh = coalescedEventCh(conf.EventCh, serf.shutdownCh, + conf.UserCoalescePeriod, conf.UserQuiescentPeriod, c) + } + + // Listen for internal Serf queries. This is setup before the snapshotter, since + // we want to capture the query-time, but the internal listener does not passthrough + // the queries + outCh, err := newSerfQueries(serf, serf.logger, conf.EventCh, serf.shutdownCh) + if err != nil { + return nil, fmt.Errorf("Failed to setup serf query handler: %v", err) + } + conf.EventCh = outCh + + // Set up network coordinate client. + if !conf.DisableCoordinates { + serf.coordClient, err = coordinate.NewClient(coordinate.DefaultConfig()) + if err != nil { + return nil, fmt.Errorf("Failed to create coordinate client: %v", err) + } + } + + // Try access the snapshot + var oldClock, oldEventClock, oldQueryClock LamportTime + var prev []*PreviousNode + if conf.SnapshotPath != "" { + eventCh, snap, err := NewSnapshotter( + conf.SnapshotPath, + snapshotSizeLimit, + conf.RejoinAfterLeave, + serf.logger, + &serf.clock, + serf.coordClient, + conf.EventCh, + serf.shutdownCh) + if err != nil { + return nil, fmt.Errorf("Failed to setup snapshot: %v", err) + } + serf.snapshotter = snap + conf.EventCh = eventCh + prev = snap.AliveNodes() + oldClock = snap.LastClock() + oldEventClock = snap.LastEventClock() + oldQueryClock = snap.LastQueryClock() + serf.eventMinTime = oldEventClock + 1 + serf.queryMinTime = oldQueryClock + 1 + } + + // Set up the coordinate cache. We do this after we read the snapshot to + // make sure we get a good initial value from there, if we got one. + if !conf.DisableCoordinates { + serf.coordCache = make(map[string]*coordinate.Coordinate) + serf.coordCache[conf.NodeName] = serf.coordClient.GetCoordinate() + } + + // Setup the various broadcast queues, which we use to send our own + // custom broadcasts along the gossip channel. + serf.broadcasts = &memberlist.TransmitLimitedQueue{ + NumNodes: serf.NumNodes, + RetransmitMult: conf.MemberlistConfig.RetransmitMult, + } + serf.eventBroadcasts = &memberlist.TransmitLimitedQueue{ + NumNodes: serf.NumNodes, + RetransmitMult: conf.MemberlistConfig.RetransmitMult, + } + serf.queryBroadcasts = &memberlist.TransmitLimitedQueue{ + NumNodes: serf.NumNodes, + RetransmitMult: conf.MemberlistConfig.RetransmitMult, + } + + // Create the buffer for recent intents + serf.recentIntents = make(map[string]nodeIntent) + + // Create a buffer for events and queries + serf.eventBuffer = make([]*userEvents, conf.EventBuffer) + serf.queryBuffer = make([]*queries, conf.QueryBuffer) + + // Ensure our lamport clock is at least 1, so that the default + // join LTime of 0 does not cause issues + serf.clock.Increment() + serf.eventClock.Increment() + serf.queryClock.Increment() + + // Restore the clock from snap if we have one + serf.clock.Witness(oldClock) + serf.eventClock.Witness(oldEventClock) + serf.queryClock.Witness(oldQueryClock) + + // Modify the memberlist configuration with keys that we set + conf.MemberlistConfig.Events = &eventDelegate{serf: serf} + conf.MemberlistConfig.Conflict = &conflictDelegate{serf: serf} + conf.MemberlistConfig.Delegate = &delegate{serf: serf} + conf.MemberlistConfig.DelegateProtocolVersion = conf.ProtocolVersion + conf.MemberlistConfig.DelegateProtocolMin = ProtocolVersionMin + conf.MemberlistConfig.DelegateProtocolMax = ProtocolVersionMax + conf.MemberlistConfig.Name = conf.NodeName + conf.MemberlistConfig.ProtocolVersion = ProtocolVersionMap[conf.ProtocolVersion] + if !conf.DisableCoordinates { + conf.MemberlistConfig.Ping = &pingDelegate{serf: serf} + } + + // Setup a merge delegate if necessary + if conf.Merge != nil { + md := &mergeDelegate{serf: serf} + conf.MemberlistConfig.Merge = md + conf.MemberlistConfig.Alive = md + } + + // Create the underlying memberlist that will manage membership + // and failure detection for the Serf instance. + memberlist, err := memberlist.Create(conf.MemberlistConfig) + if err != nil { + return nil, fmt.Errorf("Failed to create memberlist: %v", err) + } + + serf.memberlist = memberlist + + // Create a key manager for handling all encryption key changes + serf.keyManager = &KeyManager{serf: serf} + + // Start the background tasks. See the documentation above each method + // for more information on their role. + go serf.handleReap() + go serf.handleReconnect() + go serf.checkQueueDepth("Intent", serf.broadcasts) + go serf.checkQueueDepth("Event", serf.eventBroadcasts) + go serf.checkQueueDepth("Query", serf.queryBroadcasts) + + // Attempt to re-join the cluster if we have known nodes + if len(prev) != 0 { + go serf.handleRejoin(prev) + } + + return serf, nil +} + +// ProtocolVersion returns the current protocol version in use by Serf. +// This is the Serf protocol version, not the memberlist protocol version. +func (s *Serf) ProtocolVersion() uint8 { + return s.config.ProtocolVersion +} + +// EncryptionEnabled is a predicate that determines whether or not encryption +// is enabled, which can be possible in one of 2 cases: +// - Single encryption key passed at agent start (no persistence) +// - Keyring file provided at agent start +func (s *Serf) EncryptionEnabled() bool { + return s.config.MemberlistConfig.Keyring != nil +} + +// KeyManager returns the key manager for the current Serf instance. +func (s *Serf) KeyManager() *KeyManager { + return s.keyManager +} + +// UserEvent is used to broadcast a custom user event with a given +// name and payload. The events must be fairly small, and if the +// size limit is exceeded and error will be returned. If coalesce is enabled, +// nodes are allowed to coalesce this event. Coalescing is only available +// starting in v0.2 +func (s *Serf) UserEvent(name string, payload []byte, coalesce bool) error { + // Check the size limit + if len(name)+len(payload) > UserEventSizeLimit { + return fmt.Errorf("user event exceeds limit of %d bytes", UserEventSizeLimit) + } + + // Create a message + msg := messageUserEvent{ + LTime: s.eventClock.Time(), + Name: name, + Payload: payload, + CC: coalesce, + } + s.eventClock.Increment() + + // Process update locally + s.handleUserEvent(&msg) + + // Start broadcasting the event + raw, err := encodeMessage(messageUserEventType, &msg) + if err != nil { + return err + } + s.eventBroadcasts.QueueBroadcast(&broadcast{ + msg: raw, + }) + return nil +} + +// Query is used to broadcast a new query. The query must be fairly small, +// and an error will be returned if the size limit is exceeded. This is only +// available with protocol version 4 and newer. Query parameters are optional, +// and if not provided, a sane set of defaults will be used. +func (s *Serf) Query(name string, payload []byte, params *QueryParam) (*QueryResponse, error) { + // Check that the latest protocol is in use + if s.ProtocolVersion() < 4 { + return nil, FeatureNotSupported + } + + // Provide default parameters if none given + if params == nil { + params = s.DefaultQueryParams() + } else if params.Timeout == 0 { + params.Timeout = s.DefaultQueryTimeout() + } + + // Get the local node + local := s.memberlist.LocalNode() + + // Encode the filters + filters, err := params.encodeFilters() + if err != nil { + return nil, fmt.Errorf("Failed to format filters: %v", err) + } + + // Setup the flags + var flags uint32 + if params.RequestAck { + flags |= queryFlagAck + } + + // Create a message + q := messageQuery{ + LTime: s.queryClock.Time(), + ID: uint32(rand.Int31()), + Addr: local.Addr, + Port: local.Port, + Filters: filters, + Flags: flags, + RelayFactor: params.RelayFactor, + Timeout: params.Timeout, + Name: name, + Payload: payload, + } + + // Encode the query + raw, err := encodeMessage(messageQueryType, &q) + if err != nil { + return nil, err + } + + // Check the size + if len(raw) > s.config.QuerySizeLimit { + return nil, fmt.Errorf("query exceeds limit of %d bytes", s.config.QuerySizeLimit) + } + + // Register QueryResponse to track acks and responses + resp := newQueryResponse(s.memberlist.NumMembers(), &q) + s.registerQueryResponse(params.Timeout, resp) + + // Process query locally + s.handleQuery(&q) + + // Start broadcasting the event + s.queryBroadcasts.QueueBroadcast(&broadcast{ + msg: raw, + }) + return resp, nil +} + +// registerQueryResponse is used to setup the listeners for the query, +// and to schedule closing the query after the timeout. +func (s *Serf) registerQueryResponse(timeout time.Duration, resp *QueryResponse) { + s.queryLock.Lock() + defer s.queryLock.Unlock() + + // Map the LTime to the QueryResponse. This is necessarily 1-to-1, + // since we increment the time for each new query. + s.queryResponse[resp.lTime] = resp + + // Setup a timer to close the response and deregister after the timeout + time.AfterFunc(timeout, func() { + s.queryLock.Lock() + delete(s.queryResponse, resp.lTime) + resp.Close() + s.queryLock.Unlock() + }) +} + +// SetTags is used to dynamically update the tags associated with +// the local node. This will propagate the change to the rest of +// the cluster. Blocks until a the message is broadcast out. +func (s *Serf) SetTags(tags map[string]string) error { + // Check that the meta data length is okay + if len(s.encodeTags(tags)) > memberlist.MetaMaxSize { + return fmt.Errorf("Encoded length of tags exceeds limit of %d bytes", + memberlist.MetaMaxSize) + } + + // Update the config + s.config.Tags = tags + + // Trigger a memberlist update + return s.memberlist.UpdateNode(s.config.BroadcastTimeout) +} + +// Join joins an existing Serf cluster. Returns the number of nodes +// successfully contacted. The returned error will be non-nil only in the +// case that no nodes could be contacted. If ignoreOld is true, then any +// user messages sent prior to the join will be ignored. +func (s *Serf) Join(existing []string, ignoreOld bool) (int, error) { + // Do a quick state check + if s.State() != SerfAlive { + return 0, fmt.Errorf("Serf can't Join after Leave or Shutdown") + } + + // Hold the joinLock, this is to make eventJoinIgnore safe + s.joinLock.Lock() + defer s.joinLock.Unlock() + + // Ignore any events from a potential join. This is safe since we hold + // the joinLock and nobody else can be doing a Join + if ignoreOld { + s.eventJoinIgnore = true + defer func() { + s.eventJoinIgnore = false + }() + } + + // Have memberlist attempt to join + num, err := s.memberlist.Join(existing) + + // If we joined any nodes, broadcast the join message + if num > 0 { + // Start broadcasting the update + if err := s.broadcastJoin(s.clock.Time()); err != nil { + return num, err + } + } + + return num, err +} + +// broadcastJoin broadcasts a new join intent with a +// given clock value. It is used on either join, or if +// we need to refute an older leave intent. Cannot be called +// with the memberLock held. +func (s *Serf) broadcastJoin(ltime LamportTime) error { + // Construct message to update our lamport clock + msg := messageJoin{ + LTime: ltime, + Node: s.config.NodeName, + } + s.clock.Witness(ltime) + + // Process update locally + s.handleNodeJoinIntent(&msg) + + // Start broadcasting the update + if err := s.broadcast(messageJoinType, &msg, nil); err != nil { + s.logger.Printf("[WARN] serf: Failed to broadcast join intent: %v", err) + return err + } + return nil +} + +// Leave gracefully exits the cluster. It is safe to call this multiple +// times. +func (s *Serf) Leave() error { + // Check the current state + s.stateLock.Lock() + if s.state == SerfLeft { + s.stateLock.Unlock() + return nil + } else if s.state == SerfLeaving { + s.stateLock.Unlock() + return fmt.Errorf("Leave already in progress") + } else if s.state == SerfShutdown { + s.stateLock.Unlock() + return fmt.Errorf("Leave called after Shutdown") + } + s.state = SerfLeaving + s.stateLock.Unlock() + + // If we have a snapshot, mark we are leaving + if s.snapshotter != nil { + s.snapshotter.Leave() + } + + // Construct the message for the graceful leave + msg := messageLeave{ + LTime: s.clock.Time(), + Node: s.config.NodeName, + } + s.clock.Increment() + + // Process the leave locally + s.handleNodeLeaveIntent(&msg) + + // Only broadcast the leave message if there is at least one + // other node alive. + if s.hasAliveMembers() { + notifyCh := make(chan struct{}) + if err := s.broadcast(messageLeaveType, &msg, notifyCh); err != nil { + return err + } + + select { + case <-notifyCh: + case <-time.After(s.config.BroadcastTimeout): + return errors.New("timeout while waiting for graceful leave") + } + } + + // Attempt the memberlist leave + err := s.memberlist.Leave(s.config.BroadcastTimeout) + if err != nil { + return err + } + + // Transition to Left only if we not already shutdown + s.stateLock.Lock() + if s.state != SerfShutdown { + s.state = SerfLeft + } + s.stateLock.Unlock() + return nil +} + +// hasAliveMembers is called to check for any alive members other than +// ourself. +func (s *Serf) hasAliveMembers() bool { + s.memberLock.RLock() + defer s.memberLock.RUnlock() + + hasAlive := false + for _, m := range s.members { + // Skip ourself, we want to know if OTHER members are alive + if m.Name == s.config.NodeName { + continue + } + + if m.Status == StatusAlive { + hasAlive = true + break + } + } + return hasAlive +} + +// LocalMember returns the Member information for the local node +func (s *Serf) LocalMember() Member { + s.memberLock.RLock() + defer s.memberLock.RUnlock() + return s.members[s.config.NodeName].Member +} + +// Members returns a point-in-time snapshot of the members of this cluster. +func (s *Serf) Members() []Member { + s.memberLock.RLock() + defer s.memberLock.RUnlock() + + members := make([]Member, 0, len(s.members)) + for _, m := range s.members { + members = append(members, m.Member) + } + + return members +} + +// RemoveFailedNode forcibly removes a failed node from the cluster +// immediately, instead of waiting for the reaper to eventually reclaim it. +// This also has the effect that Serf will no longer attempt to reconnect +// to this node. +func (s *Serf) RemoveFailedNode(node string) error { + // Construct the message to broadcast + msg := messageLeave{ + LTime: s.clock.Time(), + Node: node, + } + s.clock.Increment() + + // Process our own event + s.handleNodeLeaveIntent(&msg) + + // If we have no members, then we don't need to broadcast + if !s.hasAliveMembers() { + return nil + } + + // Broadcast the remove + notifyCh := make(chan struct{}) + if err := s.broadcast(messageLeaveType, &msg, notifyCh); err != nil { + return err + } + + // Wait for the broadcast + select { + case <-notifyCh: + case <-time.After(s.config.BroadcastTimeout): + return fmt.Errorf("timed out broadcasting node removal") + } + + return nil +} + +// Shutdown forcefully shuts down the Serf instance, stopping all network +// activity and background maintenance associated with the instance. +// +// This is not a graceful shutdown, and should be preceded by a call +// to Leave. Otherwise, other nodes in the cluster will detect this node's +// exit as a node failure. +// +// It is safe to call this method multiple times. +func (s *Serf) Shutdown() error { + s.stateLock.Lock() + defer s.stateLock.Unlock() + + if s.state == SerfShutdown { + return nil + } + + if s.state != SerfLeft { + s.logger.Printf("[WARN] serf: Shutdown without a Leave") + } + + // Wait to close the shutdown channel until after we've shut down the + // memberlist and its associated network resources, since the shutdown + // channel signals that we are cleaned up outside of Serf. + s.state = SerfShutdown + err := s.memberlist.Shutdown() + if err != nil { + return err + } + close(s.shutdownCh) + + // Wait for the snapshoter to finish if we have one + if s.snapshotter != nil { + s.snapshotter.Wait() + } + + return nil +} + +// ShutdownCh returns a channel that can be used to wait for +// Serf to shutdown. +func (s *Serf) ShutdownCh() <-chan struct{} { + return s.shutdownCh +} + +// Memberlist is used to get access to the underlying Memberlist instance +func (s *Serf) Memberlist() *memberlist.Memberlist { + return s.memberlist +} + +// State is the current state of this Serf instance. +func (s *Serf) State() SerfState { + s.stateLock.Lock() + defer s.stateLock.Unlock() + return s.state +} + +// broadcast takes a Serf message type, encodes it for the wire, and queues +// the broadcast. If a notify channel is given, this channel will be closed +// when the broadcast is sent. +func (s *Serf) broadcast(t messageType, msg interface{}, notify chan<- struct{}) error { + raw, err := encodeMessage(t, msg) + if err != nil { + return err + } + + s.broadcasts.QueueBroadcast(&broadcast{ + msg: raw, + notify: notify, + }) + return nil +} + +// handleNodeJoin is called when a node join event is received +// from memberlist. +func (s *Serf) handleNodeJoin(n *memberlist.Node) { + s.memberLock.Lock() + defer s.memberLock.Unlock() + + var oldStatus MemberStatus + member, ok := s.members[n.Name] + if !ok { + oldStatus = StatusNone + member = &memberState{ + Member: Member{ + Name: n.Name, + Addr: net.IP(n.Addr), + Port: n.Port, + Tags: s.decodeTags(n.Meta), + Status: StatusAlive, + }, + } + + // Check if we have a join or leave intent. The intent buffer + // will only hold one event for this node, so the more recent + // one will take effect. + if join, ok := recentIntent(s.recentIntents, n.Name, messageJoinType); ok { + member.statusLTime = join + } + if leave, ok := recentIntent(s.recentIntents, n.Name, messageLeaveType); ok { + member.Status = StatusLeaving + member.statusLTime = leave + } + + s.members[n.Name] = member + } else { + oldStatus = member.Status + deadTime := time.Now().Sub(member.leaveTime) + if oldStatus == StatusFailed && deadTime < s.config.FlapTimeout { + metrics.IncrCounter([]string{"serf", "member", "flap"}, 1) + } + + member.Status = StatusAlive + member.leaveTime = time.Time{} + member.Addr = net.IP(n.Addr) + member.Port = n.Port + member.Tags = s.decodeTags(n.Meta) + } + + // Update the protocol versions every time we get an event + member.ProtocolMin = n.PMin + member.ProtocolMax = n.PMax + member.ProtocolCur = n.PCur + member.DelegateMin = n.DMin + member.DelegateMax = n.DMax + member.DelegateCur = n.DCur + + // If node was previously in a failed state, then clean up some + // internal accounting. + // TODO(mitchellh): needs tests to verify not reaped + if oldStatus == StatusFailed || oldStatus == StatusLeft { + s.failedMembers = removeOldMember(s.failedMembers, member.Name) + s.leftMembers = removeOldMember(s.leftMembers, member.Name) + } + + // Update some metrics + metrics.IncrCounter([]string{"serf", "member", "join"}, 1) + + // Send an event along + s.logger.Printf("[INFO] serf: EventMemberJoin: %s %s", + member.Member.Name, member.Member.Addr) + if s.config.EventCh != nil { + s.config.EventCh <- MemberEvent{ + Type: EventMemberJoin, + Members: []Member{member.Member}, + } + } +} + +// handleNodeLeave is called when a node leave event is received +// from memberlist. +func (s *Serf) handleNodeLeave(n *memberlist.Node) { + s.memberLock.Lock() + defer s.memberLock.Unlock() + + member, ok := s.members[n.Name] + if !ok { + // We've never even heard of this node that is supposedly + // leaving. Just ignore it completely. + return + } + + switch member.Status { + case StatusLeaving: + member.Status = StatusLeft + member.leaveTime = time.Now() + s.leftMembers = append(s.leftMembers, member) + case StatusAlive: + member.Status = StatusFailed + member.leaveTime = time.Now() + s.failedMembers = append(s.failedMembers, member) + default: + // Unknown state that it was in? Just don't do anything + s.logger.Printf("[WARN] serf: Bad state when leave: %d", member.Status) + return + } + + // Send an event along + event := EventMemberLeave + eventStr := "EventMemberLeave" + if member.Status != StatusLeft { + event = EventMemberFailed + eventStr = "EventMemberFailed" + } + + // Update some metrics + metrics.IncrCounter([]string{"serf", "member", member.Status.String()}, 1) + + s.logger.Printf("[INFO] serf: %s: %s %s", + eventStr, member.Member.Name, member.Member.Addr) + if s.config.EventCh != nil { + s.config.EventCh <- MemberEvent{ + Type: event, + Members: []Member{member.Member}, + } + } +} + +// handleNodeUpdate is called when a node meta data update +// has taken place +func (s *Serf) handleNodeUpdate(n *memberlist.Node) { + s.memberLock.Lock() + defer s.memberLock.Unlock() + + member, ok := s.members[n.Name] + if !ok { + // We've never even heard of this node that is updating. + // Just ignore it completely. + return + } + + // Update the member attributes + member.Addr = net.IP(n.Addr) + member.Port = n.Port + member.Tags = s.decodeTags(n.Meta) + + // Snag the latest versions. NOTE - the current memberlist code will NOT + // fire an update event if the metadata (for Serf, tags) stays the same + // and only the protocol versions change. If we wake any Serf-level + // protocol changes where we want to get this event under those + // circumstances, we will need to update memberlist to do a check of + // versions as well as the metadata. + member.ProtocolMin = n.PMin + member.ProtocolMax = n.PMax + member.ProtocolCur = n.PCur + member.DelegateMin = n.DMin + member.DelegateMax = n.DMax + member.DelegateCur = n.DCur + + // Update some metrics + metrics.IncrCounter([]string{"serf", "member", "update"}, 1) + + // Send an event along + s.logger.Printf("[INFO] serf: EventMemberUpdate: %s", member.Member.Name) + if s.config.EventCh != nil { + s.config.EventCh <- MemberEvent{ + Type: EventMemberUpdate, + Members: []Member{member.Member}, + } + } +} + +// handleNodeLeaveIntent is called when an intent to leave is received. +func (s *Serf) handleNodeLeaveIntent(leaveMsg *messageLeave) bool { + // Witness a potentially newer time + s.clock.Witness(leaveMsg.LTime) + + s.memberLock.Lock() + defer s.memberLock.Unlock() + + member, ok := s.members[leaveMsg.Node] + if !ok { + // Rebroadcast only if this was an update we hadn't seen before. + return upsertIntent(s.recentIntents, leaveMsg.Node, messageLeaveType, leaveMsg.LTime, time.Now) + } + + // If the message is old, then it is irrelevant and we can skip it + if leaveMsg.LTime <= member.statusLTime { + return false + } + + // Refute us leaving if we are in the alive state + // Must be done in another goroutine since we have the memberLock + if leaveMsg.Node == s.config.NodeName && s.state == SerfAlive { + s.logger.Printf("[DEBUG] serf: Refuting an older leave intent") + go s.broadcastJoin(s.clock.Time()) + return false + } + + // State transition depends on current state + switch member.Status { + case StatusAlive: + member.Status = StatusLeaving + member.statusLTime = leaveMsg.LTime + return true + case StatusFailed: + member.Status = StatusLeft + member.statusLTime = leaveMsg.LTime + + // Remove from the failed list and add to the left list. We add + // to the left list so that when we do a sync, other nodes will + // remove it from their failed list. + s.failedMembers = removeOldMember(s.failedMembers, member.Name) + s.leftMembers = append(s.leftMembers, member) + + // We must push a message indicating the node has now + // left to allow higher-level applications to handle the + // graceful leave. + s.logger.Printf("[INFO] serf: EventMemberLeave (forced): %s %s", + member.Member.Name, member.Member.Addr) + if s.config.EventCh != nil { + s.config.EventCh <- MemberEvent{ + Type: EventMemberLeave, + Members: []Member{member.Member}, + } + } + return true + default: + return false + } +} + +// handleNodeJoinIntent is called when a node broadcasts a +// join message to set the lamport time of its join +func (s *Serf) handleNodeJoinIntent(joinMsg *messageJoin) bool { + // Witness a potentially newer time + s.clock.Witness(joinMsg.LTime) + + s.memberLock.Lock() + defer s.memberLock.Unlock() + + member, ok := s.members[joinMsg.Node] + if !ok { + // Rebroadcast only if this was an update we hadn't seen before. + return upsertIntent(s.recentIntents, joinMsg.Node, messageJoinType, joinMsg.LTime, time.Now) + } + + // Check if this time is newer than what we have + if joinMsg.LTime <= member.statusLTime { + return false + } + + // Update the LTime + member.statusLTime = joinMsg.LTime + + // If we are in the leaving state, we should go back to alive, + // since the leaving message must have been for an older time + if member.Status == StatusLeaving { + member.Status = StatusAlive + } + return true +} + +// handleUserEvent is called when a user event broadcast is +// received. Returns if the message should be rebroadcast. +func (s *Serf) handleUserEvent(eventMsg *messageUserEvent) bool { + // Witness a potentially newer time + s.eventClock.Witness(eventMsg.LTime) + + s.eventLock.Lock() + defer s.eventLock.Unlock() + + // Ignore if it is before our minimum event time + if eventMsg.LTime < s.eventMinTime { + return false + } + + // Check if this message is too old + curTime := s.eventClock.Time() + if curTime > LamportTime(len(s.eventBuffer)) && + eventMsg.LTime < curTime-LamportTime(len(s.eventBuffer)) { + s.logger.Printf( + "[WARN] serf: received old event %s from time %d (current: %d)", + eventMsg.Name, + eventMsg.LTime, + s.eventClock.Time()) + return false + } + + // Check if we've already seen this + idx := eventMsg.LTime % LamportTime(len(s.eventBuffer)) + seen := s.eventBuffer[idx] + userEvent := userEvent{Name: eventMsg.Name, Payload: eventMsg.Payload} + if seen != nil && seen.LTime == eventMsg.LTime { + for _, previous := range seen.Events { + if previous.Equals(&userEvent) { + return false + } + } + } else { + seen = &userEvents{LTime: eventMsg.LTime} + s.eventBuffer[idx] = seen + } + + // Add to recent events + seen.Events = append(seen.Events, userEvent) + + // Update some metrics + metrics.IncrCounter([]string{"serf", "events"}, 1) + metrics.IncrCounter([]string{"serf", "events", eventMsg.Name}, 1) + + if s.config.EventCh != nil { + s.config.EventCh <- UserEvent{ + LTime: eventMsg.LTime, + Name: eventMsg.Name, + Payload: eventMsg.Payload, + Coalesce: eventMsg.CC, + } + } + return true +} + +// handleQuery is called when a query broadcast is +// received. Returns if the message should be rebroadcast. +func (s *Serf) handleQuery(query *messageQuery) bool { + // Witness a potentially newer time + s.queryClock.Witness(query.LTime) + + s.queryLock.Lock() + defer s.queryLock.Unlock() + + // Ignore if it is before our minimum query time + if query.LTime < s.queryMinTime { + return false + } + + // Check if this message is too old + curTime := s.queryClock.Time() + if curTime > LamportTime(len(s.queryBuffer)) && + query.LTime < curTime-LamportTime(len(s.queryBuffer)) { + s.logger.Printf( + "[WARN] serf: received old query %s from time %d (current: %d)", + query.Name, + query.LTime, + s.queryClock.Time()) + return false + } + + // Check if we've already seen this + idx := query.LTime % LamportTime(len(s.queryBuffer)) + seen := s.queryBuffer[idx] + if seen != nil && seen.LTime == query.LTime { + for _, previous := range seen.QueryIDs { + if previous == query.ID { + // Seen this ID already + return false + } + } + } else { + seen = &queries{LTime: query.LTime} + s.queryBuffer[idx] = seen + } + + // Add to recent queries + seen.QueryIDs = append(seen.QueryIDs, query.ID) + + // Update some metrics + metrics.IncrCounter([]string{"serf", "queries"}, 1) + metrics.IncrCounter([]string{"serf", "queries", query.Name}, 1) + + // Check if we should rebroadcast, this may be disabled by a flag + rebroadcast := true + if query.NoBroadcast() { + rebroadcast = false + } + + // Filter the query + if !s.shouldProcessQuery(query.Filters) { + // Even if we don't process it further, we should rebroadcast, + // since it is the first time we've seen this. + return rebroadcast + } + + // Send ack if requested, without waiting for client to Respond() + if query.Ack() { + ack := messageQueryResponse{ + LTime: query.LTime, + ID: query.ID, + From: s.config.NodeName, + Flags: queryFlagAck, + } + raw, err := encodeMessage(messageQueryResponseType, &ack) + if err != nil { + s.logger.Printf("[ERR] serf: failed to format ack: %v", err) + } else { + addr := net.UDPAddr{IP: query.Addr, Port: int(query.Port)} + if err := s.memberlist.SendTo(&addr, raw); err != nil { + s.logger.Printf("[ERR] serf: failed to send ack: %v", err) + } + if err := s.relayResponse(query.RelayFactor, addr, &ack); err != nil { + s.logger.Printf("[ERR] serf: failed to relay ack: %v", err) + } + } + } + + if s.config.EventCh != nil { + s.config.EventCh <- &Query{ + LTime: query.LTime, + Name: query.Name, + Payload: query.Payload, + serf: s, + id: query.ID, + addr: query.Addr, + port: query.Port, + deadline: time.Now().Add(query.Timeout), + relayFactor: query.RelayFactor, + } + } + return rebroadcast +} + +// handleResponse is called when a query response is +// received. +func (s *Serf) handleQueryResponse(resp *messageQueryResponse) { + // Look for a corresponding QueryResponse + s.queryLock.RLock() + query, ok := s.queryResponse[resp.LTime] + s.queryLock.RUnlock() + if !ok { + s.logger.Printf("[WARN] serf: reply for non-running query (LTime: %d, ID: %d) From: %s", + resp.LTime, resp.ID, resp.From) + return + } + + // Verify the ID matches + if query.id != resp.ID { + s.logger.Printf("[WARN] serf: query reply ID mismatch (Local: %d, Response: %d)", + query.id, resp.ID) + return + } + + // Check if the query is closed + if query.Finished() { + return + } + + // Process each type of response + if resp.Ack() { + // Exit early if this is a duplicate ack + if _, ok := query.acks[resp.From]; ok { + metrics.IncrCounter([]string{"serf", "query_duplicate_acks"}, 1) + return + } + + metrics.IncrCounter([]string{"serf", "query_acks"}, 1) + select { + case query.ackCh <- resp.From: + query.acks[resp.From] = struct{}{} + default: + s.logger.Printf("[WARN] serf: Failed to deliver query ack, dropping") + } + } else { + // Exit early if this is a duplicate response + if _, ok := query.responses[resp.From]; ok { + metrics.IncrCounter([]string{"serf", "query_duplicate_responses"}, 1) + return + } + + metrics.IncrCounter([]string{"serf", "query_responses"}, 1) + select { + case query.respCh <- NodeResponse{From: resp.From, Payload: resp.Payload}: + query.responses[resp.From] = struct{}{} + default: + s.logger.Printf("[WARN] serf: Failed to deliver query response, dropping") + } + } +} + +// handleNodeConflict is invoked when a join detects a conflict over a name. +// This means two different nodes (IP/Port) are claiming the same name. Memberlist +// will reject the "new" node mapping, but we can still be notified +func (s *Serf) handleNodeConflict(existing, other *memberlist.Node) { + // Log a basic warning if the node is not us... + if existing.Name != s.config.NodeName { + s.logger.Printf("[WARN] serf: Name conflict for '%s' both %s:%d and %s:%d are claiming", + existing.Name, existing.Addr, existing.Port, other.Addr, other.Port) + return + } + + // The current node is conflicting! This is an error + s.logger.Printf("[ERR] serf: Node name conflicts with another node at %s:%d. Names must be unique! (Resolution enabled: %v)", + other.Addr, other.Port, s.config.EnableNameConflictResolution) + + // If automatic resolution is enabled, kick off the resolution + if s.config.EnableNameConflictResolution { + go s.resolveNodeConflict() + } +} + +// resolveNodeConflict is used to determine which node should remain during +// a name conflict. This is done by running an internal query. +func (s *Serf) resolveNodeConflict() { + // Get the local node + local := s.memberlist.LocalNode() + + // Start a name resolution query + qName := internalQueryName(conflictQuery) + payload := []byte(s.config.NodeName) + resp, err := s.Query(qName, payload, nil) + if err != nil { + s.logger.Printf("[ERR] serf: Failed to start name resolution query: %v", err) + return + } + + // Counter to determine winner + var responses, matching int + + // Gather responses + respCh := resp.ResponseCh() + for r := range respCh { + // Decode the response + if len(r.Payload) < 1 || messageType(r.Payload[0]) != messageConflictResponseType { + s.logger.Printf("[ERR] serf: Invalid conflict query response type: %v", r.Payload) + continue + } + var member Member + if err := decodeMessage(r.Payload[1:], &member); err != nil { + s.logger.Printf("[ERR] serf: Failed to decode conflict query response: %v", err) + continue + } + + // Update the counters + responses++ + if member.Addr.Equal(local.Addr) && member.Port == local.Port { + matching++ + } + } + + // Query over, determine if we should live + majority := (responses / 2) + 1 + if matching >= majority { + s.logger.Printf("[INFO] serf: majority in name conflict resolution [%d / %d]", + matching, responses) + return + } + + // Since we lost the vote, we need to exit + s.logger.Printf("[WARN] serf: minority in name conflict resolution, quiting [%d / %d]", + matching, responses) + if err := s.Shutdown(); err != nil { + s.logger.Printf("[ERR] serf: Failed to shutdown: %v", err) + } +} + +// handleReap periodically reaps the list of failed and left members, as well +// as old buffered intents. +func (s *Serf) handleReap() { + for { + select { + case <-time.After(s.config.ReapInterval): + s.memberLock.Lock() + now := time.Now() + s.failedMembers = s.reap(s.failedMembers, now, s.config.ReconnectTimeout) + s.leftMembers = s.reap(s.leftMembers, now, s.config.TombstoneTimeout) + reapIntents(s.recentIntents, now, s.config.RecentIntentTimeout) + s.memberLock.Unlock() + case <-s.shutdownCh: + return + } + } +} + +// handleReconnect attempts to reconnect to recently failed nodes +// on configured intervals. +func (s *Serf) handleReconnect() { + for { + select { + case <-time.After(s.config.ReconnectInterval): + s.reconnect() + case <-s.shutdownCh: + return + } + } +} + +// reap is called with a list of old members and a timeout, and removes +// members that have exceeded the timeout. The members are removed from +// both the old list and the members itself. Locking is left to the caller. +func (s *Serf) reap(old []*memberState, now time.Time, timeout time.Duration) []*memberState { + n := len(old) + for i := 0; i < n; i++ { + m := old[i] + + // Skip if the timeout is not yet reached + if now.Sub(m.leaveTime) <= timeout { + continue + } + + // Delete from the list + old[i], old[n-1] = old[n-1], nil + old = old[:n-1] + n-- + i-- + + // Delete from members + delete(s.members, m.Name) + + // Tell the coordinate client the node has gone away and delete + // its cached coordinates. + if !s.config.DisableCoordinates { + s.coordClient.ForgetNode(m.Name) + + s.coordCacheLock.Lock() + delete(s.coordCache, m.Name) + s.coordCacheLock.Unlock() + } + + // Send an event along + s.logger.Printf("[INFO] serf: EventMemberReap: %s", m.Name) + if s.config.EventCh != nil { + s.config.EventCh <- MemberEvent{ + Type: EventMemberReap, + Members: []Member{m.Member}, + } + } + } + + return old +} + +// reconnect attempts to reconnect to recently fail nodes. +func (s *Serf) reconnect() { + s.memberLock.RLock() + + // Nothing to do if there are no failed members + n := len(s.failedMembers) + if n == 0 { + s.memberLock.RUnlock() + return + } + + // Probability we should attempt to reconect is given + // by num failed / (num members - num failed - num left) + // This means that we probabilistically expect the cluster + // to attempt to connect to each failed member once per + // reconnect interval + numFailed := float32(len(s.failedMembers)) + numAlive := float32(len(s.members) - len(s.failedMembers) - len(s.leftMembers)) + if numAlive == 0 { + numAlive = 1 // guard against zero divide + } + prob := numFailed / numAlive + if rand.Float32() > prob { + s.memberLock.RUnlock() + s.logger.Printf("[DEBUG] serf: forgoing reconnect for random throttling") + return + } + + // Select a random member to try and join + idx := rand.Int31n(int32(n)) + mem := s.failedMembers[idx] + s.memberLock.RUnlock() + + // Format the addr + addr := net.UDPAddr{IP: mem.Addr, Port: int(mem.Port)} + s.logger.Printf("[INFO] serf: attempting reconnect to %v %s", mem.Name, addr.String()) + + // Attempt to join at the memberlist level + s.memberlist.Join([]string{addr.String()}) +} + +// checkQueueDepth periodically checks the size of a queue to see if +// it is too large +func (s *Serf) checkQueueDepth(name string, queue *memberlist.TransmitLimitedQueue) { + for { + select { + case <-time.After(time.Second): + numq := queue.NumQueued() + metrics.AddSample([]string{"serf", "queue", name}, float32(numq)) + if numq >= s.config.QueueDepthWarning { + s.logger.Printf("[WARN] serf: %s queue depth: %d", name, numq) + } + if numq > s.config.MaxQueueDepth { + s.logger.Printf("[WARN] serf: %s queue depth (%d) exceeds limit (%d), dropping messages!", + name, numq, s.config.MaxQueueDepth) + queue.Prune(s.config.MaxQueueDepth) + } + case <-s.shutdownCh: + return + } + } +} + +// removeOldMember is used to remove an old member from a list of old +// members. +func removeOldMember(old []*memberState, name string) []*memberState { + for i, m := range old { + if m.Name == name { + n := len(old) + old[i], old[n-1] = old[n-1], nil + return old[:n-1] + } + } + + return old +} + +// reapIntents clears out any intents that are older than the timeout. Make sure +// the memberLock is held when passing in the Serf instance's recentIntents +// member. +func reapIntents(intents map[string]nodeIntent, now time.Time, timeout time.Duration) { + for node, intent := range intents { + if now.Sub(intent.WallTime) > timeout { + delete(intents, node) + } + } +} + +// upsertIntent will update an existing intent with the supplied Lamport time, +// or create a new entry. This will return true if a new entry was added. The +// stamper is used to capture the wall clock time for expiring these buffered +// intents. Make sure the memberLock is held when passing in the Serf instance's +// recentIntents member. +func upsertIntent(intents map[string]nodeIntent, node string, itype messageType, + ltime LamportTime, stamper func() time.Time) bool { + if intent, ok := intents[node]; !ok || ltime > intent.LTime { + intents[node] = nodeIntent{ + Type: itype, + WallTime: stamper(), + LTime: ltime, + } + return true + } + + return false +} + +// recentIntent checks the recent intent buffer for a matching entry for a given +// node, and returns the Lamport time, if an intent is present, indicated by the +// returned boolean. Make sure the memberLock is held for read when passing in +// the Serf instance's recentIntents member. +func recentIntent(intents map[string]nodeIntent, node string, itype messageType) (LamportTime, bool) { + if intent, ok := intents[node]; ok && intent.Type == itype { + return intent.LTime, true + } + + return LamportTime(0), false +} + +// handleRejoin attempts to reconnect to previously known alive nodes +func (s *Serf) handleRejoin(previous []*PreviousNode) { + for _, prev := range previous { + // Do not attempt to join ourself + if prev.Name == s.config.NodeName { + continue + } + + s.logger.Printf("[INFO] serf: Attempting re-join to previously known node: %s", prev) + _, err := s.memberlist.Join([]string{prev.Addr}) + if err == nil { + s.logger.Printf("[INFO] serf: Re-joined to previously known node: %s", prev) + return + } + } + s.logger.Printf("[WARN] serf: Failed to re-join any previously known node") +} + +// encodeTags is used to encode a tag map +func (s *Serf) encodeTags(tags map[string]string) []byte { + // Support role-only backwards compatibility + if s.ProtocolVersion() < 3 { + role := tags["role"] + return []byte(role) + } + + // Use a magic byte prefix and msgpack encode the tags + var buf bytes.Buffer + buf.WriteByte(tagMagicByte) + enc := codec.NewEncoder(&buf, &codec.MsgpackHandle{}) + if err := enc.Encode(tags); err != nil { + panic(fmt.Sprintf("Failed to encode tags: %v", err)) + } + return buf.Bytes() +} + +// decodeTags is used to decode a tag map +func (s *Serf) decodeTags(buf []byte) map[string]string { + tags := make(map[string]string) + + // Backwards compatibility mode + if len(buf) == 0 || buf[0] != tagMagicByte { + tags["role"] = string(buf) + return tags + } + + // Decode the tags + r := bytes.NewReader(buf[1:]) + dec := codec.NewDecoder(r, &codec.MsgpackHandle{}) + if err := dec.Decode(&tags); err != nil { + s.logger.Printf("[ERR] serf: Failed to decode tags: %v", err) + } + return tags +} + +// Stats is used to provide operator debugging information +func (s *Serf) Stats() map[string]string { + toString := func(v uint64) string { + return strconv.FormatUint(v, 10) + } + stats := map[string]string{ + "members": toString(uint64(len(s.members))), + "failed": toString(uint64(len(s.failedMembers))), + "left": toString(uint64(len(s.leftMembers))), + "health_score": toString(uint64(s.memberlist.GetHealthScore())), + "member_time": toString(uint64(s.clock.Time())), + "event_time": toString(uint64(s.eventClock.Time())), + "query_time": toString(uint64(s.queryClock.Time())), + "intent_queue": toString(uint64(s.broadcasts.NumQueued())), + "event_queue": toString(uint64(s.eventBroadcasts.NumQueued())), + "query_queue": toString(uint64(s.queryBroadcasts.NumQueued())), + "encrypted": fmt.Sprintf("%v", s.EncryptionEnabled()), + } + return stats +} + +// WriteKeyringFile will serialize the current keyring and save it to a file. +func (s *Serf) writeKeyringFile() error { + if len(s.config.KeyringFile) == 0 { + return nil + } + + keyring := s.config.MemberlistConfig.Keyring + keysRaw := keyring.GetKeys() + keysEncoded := make([]string, len(keysRaw)) + + for i, key := range keysRaw { + keysEncoded[i] = base64.StdEncoding.EncodeToString(key) + } + + encodedKeys, err := json.MarshalIndent(keysEncoded, "", " ") + if err != nil { + return fmt.Errorf("Failed to encode keys: %s", err) + } + + // Use 0600 for permissions because key data is sensitive + if err = ioutil.WriteFile(s.config.KeyringFile, encodedKeys, 0600); err != nil { + return fmt.Errorf("Failed to write keyring file: %s", err) + } + + // Success! + return nil +} + +// GetCoordinate returns the network coordinate of the local node. +func (s *Serf) GetCoordinate() (*coordinate.Coordinate, error) { + if !s.config.DisableCoordinates { + return s.coordClient.GetCoordinate(), nil + } + + return nil, fmt.Errorf("Coordinates are disabled") +} + +// GetCachedCoordinate returns the network coordinate for the node with the given +// name. This will only be valid if DisableCoordinates is set to false. +func (s *Serf) GetCachedCoordinate(name string) (coord *coordinate.Coordinate, ok bool) { + if !s.config.DisableCoordinates { + s.coordCacheLock.RLock() + defer s.coordCacheLock.RUnlock() + if coord, ok = s.coordCache[name]; ok { + return coord, true + } + + return nil, false + } + + return nil, false +} + +// NumNodes returns the number of nodes in the serf cluster, regardless of +// their health or status. +func (s *Serf) NumNodes() (numNodes int) { + s.memberLock.RLock() + numNodes = len(s.members) + s.memberLock.RUnlock() + + return numNodes +} diff --git a/vendor/github.com/hashicorp/serf/serf/snapshot.go b/vendor/github.com/hashicorp/serf/serf/snapshot.go new file mode 100644 index 000000000..6e1fbd596 --- /dev/null +++ b/vendor/github.com/hashicorp/serf/serf/snapshot.go @@ -0,0 +1,560 @@ +package serf + +import ( + "bufio" + "encoding/json" + "fmt" + "log" + "math/rand" + "net" + "os" + "strconv" + "strings" + "time" + + "github.com/armon/go-metrics" + "github.com/hashicorp/serf/coordinate" +) + +/* +Serf supports using a "snapshot" file that contains various +transactional data that is used to help Serf recover quickly +and gracefully from a failure. We append member events, as well +as the latest clock values to the file during normal operation, +and periodically checkpoint and roll over the file. During a restore, +we can replay the various member events to recall a list of known +nodes to re-join, as well as restore our clock values to avoid replaying +old events. +*/ + +const flushInterval = 500 * time.Millisecond +const clockUpdateInterval = 500 * time.Millisecond +const coordinateUpdateInterval = 60 * time.Second +const tmpExt = ".compact" + +// Snapshotter is responsible for ingesting events and persisting +// them to disk, and providing a recovery mechanism at start time. +type Snapshotter struct { + aliveNodes map[string]string + clock *LamportClock + coordClient *coordinate.Client + fh *os.File + buffered *bufio.Writer + inCh <-chan Event + lastFlush time.Time + lastClock LamportTime + lastEventClock LamportTime + lastQueryClock LamportTime + leaveCh chan struct{} + leaving bool + logger *log.Logger + maxSize int64 + path string + offset int64 + outCh chan<- Event + rejoinAfterLeave bool + shutdownCh <-chan struct{} + waitCh chan struct{} +} + +// PreviousNode is used to represent the previously known alive nodes +type PreviousNode struct { + Name string + Addr string +} + +func (p PreviousNode) String() string { + return fmt.Sprintf("%s: %s", p.Name, p.Addr) +} + +// NewSnapshotter creates a new Snapshotter that records events up to a +// max byte size before rotating the file. It can also be used to +// recover old state. Snapshotter works by reading an event channel it returns, +// passing through to an output channel, and persisting relevant events to disk. +// Setting rejoinAfterLeave makes leave not clear the state, and can be used +// if you intend to rejoin the same cluster after a leave. +func NewSnapshotter(path string, + maxSize int, + rejoinAfterLeave bool, + logger *log.Logger, + clock *LamportClock, + coordClient *coordinate.Client, + outCh chan<- Event, + shutdownCh <-chan struct{}) (chan<- Event, *Snapshotter, error) { + inCh := make(chan Event, 1024) + + // Try to open the file + fh, err := os.OpenFile(path, os.O_RDWR|os.O_APPEND|os.O_CREATE, 0644) + if err != nil { + return nil, nil, fmt.Errorf("failed to open snapshot: %v", err) + } + + // Determine the offset + info, err := fh.Stat() + if err != nil { + fh.Close() + return nil, nil, fmt.Errorf("failed to stat snapshot: %v", err) + } + offset := info.Size() + + // Create the snapshotter + snap := &Snapshotter{ + aliveNodes: make(map[string]string), + clock: clock, + coordClient: coordClient, + fh: fh, + buffered: bufio.NewWriter(fh), + inCh: inCh, + lastClock: 0, + lastEventClock: 0, + lastQueryClock: 0, + leaveCh: make(chan struct{}), + logger: logger, + maxSize: int64(maxSize), + path: path, + offset: offset, + outCh: outCh, + rejoinAfterLeave: rejoinAfterLeave, + shutdownCh: shutdownCh, + waitCh: make(chan struct{}), + } + + // Recover the last known state + if err := snap.replay(); err != nil { + fh.Close() + return nil, nil, err + } + + // Start handling new commands + go snap.stream() + return inCh, snap, nil +} + +// LastClock returns the last known clock time +func (s *Snapshotter) LastClock() LamportTime { + return s.lastClock +} + +// LastEventClock returns the last known event clock time +func (s *Snapshotter) LastEventClock() LamportTime { + return s.lastEventClock +} + +// LastQueryClock returns the last known query clock time +func (s *Snapshotter) LastQueryClock() LamportTime { + return s.lastQueryClock +} + +// AliveNodes returns the last known alive nodes +func (s *Snapshotter) AliveNodes() []*PreviousNode { + // Copy the previously known + previous := make([]*PreviousNode, 0, len(s.aliveNodes)) + for name, addr := range s.aliveNodes { + previous = append(previous, &PreviousNode{name, addr}) + } + + // Randomize the order, prevents hot shards + for i := range previous { + j := rand.Intn(i + 1) + previous[i], previous[j] = previous[j], previous[i] + } + return previous +} + +// Wait is used to wait until the snapshotter finishes shut down +func (s *Snapshotter) Wait() { + <-s.waitCh +} + +// Leave is used to remove known nodes to prevent a restart from +// causing a join. Otherwise nodes will re-join after leaving! +func (s *Snapshotter) Leave() { + select { + case s.leaveCh <- struct{}{}: + case <-s.shutdownCh: + } +} + +// stream is a long running routine that is used to handle events +func (s *Snapshotter) stream() { + clockTicker := time.NewTicker(clockUpdateInterval) + defer clockTicker.Stop() + + coordinateTicker := time.NewTicker(coordinateUpdateInterval) + defer coordinateTicker.Stop() + + for { + select { + case <-s.leaveCh: + s.leaving = true + + // If we plan to re-join, keep our state + if !s.rejoinAfterLeave { + s.aliveNodes = make(map[string]string) + } + s.tryAppend("leave\n") + if err := s.buffered.Flush(); err != nil { + s.logger.Printf("[ERR] serf: failed to flush leave to snapshot: %v", err) + } + if err := s.fh.Sync(); err != nil { + s.logger.Printf("[ERR] serf: failed to sync leave to snapshot: %v", err) + } + + case e := <-s.inCh: + // Forward the event immediately + if s.outCh != nil { + s.outCh <- e + } + + // Stop recording events after a leave is issued + if s.leaving { + continue + } + switch typed := e.(type) { + case MemberEvent: + s.processMemberEvent(typed) + case UserEvent: + s.processUserEvent(typed) + case *Query: + s.processQuery(typed) + default: + s.logger.Printf("[ERR] serf: Unknown event to snapshot: %#v", e) + } + + case <-clockTicker.C: + s.updateClock() + + case <-coordinateTicker.C: + s.updateCoordinate() + + case <-s.shutdownCh: + if err := s.buffered.Flush(); err != nil { + s.logger.Printf("[ERR] serf: failed to flush snapshot: %v", err) + } + if err := s.fh.Sync(); err != nil { + s.logger.Printf("[ERR] serf: failed to sync snapshot: %v", err) + } + s.fh.Close() + close(s.waitCh) + return + } + } +} + +// processMemberEvent is used to handle a single member event +func (s *Snapshotter) processMemberEvent(e MemberEvent) { + switch e.Type { + case EventMemberJoin: + for _, mem := range e.Members { + addr := net.TCPAddr{IP: mem.Addr, Port: int(mem.Port)} + s.aliveNodes[mem.Name] = addr.String() + s.tryAppend(fmt.Sprintf("alive: %s %s\n", mem.Name, addr.String())) + } + + case EventMemberLeave: + fallthrough + case EventMemberFailed: + for _, mem := range e.Members { + delete(s.aliveNodes, mem.Name) + s.tryAppend(fmt.Sprintf("not-alive: %s\n", mem.Name)) + } + } + s.updateClock() +} + +// updateClock is called periodically to check if we should udpate our +// clock value. This is done after member events but should also be done +// periodically due to race conditions with join and leave intents +func (s *Snapshotter) updateClock() { + lastSeen := s.clock.Time() - 1 + if lastSeen > s.lastClock { + s.lastClock = lastSeen + s.tryAppend(fmt.Sprintf("clock: %d\n", s.lastClock)) + } +} + +// updateCoordinate is called periodically to write out the current local +// coordinate. It's safe to call this if coordinates aren't enabled (nil +// client) and it will be a no-op. +func (s *Snapshotter) updateCoordinate() { + if s.coordClient != nil { + encoded, err := json.Marshal(s.coordClient.GetCoordinate()) + if err != nil { + s.logger.Printf("[ERR] serf: Failed to encode coordinate: %v", err) + } else { + s.tryAppend(fmt.Sprintf("coordinate: %s\n", encoded)) + } + } +} + +// processUserEvent is used to handle a single user event +func (s *Snapshotter) processUserEvent(e UserEvent) { + // Ignore old clocks + if e.LTime <= s.lastEventClock { + return + } + s.lastEventClock = e.LTime + s.tryAppend(fmt.Sprintf("event-clock: %d\n", e.LTime)) +} + +// processQuery is used to handle a single query event +func (s *Snapshotter) processQuery(q *Query) { + // Ignore old clocks + if q.LTime <= s.lastQueryClock { + return + } + s.lastQueryClock = q.LTime + s.tryAppend(fmt.Sprintf("query-clock: %d\n", q.LTime)) +} + +// tryAppend will invoke append line but will not return an error +func (s *Snapshotter) tryAppend(l string) { + if err := s.appendLine(l); err != nil { + s.logger.Printf("[ERR] serf: Failed to update snapshot: %v", err) + } +} + +// appendLine is used to append a line to the existing log +func (s *Snapshotter) appendLine(l string) error { + defer metrics.MeasureSince([]string{"serf", "snapshot", "appendLine"}, time.Now()) + + n, err := s.buffered.WriteString(l) + if err != nil { + return err + } + + // Check if we should flush + now := time.Now() + if now.Sub(s.lastFlush) > flushInterval { + s.lastFlush = now + if err := s.buffered.Flush(); err != nil { + return err + } + } + + // Check if a compaction is necessary + s.offset += int64(n) + if s.offset > s.maxSize { + return s.compact() + } + return nil +} + +// Compact is used to compact the snapshot once it is too large +func (s *Snapshotter) compact() error { + defer metrics.MeasureSince([]string{"serf", "snapshot", "compact"}, time.Now()) + + // Try to open the file to new fiel + newPath := s.path + tmpExt + fh, err := os.OpenFile(newPath, os.O_RDWR|os.O_TRUNC|os.O_CREATE, 0755) + if err != nil { + return fmt.Errorf("failed to open new snapshot: %v", err) + } + + // Create a buffered writer + buf := bufio.NewWriter(fh) + + // Write out the live nodes + var offset int64 + for name, addr := range s.aliveNodes { + line := fmt.Sprintf("alive: %s %s\n", name, addr) + n, err := buf.WriteString(line) + if err != nil { + fh.Close() + return err + } + offset += int64(n) + } + + // Write out the clocks + line := fmt.Sprintf("clock: %d\n", s.lastClock) + n, err := buf.WriteString(line) + if err != nil { + fh.Close() + return err + } + offset += int64(n) + + line = fmt.Sprintf("event-clock: %d\n", s.lastEventClock) + n, err = buf.WriteString(line) + if err != nil { + fh.Close() + return err + } + offset += int64(n) + + line = fmt.Sprintf("query-clock: %d\n", s.lastQueryClock) + n, err = buf.WriteString(line) + if err != nil { + fh.Close() + return err + } + offset += int64(n) + + // Write out the coordinate. + if s.coordClient != nil { + encoded, err := json.Marshal(s.coordClient.GetCoordinate()) + if err != nil { + fh.Close() + return err + } + + line = fmt.Sprintf("coordinate: %s\n", encoded) + n, err = buf.WriteString(line) + if err != nil { + fh.Close() + return err + } + offset += int64(n) + } + + // Flush the new snapshot + err = buf.Flush() + fh.Close() + if err != nil { + return fmt.Errorf("failed to flush new snapshot: %v", err) + } + + // We now need to swap the old snapshot file with the new snapshot. + // Turns out, Windows won't let us rename the files if we have + // open handles to them or if the destination already exists. This + // means we are forced to close the existing handles, delete the + // old file, move the new one in place, and then re-open the file + // handles. + + // Flush the existing snapshot, ignoring errors since we will + // delete it momentarily. + s.buffered.Flush() + s.buffered = nil + + // Close the file handle to the old snapshot + s.fh.Close() + s.fh = nil + + // Delete the old file + if err := os.Remove(s.path); err != nil { + return fmt.Errorf("failed to remove old snapshot: %v", err) + } + + // Move the new file into place + if err := os.Rename(newPath, s.path); err != nil { + return fmt.Errorf("failed to install new snapshot: %v", err) + } + + // Open the new snapshot + fh, err = os.OpenFile(s.path, os.O_RDWR|os.O_APPEND|os.O_CREATE, 0755) + if err != nil { + return fmt.Errorf("failed to open snapshot: %v", err) + } + buf = bufio.NewWriter(fh) + + // Rotate our handles + s.fh = fh + s.buffered = buf + s.offset = offset + s.lastFlush = time.Now() + return nil +} + +// replay is used to seek to reset our internal state by replaying +// the snapshot file. It is used at initialization time to read old +// state +func (s *Snapshotter) replay() error { + // Seek to the beginning + if _, err := s.fh.Seek(0, os.SEEK_SET); err != nil { + return err + } + + // Read each line + reader := bufio.NewReader(s.fh) + for { + line, err := reader.ReadString('\n') + if err != nil { + break + } + + // Skip the newline + line = line[:len(line)-1] + + // Switch on the prefix + if strings.HasPrefix(line, "alive: ") { + info := strings.TrimPrefix(line, "alive: ") + addrIdx := strings.LastIndex(info, " ") + if addrIdx == -1 { + s.logger.Printf("[WARN] serf: Failed to parse address: %v", line) + continue + } + addr := info[addrIdx+1:] + name := info[:addrIdx] + s.aliveNodes[name] = addr + + } else if strings.HasPrefix(line, "not-alive: ") { + name := strings.TrimPrefix(line, "not-alive: ") + delete(s.aliveNodes, name) + + } else if strings.HasPrefix(line, "clock: ") { + timeStr := strings.TrimPrefix(line, "clock: ") + timeInt, err := strconv.ParseUint(timeStr, 10, 64) + if err != nil { + s.logger.Printf("[WARN] serf: Failed to convert clock time: %v", err) + continue + } + s.lastClock = LamportTime(timeInt) + + } else if strings.HasPrefix(line, "event-clock: ") { + timeStr := strings.TrimPrefix(line, "event-clock: ") + timeInt, err := strconv.ParseUint(timeStr, 10, 64) + if err != nil { + s.logger.Printf("[WARN] serf: Failed to convert event clock time: %v", err) + continue + } + s.lastEventClock = LamportTime(timeInt) + + } else if strings.HasPrefix(line, "query-clock: ") { + timeStr := strings.TrimPrefix(line, "query-clock: ") + timeInt, err := strconv.ParseUint(timeStr, 10, 64) + if err != nil { + s.logger.Printf("[WARN] serf: Failed to convert query clock time: %v", err) + continue + } + s.lastQueryClock = LamportTime(timeInt) + + } else if strings.HasPrefix(line, "coordinate: ") { + if s.coordClient == nil { + s.logger.Printf("[WARN] serf: Ignoring snapshot coordinates since they are disabled") + continue + } + + coordStr := strings.TrimPrefix(line, "coordinate: ") + var coord coordinate.Coordinate + err := json.Unmarshal([]byte(coordStr), &coord) + if err != nil { + s.logger.Printf("[WARN] serf: Failed to decode coordinate: %v", err) + continue + } + s.coordClient.SetCoordinate(&coord) + } else if line == "leave" { + // Ignore a leave if we plan on re-joining + if s.rejoinAfterLeave { + s.logger.Printf("[INFO] serf: Ignoring previous leave in snapshot") + continue + } + s.aliveNodes = make(map[string]string) + s.lastClock = 0 + s.lastEventClock = 0 + s.lastQueryClock = 0 + + } else if strings.HasPrefix(line, "#") { + // Skip comment lines + + } else { + s.logger.Printf("[WARN] serf: Unrecognized snapshot line: %v", line) + } + } + + // Seek to the end + if _, err := s.fh.Seek(0, os.SEEK_END); err != nil { + return err + } + return nil +} diff --git a/vendor/github.com/sean-/seed/LICENSE b/vendor/github.com/sean-/seed/LICENSE new file mode 100644 index 000000000..33d326a37 --- /dev/null +++ b/vendor/github.com/sean-/seed/LICENSE @@ -0,0 +1,54 @@ +MIT License + +Copyright (c) 2017 Sean Chittenden +Copyright (c) 2016 Alex Dadgar + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +===== + +Bits of Go-lang's `once.Do()` were cribbed and reused here, too. + +Copyright (c) 2009 The Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/sean-/seed/README.md b/vendor/github.com/sean-/seed/README.md new file mode 100644 index 000000000..0137564f0 --- /dev/null +++ b/vendor/github.com/sean-/seed/README.md @@ -0,0 +1,44 @@ +# `seed` - Quickly Seed Go's Random Number Generator + +Boiler-plate to securely [seed](https://en.wikipedia.org/wiki/Random_seed) Go's +random number generator (if possible). This library isn't anything fancy, it's +just a canonical way of seeding Go's random number generator. Cribbed from +[`Nomad`](https://github.com/hashicorp/nomad/commit/f89a993ec6b91636a3384dd568898245fbc273a1) +before it was moved into +[`Consul`](https://github.com/hashicorp/consul/commit/d695bcaae6e31ee307c11fdf55bb0bf46ea9fcf4) +and made into a helper function, and now further modularized to be a super +lightweight and reusable library. + +Time is better than +[Go's default seed of `1`](https://golang.org/pkg/math/rand/#Seed), but friends +don't let friends use time as a seed to a random number generator. Use +`seed.MustInit()` instead. + +`seed.Init()` is an idempotent and reentrant call that will return an error if +it can't seed the value the first time it is called. `Init()` is reentrant. + +`seed.MustInit()` is idempotent and reentrant call that will `panic()` if it +can't seed the value the first time it is called. `MustInit()` is reentrant. + +## Usage + +``` +package mypackage + +import ( + "github.com/sean-/seed" +) + +// MustInit will panic() if it is unable to set a high-entropy random seed: +func init() { + seed.MustInit() +} + +// Or if you want to not panic() and can actually handle this error: +func init() { + if secure, err := !seed.Init(); !secure { + // Handle the error + //panic(fmt.Sprintf("Unable to securely seed Go's RNG: %v", err)) + } +} +``` diff --git a/vendor/github.com/sean-/seed/init.go b/vendor/github.com/sean-/seed/init.go new file mode 100644 index 000000000..248d6b636 --- /dev/null +++ b/vendor/github.com/sean-/seed/init.go @@ -0,0 +1,84 @@ +package seed + +import ( + crand "crypto/rand" + "fmt" + "math" + "math/big" + "math/rand" + "sync" + "sync/atomic" + "time" +) + +var ( + m sync.Mutex + secure int32 + seeded int32 +) + +func cryptoSeed() error { + defer atomic.StoreInt32(&seeded, 1) + + var err error + var n *big.Int + n, err = crand.Int(crand.Reader, big.NewInt(math.MaxInt64)) + if err != nil { + rand.Seed(time.Now().UTC().UnixNano()) + return err + } + rand.Seed(n.Int64()) + atomic.StoreInt32(&secure, 1) + return nil +} + +// Init provides best-effort seeding (which is better than running with Go's +// default seed of 1). If `/dev/urandom` is available, Init() will seed Go's +// runtime with entropy from `/dev/urandom` and return true because the runtime +// was securely seeded. If Init() has already initialized the random number or +// it had failed to securely initialize the random number generation, Init() +// will return false. See MustInit(). +func Init() (seededSecurely bool, err error) { + if atomic.LoadInt32(&seeded) == 1 { + return false, nil + } + + // Slow-path + m.Lock() + defer m.Unlock() + + if err := cryptoSeed(); err != nil { + return false, err + } + + return true, nil +} + +// MustInit provides guaranteed secure seeding. If `/dev/urandom` is not +// available, MustInit will panic() with an error indicating why reading from +// `/dev/urandom` failed. MustInit() will upgrade the seed if for some reason a +// call to Init() failed in the past. +func MustInit() { + if atomic.LoadInt32(&secure) == 1 { + return + } + + // Slow-path + m.Lock() + defer m.Unlock() + + if err := cryptoSeed(); err != nil { + panic(fmt.Sprintf("Unable to seed the random number generator: %v", err)) + } +} + +// Secure returns true if a cryptographically secure seed was used to +// initialize rand. +func Secure() bool { + return atomic.LoadInt32(&secure) == 1 +} + +// Seeded returns true if Init has seeded the random number generator. +func Seeded() bool { + return atomic.LoadInt32(&seeded) == 1 +} diff --git a/vendor/vendor.json b/vendor/vendor.json index f6fec3d43..03ba85174 100644 --- a/vendor/vendor.json +++ b/vendor/vendor.json @@ -467,6 +467,12 @@ "revision": "bbbad097214e2918d8543d5201d12bfd7bca254d", "revisionTime": "2015-08-27T00:49:46Z" }, + { + "checksumSHA1": "YfhpW3cu1CHWX7lUCRparOJ6Vy4=", + "path": "github.com/armon/go-metrics", + "revision": "93f237eba9b0602f3e73710416558854a81d9337", + "revisionTime": "2017-01-14T13:47:37Z" + }, { "checksumSHA1": "gNO0JNpLzYOdInGeq7HqMZUzx9M=", "path": "github.com/armon/go-radix", @@ -1839,6 +1845,12 @@ "revision": "1792bd8de119ba49b17fd8d3c3c1f488ec613e62", "revisionTime": "2016-11-07T20:49:10Z" }, + { + "checksumSHA1": "jfELEMRhiTcppZmRH+ZwtkVS5Uw=", + "path": "github.com/hashicorp/consul/acl", + "revision": "144a5e5340893a5e726e831c648f26dc19fef1e7", + "revisionTime": "2017-03-10T23:35:18Z" + }, { "checksumSHA1": "ygEjA1d52B1RDmZu8+1WTwkrYDQ=", "comment": "v0.6.3-28-g3215b87", @@ -1846,6 +1858,24 @@ "revision": "48d7b069ad443a48ffa93364048ff8909b5d1fa2", "revisionTime": "2017-02-07T15:38:46Z" }, + { + "checksumSHA1": "nomqbPd9j3XelMMcv7+vTEPsdr4=", + "path": "github.com/hashicorp/consul/consul/structs", + "revision": "48d7b069ad443a48ffa93364048ff8909b5d1fa2", + "revisionTime": "2017-02-07T15:38:46Z" + }, + { + "checksumSHA1": "dgYoWTG7nIL9CUBuktDvMZqYDR8=", + "path": "github.com/hashicorp/consul/testutil", + "revision": "48d7b069ad443a48ffa93364048ff8909b5d1fa2", + "revisionTime": "2017-02-07T15:38:46Z" + }, + { + "checksumSHA1": "ZPDLNuKJGZJFV9HlJ/V0O4/c/Ko=", + "path": "github.com/hashicorp/consul/types", + "revision": "48d7b069ad443a48ffa93364048ff8909b5d1fa2", + "revisionTime": "2017-02-07T15:38:46Z" + }, { "checksumSHA1": "cdOCt0Yb+hdErz8NAQqayxPmRsY=", "path": "github.com/hashicorp/errwrap", @@ -1903,6 +1933,12 @@ "revision": "6bb64b370b90e7ef1fa532be9e591a81c3493e00", "revisionTime": "2016-05-03T14:34:40Z" }, + { + "checksumSHA1": "BGODc7juQbdG3vNXHZG07kt+lKI=", + "path": "github.com/hashicorp/go-sockaddr", + "revision": "f910dd83c2052566cad78352c33af714358d1372", + "revisionTime": "2017-02-08T07:30:35Z" + }, { "checksumSHA1": "85XUnluYJL7F55ptcwdmN8eSOsk=", "path": "github.com/hashicorp/go-uuid", @@ -1914,6 +1950,18 @@ "revision": "e96d3840402619007766590ecea8dd7af1292276", "revisionTime": "2016-10-31T18:26:05Z" }, + { + "checksumSHA1": "d9PxF1XQGLMJZRct2R8qVM/eYlE=", + "path": "github.com/hashicorp/golang-lru", + "revision": "0a025b7e63adc15a622f29b0b2c4c3848243bbf6", + "revisionTime": "2016-08-13T22:13:03Z" + }, + { + "checksumSHA1": "9hffs0bAIU6CquiRhKQdzjHnKt0=", + "path": "github.com/hashicorp/golang-lru/simplelru", + "revision": "0a025b7e63adc15a622f29b0b2c4c3848243bbf6", + "revisionTime": "2016-08-13T22:13:03Z" + }, { "checksumSHA1": "Ok3Csn6Voou7pQT6Dv2mkwpqFtw=", "path": "github.com/hashicorp/hcl", @@ -2010,12 +2058,30 @@ "revision": "0dc08b1671f34c4250ce212759ebd880f743d883", "revisionTime": "2015-06-09T07:04:31Z" }, + { + "checksumSHA1": "1zk7IeGClUqBo+Phsx89p7fQ/rQ=", + "path": "github.com/hashicorp/memberlist", + "revision": "23ad4b7d7b38496cd64c241dfd4c60b7794c254a", + "revisionTime": "2017-02-08T21:15:06Z" + }, + { + "checksumSHA1": "wpirHJV/6VEbbD+HyAP2/6Xc0ek=", + "path": "github.com/hashicorp/raft", + "revision": "aaad9f10266e089bd401e7a6487651a69275641b", + "revisionTime": "2016-11-10T00:52:40Z" + }, { "checksumSHA1": "o8In5byYGDCY/mnTuV4Tfmci+3w=", "comment": "v0.7.0-12-ge4ec8cc", "path": "github.com/hashicorp/serf/coordinate", "revision": "e4ec8cc423bbe20d26584b96efbeb9102e16d05f" }, + { + "checksumSHA1": "/0bDLkmtbWMm06hjM7HnHaw0QBo=", + "path": "github.com/hashicorp/serf/serf", + "revision": "19f2c401e122352c047a84d6584dd51e2fb8fcc4", + "revisionTime": "2017-03-08T19:39:51Z" + }, { "checksumSHA1": "2fkVZIzvxIGBLhSiVnkTgGiqpQ4=", "path": "github.com/hashicorp/vault/api", @@ -2724,6 +2790,12 @@ "revision": "d10489e5d217ebe9c23470c4d0ba7081a6d1e799", "revisionTime": "2016-12-25T12:04:19Z" }, + { + "checksumSHA1": "tnMZLo/kR9Kqx6GtmWwowtTLlA8=", + "path": "github.com/sean-/seed", + "revision": "e2103e2c35297fb7e17febb81e49b312087a2372", + "revisionTime": "2017-03-13T16:33:22Z" + }, { "checksumSHA1": "ySSmShoczI/i/5PzurH8Uhi/dbA=", "path": "github.com/sethvargo/go-fastly", From 86f711f6fc26659a47182d61806763d75aa39022 Mon Sep 17 00:00:00 2001 From: James Bardin Date: Tue, 14 Mar 2017 14:41:16 -0400 Subject: [PATCH 7/7] Make consul backend tests opt-in This way we don't require contributers to have consul installed to run make test. --- .travis.yml | 2 +- backend/remote-state/consul/backend_test.go | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index c8bcec473..9a88a87a0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,7 +5,7 @@ go: - 1.8 env: - - CONSUL_VERSION=0.7.5 + - CONSUL_VERSION=0.7.5 TF_CONSUL_TEST=1 # Fetch consul for the backend and provider tests before_install: diff --git a/backend/remote-state/consul/backend_test.go b/backend/remote-state/consul/backend_test.go index 819e790d7..6a8566e0c 100644 --- a/backend/remote-state/consul/backend_test.go +++ b/backend/remote-state/consul/backend_test.go @@ -3,6 +3,7 @@ package consul import ( "fmt" "io/ioutil" + "os" "testing" "time" @@ -15,6 +16,12 @@ func TestBackend_impl(t *testing.T) { } func newConsulTestServer(t *testing.T) *testutil.TestServer { + skip := os.Getenv("TF_ACC") == "" && os.Getenv("TF_CONSUL_TEST") == "" + if skip { + t.Log("consul server tests require setting TF_ACC or TF_CONSUL_TEST") + t.Skip() + } + srv := testutil.NewTestServerConfig(t, func(c *testutil.TestServerConfig) { c.LogLevel = "warn"