always wait for a RunningOperation to return

If the user wishes to interrupt the running operation, only the first
interrupt was communicated to the operation by canceling the provided
context. A second interrupt would start the shutdown process, but not
communicate this to the running operation. This order of event could
cause partial writes of state.

What would happen is that once the command returns, the plugin system
would stop the provider processes. Once the provider processes dies, all
pending Eval operations would return return with an error, and quickly
cause the operation to complete. Since the backend code didn't know that
the process was shutting down imminently, it would continue by
attempting to write out the last known state. Under the right
conditions, the process would exit part way through the writing of the
state file.

Add Stop and Cancel CancelFuncs to the RunningOperation, to allow it to
easily differentiate between the two signals. The backend will then be
able to detect a shutdown and abort more gracefully.

In order to ensure that the backend is not in the process of writing the
state out, the command will always attempt to wait for the process to
complete after cancellation.
This commit is contained in:
James Bardin 2018-02-09 18:10:52 -05:00
parent f0a009c573
commit 7cba68326a
8 changed files with 143 additions and 37 deletions

View File

@ -145,8 +145,6 @@ type Operation struct {
// RunningOperation is the result of starting an operation.
type RunningOperation struct {
// Context should be used to track Done and Err for errors.
//
// For implementers of a backend, this context should not wrap the
// passed in context. Otherwise, canceling the parent context will
// immediately mark this context as "done" but those aren't the semantics
@ -154,6 +152,16 @@ type RunningOperation struct {
// is fully done.
context.Context
// Stop requests the operation to complete early, by calling Stop on all
// the plugins. If the process needs to terminate immediately, call Cancel.
Stop context.CancelFunc
// Cancel is the context.CancelFunc associated with the embedded context,
// and can be called to terminate the operation early.
// Once Cancel is called, the operation should return as soon as possible
// to avoid running operations during process exit.
Cancel context.CancelFunc
// Err is the error of the operation. This is populated after
// the operation has completed.
Err error

View File

@ -224,7 +224,7 @@ func (b *Local) State(name string) (state.State, error) {
// name conflicts, assume that the field is overwritten if set.
func (b *Local) Operation(ctx context.Context, op *backend.Operation) (*backend.RunningOperation, error) {
// Determine the function to call for our operation
var f func(context.Context, *backend.Operation, *backend.RunningOperation)
var f func(context.Context, context.Context, *backend.Operation, *backend.RunningOperation)
switch op.Type {
case backend.OperationTypeRefresh:
f = b.opRefresh
@ -244,14 +244,29 @@ func (b *Local) Operation(ctx context.Context, op *backend.Operation) (*backend.
b.opLock.Lock()
// Build our running operation
runningCtx, runningCtxCancel := context.WithCancel(context.Background())
runningOp := &backend.RunningOperation{Context: runningCtx}
// the runninCtx is only used to block until the operation returns.
runningCtx, done := context.WithCancel(context.Background())
runningOp := &backend.RunningOperation{
Context: runningCtx,
}
// stopCtx wraps the context passed in, and is used to signal a graceful Stop.
stopCtx, stop := context.WithCancel(ctx)
runningOp.Stop = stop
// cancelCtx is used to cancel the operation immediately, usually
// indicating that the process is exiting.
cancelCtx, cancel := context.WithCancel(context.Background())
runningOp.Cancel = cancel
// Do it
go func() {
defer done()
defer stop()
defer cancel()
defer b.opLock.Unlock()
defer runningCtxCancel()
f(ctx, op, runningOp)
f(stopCtx, cancelCtx, op, runningOp)
}()
// Return

View File

@ -19,7 +19,8 @@ import (
)
func (b *Local) opApply(
ctx context.Context,
stopCtx context.Context,
cancelCtx context.Context,
op *backend.Operation,
runningOp *backend.RunningOperation) {
log.Printf("[INFO] backend/local: starting Apply operation")
@ -55,7 +56,7 @@ func (b *Local) opApply(
}
if op.LockState {
lockCtx, cancel := context.WithTimeout(ctx, op.StateLockTimeout)
lockCtx, cancel := context.WithTimeout(stopCtx, op.StateLockTimeout)
defer cancel()
lockInfo := state.NewLockInfo()
@ -157,7 +158,7 @@ func (b *Local) opApply(
// we can handle it properly.
err = nil
select {
case <-ctx.Done():
case <-stopCtx.Done():
if b.CLI != nil {
b.CLI.Output("stopping apply operation...")
}
@ -176,8 +177,18 @@ func (b *Local) opApply(
// Stop execution
go tfCtx.Stop()
// Wait for completion still
<-doneCh
select {
case <-cancelCtx.Done():
log.Println("[WARN] running operation canceled")
// if the operation was canceled, we need to return immediately
return
case <-doneCh:
}
case <-cancelCtx.Done():
// this should not be called without first attempting to stop the
// operation
log.Println("[ERROR] running operation canceled without Stop")
return
case <-doneCh:
}

View File

@ -19,7 +19,8 @@ import (
)
func (b *Local) opPlan(
ctx context.Context,
stopCtx context.Context,
cancelCtx context.Context,
op *backend.Operation,
runningOp *backend.RunningOperation) {
log.Printf("[INFO] backend/local: starting Plan operation")
@ -62,7 +63,7 @@ func (b *Local) opPlan(
}
if op.LockState {
lockCtx, cancel := context.WithTimeout(ctx, op.StateLockTimeout)
lockCtx, cancel := context.WithTimeout(stopCtx, op.StateLockTimeout)
defer cancel()
lockInfo := state.NewLockInfo()
@ -112,7 +113,7 @@ func (b *Local) opPlan(
}()
select {
case <-ctx.Done():
case <-stopCtx.Done():
if b.CLI != nil {
b.CLI.Output("stopping plan operation...")
}
@ -120,8 +121,18 @@ func (b *Local) opPlan(
// Stop execution
go tfCtx.Stop()
// Wait for completion still
<-doneCh
select {
case <-cancelCtx.Done():
log.Println("[WARN] running operation canceled")
// if the operation was canceled, we need to return immediately
return
case <-doneCh:
}
case <-cancelCtx.Done():
// this should not be called without first attempting to stop the
// operation
log.Println("[ERROR] running operation canceled without Stop")
return
case <-doneCh:
}

View File

@ -17,7 +17,8 @@ import (
)
func (b *Local) opRefresh(
ctx context.Context,
stopCtx context.Context,
cancelCtx context.Context,
op *backend.Operation,
runningOp *backend.RunningOperation) {
// Check if our state exists if we're performing a refresh operation. We
@ -53,7 +54,7 @@ func (b *Local) opRefresh(
}
if op.LockState {
lockCtx, cancel := context.WithTimeout(ctx, op.StateLockTimeout)
lockCtx, cancel := context.WithTimeout(stopCtx, op.StateLockTimeout)
defer cancel()
lockInfo := state.NewLockInfo()
@ -91,7 +92,7 @@ func (b *Local) opRefresh(
}()
select {
case <-ctx.Done():
case <-stopCtx.Done():
if b.CLI != nil {
b.CLI.Output("stopping refresh operation...")
}
@ -99,8 +100,18 @@ func (b *Local) opRefresh(
// Stop execution
go tfCtx.Stop()
// Wait for completion still
<-doneCh
select {
case <-cancelCtx.Done():
log.Println("[WARN] running operation canceled")
// if the operation was canceled, we need to return immediately
return
case <-doneCh:
}
case <-cancelCtx.Done():
// this should not be called without first attempting to stop the
// operation
log.Println("[ERROR] running operation canceled without Stop")
return
case <-doneCh:
}

View File

@ -7,6 +7,7 @@ import (
"os"
"sort"
"strings"
"time"
"github.com/hashicorp/terraform/tfdiags"
@ -159,10 +160,7 @@ func (c *ApplyCommand) Run(args []string) int {
opReq.DestroyForce = destroyForce
// Perform the operation
ctx, ctxCancel := context.WithCancel(context.Background())
defer ctxCancel()
op, err := b.Operation(ctx, opReq)
op, err := b.Operation(context.Background(), opReq)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error starting operation: %s", err))
return 1
@ -171,8 +169,8 @@ func (c *ApplyCommand) Run(args []string) int {
// Wait for the operation to complete or an interrupt to occur
select {
case <-c.ShutdownCh:
// Cancel our context so we can start gracefully exiting
ctxCancel()
// gracefully stop the operation
op.Stop()
// Notify the user
c.Ui.Output(outputInterrupt)
@ -183,7 +181,19 @@ func (c *ApplyCommand) Run(args []string) int {
c.Ui.Error(
"Two interrupts received. Exiting immediately. Note that data\n" +
"loss may have occurred.")
// cancel the operation completely
op.Cancel()
// the operation should return asap
// but timeout just in case
select {
case <-op.Done():
case <-time.After(5 * time.Second):
}
return 1
case <-op.Done():
}
case <-op.Done():

View File

@ -4,6 +4,7 @@ import (
"context"
"fmt"
"strings"
"time"
"github.com/hashicorp/terraform/backend"
"github.com/hashicorp/terraform/config"
@ -107,10 +108,7 @@ func (c *PlanCommand) Run(args []string) int {
opReq.Type = backend.OperationTypePlan
// Perform the operation
ctx, ctxCancel := context.WithCancel(context.Background())
defer ctxCancel()
op, err := b.Operation(ctx, opReq)
op, err := b.Operation(context.Background(), opReq)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error starting operation: %s", err))
return 1
@ -119,7 +117,7 @@ func (c *PlanCommand) Run(args []string) int {
select {
case <-c.ShutdownCh:
// Cancel our context so we can start gracefully exiting
ctxCancel()
op.Stop()
// Notify the user
c.Ui.Output(outputInterrupt)
@ -129,6 +127,17 @@ func (c *PlanCommand) Run(args []string) int {
case <-c.ShutdownCh:
c.Ui.Error(
"Two interrupts received. Exiting immediately")
// cancel the operation completely
op.Cancel()
// the operation should return asap
// but timeout just in case
select {
case <-op.Done():
case <-time.After(5 * time.Second):
}
return 1
case <-op.Done():
}

View File

@ -4,6 +4,7 @@ import (
"context"
"fmt"
"strings"
"time"
"github.com/hashicorp/terraform/backend"
"github.com/hashicorp/terraform/config"
@ -82,10 +83,40 @@ func (c *RefreshCommand) Run(args []string) int {
return 1
}
// Wait for the operation to complete
<-op.Done()
if err := op.Err; err != nil {
diags = diags.Append(err)
// Wait for the operation to complete or an interrupt to occur
select {
case <-c.ShutdownCh:
// gracefully stop the operation
op.Stop()
// Notify the user
c.Ui.Output(outputInterrupt)
// Still get the result, since there is still one
select {
case <-c.ShutdownCh:
c.Ui.Error(
"Two interrupts received. Exiting immediately. Note that data\n" +
"loss may have occurred.")
// cancel the operation completely
op.Cancel()
// the operation should return asap
// but timeout just in case
select {
case <-op.Done():
case <-time.After(5 * time.Second):
}
return 1
case <-op.Done():
}
case <-op.Done():
if err := op.Err; err != nil {
diags = diags.Append(err)
}
}
c.showDiagnostics(diags)