gracefully recover from unexpected reaper panic
This commit is contained in:
parent
5e31db25cb
commit
42f3fd621b
1 changed files with 34 additions and 3 deletions
|
@ -16,9 +16,12 @@ package reaper
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"runtime/debug"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/drone/drone/core"
|
"github.com/drone/drone/core"
|
||||||
|
|
||||||
|
"github.com/sirupsen/logrus"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Reaper finds and kills zombie jobs that are permanently
|
// Reaper finds and kills zombie jobs that are permanently
|
||||||
|
@ -57,9 +60,6 @@ func New(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO use multierror to aggregate errors encountered
|
|
||||||
// TODO use trace logging
|
|
||||||
|
|
||||||
// Start starts the reaper.
|
// Start starts the reaper.
|
||||||
func (r *Reaper) Start(ctx context.Context, dur time.Duration) error {
|
func (r *Reaper) Start(ctx context.Context, dur time.Duration) error {
|
||||||
ticker := time.NewTicker(dur)
|
ticker := time.NewTicker(dur)
|
||||||
|
@ -76,33 +76,59 @@ func (r *Reaper) Start(ctx context.Context, dur time.Duration) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *Reaper) reap(ctx context.Context) error {
|
func (r *Reaper) reap(ctx context.Context) error {
|
||||||
|
defer func() {
|
||||||
|
// taking the paranoid approach to recover from
|
||||||
|
// a panic that should absolutely never happen.
|
||||||
|
if r := recover(); r != nil {
|
||||||
|
logrus.Errorf("reaper: unexpected panic: %s", r)
|
||||||
|
debug.PrintStack()
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
// TODO debug log entry
|
||||||
|
// TODO use multierror
|
||||||
|
|
||||||
pending, err := r.Builds.Pending(ctx)
|
pending, err := r.Builds.Pending(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
logrus.WithError(err).
|
||||||
|
Errorf("reaper: cannot get pending builds")
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
for _, build := range pending {
|
for _, build := range pending {
|
||||||
// if a build is pending for longer than the maximum
|
// if a build is pending for longer than the maximum
|
||||||
// pending time limit, the build is maybe cancelled.
|
// pending time limit, the build is maybe cancelled.
|
||||||
if isExceeded(build.Created, r.Pending, buffer) {
|
if isExceeded(build.Created, r.Pending, buffer) {
|
||||||
|
// TODO debug log entry
|
||||||
err = r.reapMaybe(ctx, build)
|
err = r.reapMaybe(ctx, build)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
// TODO error log entry
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
// TODO debug log entry
|
||||||
|
} else {
|
||||||
|
// TODO trace log entry
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
running, err := r.Builds.Running(ctx)
|
running, err := r.Builds.Running(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
logrus.WithError(err).
|
||||||
|
Errorf("reaper: cannot get running builds")
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
for _, build := range running {
|
for _, build := range running {
|
||||||
// if a build is running for longer than the maximum
|
// if a build is running for longer than the maximum
|
||||||
// running time limit, the build is maybe cancelled.
|
// running time limit, the build is maybe cancelled.
|
||||||
if isExceeded(build.Started, r.Running, buffer) {
|
if isExceeded(build.Started, r.Running, buffer) {
|
||||||
|
// TODO debug log entry
|
||||||
err = r.reapMaybe(ctx, build)
|
err = r.reapMaybe(ctx, build)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
// TODO error log entry
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
// TODO debug log entry
|
||||||
|
} else {
|
||||||
|
// TODO trace log entry
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -118,6 +144,7 @@ func (r *Reaper) reapMaybe(ctx context.Context, build *core.Build) error {
|
||||||
// if the build status is pending we can immediately
|
// if the build status is pending we can immediately
|
||||||
// cancel the build and all build stages.
|
// cancel the build and all build stages.
|
||||||
if build.Status == core.StatusPending {
|
if build.Status == core.StatusPending {
|
||||||
|
// TODO trace log entry
|
||||||
return r.Canceler.Cancel(ctx, repo, build)
|
return r.Canceler.Cancel(ctx, repo, build)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -139,13 +166,17 @@ func (r *Reaper) reapMaybe(ctx context.Context, build *core.Build) error {
|
||||||
// if the build stages are all pending we can immediately
|
// if the build stages are all pending we can immediately
|
||||||
// cancel the build.
|
// cancel the build.
|
||||||
if started == 0 {
|
if started == 0 {
|
||||||
|
// TODO trace log entry
|
||||||
return r.Canceler.Cancel(ctx, repo, build)
|
return r.Canceler.Cancel(ctx, repo, build)
|
||||||
}
|
}
|
||||||
|
|
||||||
// if the build stage has exceeded the timeout by a reasonable
|
// if the build stage has exceeded the timeout by a reasonable
|
||||||
// margin cancel the build and all build stages, else ignore.
|
// margin cancel the build and all build stages, else ignore.
|
||||||
if isExceeded(started, time.Duration(repo.Timeout)*time.Minute, buffer) {
|
if isExceeded(started, time.Duration(repo.Timeout)*time.Minute, buffer) {
|
||||||
|
// TODO trace log entry
|
||||||
return r.Canceler.Cancel(ctx, repo, build)
|
return r.Canceler.Cancel(ctx, repo, build)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO trace log entry
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue