podman

healthcheck.go
474 строки · 14.7 Кб
Перенос по словам
1
//go:build !remote
2

3
package libpod
4

5
import (
6
	"bufio"
7
	"context"
8
	"errors"
9
	"fmt"
10
	"io/fs"
11
	"os"
12
	"path/filepath"
13
	"strings"
14
	"time"
15

16
	"github.com/containers/podman/v5/libpod/define"
17
	"github.com/sirupsen/logrus"
18
	"golang.org/x/sys/unix"
19
)
20

21
const (
22
	// MaxHealthCheckNumberLogs is the maximum number of attempts we keep
23
	// in the healthcheck history file
24
	MaxHealthCheckNumberLogs int = 5
25
	// MaxHealthCheckLogLength in characters
26
	MaxHealthCheckLogLength = 500
27
)
28

29
// HealthCheck verifies the state and validity of the healthcheck configuration
30
// on the container and then executes the healthcheck
31
func (r *Runtime) HealthCheck(ctx context.Context, name string) (define.HealthCheckStatus, error) {
32
	container, err := r.LookupContainer(name)
33
	if err != nil {
34
		return define.HealthCheckContainerNotFound, fmt.Errorf("unable to look up %s to perform a health check: %w", name, err)
35
	}
36

37
	hcStatus, err := checkHealthCheckCanBeRun(container)
38
	if err != nil {
39
		return hcStatus, err
40
	}
41

42
	isStartupHC := false
43
	if container.config.StartupHealthCheckConfig != nil {
44
		passed, err := container.StartupHCPassed()
45
		if err != nil {
46
			return define.HealthCheckInternalError, err
47
		}
48
		isStartupHC = !passed
49
	}
50

51
	hcStatus, logStatus, err := container.runHealthCheck(ctx, isStartupHC)
52
	if !isStartupHC {
53
		if err := container.processHealthCheckStatus(logStatus); err != nil {
54
			return hcStatus, err
55
		}
56
	}
57
	return hcStatus, err
58
}
59

60
func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.HealthCheckStatus, string, error) {
61
	var (
62
		newCommand    []string
63
		returnCode    int
64
		inStartPeriod bool
65
	)
66

67
	hcCommand := c.HealthCheckConfig().Test
68
	if isStartup {
69
		logrus.Debugf("Running startup healthcheck for container %s", c.ID())
70
		hcCommand = c.config.StartupHealthCheckConfig.Test
71
	}
72
	if len(hcCommand) < 1 {
73
		return define.HealthCheckNotDefined, "", fmt.Errorf("container %s has no defined healthcheck", c.ID())
74
	}
75
	switch hcCommand[0] {
76
	case "", define.HealthConfigTestNone:
77
		return define.HealthCheckNotDefined, "", fmt.Errorf("container %s has no defined healthcheck", c.ID())
78
	case define.HealthConfigTestCmd:
79
		newCommand = hcCommand[1:]
80
	case define.HealthConfigTestCmdShell:
81
		// TODO: SHELL command from image not available in Container - use Docker default
82
		newCommand = []string{"/bin/sh", "-c", strings.Join(hcCommand[1:], " ")}
83
	default:
84
		// command supplied on command line - pass as-is
85
		newCommand = hcCommand
86
	}
87
	if len(newCommand) < 1 || newCommand[0] == "" {
88
		return define.HealthCheckNotDefined, "", fmt.Errorf("container %s has no defined healthcheck", c.ID())
89
	}
90
	rPipe, wPipe, err := os.Pipe()
91
	if err != nil {
92
		return define.HealthCheckInternalError, "", fmt.Errorf("unable to create pipe for healthcheck session: %w", err)
93
	}
94
	defer wPipe.Close()
95
	defer rPipe.Close()
96

97
	streams := new(define.AttachStreams)
98

99
	streams.InputStream = bufio.NewReader(os.Stdin)
100
	streams.OutputStream = wPipe
101
	streams.ErrorStream = wPipe
102
	streams.AttachOutput = true
103
	streams.AttachError = true
104
	streams.AttachInput = true
105

106
	stdout := []string{}
107
	go func() {
108
		scanner := bufio.NewScanner(rPipe)
109
		for scanner.Scan() {
110
			stdout = append(stdout, scanner.Text())
111
		}
112
	}()
113

114
	logrus.Debugf("executing health check command %s for %s", strings.Join(newCommand, " "), c.ID())
115
	timeStart := time.Now()
116
	hcResult := define.HealthCheckSuccess
117
	config := new(ExecConfig)
118
	config.Command = newCommand
119
	exitCode, hcErr := c.exec(config, streams, nil, true)
120
	if hcErr != nil {
121
		hcResult = define.HealthCheckFailure
122
		if errors.Is(hcErr, define.ErrOCIRuntimeNotFound) ||
123
			errors.Is(hcErr, define.ErrOCIRuntimePermissionDenied) ||
124
			errors.Is(hcErr, define.ErrOCIRuntime) {
125
			returnCode = 1
126
			hcErr = nil
127
		} else {
128
			returnCode = 125
129
		}
130
	} else if exitCode != 0 {
131
		hcResult = define.HealthCheckFailure
132
		returnCode = 1
133
	}
134

135
	// Handle startup HC
136
	if isStartup {
137
		inStartPeriod = true
138
		if hcErr != nil || exitCode != 0 {
139
			hcResult = define.HealthCheckStartup
140
			c.incrementStartupHCFailureCounter(ctx)
141
		} else {
142
			c.incrementStartupHCSuccessCounter(ctx)
143
		}
144
	}
145

146
	timeEnd := time.Now()
147
	if c.HealthCheckConfig().StartPeriod > 0 {
148
		// there is a start-period we need to honor; we add startPeriod to container start time
149
		startPeriodTime := c.state.StartedTime.Add(c.HealthCheckConfig().StartPeriod)
150
		if timeStart.Before(startPeriodTime) {
151
			// we are still in the start period, flip the inStartPeriod bool
152
			inStartPeriod = true
153
			logrus.Debugf("healthcheck for %s being run in start-period", c.ID())
154
		}
155
	}
156

157
	eventLog := strings.Join(stdout, "\n")
158
	if len(eventLog) > MaxHealthCheckLogLength {
159
		eventLog = eventLog[:MaxHealthCheckLogLength]
160
	}
161

162
	if timeEnd.Sub(timeStart) > c.HealthCheckConfig().Timeout {
163
		returnCode = -1
164
		hcResult = define.HealthCheckFailure
165
		hcErr = fmt.Errorf("healthcheck command exceeded timeout of %s", c.HealthCheckConfig().Timeout.String())
166
	}
167

168
	hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog)
169
	logStatus, err := c.updateHealthCheckLog(hcl, inStartPeriod, isStartup)
170
	if err != nil {
171
		return hcResult, "", fmt.Errorf("unable to update health check log %s for %s: %w", c.healthCheckLogPath(), c.ID(), err)
172
	}
173

174
	// Write HC event with appropriate status as the last thing before we
175
	// return.
176
	if hcResult == define.HealthCheckNotDefined || hcResult == define.HealthCheckInternalError {
177
		return hcResult, logStatus, hcErr
178
	}
179
	if c.runtime.config.Engine.HealthcheckEvents {
180
		c.newContainerHealthCheckEvent(logStatus)
181
	}
182

183
	return hcResult, logStatus, hcErr
184
}
185

186
func (c *Container) processHealthCheckStatus(status string) error {
187
	if status != define.HealthCheckUnhealthy {
188
		return nil
189
	}
190

191
	switch c.config.HealthCheckOnFailureAction {
192
	case define.HealthCheckOnFailureActionNone: // Nothing to do
193

194
	case define.HealthCheckOnFailureActionKill:
195
		if err := c.Kill(uint(unix.SIGKILL)); err != nil {
196
			return fmt.Errorf("killing container health-check turned unhealthy: %w", err)
197
		}
198

199
	case define.HealthCheckOnFailureActionRestart:
200
		// We let the cleanup process handle the restart.  Otherwise
201
		// the container would be restarted in the context of a
202
		// transient systemd unit which may cause undesired side
203
		// effects.
204
		if err := c.Stop(); err != nil {
205
			return fmt.Errorf("restarting/stopping container after health-check turned unhealthy: %w", err)
206
		}
207

208
	case define.HealthCheckOnFailureActionStop:
209
		if err := c.Stop(); err != nil {
210
			return fmt.Errorf("stopping container after health-check turned unhealthy: %w", err)
211
		}
212

213
	default: // Should not happen but better be safe than sorry
214
		return fmt.Errorf("unsupported on-failure action %d", c.config.HealthCheckOnFailureAction)
215
	}
216

217
	return nil
218
}
219

220
func checkHealthCheckCanBeRun(c *Container) (define.HealthCheckStatus, error) {
221
	cstate, err := c.State()
222
	if err != nil {
223
		return define.HealthCheckInternalError, err
224
	}
225
	if cstate != define.ContainerStateRunning {
226
		return define.HealthCheckContainerStopped, fmt.Errorf("container %s is not running", c.ID())
227
	}
228
	if !c.HasHealthCheck() {
229
		return define.HealthCheckNotDefined, fmt.Errorf("container %s has no defined healthcheck", c.ID())
230
	}
231
	return define.HealthCheckDefined, nil
232
}
233

234
// Increment the current startup healthcheck success counter.
235
// Can stop the startup HC and start the regular HC if the startup HC has enough
236
// consecutive successes.
237
func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) {
238
	if !c.batched {
239
		c.lock.Lock()
240
		defer c.lock.Unlock()
241

242
		if err := c.syncContainer(); err != nil {
243
			logrus.Errorf("Error syncing container %s state: %v", c.ID(), err)
244
			return
245
		}
246
	}
247

248
	// We don't have a startup HC, can't do anything
249
	if c.config.StartupHealthCheckConfig == nil {
250
		return
251
	}
252

253
	// Race: someone else got here first
254
	if c.state.StartupHCPassed {
255
		return
256
	}
257

258
	// Increment the success counter
259
	c.state.StartupHCSuccessCount++
260

261
	logrus.Debugf("Startup healthcheck for container %s succeeded, success counter now %d", c.ID(), c.state.StartupHCSuccessCount)
262

263
	// Did we exceed threshold?
264
	recreateTimer := false
265
	if c.config.StartupHealthCheckConfig.Successes == 0 || c.state.StartupHCSuccessCount >= c.config.StartupHealthCheckConfig.Successes {
266
		c.state.StartupHCPassed = true
267
		c.state.StartupHCSuccessCount = 0
268
		c.state.StartupHCFailureCount = 0
269

270
		recreateTimer = true
271
	}
272

273
	if err := c.save(); err != nil {
274
		logrus.Errorf("Error saving container %s state: %v", c.ID(), err)
275
		return
276
	}
277

278
	if recreateTimer {
279
		logrus.Infof("Startup healthcheck for container %s passed, recreating timer", c.ID())
280

281
		// Create the new, standard healthcheck timer first.
282
		if err := c.createTimer(c.HealthCheckConfig().Interval.String(), false); err != nil {
283
			logrus.Errorf("Error recreating container %s healthcheck: %v", c.ID(), err)
284
			return
285
		}
286
		if err := c.startTimer(false); err != nil {
287
			logrus.Errorf("Error restarting container %s healthcheck timer: %v", c.ID(), err)
288
		}
289

290
		// This kills the process the healthcheck is running.
291
		// Which happens to be us.
292
		// So this has to be last - after this, systemd serves us a
293
		// SIGTERM and we exit.
294
		if err := c.removeTransientFiles(ctx, true); err != nil {
295
			logrus.Errorf("Error removing container %s healthcheck: %v", c.ID(), err)
296
			return
297
		}
298
	}
299
}
300

301
// Increment the current startup healthcheck failure counter.
302
// Can restart the container if the HC fails enough times consecutively.
303
func (c *Container) incrementStartupHCFailureCounter(ctx context.Context) {
304
	if !c.batched {
305
		c.lock.Lock()
306
		defer c.lock.Unlock()
307

308
		if err := c.syncContainer(); err != nil {
309
			logrus.Errorf("Error syncing container %s state: %v", c.ID(), err)
310
			return
311
		}
312
	}
313

314
	// We don't have a startup HC, can't do anything
315
	if c.config.StartupHealthCheckConfig == nil {
316
		return
317
	}
318

319
	// Race: someone else got here first
320
	if c.state.StartupHCPassed {
321
		return
322
	}
323

324
	c.state.StartupHCFailureCount++
325

326
	logrus.Debugf("Startup healthcheck for container %s failed, failure counter now %d", c.ID(), c.state.StartupHCFailureCount)
327

328
	if c.config.StartupHealthCheckConfig.Retries != 0 && c.state.StartupHCFailureCount >= c.config.StartupHealthCheckConfig.Retries {
329
		logrus.Infof("Restarting container %s as startup healthcheck failed", c.ID())
330
		// Restart the container
331
		if err := c.restartWithTimeout(ctx, c.config.StopTimeout); err != nil {
332
			logrus.Errorf("Error restarting container %s after healthcheck failure: %v", c.ID(), err)
333
		}
334
		return
335
	}
336

337
	if err := c.save(); err != nil {
338
		logrus.Errorf("Error saving container %s state: %v", c.ID(), err)
339
	}
340
}
341

342
func newHealthCheckLog(start, end time.Time, exitCode int, log string) define.HealthCheckLog {
343
	return define.HealthCheckLog{
344
		Start:    start.Format(time.RFC3339Nano),
345
		End:      end.Format(time.RFC3339Nano),
346
		ExitCode: exitCode,
347
		Output:   log,
348
	}
349
}
350

351
// updateHealthStatus updates the health status of the container
352
// in the healthcheck log
353
func (c *Container) updateHealthStatus(status string) error {
354
	healthCheck, err := c.getHealthCheckLog()
355
	if err != nil {
356
		return err
357
	}
358
	healthCheck.Status = status
359
	newResults, err := json.Marshal(healthCheck)
360
	if err != nil {
361
		return fmt.Errorf("unable to marshall healthchecks for writing status: %w", err)
362
	}
363
	return os.WriteFile(c.healthCheckLogPath(), newResults, 0700)
364
}
365

366
// isUnhealthy returns true if the current health check status is unhealthy.
367
func (c *Container) isUnhealthy() (bool, error) {
368
	if !c.HasHealthCheck() {
369
		return false, nil
370
	}
371
	healthCheck, err := c.getHealthCheckLog()
372
	if err != nil {
373
		return false, err
374
	}
375
	return healthCheck.Status == define.HealthCheckUnhealthy, nil
376
}
377

378
// UpdateHealthCheckLog parses the health check results and writes the log
379
func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, inStartPeriod, isStartup bool) (string, error) {
380
	c.lock.Lock()
381
	defer c.lock.Unlock()
382

383
	// If we are playing a kube yaml then let's honor the start period time for
384
	// both failing and succeeding cases to match kube behavior.
385
	// So don't update the health check log till the start period is over
386
	if _, ok := c.config.Spec.Annotations[define.KubeHealthCheckAnnotation]; ok && inStartPeriod && !isStartup {
387
		return "", nil
388
	}
389

390
	healthCheck, err := c.getHealthCheckLog()
391
	if err != nil {
392
		return "", err
393
	}
394
	if hcl.ExitCode == 0 {
395
		//	set status to healthy, reset failing state to 0
396
		healthCheck.Status = define.HealthCheckHealthy
397
		healthCheck.FailingStreak = 0
398
	} else {
399
		if len(healthCheck.Status) < 1 {
400
			healthCheck.Status = define.HealthCheckHealthy
401
		}
402
		if !inStartPeriod {
403
			// increment failing streak
404
			healthCheck.FailingStreak++
405
			// if failing streak > retries, then status to unhealthy
406
			if healthCheck.FailingStreak >= c.HealthCheckConfig().Retries {
407
				healthCheck.Status = define.HealthCheckUnhealthy
408
			}
409
		}
410
	}
411
	healthCheck.Log = append(healthCheck.Log, hcl)
412
	if len(healthCheck.Log) > MaxHealthCheckNumberLogs {
413
		healthCheck.Log = healthCheck.Log[1:]
414
	}
415
	newResults, err := json.Marshal(healthCheck)
416
	if err != nil {
417
		return "", fmt.Errorf("unable to marshall healthchecks for writing: %w", err)
418
	}
419
	return healthCheck.Status, os.WriteFile(c.healthCheckLogPath(), newResults, 0700)
420
}
421

422
// HealthCheckLogPath returns the path for where the health check log is
423
func (c *Container) healthCheckLogPath() string {
424
	return filepath.Join(filepath.Dir(c.state.RunDir), "healthcheck.log")
425
}
426

427
// getHealthCheckLog returns HealthCheck results by reading the container's
428
// health check log file.  If the health check log file does not exist, then
429
// an empty healthcheck struct is returned
430
// The caller should lock the container before this function is called.
431
func (c *Container) getHealthCheckLog() (define.HealthCheckResults, error) {
432
	var healthCheck define.HealthCheckResults
433
	b, err := os.ReadFile(c.healthCheckLogPath())
434
	if err != nil {
435
		if errors.Is(err, fs.ErrNotExist) {
436
			// If the file does not exists just return empty healthcheck and no error.
437
			return healthCheck, nil
438
		}
439
		return healthCheck, fmt.Errorf("failed to read health check log file: %w", err)
440
	}
441
	if err := json.Unmarshal(b, &healthCheck); err != nil {
442
		return healthCheck, fmt.Errorf("failed to unmarshal existing healthcheck results in %s: %w", c.healthCheckLogPath(), err)
443
	}
444
	return healthCheck, nil
445
}
446

447
// HealthCheckStatus returns the current state of a container with a healthcheck.
448
// Returns an empty string if no health check is defined for the container.
449
func (c *Container) HealthCheckStatus() (string, error) {
450
	if !c.batched {
451
		c.lock.Lock()
452
		defer c.lock.Unlock()
453
	}
454
	return c.healthCheckStatus()
455
}
456

457
// Internal function to return the current state of a container with a healthcheck.
458
// This function does not lock the container.
459
func (c *Container) healthCheckStatus() (string, error) {
460
	if !c.HasHealthCheck() {
461
		return "", nil
462
	}
463

464
	if err := c.syncContainer(); err != nil {
465
		return "", err
466
	}
467

468
	results, err := c.getHealthCheckLog()
469
	if err != nil {
470
		return "", fmt.Errorf("unable to get healthcheck log for %s: %w", c.ID(), err)
471
	}
472

473
	return results.Status, nil
474
}
475
podman

Использование cookies