16
"github.com/containers/podman/v5/libpod/define"
17
"github.com/sirupsen/logrus"
18
"golang.org/x/sys/unix"
22
// MaxHealthCheckNumberLogs is the maximum number of attempts we keep
23
// in the healthcheck history file
24
MaxHealthCheckNumberLogs int = 5
25
// MaxHealthCheckLogLength in characters
26
MaxHealthCheckLogLength = 500
29
// HealthCheck verifies the state and validity of the healthcheck configuration
30
// on the container and then executes the healthcheck
31
func (r *Runtime) HealthCheck(ctx context.Context, name string) (define.HealthCheckStatus, error) {
32
container, err := r.LookupContainer(name)
34
return define.HealthCheckContainerNotFound, fmt.Errorf("unable to look up %s to perform a health check: %w", name, err)
37
hcStatus, err := checkHealthCheckCanBeRun(container)
43
if container.config.StartupHealthCheckConfig != nil {
44
passed, err := container.StartupHCPassed()
46
return define.HealthCheckInternalError, err
51
hcStatus, logStatus, err := container.runHealthCheck(ctx, isStartupHC)
53
if err := container.processHealthCheckStatus(logStatus); err != nil {
60
func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.HealthCheckStatus, string, error) {
67
hcCommand := c.HealthCheckConfig().Test
69
logrus.Debugf("Running startup healthcheck for container %s", c.ID())
70
hcCommand = c.config.StartupHealthCheckConfig.Test
72
if len(hcCommand) < 1 {
73
return define.HealthCheckNotDefined, "", fmt.Errorf("container %s has no defined healthcheck", c.ID())
76
case "", define.HealthConfigTestNone:
77
return define.HealthCheckNotDefined, "", fmt.Errorf("container %s has no defined healthcheck", c.ID())
78
case define.HealthConfigTestCmd:
79
newCommand = hcCommand[1:]
80
case define.HealthConfigTestCmdShell:
81
// TODO: SHELL command from image not available in Container - use Docker default
82
newCommand = []string{"/bin/sh", "-c", strings.Join(hcCommand[1:], " ")}
84
// command supplied on command line - pass as-is
85
newCommand = hcCommand
87
if len(newCommand) < 1 || newCommand[0] == "" {
88
return define.HealthCheckNotDefined, "", fmt.Errorf("container %s has no defined healthcheck", c.ID())
90
rPipe, wPipe, err := os.Pipe()
92
return define.HealthCheckInternalError, "", fmt.Errorf("unable to create pipe for healthcheck session: %w", err)
97
streams := new(define.AttachStreams)
99
streams.InputStream = bufio.NewReader(os.Stdin)
100
streams.OutputStream = wPipe
101
streams.ErrorStream = wPipe
102
streams.AttachOutput = true
103
streams.AttachError = true
104
streams.AttachInput = true
108
scanner := bufio.NewScanner(rPipe)
110
stdout = append(stdout, scanner.Text())
114
logrus.Debugf("executing health check command %s for %s", strings.Join(newCommand, " "), c.ID())
115
timeStart := time.Now()
116
hcResult := define.HealthCheckSuccess
117
config := new(ExecConfig)
118
config.Command = newCommand
119
exitCode, hcErr := c.exec(config, streams, nil, true)
121
hcResult = define.HealthCheckFailure
122
if errors.Is(hcErr, define.ErrOCIRuntimeNotFound) ||
123
errors.Is(hcErr, define.ErrOCIRuntimePermissionDenied) ||
124
errors.Is(hcErr, define.ErrOCIRuntime) {
130
} else if exitCode != 0 {
131
hcResult = define.HealthCheckFailure
138
if hcErr != nil || exitCode != 0 {
139
hcResult = define.HealthCheckStartup
140
c.incrementStartupHCFailureCounter(ctx)
142
c.incrementStartupHCSuccessCounter(ctx)
146
timeEnd := time.Now()
147
if c.HealthCheckConfig().StartPeriod > 0 {
148
// there is a start-period we need to honor; we add startPeriod to container start time
149
startPeriodTime := c.state.StartedTime.Add(c.HealthCheckConfig().StartPeriod)
150
if timeStart.Before(startPeriodTime) {
151
// we are still in the start period, flip the inStartPeriod bool
153
logrus.Debugf("healthcheck for %s being run in start-period", c.ID())
157
eventLog := strings.Join(stdout, "\n")
158
if len(eventLog) > MaxHealthCheckLogLength {
159
eventLog = eventLog[:MaxHealthCheckLogLength]
162
if timeEnd.Sub(timeStart) > c.HealthCheckConfig().Timeout {
164
hcResult = define.HealthCheckFailure
165
hcErr = fmt.Errorf("healthcheck command exceeded timeout of %s", c.HealthCheckConfig().Timeout.String())
168
hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog)
169
logStatus, err := c.updateHealthCheckLog(hcl, inStartPeriod, isStartup)
171
return hcResult, "", fmt.Errorf("unable to update health check log %s for %s: %w", c.healthCheckLogPath(), c.ID(), err)
174
// Write HC event with appropriate status as the last thing before we
176
if hcResult == define.HealthCheckNotDefined || hcResult == define.HealthCheckInternalError {
177
return hcResult, logStatus, hcErr
179
if c.runtime.config.Engine.HealthcheckEvents {
180
c.newContainerHealthCheckEvent(logStatus)
183
return hcResult, logStatus, hcErr
186
func (c *Container) processHealthCheckStatus(status string) error {
187
if status != define.HealthCheckUnhealthy {
191
switch c.config.HealthCheckOnFailureAction {
192
case define.HealthCheckOnFailureActionNone: // Nothing to do
194
case define.HealthCheckOnFailureActionKill:
195
if err := c.Kill(uint(unix.SIGKILL)); err != nil {
196
return fmt.Errorf("killing container health-check turned unhealthy: %w", err)
199
case define.HealthCheckOnFailureActionRestart:
200
// We let the cleanup process handle the restart. Otherwise
201
// the container would be restarted in the context of a
202
// transient systemd unit which may cause undesired side
204
if err := c.Stop(); err != nil {
205
return fmt.Errorf("restarting/stopping container after health-check turned unhealthy: %w", err)
208
case define.HealthCheckOnFailureActionStop:
209
if err := c.Stop(); err != nil {
210
return fmt.Errorf("stopping container after health-check turned unhealthy: %w", err)
213
default: // Should not happen but better be safe than sorry
214
return fmt.Errorf("unsupported on-failure action %d", c.config.HealthCheckOnFailureAction)
220
func checkHealthCheckCanBeRun(c *Container) (define.HealthCheckStatus, error) {
221
cstate, err := c.State()
223
return define.HealthCheckInternalError, err
225
if cstate != define.ContainerStateRunning {
226
return define.HealthCheckContainerStopped, fmt.Errorf("container %s is not running", c.ID())
228
if !c.HasHealthCheck() {
229
return define.HealthCheckNotDefined, fmt.Errorf("container %s has no defined healthcheck", c.ID())
231
return define.HealthCheckDefined, nil
234
// Increment the current startup healthcheck success counter.
235
// Can stop the startup HC and start the regular HC if the startup HC has enough
236
// consecutive successes.
237
func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) {
240
defer c.lock.Unlock()
242
if err := c.syncContainer(); err != nil {
243
logrus.Errorf("Error syncing container %s state: %v", c.ID(), err)
248
// We don't have a startup HC, can't do anything
249
if c.config.StartupHealthCheckConfig == nil {
253
// Race: someone else got here first
254
if c.state.StartupHCPassed {
258
// Increment the success counter
259
c.state.StartupHCSuccessCount++
261
logrus.Debugf("Startup healthcheck for container %s succeeded, success counter now %d", c.ID(), c.state.StartupHCSuccessCount)
263
// Did we exceed threshold?
264
recreateTimer := false
265
if c.config.StartupHealthCheckConfig.Successes == 0 || c.state.StartupHCSuccessCount >= c.config.StartupHealthCheckConfig.Successes {
266
c.state.StartupHCPassed = true
267
c.state.StartupHCSuccessCount = 0
268
c.state.StartupHCFailureCount = 0
273
if err := c.save(); err != nil {
274
logrus.Errorf("Error saving container %s state: %v", c.ID(), err)
279
logrus.Infof("Startup healthcheck for container %s passed, recreating timer", c.ID())
281
// Create the new, standard healthcheck timer first.
282
if err := c.createTimer(c.HealthCheckConfig().Interval.String(), false); err != nil {
283
logrus.Errorf("Error recreating container %s healthcheck: %v", c.ID(), err)
286
if err := c.startTimer(false); err != nil {
287
logrus.Errorf("Error restarting container %s healthcheck timer: %v", c.ID(), err)
290
// This kills the process the healthcheck is running.
291
// Which happens to be us.
292
// So this has to be last - after this, systemd serves us a
293
// SIGTERM and we exit.
294
if err := c.removeTransientFiles(ctx, true); err != nil {
295
logrus.Errorf("Error removing container %s healthcheck: %v", c.ID(), err)
301
// Increment the current startup healthcheck failure counter.
302
// Can restart the container if the HC fails enough times consecutively.
303
func (c *Container) incrementStartupHCFailureCounter(ctx context.Context) {
306
defer c.lock.Unlock()
308
if err := c.syncContainer(); err != nil {
309
logrus.Errorf("Error syncing container %s state: %v", c.ID(), err)
314
// We don't have a startup HC, can't do anything
315
if c.config.StartupHealthCheckConfig == nil {
319
// Race: someone else got here first
320
if c.state.StartupHCPassed {
324
c.state.StartupHCFailureCount++
326
logrus.Debugf("Startup healthcheck for container %s failed, failure counter now %d", c.ID(), c.state.StartupHCFailureCount)
328
if c.config.StartupHealthCheckConfig.Retries != 0 && c.state.StartupHCFailureCount >= c.config.StartupHealthCheckConfig.Retries {
329
logrus.Infof("Restarting container %s as startup healthcheck failed", c.ID())
330
// Restart the container
331
if err := c.restartWithTimeout(ctx, c.config.StopTimeout); err != nil {
332
logrus.Errorf("Error restarting container %s after healthcheck failure: %v", c.ID(), err)
337
if err := c.save(); err != nil {
338
logrus.Errorf("Error saving container %s state: %v", c.ID(), err)
342
func newHealthCheckLog(start, end time.Time, exitCode int, log string) define.HealthCheckLog {
343
return define.HealthCheckLog{
344
Start: start.Format(time.RFC3339Nano),
345
End: end.Format(time.RFC3339Nano),
351
// updateHealthStatus updates the health status of the container
352
// in the healthcheck log
353
func (c *Container) updateHealthStatus(status string) error {
354
healthCheck, err := c.getHealthCheckLog()
358
healthCheck.Status = status
359
newResults, err := json.Marshal(healthCheck)
361
return fmt.Errorf("unable to marshall healthchecks for writing status: %w", err)
363
return os.WriteFile(c.healthCheckLogPath(), newResults, 0700)
366
// isUnhealthy returns true if the current health check status is unhealthy.
367
func (c *Container) isUnhealthy() (bool, error) {
368
if !c.HasHealthCheck() {
371
healthCheck, err := c.getHealthCheckLog()
375
return healthCheck.Status == define.HealthCheckUnhealthy, nil
378
// UpdateHealthCheckLog parses the health check results and writes the log
379
func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, inStartPeriod, isStartup bool) (string, error) {
381
defer c.lock.Unlock()
383
// If we are playing a kube yaml then let's honor the start period time for
384
// both failing and succeeding cases to match kube behavior.
385
// So don't update the health check log till the start period is over
386
if _, ok := c.config.Spec.Annotations[define.KubeHealthCheckAnnotation]; ok && inStartPeriod && !isStartup {
390
healthCheck, err := c.getHealthCheckLog()
394
if hcl.ExitCode == 0 {
395
// set status to healthy, reset failing state to 0
396
healthCheck.Status = define.HealthCheckHealthy
397
healthCheck.FailingStreak = 0
399
if len(healthCheck.Status) < 1 {
400
healthCheck.Status = define.HealthCheckHealthy
403
// increment failing streak
404
healthCheck.FailingStreak++
405
// if failing streak > retries, then status to unhealthy
406
if healthCheck.FailingStreak >= c.HealthCheckConfig().Retries {
407
healthCheck.Status = define.HealthCheckUnhealthy
411
healthCheck.Log = append(healthCheck.Log, hcl)
412
if len(healthCheck.Log) > MaxHealthCheckNumberLogs {
413
healthCheck.Log = healthCheck.Log[1:]
415
newResults, err := json.Marshal(healthCheck)
417
return "", fmt.Errorf("unable to marshall healthchecks for writing: %w", err)
419
return healthCheck.Status, os.WriteFile(c.healthCheckLogPath(), newResults, 0700)
422
// HealthCheckLogPath returns the path for where the health check log is
423
func (c *Container) healthCheckLogPath() string {
424
return filepath.Join(filepath.Dir(c.state.RunDir), "healthcheck.log")
427
// getHealthCheckLog returns HealthCheck results by reading the container's
428
// health check log file. If the health check log file does not exist, then
429
// an empty healthcheck struct is returned
430
// The caller should lock the container before this function is called.
431
func (c *Container) getHealthCheckLog() (define.HealthCheckResults, error) {
432
var healthCheck define.HealthCheckResults
433
b, err := os.ReadFile(c.healthCheckLogPath())
435
if errors.Is(err, fs.ErrNotExist) {
436
// If the file does not exists just return empty healthcheck and no error.
437
return healthCheck, nil
439
return healthCheck, fmt.Errorf("failed to read health check log file: %w", err)
441
if err := json.Unmarshal(b, &healthCheck); err != nil {
442
return healthCheck, fmt.Errorf("failed to unmarshal existing healthcheck results in %s: %w", c.healthCheckLogPath(), err)
444
return healthCheck, nil
447
// HealthCheckStatus returns the current state of a container with a healthcheck.
448
// Returns an empty string if no health check is defined for the container.
449
func (c *Container) HealthCheckStatus() (string, error) {
452
defer c.lock.Unlock()
454
return c.healthCheckStatus()
457
// Internal function to return the current state of a container with a healthcheck.
458
// This function does not lock the container.
459
func (c *Container) healthCheckStatus() (string, error) {
460
if !c.HasHealthCheck() {
464
if err := c.syncContainer(); err != nil {
468
results, err := c.getHealthCheckLog()
470
return "", fmt.Errorf("unable to get healthcheck log for %s: %w", c.ID(), err)
473
return results.Status, nil