podman

Форк
0
/
oci_conmon_common.go 
1730 строк · 52.7 Кб
1
//go:build !remote && (linux || freebsd)
2

3
package libpod
4

5
import (
6
	"bufio"
7
	"bytes"
8
	"context"
9
	"errors"
10
	"fmt"
11
	"io"
12
	"net"
13
	"net/http"
14
	"os"
15
	"os/exec"
16
	"path/filepath"
17
	"strconv"
18
	"strings"
19
	"sync"
20
	"syscall"
21
	"text/template"
22
	"time"
23

24
	"github.com/containers/common/pkg/config"
25
	"github.com/containers/common/pkg/detach"
26
	"github.com/containers/common/pkg/resize"
27
	"github.com/containers/common/pkg/version"
28
	conmonConfig "github.com/containers/conmon/runner/config"
29
	"github.com/containers/podman/v5/libpod/define"
30
	"github.com/containers/podman/v5/libpod/logs"
31
	"github.com/containers/podman/v5/pkg/checkpoint/crutils"
32
	"github.com/containers/podman/v5/pkg/errorhandling"
33
	"github.com/containers/podman/v5/pkg/rootless"
34
	"github.com/containers/podman/v5/pkg/specgenutil"
35
	"github.com/containers/podman/v5/pkg/util"
36
	"github.com/containers/podman/v5/utils"
37
	spec "github.com/opencontainers/runtime-spec/specs-go"
38
	"github.com/sirupsen/logrus"
39
	"golang.org/x/sys/unix"
40
)
41

42
const (
43
	// This is Conmon's STDIO_BUF_SIZE. I don't believe we have access to it
44
	// directly from the Go code, so const it here
45
	// Important: The conmon attach socket uses an extra byte at the beginning of each
46
	// message to specify the STREAM so we have to increase the buffer size by one
47
	bufferSize = conmonConfig.BufSize + 1
48
)
49

50
// ConmonOCIRuntime is an OCI runtime managed by Conmon.
51
// TODO: Make all calls to OCI runtime have a timeout.
52
type ConmonOCIRuntime struct {
53
	name              string
54
	path              string
55
	conmonPath        string
56
	conmonEnv         []string
57
	tmpDir            string
58
	exitsDir          string
59
	logSizeMax        int64
60
	noPivot           bool
61
	reservePorts      bool
62
	runtimeFlags      []string
63
	supportsJSON      bool
64
	supportsKVM       bool
65
	supportsNoCgroups bool
66
	enableKeyring     bool
67
	persistDir        string
68
}
69

70
// Make a new Conmon-based OCI runtime with the given options.
71
// Conmon will wrap the given OCI runtime, which can be `runc`, `crun`, or
72
// any runtime with a runc-compatible CLI.
73
// The first path that points to a valid executable will be used.
74
// Deliberately private. Someone should not be able to construct this outside of
75
// libpod.
76
func newConmonOCIRuntime(name string, paths []string, conmonPath string, runtimeFlags []string, runtimeCfg *config.Config) (OCIRuntime, error) {
77
	if name == "" {
78
		return nil, fmt.Errorf("the OCI runtime must be provided a non-empty name: %w", define.ErrInvalidArg)
79
	}
80

81
	// Make lookup tables for runtime support
82
	supportsJSON := make(map[string]bool, len(runtimeCfg.Engine.RuntimeSupportsJSON.Get()))
83
	supportsNoCgroups := make(map[string]bool, len(runtimeCfg.Engine.RuntimeSupportsNoCgroups.Get()))
84
	supportsKVM := make(map[string]bool, len(runtimeCfg.Engine.RuntimeSupportsKVM.Get()))
85
	for _, r := range runtimeCfg.Engine.RuntimeSupportsJSON.Get() {
86
		supportsJSON[r] = true
87
	}
88
	for _, r := range runtimeCfg.Engine.RuntimeSupportsNoCgroups.Get() {
89
		supportsNoCgroups[r] = true
90
	}
91
	for _, r := range runtimeCfg.Engine.RuntimeSupportsKVM.Get() {
92
		supportsKVM[r] = true
93
	}
94

95
	runtime := new(ConmonOCIRuntime)
96
	runtime.name = name
97
	runtime.conmonPath = conmonPath
98
	runtime.runtimeFlags = runtimeFlags
99

100
	runtime.conmonEnv = runtimeCfg.Engine.ConmonEnvVars.Get()
101
	runtime.tmpDir = runtimeCfg.Engine.TmpDir
102
	runtime.logSizeMax = runtimeCfg.Containers.LogSizeMax
103
	runtime.noPivot = runtimeCfg.Engine.NoPivotRoot
104
	runtime.reservePorts = runtimeCfg.Engine.EnablePortReservation
105
	runtime.enableKeyring = runtimeCfg.Containers.EnableKeyring
106

107
	// TODO: probe OCI runtime for feature and enable automatically if
108
	// available.
109

110
	base := filepath.Base(name)
111
	runtime.supportsJSON = supportsJSON[base]
112
	runtime.supportsNoCgroups = supportsNoCgroups[base]
113
	runtime.supportsKVM = supportsKVM[base]
114

115
	foundPath := false
116
	for _, path := range paths {
117
		stat, err := os.Stat(path)
118
		if err != nil {
119
			if os.IsNotExist(err) {
120
				continue
121
			}
122
			return nil, fmt.Errorf("cannot stat OCI runtime %s path: %w", name, err)
123
		}
124
		if !stat.Mode().IsRegular() {
125
			continue
126
		}
127
		foundPath = true
128
		logrus.Tracef("found runtime %q", path)
129
		runtime.path = path
130
		break
131
	}
132

133
	// Search the $PATH as last fallback
134
	if !foundPath {
135
		if foundRuntime, err := exec.LookPath(name); err == nil {
136
			foundPath = true
137
			runtime.path = foundRuntime
138
			logrus.Debugf("using runtime %q from $PATH: %q", name, foundRuntime)
139
		}
140
	}
141

142
	if !foundPath {
143
		return nil, fmt.Errorf("no valid executable found for OCI runtime %s: %w", name, define.ErrInvalidArg)
144
	}
145

146
	runtime.exitsDir = filepath.Join(runtime.tmpDir, "exits")
147
	// The persist-dir is where conmon writes the exit file and oom file (if oom killed), we join the container ID to this path later on
148
	runtime.persistDir = filepath.Join(runtime.tmpDir, "persist")
149

150
	// Create the exit files and attach sockets directories
151
	if err := os.MkdirAll(runtime.exitsDir, 0750); err != nil {
152
		return nil, fmt.Errorf("creating OCI runtime exit files directory: %w", err)
153
	}
154
	if err := os.MkdirAll(runtime.persistDir, 0750); err != nil {
155
		return nil, fmt.Errorf("creating OCI runtime persist directory: %w", err)
156
	}
157
	return runtime, nil
158
}
159

160
// Name returns the name of the runtime being wrapped by Conmon.
161
func (r *ConmonOCIRuntime) Name() string {
162
	return r.name
163
}
164

165
// Path returns the path of the OCI runtime being wrapped by Conmon.
166
func (r *ConmonOCIRuntime) Path() string {
167
	return r.path
168
}
169

170
// hasCurrentUserMapped checks whether the current user is mapped inside the container user namespace
171
func hasCurrentUserMapped(ctr *Container) bool {
172
	if len(ctr.config.IDMappings.UIDMap) == 0 && len(ctr.config.IDMappings.GIDMap) == 0 {
173
		return true
174
	}
175
	uid := os.Geteuid()
176
	for _, m := range ctr.config.IDMappings.UIDMap {
177
		if uid >= m.HostID && uid < m.HostID+m.Size {
178
			return true
179
		}
180
	}
181
	return false
182
}
183

184
// CreateContainer creates a container.
185
func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) {
186
	// always make the run dir accessible to the current user so that the PID files can be read without
187
	// being in the rootless user namespace.
188
	if err := makeAccessible(ctr.state.RunDir, 0, 0); err != nil {
189
		return 0, err
190
	}
191
	if !hasCurrentUserMapped(ctr) {
192
		for _, i := range []string{ctr.state.RunDir, ctr.runtime.config.Engine.TmpDir, ctr.config.StaticDir, ctr.state.Mountpoint, ctr.runtime.config.Engine.VolumePath} {
193
			if err := makeAccessible(i, ctr.RootUID(), ctr.RootGID()); err != nil {
194
				return 0, err
195
			}
196
		}
197

198
		// if we are running a non privileged container, be sure to umount some kernel paths so they are not
199
		// bind mounted inside the container at all.
200
		if !ctr.config.Privileged && !rootless.IsRootless() {
201
			return r.createRootlessContainer(ctr, restoreOptions)
202
		}
203
	}
204
	return r.createOCIContainer(ctr, restoreOptions)
205
}
206

207
// UpdateContainerStatus retrieves the current status of the container from the
208
// runtime. It updates the container's state but does not save it.
209
// If useRuntime is false, we will not directly hit runc to see the container's
210
// status, but will instead only check for the existence of the conmon exit file
211
// and update state to stopped if it exists.
212
func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container) error {
213
	runtimeDir, err := util.GetRootlessRuntimeDir()
214
	if err != nil {
215
		return err
216
	}
217

218
	// Store old state so we know if we were already stopped
219
	oldState := ctr.state.State
220

221
	state := new(spec.State)
222

223
	cmd := exec.Command(r.path, "state", ctr.ID())
224
	cmd.Env = append(cmd.Env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir))
225

226
	outPipe, err := cmd.StdoutPipe()
227
	if err != nil {
228
		return fmt.Errorf("getting stdout pipe: %w", err)
229
	}
230
	errPipe, err := cmd.StderrPipe()
231
	if err != nil {
232
		return fmt.Errorf("getting stderr pipe: %w", err)
233
	}
234

235
	err = cmd.Start()
236
	if err != nil {
237
		return fmt.Errorf("error launching container runtime: %w", err)
238
	}
239
	defer func() {
240
		_ = cmd.Wait()
241
	}()
242

243
	stderr, err := io.ReadAll(errPipe)
244
	if err != nil {
245
		return fmt.Errorf("reading stderr: %s: %w", ctr.ID(), err)
246
	}
247
	if strings.Contains(string(stderr), "does not exist") || strings.Contains(string(stderr), "No such file") {
248
		if err := ctr.removeConmonFiles(); err != nil {
249
			logrus.Debugf("unable to remove conmon files for container %s", ctr.ID())
250
		}
251
		ctr.state.ExitCode = -1
252
		ctr.state.FinishedTime = time.Now()
253
		ctr.state.State = define.ContainerStateExited
254
		return ctr.runtime.state.AddContainerExitCode(ctr.ID(), ctr.state.ExitCode)
255
	}
256
	if err := errPipe.Close(); err != nil {
257
		return err
258
	}
259

260
	out, err := io.ReadAll(outPipe)
261
	if err != nil {
262
		return fmt.Errorf("reading stdout: %s: %w", ctr.ID(), err)
263
	}
264
	if err := json.NewDecoder(bytes.NewReader(out)).Decode(state); err != nil {
265
		return fmt.Errorf("decoding container status for container %s: %w", ctr.ID(), err)
266
	}
267
	ctr.state.PID = state.Pid
268

269
	switch state.Status {
270
	case "created":
271
		ctr.state.State = define.ContainerStateCreated
272
	case "paused":
273
		ctr.state.State = define.ContainerStatePaused
274
	case "running":
275
		ctr.state.State = define.ContainerStateRunning
276
	case "stopped":
277
		ctr.state.State = define.ContainerStateStopped
278
	default:
279
		return fmt.Errorf("unrecognized status returned by runtime for container %s: %s: %w",
280
			ctr.ID(), state.Status, define.ErrInternal)
281
	}
282

283
	// Handle ContainerStateStopping - keep it unless the container
284
	// transitioned to no longer running.
285
	if oldState == define.ContainerStateStopping && (ctr.state.State == define.ContainerStatePaused || ctr.state.State == define.ContainerStateRunning) {
286
		ctr.state.State = define.ContainerStateStopping
287
	}
288

289
	return nil
290
}
291

292
// StartContainer starts the given container.
293
// Sets time the container was started, but does not save it.
294
func (r *ConmonOCIRuntime) StartContainer(ctr *Container) error {
295
	// TODO: streams should probably *not* be our STDIN/OUT/ERR - redirect to buffers?
296
	runtimeDir, err := util.GetRootlessRuntimeDir()
297
	if err != nil {
298
		return err
299
	}
300
	env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
301
	if path, ok := os.LookupEnv("PATH"); ok {
302
		env = append(env, fmt.Sprintf("PATH=%s", path))
303
	}
304
	if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "start", ctr.ID())...); err != nil {
305
		return err
306
	}
307

308
	ctr.state.StartedTime = time.Now()
309

310
	return nil
311
}
312

313
// UpdateContainer updates the given container's cgroup configuration
314
func (r *ConmonOCIRuntime) UpdateContainer(ctr *Container, resources *spec.LinuxResources) error {
315
	runtimeDir, err := util.GetRootlessRuntimeDir()
316
	if err != nil {
317
		return err
318
	}
319
	env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
320
	if path, ok := os.LookupEnv("PATH"); ok {
321
		env = append(env, fmt.Sprintf("PATH=%s", path))
322
	}
323
	args := r.runtimeFlags
324
	args = append(args, "update")
325
	tempFile, additionalArgs, err := generateResourceFile(resources)
326
	if err != nil {
327
		return err
328
	}
329
	defer os.Remove(tempFile)
330

331
	args = append(args, additionalArgs...)
332
	return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(args, ctr.ID())...)
333
}
334

335
func generateResourceFile(res *spec.LinuxResources) (string, []string, error) {
336
	flags := []string{}
337
	if res == nil {
338
		return "", flags, nil
339
	}
340

341
	f, err := os.CreateTemp("", "podman")
342
	if err != nil {
343
		return "", nil, err
344
	}
345

346
	j, err := json.Marshal(res)
347
	if err != nil {
348
		return "", nil, err
349
	}
350
	_, err = f.Write(j)
351
	if err != nil {
352
		return "", nil, err
353
	}
354

355
	flags = append(flags, "--resources="+f.Name())
356
	return f.Name(), flags, nil
357
}
358

359
// KillContainer sends the given signal to the given container.
360
// If all is set, send to all PIDs in the container.
361
// All is only supported if the container created cgroups.
362
func (r *ConmonOCIRuntime) KillContainer(ctr *Container, signal uint, all bool) error {
363
	if _, err := r.killContainer(ctr, signal, all, false); err != nil {
364
		return err
365
	}
366

367
	return nil
368
}
369

370
// If captureStderr is requested, OCI runtime STDERR will be captured as a
371
// *bytes.buffer and returned; otherwise, it is set to os.Stderr.
372
func (r *ConmonOCIRuntime) killContainer(ctr *Container, signal uint, all, captureStderr bool) (*bytes.Buffer, error) {
373
	logrus.Debugf("Sending signal %d to container %s", signal, ctr.ID())
374
	runtimeDir, err := util.GetRootlessRuntimeDir()
375
	if err != nil {
376
		return nil, err
377
	}
378
	env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
379
	var args []string
380
	args = append(args, r.runtimeFlags...)
381
	if all {
382
		args = append(args, "kill", "--all", ctr.ID(), strconv.FormatUint(uint64(signal), 10))
383
	} else {
384
		args = append(args, "kill", ctr.ID(), strconv.FormatUint(uint64(signal), 10))
385
	}
386
	var (
387
		stderr       io.Writer = os.Stderr
388
		stderrBuffer *bytes.Buffer
389
	)
390
	if captureStderr {
391
		stderrBuffer = new(bytes.Buffer)
392
		stderr = stderrBuffer
393
	}
394
	if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, stderr, env, r.path, args...); err != nil {
395
		// Update container state - there's a chance we failed because
396
		// the container exited in the meantime.
397
		if err2 := r.UpdateContainerStatus(ctr); err2 != nil {
398
			logrus.Infof("Error updating status for container %s: %v", ctr.ID(), err2)
399
		}
400
		if ctr.ensureState(define.ContainerStateStopped, define.ContainerStateExited) {
401
			return stderrBuffer, fmt.Errorf("%w: %s", define.ErrCtrStateInvalid, ctr.state.State)
402
		}
403
		return stderrBuffer, fmt.Errorf("sending signal to container %s: %w", ctr.ID(), err)
404
	}
405

406
	return stderrBuffer, nil
407
}
408

409
// StopContainer stops a container, first using its given stop signal (or
410
// SIGTERM if no signal was specified), then using SIGKILL.
411
// Timeout is given in seconds. If timeout is 0, the container will be
412
// immediately kill with SIGKILL.
413
// Does not set finished time for container, assumes you will run updateStatus
414
// after to pull the exit code.
415
func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool) error {
416
	logrus.Debugf("Stopping container %s (PID %d)", ctr.ID(), ctr.state.PID)
417

418
	// Ping the container to see if it's alive
419
	// If it's not, it's already stopped, return
420
	err := unix.Kill(ctr.state.PID, 0)
421
	if err == unix.ESRCH {
422
		return nil
423
	}
424

425
	killCtr := func(signal uint) (bool, error) {
426
		stderr, err := r.killContainer(ctr, signal, all, true)
427
		if err != nil {
428
			// There's an inherent race with the cleanup process (see
429
			// #16142, #17142). If the container has already been marked as
430
			// stopped or exited by the cleanup process, we can return
431
			// immediately.
432
			if errors.Is(err, define.ErrCtrStateInvalid) && ctr.ensureState(define.ContainerStateStopped, define.ContainerStateExited) {
433
				return true, nil
434
			}
435

436
			// If the PID is 0, then the container is already stopped.
437
			if ctr.state.PID == 0 {
438
				return true, nil
439
			}
440

441
			// Is the container gone?
442
			// If so, it probably died between the first check and
443
			// our sending the signal
444
			// The container is stopped, so exit cleanly
445
			err := unix.Kill(ctr.state.PID, 0)
446
			if err == unix.ESRCH {
447
				return true, nil
448
			}
449

450
			return false, err
451
		}
452

453
		// Before handling error from KillContainer, convert STDERR to a []string
454
		// (one string per line of output) and print it.
455
		stderrLines := strings.Split(stderr.String(), "\n")
456
		for _, line := range stderrLines {
457
			if line != "" {
458
				fmt.Fprintf(os.Stderr, "%s\n", line)
459
			}
460
		}
461

462
		return false, nil
463
	}
464

465
	if timeout > 0 {
466
		stopSignal := ctr.config.StopSignal
467
		if stopSignal == 0 {
468
			stopSignal = uint(syscall.SIGTERM)
469
		}
470

471
		stopped, err := killCtr(stopSignal)
472
		if err != nil {
473
			return err
474
		}
475
		if stopped {
476
			return nil
477
		}
478

479
		if err := waitContainerStop(ctr, time.Duration(util.ConvertTimeout(int(timeout)))*time.Second); err != nil {
480
			sigName := unix.SignalName(syscall.Signal(stopSignal))
481
			if sigName == "" {
482
				sigName = fmt.Sprintf("(%d)", stopSignal)
483
			}
484
			logrus.Debugf("Timed out stopping container %s with %s, resorting to SIGKILL: %v", ctr.ID(), sigName, err)
485
			logrus.Warnf("StopSignal %s failed to stop container %s in %d seconds, resorting to SIGKILL", sigName, ctr.Name(), timeout)
486
		} else {
487
			// No error, the container is dead
488
			return nil
489
		}
490
	}
491

492
	stopped, err := killCtr(uint(unix.SIGKILL))
493
	if err != nil {
494
		return fmt.Errorf("sending SIGKILL to container %s: %w", ctr.ID(), err)
495
	}
496
	if stopped {
497
		return nil
498
	}
499

500
	// Give runtime a few seconds to make it happen
501
	if err := waitContainerStop(ctr, killContainerTimeout); err != nil {
502
		return err
503
	}
504

505
	return nil
506
}
507

508
// DeleteContainer deletes a container from the OCI runtime.
509
func (r *ConmonOCIRuntime) DeleteContainer(ctr *Container) error {
510
	runtimeDir, err := util.GetRootlessRuntimeDir()
511
	if err != nil {
512
		return err
513
	}
514
	env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
515
	return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "delete", "--force", ctr.ID())...)
516
}
517

518
// PauseContainer pauses the given container.
519
func (r *ConmonOCIRuntime) PauseContainer(ctr *Container) error {
520
	runtimeDir, err := util.GetRootlessRuntimeDir()
521
	if err != nil {
522
		return err
523
	}
524
	env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
525
	return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "pause", ctr.ID())...)
526
}
527

528
// UnpauseContainer unpauses the given container.
529
func (r *ConmonOCIRuntime) UnpauseContainer(ctr *Container) error {
530
	runtimeDir, err := util.GetRootlessRuntimeDir()
531
	if err != nil {
532
		return err
533
	}
534
	env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
535
	return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "resume", ctr.ID())...)
536
}
537

538
// This filters out ENOTCONN errors which can happen on FreeBSD if the
539
// other side of the connection is already closed.
540
func socketCloseWrite(conn *net.UnixConn) error {
541
	err := conn.CloseWrite()
542
	if err != nil && errors.Is(err, syscall.ENOTCONN) {
543
		return nil
544
	}
545
	return err
546
}
547

548
// HTTPAttach performs an attach for the HTTP API.
549
// The caller must handle closing the HTTP connection after this returns.
550
// The cancel channel is not closed; it is up to the caller to do so after
551
// this function returns.
552
// If this is a container with a terminal, we will stream raw. If it is not, we
553
// will stream with an 8-byte header to multiplex STDOUT and STDERR.
554
// Returns any errors that occurred, and whether the connection was successfully
555
// hijacked before that error occurred.
556
func (r *ConmonOCIRuntime) HTTPAttach(ctr *Container, req *http.Request, w http.ResponseWriter, streams *HTTPAttachStreams, detachKeys *string, cancel <-chan bool, hijackDone chan<- bool, streamAttach, streamLogs bool) (deferredErr error) {
557
	isTerminal := ctr.Terminal()
558

559
	if streams != nil {
560
		if !streams.Stdin && !streams.Stdout && !streams.Stderr {
561
			return fmt.Errorf("must specify at least one stream to attach to: %w", define.ErrInvalidArg)
562
		}
563
	}
564

565
	attachSock, err := r.AttachSocketPath(ctr)
566
	if err != nil {
567
		return err
568
	}
569

570
	var conn *net.UnixConn
571
	if streamAttach {
572
		newConn, err := openUnixSocket(attachSock)
573
		if err != nil {
574
			return fmt.Errorf("failed to connect to container's attach socket: %v: %w", attachSock, err)
575
		}
576
		conn = newConn
577
		defer func() {
578
			if err := conn.Close(); err != nil {
579
				logrus.Errorf("Unable to close container %s attach socket: %q", ctr.ID(), err)
580
			}
581
		}()
582

583
		logrus.Debugf("Successfully connected to container %s attach socket %s", ctr.ID(), attachSock)
584
	}
585

586
	detachString := ctr.runtime.config.Engine.DetachKeys
587
	if detachKeys != nil {
588
		detachString = *detachKeys
589
	}
590
	isDetach, err := processDetachKeys(detachString)
591
	if err != nil {
592
		return err
593
	}
594

595
	attachStdout := true
596
	attachStderr := true
597
	attachStdin := true
598
	if streams != nil {
599
		attachStdout = streams.Stdout
600
		attachStderr = streams.Stderr
601
		attachStdin = streams.Stdin
602
	}
603

604
	logrus.Debugf("Going to hijack container %s attach connection", ctr.ID())
605

606
	// Alright, let's hijack.
607
	hijacker, ok := w.(http.Hijacker)
608
	if !ok {
609
		return fmt.Errorf("unable to hijack connection")
610
	}
611

612
	httpCon, httpBuf, err := hijacker.Hijack()
613
	if err != nil {
614
		return fmt.Errorf("hijacking connection: %w", err)
615
	}
616

617
	hijackDone <- true
618

619
	writeHijackHeader(req, httpBuf, isTerminal)
620

621
	// Force a flush after the header is written.
622
	if err := httpBuf.Flush(); err != nil {
623
		return fmt.Errorf("flushing HTTP hijack header: %w", err)
624
	}
625

626
	defer func() {
627
		hijackWriteErrorAndClose(deferredErr, ctr.ID(), isTerminal, httpCon, httpBuf)
628
	}()
629

630
	logrus.Debugf("Hijack for container %s attach session done, ready to stream", ctr.ID())
631

632
	// TODO: This is gross. Really, really gross.
633
	// I want to say we should read all the logs into an array before
634
	// calling this, in container_api.go, but that could take a lot of
635
	// memory...
636
	// On the whole, we need to figure out a better way of doing this,
637
	// though.
638
	logSize := 0
639
	if streamLogs {
640
		logrus.Debugf("Will stream logs for container %s attach session", ctr.ID())
641

642
		// Get all logs for the container
643
		logChan := make(chan *logs.LogLine)
644
		logOpts := new(logs.LogOptions)
645
		logOpts.Tail = -1
646
		logOpts.WaitGroup = new(sync.WaitGroup)
647
		errChan := make(chan error)
648
		go func() {
649
			var err error
650
			// In non-terminal mode we need to prepend with the
651
			// stream header.
652
			logrus.Debugf("Writing logs for container %s to HTTP attach", ctr.ID())
653
			for logLine := range logChan {
654
				if !isTerminal {
655
					device := logLine.Device
656
					var header []byte
657
					headerLen := uint32(len(logLine.Msg))
658
					if !logLine.Partial() {
659
						// we append an extra newline in this case so we need to increment the len as well
660
						headerLen++
661
					}
662
					logSize += len(logLine.Msg)
663
					switch strings.ToLower(device) {
664
					case "stdin":
665
						header = makeHTTPAttachHeader(0, headerLen)
666
					case "stdout":
667
						header = makeHTTPAttachHeader(1, headerLen)
668
					case "stderr":
669
						header = makeHTTPAttachHeader(2, headerLen)
670
					default:
671
						logrus.Errorf("Unknown device for log line: %s", device)
672
						header = makeHTTPAttachHeader(1, headerLen)
673
					}
674
					_, err = httpBuf.Write(header)
675
					if err != nil {
676
						break
677
					}
678
				}
679
				_, err = httpBuf.Write([]byte(logLine.Msg))
680
				if err != nil {
681
					break
682
				}
683
				if !logLine.Partial() {
684
					_, err = httpBuf.Write([]byte("\n"))
685
					if err != nil {
686
						break
687
					}
688
				}
689
				err = httpBuf.Flush()
690
				if err != nil {
691
					break
692
				}
693
			}
694
			errChan <- err
695
		}()
696
		if err := ctr.ReadLog(context.Background(), logOpts, logChan, 0); err != nil {
697
			return err
698
		}
699
		go func() {
700
			logOpts.WaitGroup.Wait()
701
			close(logChan)
702
		}()
703
		logrus.Debugf("Done reading logs for container %s, %d bytes", ctr.ID(), logSize)
704
		if err := <-errChan; err != nil {
705
			return err
706
		}
707
	}
708
	if !streamAttach {
709
		logrus.Debugf("Done streaming logs for container %s attach, exiting as attach streaming not requested", ctr.ID())
710
		return nil
711
	}
712

713
	logrus.Debugf("Forwarding attach output for container %s", ctr.ID())
714

715
	stdoutChan := make(chan error)
716
	stdinChan := make(chan error)
717

718
	// Handle STDOUT/STDERR
719
	go func() {
720
		var err error
721
		if isTerminal {
722
			// Hack: return immediately if attachStdout not set to
723
			// emulate Docker.
724
			// Basically, when terminal is set, STDERR goes nowhere.
725
			// Everything does over STDOUT.
726
			// Therefore, if not attaching STDOUT - we'll never copy
727
			// anything from here.
728
			logrus.Debugf("Performing terminal HTTP attach for container %s", ctr.ID())
729
			if attachStdout {
730
				err = httpAttachTerminalCopy(conn, httpBuf, ctr.ID())
731
			}
732
		} else {
733
			logrus.Debugf("Performing non-terminal HTTP attach for container %s", ctr.ID())
734
			err = httpAttachNonTerminalCopy(conn, httpBuf, ctr.ID(), attachStdin, attachStdout, attachStderr)
735
		}
736
		stdoutChan <- err
737
		logrus.Debugf("STDOUT/ERR copy completed")
738
	}()
739
	// Next, STDIN. Avoid entirely if attachStdin unset.
740
	if attachStdin {
741
		go func() {
742
			_, err := detach.Copy(conn, httpBuf, isDetach)
743
			logrus.Debugf("STDIN copy completed")
744
			stdinChan <- err
745
		}()
746
	}
747

748
	for {
749
		select {
750
		case err := <-stdoutChan:
751
			if err != nil {
752
				return err
753
			}
754

755
			return nil
756
		case err := <-stdinChan:
757
			if err != nil {
758
				return err
759
			}
760
			// copy stdin is done, close it
761
			if connErr := socketCloseWrite(conn); connErr != nil {
762
				logrus.Errorf("Unable to close conn: %v", connErr)
763
			}
764
		case <-cancel:
765
			return nil
766
		}
767
	}
768
}
769

770
// isRetryable returns whether the error was caused by a blocked syscall or the
771
// specified operation on a non blocking file descriptor wasn't ready for completion.
772
func isRetryable(err error) bool {
773
	var errno syscall.Errno
774
	if errors.As(err, &errno) {
775
		return errno == syscall.EINTR || errno == syscall.EAGAIN
776
	}
777
	return false
778
}
779

780
// openControlFile opens the terminal control file.
781
func openControlFile(ctr *Container, parentDir string) (*os.File, error) {
782
	controlPath := filepath.Join(parentDir, "ctl")
783
	for i := 0; i < 600; i++ {
784
		controlFile, err := os.OpenFile(controlPath, unix.O_WRONLY|unix.O_NONBLOCK, 0)
785
		if err == nil {
786
			return controlFile, nil
787
		}
788
		if !isRetryable(err) {
789
			return nil, fmt.Errorf("could not open ctl file for terminal resize for container %s: %w", ctr.ID(), err)
790
		}
791
		time.Sleep(time.Second / 10)
792
	}
793
	return nil, fmt.Errorf("timeout waiting for %q", controlPath)
794
}
795

796
// AttachResize resizes the terminal used by the given container.
797
func (r *ConmonOCIRuntime) AttachResize(ctr *Container, newSize resize.TerminalSize) error {
798
	controlFile, err := openControlFile(ctr, ctr.bundlePath())
799
	if err != nil {
800
		return err
801
	}
802
	defer controlFile.Close()
803

804
	logrus.Debugf("Received a resize event for container %s: %+v", ctr.ID(), newSize)
805
	if _, err = fmt.Fprintf(controlFile, "%d %d %d\n", 1, newSize.Height, newSize.Width); err != nil {
806
		return fmt.Errorf("failed to write to ctl file to resize terminal: %w", err)
807
	}
808

809
	return nil
810
}
811

812
// CheckpointContainer checkpoints the given container.
813
func (r *ConmonOCIRuntime) CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) (int64, error) {
814
	// imagePath is used by CRIU to store the actual checkpoint files
815
	imagePath := ctr.CheckpointPath()
816
	if options.PreCheckPoint {
817
		imagePath = ctr.PreCheckPointPath()
818
	}
819
	// workPath will be used to store dump.log and stats-dump
820
	workPath := ctr.bundlePath()
821
	logrus.Debugf("Writing checkpoint to %s", imagePath)
822
	logrus.Debugf("Writing checkpoint logs to %s", workPath)
823
	logrus.Debugf("Pre-dump the container %t", options.PreCheckPoint)
824
	args := []string{}
825
	args = append(args, r.runtimeFlags...)
826
	args = append(args, "checkpoint")
827
	args = append(args, "--image-path")
828
	args = append(args, imagePath)
829
	args = append(args, "--work-path")
830
	args = append(args, workPath)
831
	if options.KeepRunning {
832
		args = append(args, "--leave-running")
833
	}
834
	if options.TCPEstablished {
835
		args = append(args, "--tcp-established")
836
	}
837
	if options.FileLocks {
838
		args = append(args, "--file-locks")
839
	}
840
	if !options.PreCheckPoint && options.KeepRunning {
841
		args = append(args, "--leave-running")
842
	}
843
	if options.PreCheckPoint {
844
		args = append(args, "--pre-dump")
845
	}
846
	if !options.PreCheckPoint && options.WithPrevious {
847
		args = append(
848
			args,
849
			"--parent-path",
850
			filepath.Join("..", preCheckpointDir),
851
		)
852
	}
853

854
	args = append(args, ctr.ID())
855
	logrus.Debugf("the args to checkpoint: %s %s", r.path, strings.Join(args, " "))
856

857
	runtimeDir, err := util.GetRootlessRuntimeDir()
858
	if err != nil {
859
		return 0, err
860
	}
861
	env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
862
	if path, ok := os.LookupEnv("PATH"); ok {
863
		env = append(env, fmt.Sprintf("PATH=%s", path))
864
	}
865

866
	var runtimeCheckpointStarted time.Time
867
	err = r.withContainerSocketLabel(ctr, func() error {
868
		runtimeCheckpointStarted = time.Now()
869
		return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...)
870
	})
871

872
	runtimeCheckpointDuration := func() int64 {
873
		if options.PrintStats {
874
			return time.Since(runtimeCheckpointStarted).Microseconds()
875
		}
876
		return 0
877
	}()
878

879
	return runtimeCheckpointDuration, err
880
}
881

882
func (r *ConmonOCIRuntime) CheckConmonRunning(ctr *Container) (bool, error) {
883
	if ctr.state.ConmonPID == 0 {
884
		// If the container is running or paused, assume Conmon is
885
		// running. We didn't record Conmon PID on some old versions, so
886
		// that is likely what's going on...
887
		// Unusual enough that we should print a warning message though.
888
		if ctr.ensureState(define.ContainerStateRunning, define.ContainerStatePaused) {
889
			logrus.Warnf("Conmon PID is not set, but container is running!")
890
			return true, nil
891
		}
892
		// Container's not running, so conmon PID being unset is
893
		// expected. Conmon is not running.
894
		return false, nil
895
	}
896

897
	// We have a conmon PID. Ping it with signal 0.
898
	if err := unix.Kill(ctr.state.ConmonPID, 0); err != nil {
899
		if err == unix.ESRCH {
900
			return false, nil
901
		}
902
		return false, fmt.Errorf("pinging container %s conmon with signal 0: %w", ctr.ID(), err)
903
	}
904
	return true, nil
905
}
906

907
// SupportsCheckpoint checks if the OCI runtime supports checkpointing
908
// containers.
909
func (r *ConmonOCIRuntime) SupportsCheckpoint() bool {
910
	return crutils.CRRuntimeSupportsCheckpointRestore(r.path)
911
}
912

913
// SupportsJSONErrors checks if the OCI runtime supports JSON-formatted error
914
// messages.
915
func (r *ConmonOCIRuntime) SupportsJSONErrors() bool {
916
	return r.supportsJSON
917
}
918

919
// SupportsNoCgroups checks if the OCI runtime supports running containers
920
// without cgroups (the --cgroup-manager=disabled flag).
921
func (r *ConmonOCIRuntime) SupportsNoCgroups() bool {
922
	return r.supportsNoCgroups
923
}
924

925
// SupportsKVM checks if the OCI runtime supports running containers
926
// without KVM separation
927
func (r *ConmonOCIRuntime) SupportsKVM() bool {
928
	return r.supportsKVM
929
}
930

931
// AttachSocketPath is the path to a single container's attach socket.
932
func (r *ConmonOCIRuntime) AttachSocketPath(ctr *Container) (string, error) {
933
	if ctr == nil {
934
		return "", fmt.Errorf("must provide a valid container to get attach socket path: %w", define.ErrInvalidArg)
935
	}
936

937
	return filepath.Join(ctr.bundlePath(), "attach"), nil
938
}
939

940
// ExitFilePath is the path to a container's exit file.
941
func (r *ConmonOCIRuntime) ExitFilePath(ctr *Container) (string, error) {
942
	if ctr == nil {
943
		return "", fmt.Errorf("must provide a valid container to get exit file path: %w", define.ErrInvalidArg)
944
	}
945
	return filepath.Join(r.exitsDir, ctr.ID()), nil
946
}
947

948
// OOMFilePath is the path to a container's oom file.
949
// The oom file will only exist if the container was oom killed.
950
func (r *ConmonOCIRuntime) OOMFilePath(ctr *Container) (string, error) {
951
	return filepath.Join(r.persistDir, ctr.ID(), "oom"), nil
952
}
953

954
// RuntimeInfo provides information on the runtime.
955
func (r *ConmonOCIRuntime) RuntimeInfo() (*define.ConmonInfo, *define.OCIRuntimeInfo, error) {
956
	runtimePackage := version.Package(r.path)
957
	conmonPackage := version.Package(r.conmonPath)
958
	runtimeVersion, err := r.getOCIRuntimeVersion()
959
	if err != nil {
960
		return nil, nil, fmt.Errorf("getting version of OCI runtime %s: %w", r.name, err)
961
	}
962
	conmonVersion, err := r.getConmonVersion()
963
	if err != nil {
964
		return nil, nil, fmt.Errorf("getting conmon version: %w", err)
965
	}
966

967
	conmon := define.ConmonInfo{
968
		Package: conmonPackage,
969
		Path:    r.conmonPath,
970
		Version: conmonVersion,
971
	}
972
	ocirt := define.OCIRuntimeInfo{
973
		Name:    r.name,
974
		Path:    r.path,
975
		Package: runtimePackage,
976
		Version: runtimeVersion,
977
	}
978
	return &conmon, &ocirt, nil
979
}
980

981
// makeAccessible changes the path permission and each parent directory to have --x--x--x
982
func makeAccessible(path string, uid, gid int) error {
983
	for ; path != "/"; path = filepath.Dir(path) {
984
		st, err := os.Stat(path)
985
		if err != nil {
986
			if os.IsNotExist(err) {
987
				return nil
988
			}
989
			return err
990
		}
991
		if int(st.Sys().(*syscall.Stat_t).Uid) == uid && int(st.Sys().(*syscall.Stat_t).Gid) == gid {
992
			continue
993
		}
994
		if st.Mode()&0111 != 0111 {
995
			if err := os.Chmod(path, st.Mode()|0111); err != nil {
996
				return err
997
			}
998
		}
999
	}
1000
	return nil
1001
}
1002

1003
// Wait for a container which has been sent a signal to stop
1004
func waitContainerStop(ctr *Container, timeout time.Duration) error {
1005
	return waitPidStop(ctr.state.PID, timeout)
1006
}
1007

1008
// Wait for a given PID to stop
1009
func waitPidStop(pid int, timeout time.Duration) error {
1010
	timer := time.NewTimer(timeout)
1011
	defer timer.Stop()
1012
	for {
1013
		select {
1014
		case <-timer.C:
1015
			return fmt.Errorf("given PID did not die within timeout")
1016
		default:
1017
			if err := unix.Kill(pid, 0); err != nil {
1018
				if err == unix.ESRCH {
1019
					return nil
1020
				}
1021
				logrus.Errorf("Pinging PID %d with signal 0: %v", pid, err)
1022
			}
1023
			time.Sleep(10 * time.Millisecond)
1024
		}
1025
	}
1026
}
1027

1028
func (r *ConmonOCIRuntime) getLogTag(ctr *Container) (string, error) {
1029
	logTag := ctr.LogTag()
1030
	if logTag == "" {
1031
		return "", nil
1032
	}
1033
	data, err := ctr.inspectLocked(false)
1034
	if err != nil {
1035
		// FIXME: this error should probably be returned
1036
		return "", nil //nolint: nilerr
1037
	}
1038
	tmpl, err := template.New("container").Parse(logTag)
1039
	if err != nil {
1040
		return "", fmt.Errorf("template parsing error %s: %w", logTag, err)
1041
	}
1042
	var b bytes.Buffer
1043
	err = tmpl.Execute(&b, data)
1044
	if err != nil {
1045
		return "", err
1046
	}
1047
	return b.String(), nil
1048
}
1049

1050
func getPreserveFdExtraFiles(preserveFD []uint, preserveFDs uint) (uint, []*os.File, []*os.File, error) {
1051
	var filesToClose []*os.File
1052
	var extraFiles []*os.File
1053

1054
	preserveFDsMap := make(map[uint]struct{})
1055
	for _, i := range preserveFD {
1056
		if i < 3 {
1057
			return 0, nil, nil, fmt.Errorf("cannot preserve FD %d, consider using the passthrough log-driver to pass STDIO streams into the container: %w", i, define.ErrInvalidArg)
1058
		}
1059
		if i-2 > preserveFDs {
1060
			// preserveFDs is the number of FDs above 2 to keep around.
1061
			// e.g. if the user specified FD=3, then preserveFDs must be 1.
1062
			preserveFDs = i - 2
1063
		}
1064
		preserveFDsMap[i] = struct{}{}
1065
	}
1066

1067
	if preserveFDs > 0 {
1068
		for fd := 3; fd < int(3+preserveFDs); fd++ {
1069
			if len(preserveFDsMap) > 0 {
1070
				if _, ok := preserveFDsMap[uint(fd)]; !ok {
1071
					extraFiles = append(extraFiles, nil)
1072
					continue
1073
				}
1074
			}
1075
			f := os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd))
1076
			filesToClose = append(filesToClose, f)
1077
			extraFiles = append(extraFiles, f)
1078
		}
1079
	}
1080
	return preserveFDs, filesToClose, extraFiles, nil
1081
}
1082

1083
// createOCIContainer generates this container's main conmon instance and prepares it for starting
1084
func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) {
1085
	var stderrBuf bytes.Buffer
1086

1087
	parentSyncPipe, childSyncPipe, err := newPipe()
1088
	if err != nil {
1089
		return 0, fmt.Errorf("creating socket pair: %w", err)
1090
	}
1091
	defer errorhandling.CloseQuiet(parentSyncPipe)
1092

1093
	childStartPipe, parentStartPipe, err := newPipe()
1094
	if err != nil {
1095
		return 0, fmt.Errorf("creating socket pair for start pipe: %w", err)
1096
	}
1097

1098
	defer errorhandling.CloseQuiet(parentStartPipe)
1099

1100
	var ociLog string
1101
	if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON {
1102
		ociLog = filepath.Join(ctr.state.RunDir, "oci-log")
1103
	}
1104

1105
	logTag, err := r.getLogTag(ctr)
1106
	if err != nil {
1107
		return 0, err
1108
	}
1109

1110
	if ctr.config.CgroupsMode == cgroupSplit {
1111
		if err := moveToRuntimeCgroup(); err != nil {
1112
			return 0, err
1113
		}
1114
	}
1115

1116
	pidfile := ctr.config.PidFile
1117
	if pidfile == "" {
1118
		pidfile = filepath.Join(ctr.state.RunDir, "pidfile")
1119
	}
1120

1121
	persistDir := filepath.Join(r.persistDir, ctr.ID())
1122
	args, err := r.sharedConmonArgs(ctr, ctr.ID(), ctr.bundlePath(), pidfile, ctr.LogPath(), r.exitsDir, persistDir, ociLog, ctr.LogDriver(), logTag)
1123
	if err != nil {
1124
		return 0, err
1125
	}
1126

1127
	if ctr.config.SdNotifyMode == define.SdNotifyModeContainer && ctr.config.SdNotifySocket != "" {
1128
		args = append(args, fmt.Sprintf("--sdnotify-socket=%s", ctr.config.SdNotifySocket))
1129
	}
1130

1131
	if ctr.Terminal() {
1132
		args = append(args, "-t")
1133
	} else if ctr.config.Stdin {
1134
		args = append(args, "-i")
1135
	}
1136

1137
	if ctr.config.Timeout > 0 {
1138
		args = append(args, fmt.Sprintf("--timeout=%d", ctr.config.Timeout))
1139
	}
1140

1141
	if !r.enableKeyring {
1142
		args = append(args, "--no-new-keyring")
1143
	}
1144
	if ctr.config.ConmonPidFile != "" {
1145
		args = append(args, "--conmon-pidfile", ctr.config.ConmonPidFile)
1146
	}
1147

1148
	if r.noPivot {
1149
		args = append(args, "--no-pivot")
1150
	}
1151

1152
	exitCommand, err := specgenutil.CreateExitCommandArgs(ctr.runtime.storageConfig, ctr.runtime.config, ctr.runtime.syslog || logrus.IsLevelEnabled(logrus.DebugLevel), ctr.AutoRemove(), false)
1153
	if err != nil {
1154
		return 0, err
1155
	}
1156
	exitCommand = append(exitCommand, ctr.config.ID)
1157

1158
	args = append(args, "--exit-command", exitCommand[0])
1159
	for _, arg := range exitCommand[1:] {
1160
		args = append(args, []string{"--exit-command-arg", arg}...)
1161
	}
1162

1163
	preserveFDs := ctr.config.PreserveFDs
1164

1165
	// Pass down the LISTEN_* environment (see #10443).
1166
	if val := os.Getenv("LISTEN_FDS"); val != "" {
1167
		if preserveFDs > 0 || len(ctr.config.PreserveFD) > 0 {
1168
			logrus.Warnf("Ignoring LISTEN_FDS to preserve custom user-specified FDs")
1169
		} else {
1170
			fds, err := strconv.Atoi(val)
1171
			if err != nil {
1172
				return 0, fmt.Errorf("converting LISTEN_FDS=%s: %w", val, err)
1173
			}
1174
			preserveFDs = uint(fds)
1175
		}
1176
	}
1177

1178
	preserveFDs, filesToClose, extraFiles, err := getPreserveFdExtraFiles(ctr.config.PreserveFD, preserveFDs)
1179
	if err != nil {
1180
		return 0, err
1181
	}
1182
	if preserveFDs > 0 {
1183
		args = append(args, formatRuntimeOpts("--preserve-fds", strconv.FormatUint(uint64(preserveFDs), 10))...)
1184
	}
1185

1186
	if restoreOptions != nil {
1187
		args = append(args, "--restore", ctr.CheckpointPath())
1188
		if restoreOptions.TCPEstablished {
1189
			args = append(args, "--runtime-opt", "--tcp-established")
1190
		}
1191
		if restoreOptions.FileLocks {
1192
			args = append(args, "--runtime-opt", "--file-locks")
1193
		}
1194
		if restoreOptions.Pod != "" {
1195
			mountLabel := ctr.config.MountLabel
1196
			processLabel := ctr.config.ProcessLabel
1197
			if mountLabel != "" {
1198
				args = append(
1199
					args,
1200
					"--runtime-opt",
1201
					fmt.Sprintf(
1202
						"--lsm-mount-context=%s",
1203
						mountLabel,
1204
					),
1205
				)
1206
			}
1207
			if processLabel != "" {
1208
				args = append(
1209
					args,
1210
					"--runtime-opt",
1211
					fmt.Sprintf(
1212
						"--lsm-profile=selinux:%s",
1213
						processLabel,
1214
					),
1215
				)
1216
			}
1217
		}
1218
	}
1219

1220
	logrus.WithFields(logrus.Fields{
1221
		"args": args,
1222
	}).Debugf("running conmon: %s", r.conmonPath)
1223

1224
	cmd := exec.Command(r.conmonPath, args...)
1225
	cmd.SysProcAttr = &syscall.SysProcAttr{
1226
		Setpgid: true,
1227
	}
1228
	// TODO this is probably a really bad idea for some uses
1229
	// Make this configurable
1230
	cmd.Stdin = os.Stdin
1231
	cmd.Stdout = os.Stdout
1232
	cmd.Stderr = os.Stderr
1233
	if ctr.Terminal() {
1234
		cmd.Stderr = &stderrBuf
1235
	}
1236

1237
	// 0, 1 and 2 are stdin, stdout and stderr
1238
	conmonEnv, err := r.configureConmonEnv()
1239
	if err != nil {
1240
		return 0, fmt.Errorf("configuring conmon env: %w", err)
1241
	}
1242

1243
	cmd.ExtraFiles = extraFiles
1244

1245
	cmd.Env = r.conmonEnv
1246
	// we don't want to step on users fds they asked to preserve
1247
	// Since 0-2 are used for stdio, start the fds we pass in at preserveFDs+3
1248
	cmd.Env = append(cmd.Env, fmt.Sprintf("_OCI_SYNCPIPE=%d", preserveFDs+3), fmt.Sprintf("_OCI_STARTPIPE=%d", preserveFDs+4))
1249
	cmd.Env = append(cmd.Env, conmonEnv...)
1250
	cmd.ExtraFiles = append(cmd.ExtraFiles, childSyncPipe, childStartPipe)
1251

1252
	if r.reservePorts && !rootless.IsRootless() && !ctr.config.NetMode.IsSlirp4netns() {
1253
		ports, err := bindPorts(ctr.convertPortMappings())
1254
		if err != nil {
1255
			return 0, err
1256
		}
1257
		filesToClose = append(filesToClose, ports...)
1258

1259
		// Leak the port we bound in the conmon process.  These fd's won't be used
1260
		// by the container and conmon will keep the ports busy so that another
1261
		// process cannot use them.
1262
		cmd.ExtraFiles = append(cmd.ExtraFiles, ports...)
1263
	}
1264

1265
	if ctr.config.NetMode.IsSlirp4netns() || rootless.IsRootless() {
1266
		if ctr.config.PostConfigureNetNS {
1267
			havePortMapping := len(ctr.config.PortMappings) > 0
1268
			if havePortMapping {
1269
				ctr.rootlessPortSyncR, ctr.rootlessPortSyncW, err = os.Pipe()
1270
				if err != nil {
1271
					return 0, fmt.Errorf("failed to create rootless port sync pipe: %w", err)
1272
				}
1273
			}
1274
			ctr.rootlessSlirpSyncR, ctr.rootlessSlirpSyncW, err = os.Pipe()
1275
			if err != nil {
1276
				return 0, fmt.Errorf("failed to create rootless network sync pipe: %w", err)
1277
			}
1278
		}
1279

1280
		if ctr.rootlessSlirpSyncW != nil {
1281
			defer errorhandling.CloseQuiet(ctr.rootlessSlirpSyncW)
1282
			// Leak one end in conmon, the other one will be leaked into slirp4netns
1283
			cmd.ExtraFiles = append(cmd.ExtraFiles, ctr.rootlessSlirpSyncW)
1284
		}
1285

1286
		if ctr.rootlessPortSyncW != nil {
1287
			defer errorhandling.CloseQuiet(ctr.rootlessPortSyncW)
1288
			// Leak one end in conmon, the other one will be leaked into rootlessport
1289
			cmd.ExtraFiles = append(cmd.ExtraFiles, ctr.rootlessPortSyncW)
1290
		}
1291
	}
1292
	var runtimeRestoreStarted time.Time
1293
	if restoreOptions != nil {
1294
		runtimeRestoreStarted = time.Now()
1295
	}
1296
	err = cmd.Start()
1297

1298
	// regardless of whether we errored or not, we no longer need the children pipes
1299
	childSyncPipe.Close()
1300
	childStartPipe.Close()
1301
	if err != nil {
1302
		return 0, err
1303
	}
1304
	if err := r.moveConmonToCgroupAndSignal(ctr, cmd, parentStartPipe); err != nil {
1305
		// The child likely already exited in which case the cmd.Wait() below should return the proper error.
1306
		// EPIPE is expected if the child already exited so not worth to log and kill the process.
1307
		if !errors.Is(err, syscall.EPIPE) {
1308
			logrus.Errorf("Failed to signal conmon to start: %v", err)
1309
			if err := cmd.Process.Kill(); err != nil && !errors.Is(err, syscall.ESRCH) {
1310
				logrus.Errorf("Failed to kill conmon after error: %v", err)
1311
			}
1312
		}
1313
	}
1314

1315
	/* Wait for initial setup and fork, and reap child */
1316
	err = cmd.Wait()
1317
	if err != nil {
1318
		return 0, fmt.Errorf("conmon failed: %w", err)
1319
	}
1320

1321
	pid, err := readConmonPipeData(r.name, parentSyncPipe, ociLog)
1322
	if err != nil {
1323
		if err2 := r.DeleteContainer(ctr); err2 != nil {
1324
			logrus.Errorf("Removing container %s from runtime after creation failed", ctr.ID())
1325
		}
1326
		return 0, err
1327
	}
1328
	ctr.state.PID = pid
1329

1330
	conmonPID, err := readConmonPidFile(ctr.config.ConmonPidFile)
1331
	if err != nil {
1332
		logrus.Warnf("Error reading conmon pid file for container %s: %v", ctr.ID(), err)
1333
	} else if conmonPID > 0 {
1334
		// conmon not having a pid file is a valid state, so don't set it if we don't have it
1335
		logrus.Infof("Got Conmon PID as %d", conmonPID)
1336
		ctr.state.ConmonPID = conmonPID
1337
	}
1338

1339
	runtimeRestoreDuration := func() int64 {
1340
		if restoreOptions != nil && restoreOptions.PrintStats {
1341
			return time.Since(runtimeRestoreStarted).Microseconds()
1342
		}
1343
		return 0
1344
	}()
1345

1346
	// These fds were passed down to the runtime.  Close them
1347
	// and not interfere
1348
	for _, f := range filesToClose {
1349
		errorhandling.CloseQuiet(f)
1350
	}
1351

1352
	return runtimeRestoreDuration, nil
1353
}
1354

1355
// configureConmonEnv gets the environment values to add to conmon's exec struct
1356
func (r *ConmonOCIRuntime) configureConmonEnv() ([]string, error) {
1357
	env := os.Environ()
1358
	res := make([]string, 0, len(env))
1359
	for _, v := range env {
1360
		if strings.HasPrefix(v, "NOTIFY_SOCKET=") {
1361
			// The NOTIFY_SOCKET must not leak into the environment.
1362
			continue
1363
		}
1364
		if strings.HasPrefix(v, "DBUS_SESSION_BUS_ADDRESS=") && !rootless.IsRootless() {
1365
			// The DBUS_SESSION_BUS_ADDRESS must not leak into the environment when running as root.
1366
			// This is because we want to use the system session for root containers, not the user session.
1367
			continue
1368
		}
1369
		res = append(res, v)
1370
	}
1371
	runtimeDir, err := util.GetRootlessRuntimeDir()
1372
	if err != nil {
1373
		return nil, err
1374
	}
1375

1376
	res = append(res, "XDG_RUNTIME_DIR="+runtimeDir)
1377
	return res, nil
1378
}
1379

1380
// sharedConmonArgs takes common arguments for exec and create/restore and formats them for the conmon CLI
1381
// func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, pidPath, logPath, exitDir, persistDir, ociLogPath, logDriver, logTag string) ([]string, error) {
1382
func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, pidPath, logPath, exitDir, persistDir, ociLogPath, logDriver, logTag string) ([]string, error) {
1383
	// Make the persists directory for the container after the ctr ID is appended to it in the caller
1384
	// This is needed as conmon writes the exit and oom file in the given persist directory path as just "exit" and "oom"
1385
	// So creating a directory with the container ID under the persist dir will help keep track of which container the
1386
	// exit and oom files belong to.
1387
	if err := os.MkdirAll(persistDir, 0750); err != nil {
1388
		return nil, fmt.Errorf("creating OCI runtime oom files directory for ctr %q: %w", ctr.ID(), err)
1389
	}
1390

1391
	// set the conmon API version to be able to use the correct sync struct keys
1392
	args := []string{
1393
		"--api-version", "1",
1394
		"-c", ctr.ID(),
1395
		"-u", cuuid,
1396
		"-r", r.path,
1397
		"-b", bundlePath,
1398
		"-p", pidPath,
1399
		"-n", ctr.Name(),
1400
		"--exit-dir", exitDir,
1401
		"--persist-dir", persistDir,
1402
		"--full-attach",
1403
	}
1404
	if len(r.runtimeFlags) > 0 {
1405
		rFlags := []string{}
1406
		for _, arg := range r.runtimeFlags {
1407
			rFlags = append(rFlags, "--runtime-arg", arg)
1408
		}
1409
		args = append(args, rFlags...)
1410
	}
1411

1412
	if ctr.CgroupManager() == config.SystemdCgroupsManager && !ctr.config.NoCgroups && ctr.config.CgroupsMode != cgroupSplit {
1413
		args = append(args, "-s")
1414
	}
1415

1416
	var logDriverArg string
1417
	switch logDriver {
1418
	case define.JournaldLogging:
1419
		logDriverArg = define.JournaldLogging
1420
	case define.NoLogging:
1421
		logDriverArg = define.NoLogging
1422
	case define.PassthroughLogging, define.PassthroughTTYLogging:
1423
		logDriverArg = define.PassthroughLogging
1424
	//lint:ignore ST1015 the default case has to be here
1425
	default: //nolint:gocritic
1426
		// No case here should happen except JSONLogging, but keep this here in case the options are extended
1427
		logrus.Errorf("%s logging specified but not supported. Choosing k8s-file logging instead", ctr.LogDriver())
1428
		fallthrough
1429
	case "":
1430
		// to get here, either a user would specify `--log-driver ""`, or this came from another place in libpod
1431
		// since the former case is obscure, and the latter case isn't an error, let's silently fallthrough
1432
		fallthrough
1433
	case define.JSONLogging:
1434
		fallthrough
1435
	case define.KubernetesLogging:
1436
		logDriverArg = fmt.Sprintf("%s:%s", define.KubernetesLogging, logPath)
1437
	}
1438

1439
	args = append(args, "-l", logDriverArg)
1440
	logLevel := logrus.GetLevel()
1441
	args = append(args, "--log-level", logLevel.String())
1442

1443
	logrus.Debugf("%s messages will be logged to syslog", r.conmonPath)
1444
	args = append(args, "--syslog")
1445

1446
	size := r.logSizeMax
1447
	if ctr.config.LogSize > 0 {
1448
		size = ctr.config.LogSize
1449
	}
1450
	if size > 0 {
1451
		args = append(args, "--log-size-max", strconv.FormatInt(size, 10))
1452
	}
1453

1454
	if ociLogPath != "" {
1455
		args = append(args, "--runtime-arg", "--log-format=json", "--runtime-arg", "--log", fmt.Sprintf("--runtime-arg=%s", ociLogPath))
1456
	}
1457
	if logTag != "" {
1458
		args = append(args, "--log-tag", logTag)
1459
	}
1460
	if ctr.config.NoCgroups {
1461
		logrus.Debugf("Running with no Cgroups")
1462
		args = append(args, "--runtime-arg", "--cgroup-manager", "--runtime-arg", "disabled")
1463
	}
1464
	return args, nil
1465
}
1466

1467
// newPipe creates a unix socket pair for communication.
1468
// Returns two files - first is parent, second is child.
1469
func newPipe() (*os.File, *os.File, error) {
1470
	fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0)
1471
	if err != nil {
1472
		return nil, nil, err
1473
	}
1474
	return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil
1475
}
1476

1477
// readConmonPidFile attempts to read conmon's pid from its pid file
1478
func readConmonPidFile(pidFile string) (int, error) {
1479
	// Let's try reading the Conmon pid at the same time.
1480
	if pidFile != "" {
1481
		contents, err := os.ReadFile(pidFile)
1482
		if err != nil {
1483
			return -1, err
1484
		}
1485
		// Convert it to an int
1486
		conmonPID, err := strconv.Atoi(string(contents))
1487
		if err != nil {
1488
			return -1, err
1489
		}
1490
		return conmonPID, nil
1491
	}
1492
	return 0, nil
1493
}
1494

1495
// readConmonPipeData attempts to read a syncInfo struct from the pipe
1496
func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int, error) {
1497
	// syncInfo is used to return data from monitor process to daemon
1498
	type syncInfo struct {
1499
		Data    int    `json:"data"`
1500
		Message string `json:"message,omitempty"`
1501
	}
1502

1503
	// Wait to get container pid from conmon
1504
	type syncStruct struct {
1505
		si  *syncInfo
1506
		err error
1507
	}
1508
	ch := make(chan syncStruct)
1509
	go func() {
1510
		var si *syncInfo
1511
		rdr := bufio.NewReader(pipe)
1512
		b, err := rdr.ReadBytes('\n')
1513
		// ignore EOF here, error is returned even when data was read
1514
		// if it is no valid json unmarshal will fail below
1515
		if err != nil && !errors.Is(err, io.EOF) {
1516
			ch <- syncStruct{err: err}
1517
		}
1518
		if err := json.Unmarshal(b, &si); err != nil {
1519
			ch <- syncStruct{err: fmt.Errorf("conmon bytes %q: %w", string(b), err)}
1520
			return
1521
		}
1522
		ch <- syncStruct{si: si}
1523
	}()
1524

1525
	var data int
1526
	select {
1527
	case ss := <-ch:
1528
		if ss.err != nil {
1529
			if ociLog != "" {
1530
				ociLogData, err := os.ReadFile(ociLog)
1531
				if err == nil {
1532
					var ociErr ociError
1533
					if err := json.Unmarshal(ociLogData, &ociErr); err == nil {
1534
						return -1, getOCIRuntimeError(runtimeName, ociErr.Msg)
1535
					}
1536
				}
1537
			}
1538
			return -1, fmt.Errorf("container create failed (no logs from conmon): %w", ss.err)
1539
		}
1540
		logrus.Debugf("Received: %d", ss.si.Data)
1541
		if ss.si.Data < 0 {
1542
			if ociLog != "" {
1543
				ociLogData, err := os.ReadFile(ociLog)
1544
				if err == nil {
1545
					var ociErr ociError
1546
					if err := json.Unmarshal(ociLogData, &ociErr); err == nil {
1547
						return ss.si.Data, getOCIRuntimeError(runtimeName, ociErr.Msg)
1548
					}
1549
				}
1550
			}
1551
			// If we failed to parse the JSON errors, then print the output as it is
1552
			if ss.si.Message != "" {
1553
				return ss.si.Data, getOCIRuntimeError(runtimeName, ss.si.Message)
1554
			}
1555
			return ss.si.Data, fmt.Errorf("container create failed: %w", define.ErrInternal)
1556
		}
1557
		data = ss.si.Data
1558
	case <-time.After(define.ContainerCreateTimeout):
1559
		return -1, fmt.Errorf("container creation timeout: %w", define.ErrInternal)
1560
	}
1561
	return data, nil
1562
}
1563

1564
// writeConmonPipeData writes nonce data to a pipe
1565
func writeConmonPipeData(pipe *os.File) error {
1566
	someData := []byte{0}
1567
	_, err := pipe.Write(someData)
1568
	return err
1569
}
1570

1571
// formatRuntimeOpts prepends opts passed to it with --runtime-opt for passing to conmon
1572
func formatRuntimeOpts(opts ...string) []string {
1573
	args := make([]string, 0, len(opts)*2)
1574
	for _, o := range opts {
1575
		args = append(args, "--runtime-opt", o)
1576
	}
1577
	return args
1578
}
1579

1580
// getConmonVersion returns a string representation of the conmon version.
1581
func (r *ConmonOCIRuntime) getConmonVersion() (string, error) {
1582
	output, err := utils.ExecCmd(r.conmonPath, "--version")
1583
	if err != nil {
1584
		return "", err
1585
	}
1586
	return strings.TrimSuffix(strings.Replace(output, "\n", ", ", 1), "\n"), nil
1587
}
1588

1589
// getOCIRuntimeVersion returns a string representation of the OCI runtime's
1590
// version.
1591
func (r *ConmonOCIRuntime) getOCIRuntimeVersion() (string, error) {
1592
	output, err := utils.ExecCmd(r.path, "--version")
1593
	if err != nil {
1594
		return "", err
1595
	}
1596
	return strings.TrimSuffix(output, "\n"), nil
1597
}
1598

1599
// Copy data from container to HTTP connection, for terminal attach.
1600
// Container is the container's attach socket connection, http is a buffer for
1601
// the HTTP connection. cid is the ID of the container the attach session is
1602
// running for (used solely for error messages).
1603
func httpAttachTerminalCopy(container *net.UnixConn, http *bufio.ReadWriter, cid string) error {
1604
	buf := make([]byte, bufferSize)
1605
	for {
1606
		numR, err := container.Read(buf)
1607
		logrus.Debugf("Read fd(%d) %d/%d bytes for container %s", int(buf[0]), numR, len(buf), cid)
1608

1609
		if numR > 0 {
1610
			switch buf[0] {
1611
			case AttachPipeStdout:
1612
				// Do nothing
1613
			default:
1614
				logrus.Errorf("Received unexpected attach type %+d, discarding %d bytes", buf[0], numR)
1615
				continue
1616
			}
1617

1618
			numW, err2 := http.Write(buf[1:numR])
1619
			if err2 != nil {
1620
				if err != nil {
1621
					logrus.Errorf("Reading container %s STDOUT: %v", cid, err)
1622
				}
1623
				return err2
1624
			} else if numW+1 != numR {
1625
				return io.ErrShortWrite
1626
			}
1627
			// We need to force the buffer to write immediately, so
1628
			// there isn't a delay on the terminal side.
1629
			if err2 := http.Flush(); err2 != nil {
1630
				if err != nil {
1631
					logrus.Errorf("Reading container %s STDOUT: %v", cid, err)
1632
				}
1633
				return err2
1634
			}
1635
		}
1636
		if err != nil {
1637
			if err == io.EOF {
1638
				return nil
1639
			}
1640
			return err
1641
		}
1642
	}
1643
}
1644

1645
// Copy data from a container to an HTTP connection, for non-terminal attach.
1646
// Appends a header to multiplex input.
1647
func httpAttachNonTerminalCopy(container *net.UnixConn, http *bufio.ReadWriter, cid string, stdin, stdout, stderr bool) error {
1648
	buf := make([]byte, bufferSize)
1649
	for {
1650
		numR, err := container.Read(buf)
1651
		if numR > 0 {
1652
			var headerBuf []byte
1653

1654
			// Subtract 1 because we strip the first byte (used for
1655
			// multiplexing by Conmon).
1656
			headerLen := uint32(numR - 1)
1657
			// Practically speaking, we could make this buf[0] - 1,
1658
			// but we need to validate it anyway.
1659
			switch buf[0] {
1660
			case AttachPipeStdin:
1661
				headerBuf = makeHTTPAttachHeader(0, headerLen)
1662
				if !stdin {
1663
					continue
1664
				}
1665
			case AttachPipeStdout:
1666
				if !stdout {
1667
					continue
1668
				}
1669
				headerBuf = makeHTTPAttachHeader(1, headerLen)
1670
			case AttachPipeStderr:
1671
				if !stderr {
1672
					continue
1673
				}
1674
				headerBuf = makeHTTPAttachHeader(2, headerLen)
1675
			default:
1676
				logrus.Errorf("Received unexpected attach type %+d, discarding %d bytes", buf[0], numR)
1677
				continue
1678
			}
1679

1680
			numH, err2 := http.Write(headerBuf)
1681
			if err2 != nil {
1682
				if err != nil {
1683
					logrus.Errorf("Reading container %s standard streams: %v", cid, err)
1684
				}
1685

1686
				return err2
1687
			}
1688
			// Hardcoding header length is pretty gross, but
1689
			// fast. Should be safe, as this is a fixed part
1690
			// of the protocol.
1691
			if numH != 8 {
1692
				if err != nil {
1693
					logrus.Errorf("Reading container %s standard streams: %v", cid, err)
1694
				}
1695

1696
				return io.ErrShortWrite
1697
			}
1698

1699
			numW, err2 := http.Write(buf[1:numR])
1700
			if err2 != nil {
1701
				if err != nil {
1702
					logrus.Errorf("Reading container %s standard streams: %v", cid, err)
1703
				}
1704

1705
				return err2
1706
			} else if numW+1 != numR {
1707
				if err != nil {
1708
					logrus.Errorf("Reading container %s standard streams: %v", cid, err)
1709
				}
1710

1711
				return io.ErrShortWrite
1712
			}
1713
			// We need to force the buffer to write immediately, so
1714
			// there isn't a delay on the terminal side.
1715
			if err2 := http.Flush(); err2 != nil {
1716
				if err != nil {
1717
					logrus.Errorf("Reading container %s STDOUT: %v", cid, err)
1718
				}
1719
				return err2
1720
			}
1721
		}
1722
		if err != nil {
1723
			if err == io.EOF {
1724
				return nil
1725
			}
1726

1727
			return err
1728
		}
1729
	}
1730
}
1731

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.