podman
839 строк · 26.5 Кб
1//go:build linux || freebsd
2// +build linux freebsd
3
4package chroot
5
6import (
7"bytes"
8"encoding/json"
9"fmt"
10"io"
11"os"
12"os/exec"
13"os/signal"
14"path/filepath"
15"runtime"
16"strconv"
17"strings"
18"sync"
19"syscall"
20
21"github.com/containers/buildah/bind"
22"github.com/containers/buildah/util"
23"github.com/containers/storage/pkg/ioutils"
24"github.com/containers/storage/pkg/reexec"
25"github.com/containers/storage/pkg/unshare"
26"github.com/opencontainers/runtime-spec/specs-go"
27"github.com/sirupsen/logrus"
28"golang.org/x/sys/unix"
29"golang.org/x/term"
30)
31
32const (
33// runUsingChrootCommand is a command we use as a key for reexec
34runUsingChrootCommand = "buildah-chroot-runtime"
35// runUsingChrootExec is a command we use as a key for reexec
36runUsingChrootExecCommand = "buildah-chroot-exec"
37// containersConfEnv is an environment variable that we need to pass down except for the command itself
38containersConfEnv = "CONTAINERS_CONF"
39)
40
41func init() {
42reexec.Register(runUsingChrootCommand, runUsingChrootMain)
43reexec.Register(runUsingChrootExecCommand, runUsingChrootExecMain)
44for limitName, limitNumber := range rlimitsMap {
45rlimitsReverseMap[limitNumber] = limitName
46}
47}
48
49type runUsingChrootExecSubprocOptions struct {
50Spec *specs.Spec
51BundlePath string
52}
53
54// RunUsingChroot runs a chrooted process, using some of the settings from the
55// passed-in spec, and using the specified bundlePath to hold temporary files,
56// directories, and mountpoints.
57func RunUsingChroot(spec *specs.Spec, bundlePath, homeDir string, stdin io.Reader, stdout, stderr io.Writer) (err error) {
58var confwg sync.WaitGroup
59var homeFound bool
60for _, env := range spec.Process.Env {
61if strings.HasPrefix(env, "HOME=") {
62homeFound = true
63break
64}
65}
66if !homeFound {
67spec.Process.Env = append(spec.Process.Env, fmt.Sprintf("HOME=%s", homeDir))
68}
69runtime.LockOSThread()
70defer runtime.UnlockOSThread()
71
72// Write the runtime configuration, mainly for debugging.
73specbytes, err := json.Marshal(spec)
74if err != nil {
75return err
76}
77if err = ioutils.AtomicWriteFile(filepath.Join(bundlePath, "config.json"), specbytes, 0600); err != nil {
78return fmt.Errorf("storing runtime configuration: %w", err)
79}
80logrus.Debugf("config = %v", string(specbytes))
81
82// Default to using stdin/stdout/stderr if we weren't passed objects to use.
83if stdin == nil {
84stdin = os.Stdin
85}
86if stdout == nil {
87stdout = os.Stdout
88}
89if stderr == nil {
90stderr = os.Stderr
91}
92
93// Create a pipe for passing configuration down to the next process.
94preader, pwriter, err := os.Pipe()
95if err != nil {
96return fmt.Errorf("creating configuration pipe: %w", err)
97}
98config, conferr := json.Marshal(runUsingChrootSubprocOptions{
99Spec: spec,
100BundlePath: bundlePath,
101})
102if conferr != nil {
103return fmt.Errorf("encoding configuration for %q: %w", runUsingChrootCommand, conferr)
104}
105
106// Set our terminal's mode to raw, to pass handling of special
107// terminal input to the terminal in the container.
108if spec.Process.Terminal && term.IsTerminal(unix.Stdin) {
109state, err := term.MakeRaw(unix.Stdin)
110if err != nil {
111logrus.Warnf("error setting terminal state: %v", err)
112} else {
113defer func() {
114if err = term.Restore(unix.Stdin, state); err != nil {
115logrus.Errorf("unable to restore terminal state: %v", err)
116}
117}()
118}
119}
120
121// Raise any resource limits that are higher than they are now, before
122// we drop any more privileges.
123if err = setRlimits(spec, false, true); err != nil {
124return err
125}
126
127// Start the grandparent subprocess.
128cmd := unshare.Command(runUsingChrootCommand)
129setPdeathsig(cmd.Cmd)
130cmd.Stdin, cmd.Stdout, cmd.Stderr = stdin, stdout, stderr
131cmd.Dir = "/"
132cmd.Env = []string{fmt.Sprintf("LOGLEVEL=%d", logrus.GetLevel())}
133if _, ok := os.LookupEnv(containersConfEnv); ok {
134cmd.Env = append(cmd.Env, containersConfEnv+"="+os.Getenv(containersConfEnv))
135}
136
137interrupted := make(chan os.Signal, 100)
138cmd.Hook = func(int) error {
139signal.Notify(interrupted, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM)
140go func() {
141for receivedSignal := range interrupted {
142if err := cmd.Process.Signal(receivedSignal); err != nil {
143logrus.Infof("%v while attempting to forward %v to child process", err, receivedSignal)
144}
145}
146}()
147return nil
148}
149
150logrus.Debugf("Running %#v in %#v", cmd.Cmd, cmd)
151confwg.Add(1)
152go func() {
153_, conferr = io.Copy(pwriter, bytes.NewReader(config))
154pwriter.Close()
155confwg.Done()
156}()
157cmd.ExtraFiles = append([]*os.File{preader}, cmd.ExtraFiles...)
158err = cmd.Run()
159confwg.Wait()
160signal.Stop(interrupted)
161close(interrupted)
162if err == nil {
163return conferr
164}
165return err
166}
167
168// main() for grandparent subprocess. Its main job is to shuttle stdio back
169// and forth, managing a pseudo-terminal if we want one, for our child, the
170// parent subprocess.
171func runUsingChrootMain() {
172var options runUsingChrootSubprocOptions
173
174runtime.LockOSThread()
175
176// Set logging.
177if level := os.Getenv("LOGLEVEL"); level != "" {
178if ll, err := strconv.Atoi(level); err == nil {
179logrus.SetLevel(logrus.Level(ll))
180}
181os.Unsetenv("LOGLEVEL")
182}
183
184// Unpack our configuration.
185confPipe := os.NewFile(3, "confpipe")
186if confPipe == nil {
187fmt.Fprintf(os.Stderr, "error reading options pipe\n")
188os.Exit(1)
189}
190defer confPipe.Close()
191if err := json.NewDecoder(confPipe).Decode(&options); err != nil {
192fmt.Fprintf(os.Stderr, "error decoding options: %v\n", err)
193os.Exit(1)
194}
195
196if options.Spec == nil || options.Spec.Process == nil {
197fmt.Fprintf(os.Stderr, "invalid options spec in runUsingChrootMain\n")
198os.Exit(1)
199}
200
201// Prepare to shuttle stdio back and forth.
202rootUID32, rootGID32, err := util.GetHostRootIDs(options.Spec)
203if err != nil {
204logrus.Errorf("error determining ownership for container stdio")
205os.Exit(1)
206}
207rootUID := int(rootUID32)
208rootGID := int(rootGID32)
209relays := make(map[int]int)
210closeOnceRunning := []*os.File{}
211var ctty *os.File
212var stdin io.Reader
213var stdinCopy io.WriteCloser
214var stdout io.Writer
215var stderr io.Writer
216fdDesc := make(map[int]string)
217if options.Spec.Process.Terminal {
218ptyMasterFd, ptyFd, err := getPtyDescriptors()
219if err != nil {
220logrus.Errorf("error opening PTY descriptors: %v", err)
221os.Exit(1)
222}
223// Make notes about what's going where.
224relays[ptyMasterFd] = unix.Stdout
225relays[unix.Stdin] = ptyMasterFd
226fdDesc[ptyMasterFd] = "container terminal"
227fdDesc[unix.Stdin] = "stdin"
228fdDesc[unix.Stdout] = "stdout"
229winsize := &unix.Winsize{}
230// Set the pseudoterminal's size to the configured size, or our own.
231if options.Spec.Process.ConsoleSize != nil {
232// Use configured sizes.
233winsize.Row = uint16(options.Spec.Process.ConsoleSize.Height)
234winsize.Col = uint16(options.Spec.Process.ConsoleSize.Width)
235} else {
236if term.IsTerminal(unix.Stdin) {
237// Use the size of our terminal.
238winsize, err = unix.IoctlGetWinsize(unix.Stdin, unix.TIOCGWINSZ)
239if err != nil {
240logrus.Debugf("error reading current terminal's size")
241winsize.Row = 0
242winsize.Col = 0
243}
244}
245}
246if winsize.Row != 0 && winsize.Col != 0 {
247if err = unix.IoctlSetWinsize(ptyFd, unix.TIOCSWINSZ, winsize); err != nil {
248logrus.Warnf("error setting terminal size for pty")
249}
250// FIXME - if we're connected to a terminal, we should
251// be passing the updated terminal size down when we
252// receive a SIGWINCH.
253}
254// Open an *os.File object that we can pass to our child.
255ctty = os.NewFile(uintptr(ptyFd), "/dev/tty")
256// Set ownership for the PTY.
257if err = ctty.Chown(rootUID, rootGID); err != nil {
258var cttyInfo unix.Stat_t
259err2 := unix.Fstat(ptyFd, &cttyInfo)
260from := ""
261op := "setting"
262if err2 == nil {
263op = "changing"
264from = fmt.Sprintf("from %d/%d ", cttyInfo.Uid, cttyInfo.Gid)
265}
266logrus.Warnf("error %s ownership of container PTY %sto %d/%d: %v", op, from, rootUID, rootGID, err)
267}
268// Set permissions on the PTY.
269if err = ctty.Chmod(0620); err != nil {
270logrus.Errorf("error setting permissions of container PTY: %v", err)
271os.Exit(1)
272}
273// Make a note that our child (the parent subprocess) should
274// have the PTY connected to its stdio, and that we should
275// close it once it's running.
276stdin = ctty
277stdout = ctty
278stderr = ctty
279closeOnceRunning = append(closeOnceRunning, ctty)
280} else {
281// Create pipes for stdio.
282stdinRead, stdinWrite, err := os.Pipe()
283if err != nil {
284logrus.Errorf("error opening pipe for stdin: %v", err)
285}
286stdoutRead, stdoutWrite, err := os.Pipe()
287if err != nil {
288logrus.Errorf("error opening pipe for stdout: %v", err)
289}
290stderrRead, stderrWrite, err := os.Pipe()
291if err != nil {
292logrus.Errorf("error opening pipe for stderr: %v", err)
293}
294// Make notes about what's going where.
295relays[unix.Stdin] = int(stdinWrite.Fd())
296relays[int(stdoutRead.Fd())] = unix.Stdout
297relays[int(stderrRead.Fd())] = unix.Stderr
298fdDesc[int(stdinWrite.Fd())] = "container stdin pipe"
299fdDesc[int(stdoutRead.Fd())] = "container stdout pipe"
300fdDesc[int(stderrRead.Fd())] = "container stderr pipe"
301fdDesc[unix.Stdin] = "stdin"
302fdDesc[unix.Stdout] = "stdout"
303fdDesc[unix.Stderr] = "stderr"
304// Set ownership for the pipes.
305if err = stdinRead.Chown(rootUID, rootGID); err != nil {
306logrus.Errorf("error setting ownership of container stdin pipe: %v", err)
307os.Exit(1)
308}
309if err = stdoutWrite.Chown(rootUID, rootGID); err != nil {
310logrus.Errorf("error setting ownership of container stdout pipe: %v", err)
311os.Exit(1)
312}
313if err = stderrWrite.Chown(rootUID, rootGID); err != nil {
314logrus.Errorf("error setting ownership of container stderr pipe: %v", err)
315os.Exit(1)
316}
317// Make a note that our child (the parent subprocess) should
318// have the pipes connected to its stdio, and that we should
319// close its ends of them once it's running.
320stdin = stdinRead
321stdout = stdoutWrite
322stderr = stderrWrite
323closeOnceRunning = append(closeOnceRunning, stdinRead, stdoutWrite, stderrWrite)
324stdinCopy = stdinWrite
325defer stdoutRead.Close()
326defer stderrRead.Close()
327}
328for readFd, writeFd := range relays {
329if err := unix.SetNonblock(readFd, true); err != nil {
330logrus.Errorf("error setting descriptor %d (%s) non-blocking: %v", readFd, fdDesc[readFd], err)
331return
332}
333if err := unix.SetNonblock(writeFd, false); err != nil {
334logrus.Errorf("error setting descriptor %d (%s) blocking: %v", relays[writeFd], fdDesc[writeFd], err)
335return
336}
337}
338if err := unix.SetNonblock(relays[unix.Stdin], true); err != nil {
339logrus.Errorf("error setting %d to nonblocking: %v", relays[unix.Stdin], err)
340}
341go func() {
342buffers := make(map[int]*bytes.Buffer)
343for _, writeFd := range relays {
344buffers[writeFd] = new(bytes.Buffer)
345}
346pollTimeout := -1
347stdinClose := false
348for len(relays) > 0 {
349fds := make([]unix.PollFd, 0, len(relays))
350for fd := range relays {
351fds = append(fds, unix.PollFd{Fd: int32(fd), Events: unix.POLLIN | unix.POLLHUP})
352}
353_, err := unix.Poll(fds, pollTimeout)
354if !util.LogIfNotRetryable(err, fmt.Sprintf("poll: %v", err)) {
355return
356}
357removeFds := make(map[int]struct{})
358for _, rfd := range fds {
359if rfd.Revents&unix.POLLHUP == unix.POLLHUP {
360removeFds[int(rfd.Fd)] = struct{}{}
361}
362if rfd.Revents&unix.POLLNVAL == unix.POLLNVAL {
363logrus.Debugf("error polling descriptor %s: closed?", fdDesc[int(rfd.Fd)])
364removeFds[int(rfd.Fd)] = struct{}{}
365}
366if rfd.Revents&unix.POLLIN == 0 {
367if stdinClose && stdinCopy == nil {
368continue
369}
370continue
371}
372b := make([]byte, 8192)
373nread, err := unix.Read(int(rfd.Fd), b)
374util.LogIfNotRetryable(err, fmt.Sprintf("read %s: %v", fdDesc[int(rfd.Fd)], err))
375if nread > 0 {
376if wfd, ok := relays[int(rfd.Fd)]; ok {
377nwritten, err := buffers[wfd].Write(b[:nread])
378if err != nil {
379logrus.Debugf("buffer: %v", err)
380continue
381}
382if nwritten != nread {
383logrus.Debugf("buffer: expected to buffer %d bytes, wrote %d", nread, nwritten)
384continue
385}
386}
387// If this is the last of the data we'll be able to read
388// from this descriptor, read as much as there is to read.
389for rfd.Revents&unix.POLLHUP == unix.POLLHUP {
390nr, err := unix.Read(int(rfd.Fd), b)
391util.LogIfUnexpectedWhileDraining(err, fmt.Sprintf("read %s: %v", fdDesc[int(rfd.Fd)], err))
392if nr <= 0 {
393break
394}
395if wfd, ok := relays[int(rfd.Fd)]; ok {
396nwritten, err := buffers[wfd].Write(b[:nr])
397if err != nil {
398logrus.Debugf("buffer: %v", err)
399break
400}
401if nwritten != nr {
402logrus.Debugf("buffer: expected to buffer %d bytes, wrote %d", nr, nwritten)
403break
404}
405}
406}
407}
408if nread == 0 {
409removeFds[int(rfd.Fd)] = struct{}{}
410}
411}
412pollTimeout = -1
413for wfd, buffer := range buffers {
414if buffer.Len() > 0 {
415nwritten, err := unix.Write(wfd, buffer.Bytes())
416util.LogIfNotRetryable(err, fmt.Sprintf("write %s: %v", fdDesc[wfd], err))
417if nwritten >= 0 {
418_ = buffer.Next(nwritten)
419}
420}
421if buffer.Len() > 0 {
422pollTimeout = 100
423}
424if wfd == relays[unix.Stdin] && stdinClose && buffer.Len() == 0 {
425stdinCopy.Close()
426delete(relays, unix.Stdin)
427}
428}
429for rfd := range removeFds {
430if rfd == unix.Stdin {
431buffer, found := buffers[relays[unix.Stdin]]
432if found && buffer.Len() > 0 {
433stdinClose = true
434continue
435}
436}
437if !options.Spec.Process.Terminal && rfd == unix.Stdin {
438stdinCopy.Close()
439}
440delete(relays, rfd)
441}
442}
443}()
444
445// Set up mounts and namespaces, and run the parent subprocess.
446status, err := runUsingChroot(options.Spec, options.BundlePath, ctty, stdin, stdout, stderr, closeOnceRunning)
447if err != nil {
448fmt.Fprintf(os.Stderr, "error running subprocess: %v\n", err)
449os.Exit(1)
450}
451
452// Pass the process's exit status back to the caller by exiting with the same status.
453if status.Exited() {
454if status.ExitStatus() != 0 {
455fmt.Fprintf(os.Stderr, "subprocess exited with status %d\n", status.ExitStatus())
456}
457os.Exit(status.ExitStatus())
458} else if status.Signaled() {
459fmt.Fprintf(os.Stderr, "subprocess exited on %s\n", status.Signal())
460os.Exit(1)
461}
462}
463
464// runUsingChroot, still in the grandparent process, sets up various bind
465// mounts and then runs the parent process in its own user namespace with the
466// necessary ID mappings.
467func runUsingChroot(spec *specs.Spec, bundlePath string, ctty *os.File, stdin io.Reader, stdout, stderr io.Writer, closeOnceRunning []*os.File) (wstatus unix.WaitStatus, err error) {
468var confwg sync.WaitGroup
469
470// Create a new mount namespace for ourselves and bind mount everything to a new location.
471undoIntermediates, err := bind.SetupIntermediateMountNamespace(spec, bundlePath)
472if err != nil {
473return 1, err
474}
475defer func() {
476if undoErr := undoIntermediates(); undoErr != nil {
477logrus.Debugf("error cleaning up intermediate mount NS: %v", err)
478}
479}()
480
481// Bind mount in our filesystems.
482undoChroots, err := setupChrootBindMounts(spec, bundlePath)
483if err != nil {
484return 1, err
485}
486defer func() {
487if undoErr := undoChroots(); undoErr != nil {
488logrus.Debugf("error cleaning up intermediate chroot bind mounts: %v", err)
489}
490}()
491
492// Create a pipe for passing configuration down to the next process.
493preader, pwriter, err := os.Pipe()
494if err != nil {
495return 1, fmt.Errorf("creating configuration pipe: %w", err)
496}
497config, conferr := json.Marshal(runUsingChrootExecSubprocOptions{
498Spec: spec,
499BundlePath: bundlePath,
500})
501if conferr != nil {
502fmt.Fprintf(os.Stderr, "error re-encoding configuration for %q", runUsingChrootExecCommand)
503os.Exit(1)
504}
505
506// Apologize for the namespace configuration that we're about to ignore.
507logNamespaceDiagnostics(spec)
508
509// We need to lock the thread so that PR_SET_PDEATHSIG won't trigger if the current thread exits.
510runtime.LockOSThread()
511defer runtime.UnlockOSThread()
512
513// Start the parent subprocess.
514cmd := unshare.Command(append([]string{runUsingChrootExecCommand}, spec.Process.Args...)...)
515setPdeathsig(cmd.Cmd)
516cmd.Stdin, cmd.Stdout, cmd.Stderr = stdin, stdout, stderr
517cmd.Dir = "/"
518cmd.Env = []string{fmt.Sprintf("LOGLEVEL=%d", logrus.GetLevel())}
519if _, ok := os.LookupEnv(containersConfEnv); ok {
520cmd.Env = append(cmd.Env, containersConfEnv+"="+os.Getenv(containersConfEnv))
521}
522if ctty != nil {
523cmd.Setsid = true
524cmd.Ctty = ctty
525}
526cmd.ExtraFiles = append([]*os.File{preader}, cmd.ExtraFiles...)
527if err := setPlatformUnshareOptions(spec, cmd); err != nil {
528return 1, fmt.Errorf("setting platform unshare options: %w", err)
529
530}
531interrupted := make(chan os.Signal, 100)
532cmd.Hook = func(int) error {
533for _, f := range closeOnceRunning {
534f.Close()
535}
536signal.Notify(interrupted, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM)
537go func() {
538for receivedSignal := range interrupted {
539if err := cmd.Process.Signal(receivedSignal); err != nil {
540logrus.Infof("%v while attempting to forward %v to child process", err, receivedSignal)
541}
542}
543}()
544return nil
545}
546
547logrus.Debugf("Running %#v in %#v", cmd.Cmd, cmd)
548confwg.Add(1)
549go func() {
550_, conferr = io.Copy(pwriter, bytes.NewReader(config))
551pwriter.Close()
552confwg.Done()
553}()
554err = cmd.Run()
555confwg.Wait()
556signal.Stop(interrupted)
557close(interrupted)
558if err != nil {
559if exitError, ok := err.(*exec.ExitError); ok {
560if waitStatus, ok := exitError.ProcessState.Sys().(syscall.WaitStatus); ok {
561if waitStatus.Exited() {
562if waitStatus.ExitStatus() != 0 {
563fmt.Fprintf(os.Stderr, "subprocess exited with status %d\n", waitStatus.ExitStatus())
564}
565os.Exit(waitStatus.ExitStatus())
566} else if waitStatus.Signaled() {
567fmt.Fprintf(os.Stderr, "subprocess exited on %s\n", waitStatus.Signal())
568os.Exit(1)
569}
570}
571}
572fmt.Fprintf(os.Stderr, "process exited with error: %v", err)
573os.Exit(1)
574}
575
576return 0, nil
577}
578
579// main() for parent subprocess. Its main job is to try to make our
580// environment look like the one described by the runtime configuration blob,
581// and then launch the intended command as a child.
582func runUsingChrootExecMain() {
583args := os.Args[1:]
584var options runUsingChrootExecSubprocOptions
585var err error
586
587runtime.LockOSThread()
588
589// Set logging.
590if level := os.Getenv("LOGLEVEL"); level != "" {
591if ll, err := strconv.Atoi(level); err == nil {
592logrus.SetLevel(logrus.Level(ll))
593}
594os.Unsetenv("LOGLEVEL")
595}
596
597// Unpack our configuration.
598confPipe := os.NewFile(3, "confpipe")
599if confPipe == nil {
600fmt.Fprintf(os.Stderr, "error reading options pipe\n")
601os.Exit(1)
602}
603defer confPipe.Close()
604if err := json.NewDecoder(confPipe).Decode(&options); err != nil {
605fmt.Fprintf(os.Stderr, "error decoding options: %v\n", err)
606os.Exit(1)
607}
608
609// Set the hostname. We're already in a distinct UTS namespace and are admins in the user
610// namespace which created it, so we shouldn't get a permissions error, but seccomp policy
611// might deny our attempt to call sethostname() anyway, so log a debug message for that.
612if options.Spec == nil || options.Spec.Process == nil {
613fmt.Fprintf(os.Stderr, "invalid options spec passed in\n")
614os.Exit(1)
615}
616
617if options.Spec.Hostname != "" {
618setContainerHostname(options.Spec.Hostname)
619}
620
621// Try to chroot into the root. Do this before we potentially
622// block the syscall via the seccomp profile. Allow the
623// platform to override this - on FreeBSD, we use a simple
624// jail to set the hostname in the container
625if err := createPlatformContainer(options); err != nil {
626var oldst, newst unix.Stat_t
627if err := unix.Stat(options.Spec.Root.Path, &oldst); err != nil {
628fmt.Fprintf(os.Stderr, "error stat()ing intended root directory %q: %v\n", options.Spec.Root.Path, err)
629os.Exit(1)
630}
631if err := unix.Chdir(options.Spec.Root.Path); err != nil {
632fmt.Fprintf(os.Stderr, "error chdir()ing to intended root directory %q: %v\n", options.Spec.Root.Path, err)
633os.Exit(1)
634}
635if err := unix.Chroot(options.Spec.Root.Path); err != nil {
636fmt.Fprintf(os.Stderr, "error chroot()ing into directory %q: %v\n", options.Spec.Root.Path, err)
637os.Exit(1)
638}
639if err := unix.Stat("/", &newst); err != nil {
640fmt.Fprintf(os.Stderr, "error stat()ing current root directory: %v\n", err)
641os.Exit(1)
642}
643if oldst.Dev != newst.Dev || oldst.Ino != newst.Ino {
644fmt.Fprintf(os.Stderr, "unknown error chroot()ing into directory %q: %v\n", options.Spec.Root.Path, err)
645os.Exit(1)
646}
647logrus.Debugf("chrooted into %q", options.Spec.Root.Path)
648}
649
650// not doing because it's still shared: creating devices
651// not doing because it's not applicable: setting annotations
652// not doing because it's still shared: setting sysctl settings
653// not doing because cgroupfs is read only: configuring control groups
654// -> this means we can use the freezer to make sure there aren't any lingering processes
655// -> this means we ignore cgroups-based controls
656// not doing because we don't set any in the config: running hooks
657// not doing because we don't set it in the config: setting rootfs read-only
658// not doing because we don't set it in the config: setting rootfs propagation
659logrus.Debugf("setting apparmor profile")
660if err = setApparmorProfile(options.Spec); err != nil {
661fmt.Fprintf(os.Stderr, "error setting apparmor profile for process: %v\n", err)
662os.Exit(1)
663}
664if err = setSelinuxLabel(options.Spec); err != nil {
665fmt.Fprintf(os.Stderr, "error setting SELinux label for process: %v\n", err)
666os.Exit(1)
667}
668
669logrus.Debugf("setting resource limits")
670if err = setRlimits(options.Spec, false, false); err != nil {
671fmt.Fprintf(os.Stderr, "error setting process resource limits for process: %v\n", err)
672os.Exit(1)
673}
674
675// Try to change to the directory.
676cwd := options.Spec.Process.Cwd
677if !filepath.IsAbs(cwd) {
678cwd = "/" + cwd
679}
680cwd = filepath.Clean(cwd)
681if err := unix.Chdir("/"); err != nil {
682fmt.Fprintf(os.Stderr, "error chdir()ing into new root directory %q: %v\n", options.Spec.Root.Path, err)
683os.Exit(1)
684}
685if err := unix.Chdir(cwd); err != nil {
686fmt.Fprintf(os.Stderr, "error chdir()ing into directory %q under root %q: %v\n", cwd, options.Spec.Root.Path, err)
687os.Exit(1)
688}
689logrus.Debugf("changed working directory to %q", cwd)
690
691// Drop privileges.
692user := options.Spec.Process.User
693if len(user.AdditionalGids) > 0 {
694gids := make([]int, len(user.AdditionalGids))
695for i := range user.AdditionalGids {
696gids[i] = int(user.AdditionalGids[i])
697}
698logrus.Debugf("setting supplemental groups")
699if err = syscall.Setgroups(gids); err != nil {
700fmt.Fprintf(os.Stderr, "error setting supplemental groups list: %v", err)
701os.Exit(1)
702}
703} else {
704setgroups, _ := os.ReadFile("/proc/self/setgroups")
705if strings.Trim(string(setgroups), "\n") != "deny" {
706logrus.Debugf("clearing supplemental groups")
707if err = syscall.Setgroups([]int{}); err != nil {
708fmt.Fprintf(os.Stderr, "error clearing supplemental groups list: %v", err)
709os.Exit(1)
710}
711}
712}
713
714logrus.Debugf("setting gid")
715if err = unix.Setresgid(int(user.GID), int(user.GID), int(user.GID)); err != nil {
716fmt.Fprintf(os.Stderr, "error setting GID: %v", err)
717os.Exit(1)
718}
719
720if err = setSeccomp(options.Spec); err != nil {
721fmt.Fprintf(os.Stderr, "error setting seccomp filter for process: %v\n", err)
722os.Exit(1)
723}
724
725logrus.Debugf("setting capabilities")
726var keepCaps []string
727if user.UID != 0 {
728keepCaps = []string{"CAP_SETUID"}
729}
730if err := setCapabilities(options.Spec, keepCaps...); err != nil {
731fmt.Fprintf(os.Stderr, "error setting capabilities for process: %v\n", err)
732os.Exit(1)
733}
734
735logrus.Debugf("setting uid")
736if err = unix.Setresuid(int(user.UID), int(user.UID), int(user.UID)); err != nil {
737fmt.Fprintf(os.Stderr, "error setting UID: %v", err)
738os.Exit(1)
739}
740
741// Actually run the specified command.
742cmd := exec.Command(args[0], args[1:]...)
743setPdeathsig(cmd)
744cmd.Env = options.Spec.Process.Env
745cmd.Stdin, cmd.Stdout, cmd.Stderr = os.Stdin, os.Stdout, os.Stderr
746cmd.Dir = cwd
747logrus.Debugf("Running %#v (PATH = %q)", cmd, os.Getenv("PATH"))
748interrupted := make(chan os.Signal, 100)
749if err = cmd.Start(); err != nil {
750fmt.Fprintf(os.Stderr, "process failed to start with error: %v", err)
751}
752go func() {
753for range interrupted {
754if err := cmd.Process.Signal(syscall.SIGKILL); err != nil {
755logrus.Infof("%v while attempting to send SIGKILL to child process", err)
756}
757}
758}()
759signal.Notify(interrupted, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM)
760err = cmd.Wait()
761signal.Stop(interrupted)
762close(interrupted)
763if err != nil {
764if exitError, ok := err.(*exec.ExitError); ok {
765if waitStatus, ok := exitError.ProcessState.Sys().(syscall.WaitStatus); ok {
766if waitStatus.Exited() {
767if waitStatus.ExitStatus() != 0 {
768fmt.Fprintf(os.Stderr, "subprocess exited with status %d\n", waitStatus.ExitStatus())
769}
770os.Exit(waitStatus.ExitStatus())
771} else if waitStatus.Signaled() {
772fmt.Fprintf(os.Stderr, "subprocess exited on %s\n", waitStatus.Signal())
773os.Exit(1)
774}
775}
776}
777fmt.Fprintf(os.Stderr, "process exited with error: %v", err)
778os.Exit(1)
779}
780}
781
782// parses the resource limits for ourselves and any processes that
783// we'll start into a format that's more in line with the kernel APIs
784func parseRlimits(spec *specs.Spec) (map[int]unix.Rlimit, error) {
785if spec.Process == nil {
786return nil, nil
787}
788parsed := make(map[int]unix.Rlimit)
789for _, limit := range spec.Process.Rlimits {
790resource, recognized := rlimitsMap[strings.ToUpper(limit.Type)]
791if !recognized {
792return nil, fmt.Errorf("parsing limit type %q", limit.Type)
793}
794parsed[resource] = makeRlimit(limit)
795}
796return parsed, nil
797}
798
799// setRlimits sets any resource limits that we want to apply to processes that
800// we'll start.
801func setRlimits(spec *specs.Spec, onlyLower, onlyRaise bool) error {
802limits, err := parseRlimits(spec)
803if err != nil {
804return err
805}
806for resource, desired := range limits {
807var current unix.Rlimit
808if err := unix.Getrlimit(resource, ¤t); err != nil {
809return fmt.Errorf("reading %q limit: %w", rlimitsReverseMap[resource], err)
810}
811if desired.Max > current.Max && onlyLower {
812// this would raise a hard limit, and we're only here to lower them
813continue
814}
815if desired.Max < current.Max && onlyRaise {
816// this would lower a hard limit, and we're only here to raise them
817continue
818}
819if err := unix.Setrlimit(resource, &desired); err != nil {
820return fmt.Errorf("setting %q limit to soft=%d,hard=%d (was soft=%d,hard=%d): %w", rlimitsReverseMap[resource], desired.Cur, desired.Max, current.Cur, current.Max, err)
821}
822}
823return nil
824}
825
826func isDevNull(dev os.FileInfo) bool {
827if dev.Mode()&os.ModeCharDevice != 0 {
828stat, _ := dev.Sys().(*syscall.Stat_t)
829nullStat := syscall.Stat_t{}
830if err := syscall.Stat(os.DevNull, &nullStat); err != nil {
831logrus.Warnf("unable to stat /dev/null: %v", err)
832return false
833}
834if stat.Rdev == nullStat.Rdev {
835return true
836}
837}
838return false
839}
840