podman
711 строк · 25.4 Кб
1//go:build linux
2// +build linux
3
4package chroot
5
6import (
7"errors"
8"fmt"
9"os"
10"os/exec"
11"path/filepath"
12"strings"
13"syscall"
14"time"
15
16"github.com/containers/buildah/copier"
17"github.com/containers/storage/pkg/mount"
18"github.com/containers/storage/pkg/unshare"
19"github.com/opencontainers/runc/libcontainer/apparmor"
20"github.com/opencontainers/runtime-spec/specs-go"
21"github.com/sirupsen/logrus"
22"github.com/syndtr/gocapability/capability"
23"golang.org/x/sys/unix"
24)
25
26var (
27rlimitsMap = map[string]int{
28"RLIMIT_AS": unix.RLIMIT_AS,
29"RLIMIT_CORE": unix.RLIMIT_CORE,
30"RLIMIT_CPU": unix.RLIMIT_CPU,
31"RLIMIT_DATA": unix.RLIMIT_DATA,
32"RLIMIT_FSIZE": unix.RLIMIT_FSIZE,
33"RLIMIT_LOCKS": unix.RLIMIT_LOCKS,
34"RLIMIT_MEMLOCK": unix.RLIMIT_MEMLOCK,
35"RLIMIT_MSGQUEUE": unix.RLIMIT_MSGQUEUE,
36"RLIMIT_NICE": unix.RLIMIT_NICE,
37"RLIMIT_NOFILE": unix.RLIMIT_NOFILE,
38"RLIMIT_NPROC": unix.RLIMIT_NPROC,
39"RLIMIT_RSS": unix.RLIMIT_RSS,
40"RLIMIT_RTPRIO": unix.RLIMIT_RTPRIO,
41"RLIMIT_RTTIME": unix.RLIMIT_RTTIME,
42"RLIMIT_SIGPENDING": unix.RLIMIT_SIGPENDING,
43"RLIMIT_STACK": unix.RLIMIT_STACK,
44}
45rlimitsReverseMap = map[int]string{}
46)
47
48type runUsingChrootSubprocOptions struct {
49Spec *specs.Spec
50BundlePath string
51UIDMappings []syscall.SysProcIDMap
52GIDMappings []syscall.SysProcIDMap
53}
54
55func setPlatformUnshareOptions(spec *specs.Spec, cmd *unshare.Cmd) error {
56// If we have configured ID mappings, set them here so that they can apply to the child.
57hostUidmap, hostGidmap, err := unshare.GetHostIDMappings("")
58if err != nil {
59return err
60}
61uidmap, gidmap := spec.Linux.UIDMappings, spec.Linux.GIDMappings
62if len(uidmap) == 0 {
63// No UID mappings are configured for the container. Borrow our parent's mappings.
64uidmap = append([]specs.LinuxIDMapping{}, hostUidmap...)
65for i := range uidmap {
66uidmap[i].HostID = uidmap[i].ContainerID
67}
68}
69if len(gidmap) == 0 {
70// No GID mappings are configured for the container. Borrow our parent's mappings.
71gidmap = append([]specs.LinuxIDMapping{}, hostGidmap...)
72for i := range gidmap {
73gidmap[i].HostID = gidmap[i].ContainerID
74}
75}
76
77cmd.UnshareFlags = syscall.CLONE_NEWUTS | syscall.CLONE_NEWNS
78requestedUserNS := false
79for _, ns := range spec.Linux.Namespaces {
80if ns.Type == specs.UserNamespace {
81requestedUserNS = true
82}
83}
84if len(spec.Linux.UIDMappings) > 0 || len(spec.Linux.GIDMappings) > 0 || requestedUserNS {
85cmd.UnshareFlags = cmd.UnshareFlags | syscall.CLONE_NEWUSER
86cmd.UidMappings = uidmap
87cmd.GidMappings = gidmap
88cmd.GidMappingsEnableSetgroups = true
89}
90cmd.OOMScoreAdj = spec.Process.OOMScoreAdj
91return nil
92}
93
94func setContainerHostname(name string) {
95if err := unix.Sethostname([]byte(name)); err != nil {
96logrus.Debugf("failed to set hostname %q for process: %v", name, err)
97}
98}
99
100// logNamespaceDiagnostics knows which namespaces we want to create.
101// Output debug messages when that differs from what we're being asked to do.
102func logNamespaceDiagnostics(spec *specs.Spec) {
103sawMountNS := false
104sawUTSNS := false
105for _, ns := range spec.Linux.Namespaces {
106switch ns.Type {
107case specs.CgroupNamespace:
108if ns.Path != "" {
109logrus.Debugf("unable to join cgroup namespace, sorry about that")
110} else {
111logrus.Debugf("unable to create cgroup namespace, sorry about that")
112}
113case specs.IPCNamespace:
114if ns.Path != "" {
115logrus.Debugf("unable to join IPC namespace, sorry about that")
116} else {
117logrus.Debugf("unable to create IPC namespace, sorry about that")
118}
119case specs.MountNamespace:
120if ns.Path != "" {
121logrus.Debugf("unable to join mount namespace %q, creating a new one", ns.Path)
122}
123sawMountNS = true
124case specs.NetworkNamespace:
125if ns.Path != "" {
126logrus.Debugf("unable to join network namespace, sorry about that")
127} else {
128logrus.Debugf("unable to create network namespace, sorry about that")
129}
130case specs.PIDNamespace:
131if ns.Path != "" {
132logrus.Debugf("unable to join PID namespace, sorry about that")
133} else {
134logrus.Debugf("unable to create PID namespace, sorry about that")
135}
136case specs.UserNamespace:
137if ns.Path != "" {
138logrus.Debugf("unable to join user namespace, sorry about that")
139}
140case specs.UTSNamespace:
141if ns.Path != "" {
142logrus.Debugf("unable to join UTS namespace %q, creating a new one", ns.Path)
143}
144sawUTSNS = true
145}
146}
147if !sawMountNS {
148logrus.Debugf("mount namespace not requested, but creating a new one anyway")
149}
150if !sawUTSNS {
151logrus.Debugf("UTS namespace not requested, but creating a new one anyway")
152}
153}
154
155// setApparmorProfile sets the apparmor profile for ourselves, and hopefully any child processes that we'll start.
156func setApparmorProfile(spec *specs.Spec) error {
157if !apparmor.IsEnabled() || spec.Process.ApparmorProfile == "" {
158return nil
159}
160if err := apparmor.ApplyProfile(spec.Process.ApparmorProfile); err != nil {
161return fmt.Errorf("setting apparmor profile to %q: %w", spec.Process.ApparmorProfile, err)
162}
163return nil
164}
165
166// setCapabilities sets capabilities for ourselves, to be more or less inherited by any processes that we'll start.
167func setCapabilities(spec *specs.Spec, keepCaps ...string) error {
168currentCaps, err := capability.NewPid2(0)
169if err != nil {
170return fmt.Errorf("reading capabilities of current process: %w", err)
171}
172if err := currentCaps.Load(); err != nil {
173return fmt.Errorf("loading capabilities: %w", err)
174}
175caps, err := capability.NewPid2(0)
176if err != nil {
177return fmt.Errorf("reading capabilities of current process: %w", err)
178}
179capMap := map[capability.CapType][]string{
180capability.BOUNDING: spec.Process.Capabilities.Bounding,
181capability.EFFECTIVE: spec.Process.Capabilities.Effective,
182capability.INHERITABLE: []string{},
183capability.PERMITTED: spec.Process.Capabilities.Permitted,
184capability.AMBIENT: spec.Process.Capabilities.Ambient,
185}
186knownCaps := capability.List()
187noCap := capability.Cap(-1)
188for capType, capList := range capMap {
189for _, capToSet := range capList {
190cap := noCap
191for _, c := range knownCaps {
192if strings.EqualFold("CAP_"+c.String(), capToSet) {
193cap = c
194break
195}
196}
197if cap == noCap {
198return fmt.Errorf("mapping capability %q to a number", capToSet)
199}
200caps.Set(capType, cap)
201}
202for _, capToSet := range keepCaps {
203cap := noCap
204for _, c := range knownCaps {
205if strings.EqualFold("CAP_"+c.String(), capToSet) {
206cap = c
207break
208}
209}
210if cap == noCap {
211return fmt.Errorf("mapping capability %q to a number", capToSet)
212}
213if currentCaps.Get(capType, cap) {
214caps.Set(capType, cap)
215}
216}
217}
218if err = caps.Apply(capability.CAPS | capability.BOUNDS | capability.AMBS); err != nil {
219return fmt.Errorf("setting capabilities: %w", err)
220}
221return nil
222}
223
224func makeRlimit(limit specs.POSIXRlimit) unix.Rlimit {
225return unix.Rlimit{Cur: limit.Soft, Max: limit.Hard}
226}
227
228func createPlatformContainer(options runUsingChrootExecSubprocOptions) error {
229return errors.New("unsupported createPlatformContainer")
230}
231
232func mountFlagsForFSFlags(fsFlags uintptr) uintptr {
233var mountFlags uintptr
234for _, mapping := range []struct {
235fsFlag uintptr
236mountFlag uintptr
237}{
238{unix.ST_MANDLOCK, unix.MS_MANDLOCK},
239{unix.ST_NOATIME, unix.MS_NOATIME},
240{unix.ST_NODEV, unix.MS_NODEV},
241{unix.ST_NODIRATIME, unix.MS_NODIRATIME},
242{unix.ST_NOEXEC, unix.MS_NOEXEC},
243{unix.ST_NOSUID, unix.MS_NOSUID},
244{unix.ST_RDONLY, unix.MS_RDONLY},
245{unix.ST_RELATIME, unix.MS_RELATIME},
246{unix.ST_SYNCHRONOUS, unix.MS_SYNCHRONOUS},
247} {
248if fsFlags&mapping.fsFlag == mapping.fsFlag {
249mountFlags |= mapping.mountFlag
250}
251}
252return mountFlags
253}
254
255func makeReadOnly(mntpoint string, flags uintptr) error {
256var fs unix.Statfs_t
257// Make sure it's read-only.
258if err := unix.Statfs(mntpoint, &fs); err != nil {
259return fmt.Errorf("checking if directory %q was bound read-only: %w", mntpoint, err)
260}
261if fs.Flags&unix.ST_RDONLY == 0 {
262// All callers currently pass MS_RDONLY in "flags", but in case they stop doing
263// that at some point in the future...
264if err := unix.Mount(mntpoint, mntpoint, "bind", flags|unix.MS_RDONLY|unix.MS_REMOUNT|unix.MS_BIND, ""); err != nil {
265return fmt.Errorf("remounting %s in mount namespace read-only: %w", mntpoint, err)
266}
267}
268return nil
269}
270
271// setupChrootBindMounts actually bind mounts things under the rootfs, and returns a
272// callback that will clean up its work.
273func setupChrootBindMounts(spec *specs.Spec, bundlePath string) (undoBinds func() error, err error) {
274var fs unix.Statfs_t
275undoBinds = func() error {
276if err2 := unix.Unmount(spec.Root.Path, unix.MNT_DETACH); err2 != nil {
277retries := 0
278for (err2 == unix.EBUSY || err2 == unix.EAGAIN) && retries < 50 {
279time.Sleep(50 * time.Millisecond)
280err2 = unix.Unmount(spec.Root.Path, unix.MNT_DETACH)
281retries++
282}
283if err2 != nil {
284logrus.Warnf("pkg/chroot: error unmounting %q (retried %d times): %v", spec.Root.Path, retries, err2)
285if err == nil {
286err = err2
287}
288}
289}
290return err
291}
292
293// Now bind mount all of those things to be under the rootfs's location in this
294// mount namespace.
295commonFlags := uintptr(unix.MS_BIND | unix.MS_REC | unix.MS_PRIVATE)
296bindFlags := commonFlags
297devFlags := commonFlags | unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_RDONLY
298procFlags := devFlags | unix.MS_NODEV
299sysFlags := devFlags | unix.MS_NODEV
300
301// Bind /dev read-only.
302subDev := filepath.Join(spec.Root.Path, "/dev")
303if err := unix.Mount("/dev", subDev, "bind", devFlags, ""); err != nil {
304if errors.Is(err, os.ErrNotExist) {
305err = os.Mkdir(subDev, 0755)
306if err == nil {
307err = unix.Mount("/dev", subDev, "bind", devFlags, "")
308}
309}
310if err != nil {
311return undoBinds, fmt.Errorf("bind mounting /dev from host into mount namespace: %w", err)
312}
313}
314// Make sure it's read-only.
315if err = unix.Statfs(subDev, &fs); err != nil {
316return undoBinds, fmt.Errorf("checking if directory %q was bound read-only: %w", subDev, err)
317}
318if fs.Flags&unix.ST_RDONLY == 0 {
319if err := unix.Mount(subDev, subDev, "bind", devFlags|unix.MS_REMOUNT|unix.MS_BIND, ""); err != nil {
320return undoBinds, fmt.Errorf("remounting /dev in mount namespace read-only: %w", err)
321}
322}
323logrus.Debugf("bind mounted %q to %q", "/dev", filepath.Join(spec.Root.Path, "/dev"))
324
325// Bind /proc read-only.
326subProc := filepath.Join(spec.Root.Path, "/proc")
327if err := unix.Mount("/proc", subProc, "bind", procFlags, ""); err != nil {
328if errors.Is(err, os.ErrNotExist) {
329err = os.Mkdir(subProc, 0755)
330if err == nil {
331err = unix.Mount("/proc", subProc, "bind", procFlags, "")
332}
333}
334if err != nil {
335return undoBinds, fmt.Errorf("bind mounting /proc from host into mount namespace: %w", err)
336}
337}
338logrus.Debugf("bind mounted %q to %q", "/proc", filepath.Join(spec.Root.Path, "/proc"))
339
340// Bind /sys read-only.
341subSys := filepath.Join(spec.Root.Path, "/sys")
342if err := unix.Mount("/sys", subSys, "bind", sysFlags, ""); err != nil {
343if errors.Is(err, os.ErrNotExist) {
344err = os.Mkdir(subSys, 0755)
345if err == nil {
346err = unix.Mount("/sys", subSys, "bind", sysFlags, "")
347}
348}
349if err != nil {
350return undoBinds, fmt.Errorf("bind mounting /sys from host into mount namespace: %w", err)
351}
352}
353if err := makeReadOnly(subSys, sysFlags); err != nil {
354return undoBinds, err
355}
356
357mnts, _ := mount.GetMounts()
358for _, m := range mnts {
359if !strings.HasPrefix(m.Mountpoint, "/sys/") &&
360m.Mountpoint != "/sys" {
361continue
362}
363subSys := filepath.Join(spec.Root.Path, m.Mountpoint)
364if err := unix.Mount(m.Mountpoint, subSys, "bind", sysFlags, ""); err != nil {
365msg := fmt.Sprintf("could not bind mount %q, skipping: %v", m.Mountpoint, err)
366if strings.HasPrefix(m.Mountpoint, "/sys") {
367logrus.Infof(msg)
368} else {
369logrus.Warningf(msg)
370}
371continue
372}
373if err := makeReadOnly(subSys, sysFlags); err != nil {
374return undoBinds, err
375}
376}
377logrus.Debugf("bind mounted %q to %q", "/sys", filepath.Join(spec.Root.Path, "/sys"))
378
379// Bind, overlay, or tmpfs mount everything we've been asked to mount.
380for _, m := range spec.Mounts {
381// Skip anything that we just mounted.
382switch m.Destination {
383case "/dev", "/proc", "/sys":
384logrus.Debugf("already bind mounted %q on %q", m.Destination, filepath.Join(spec.Root.Path, m.Destination))
385continue
386default:
387if strings.HasPrefix(m.Destination, "/dev/") {
388continue
389}
390if strings.HasPrefix(m.Destination, "/proc/") {
391continue
392}
393if strings.HasPrefix(m.Destination, "/sys/") {
394continue
395}
396}
397// Skip anything that isn't a bind or overlay or tmpfs mount.
398if m.Type != "bind" && m.Type != "tmpfs" && m.Type != "overlay" {
399logrus.Debugf("skipping mount of type %q on %q", m.Type, m.Destination)
400continue
401}
402// If the target is already there, we can just mount over it.
403var srcinfo os.FileInfo
404switch m.Type {
405case "bind":
406srcinfo, err = os.Stat(m.Source)
407if err != nil {
408return undoBinds, fmt.Errorf("examining %q for mounting in mount namespace: %w", m.Source, err)
409}
410case "overlay", "tmpfs":
411srcinfo, err = os.Stat("/")
412if err != nil {
413return undoBinds, fmt.Errorf("examining / to use as a template for a %s mount: %w", m.Type, err)
414}
415}
416target := filepath.Join(spec.Root.Path, m.Destination)
417// Check if target is a symlink.
418stat, err := os.Lstat(target)
419// If target is a symlink, follow the link and ensure the destination exists.
420if err == nil && stat != nil && (stat.Mode()&os.ModeSymlink != 0) {
421target, err = copier.Eval(spec.Root.Path, m.Destination, copier.EvalOptions{})
422if err != nil {
423return nil, fmt.Errorf("evaluating symlink %q: %w", target, err)
424}
425// Stat the destination of the evaluated symlink.
426_, err = os.Stat(target)
427}
428if err != nil {
429// If the target can't be stat()ted, check the error.
430if !errors.Is(err, os.ErrNotExist) {
431return undoBinds, fmt.Errorf("examining %q for mounting in mount namespace: %w", target, err)
432}
433// The target isn't there yet, so create it. If the source is a directory,
434// we need a directory, otherwise we need a non-directory (i.e., a file).
435if srcinfo.IsDir() {
436if err = os.MkdirAll(target, 0755); err != nil {
437return undoBinds, fmt.Errorf("creating mountpoint %q in mount namespace: %w", target, err)
438}
439} else {
440if err = os.MkdirAll(filepath.Dir(target), 0755); err != nil {
441return undoBinds, fmt.Errorf("ensuring parent of mountpoint %q (%q) is present in mount namespace: %w", target, filepath.Dir(target), err)
442}
443var file *os.File
444if file, err = os.OpenFile(target, os.O_WRONLY|os.O_CREATE, 0755); err != nil {
445return undoBinds, fmt.Errorf("creating mountpoint %q in mount namespace: %w", target, err)
446}
447file.Close()
448}
449}
450// Sort out which flags we're asking for, and what statfs() should be telling us
451// if we successfully mounted with them.
452requestFlags := uintptr(0)
453expectedImportantFlags := uintptr(0)
454importantFlags := uintptr(0)
455possibleImportantFlags := uintptr(unix.ST_NODEV | unix.ST_NOEXEC | unix.ST_NOSUID | unix.ST_RDONLY)
456for _, option := range m.Options {
457switch option {
458case "nodev":
459requestFlags |= unix.MS_NODEV
460importantFlags |= unix.ST_NODEV
461expectedImportantFlags |= unix.ST_NODEV
462case "dev":
463requestFlags &= ^uintptr(unix.MS_NODEV)
464importantFlags |= unix.ST_NODEV
465expectedImportantFlags &= ^uintptr(unix.ST_NODEV)
466case "noexec":
467requestFlags |= unix.MS_NOEXEC
468importantFlags |= unix.ST_NOEXEC
469expectedImportantFlags |= unix.ST_NOEXEC
470case "exec":
471requestFlags &= ^uintptr(unix.MS_NOEXEC)
472importantFlags |= unix.ST_NOEXEC
473expectedImportantFlags &= ^uintptr(unix.ST_NOEXEC)
474case "nosuid":
475requestFlags |= unix.MS_NOSUID
476importantFlags |= unix.ST_NOSUID
477expectedImportantFlags |= unix.ST_NOSUID
478case "suid":
479requestFlags &= ^uintptr(unix.MS_NOSUID)
480importantFlags |= unix.ST_NOSUID
481expectedImportantFlags &= ^uintptr(unix.ST_NOSUID)
482case "ro":
483requestFlags |= unix.MS_RDONLY
484importantFlags |= unix.ST_RDONLY
485expectedImportantFlags |= unix.ST_RDONLY
486case "rw":
487requestFlags &= ^uintptr(unix.MS_RDONLY)
488importantFlags |= unix.ST_RDONLY
489expectedImportantFlags &= ^uintptr(unix.ST_RDONLY)
490}
491}
492switch m.Type {
493case "bind":
494// Do the initial bind mount. We'll worry about the flags in a bit.
495logrus.Debugf("bind mounting %q on %q %v", m.Destination, filepath.Join(spec.Root.Path, m.Destination), m.Options)
496if err = unix.Mount(m.Source, target, "", bindFlags|requestFlags, ""); err != nil {
497return undoBinds, fmt.Errorf("bind mounting %q from host to %q in mount namespace (%q): %w", m.Source, m.Destination, target, err)
498}
499logrus.Debugf("bind mounted %q to %q", m.Source, target)
500case "tmpfs":
501// Mount a tmpfs. We'll worry about the flags in a bit.
502if err = mount.Mount(m.Source, target, m.Type, strings.Join(append(m.Options, "private"), ",")); err != nil {
503return undoBinds, fmt.Errorf("mounting tmpfs to %q in mount namespace (%q, %q): %w", m.Destination, target, strings.Join(append(m.Options, "private"), ","), err)
504}
505logrus.Debugf("mounted a tmpfs to %q", target)
506case "overlay":
507// Mount an overlay. We'll worry about the flags in a bit.
508if err = mount.Mount(m.Source, target, m.Type, strings.Join(append(m.Options, "private"), ",")); err != nil {
509return undoBinds, fmt.Errorf("mounting overlay to %q in mount namespace (%q, %q): %w", m.Destination, target, strings.Join(append(m.Options, "private"), ","), err)
510}
511logrus.Debugf("mounted a overlay to %q", target)
512}
513// Time to worry about the flags.
514if err = unix.Statfs(target, &fs); err != nil {
515return undoBinds, fmt.Errorf("checking if volume %q was mounted with requested flags: %w", target, err)
516}
517effectiveImportantFlags := uintptr(fs.Flags) & importantFlags
518if effectiveImportantFlags != expectedImportantFlags {
519// Do a remount to try to get the desired flags to stick.
520effectiveUnimportantFlags := uintptr(fs.Flags) & ^possibleImportantFlags
521if err = unix.Mount(target, target, m.Type, unix.MS_REMOUNT|bindFlags|requestFlags|mountFlagsForFSFlags(effectiveUnimportantFlags), ""); err != nil {
522return undoBinds, fmt.Errorf("remounting %q in mount namespace with flags %#x instead of %#x: %w", target, requestFlags, effectiveImportantFlags, err)
523}
524// Check if the desired flags stuck.
525if err = unix.Statfs(target, &fs); err != nil {
526return undoBinds, fmt.Errorf("checking if directory %q was remounted with requested flags %#x instead of %#x: %w", target, requestFlags, effectiveImportantFlags, err)
527}
528newEffectiveImportantFlags := uintptr(fs.Flags) & importantFlags
529if newEffectiveImportantFlags != expectedImportantFlags {
530return undoBinds, fmt.Errorf("unable to remount %q with requested flags %#x instead of %#x, just got %#x back", target, requestFlags, effectiveImportantFlags, newEffectiveImportantFlags)
531}
532}
533}
534
535// Set up any read-only paths that we need to. If we're running inside
536// of a container, some of these locations will already be read-only, in
537// which case can declare victory and move on.
538for _, roPath := range spec.Linux.ReadonlyPaths {
539r := filepath.Join(spec.Root.Path, roPath)
540target, err := filepath.EvalSymlinks(r)
541if err != nil {
542if errors.Is(err, os.ErrNotExist) {
543// No target, no problem.
544continue
545}
546return undoBinds, fmt.Errorf("checking %q for symlinks before marking it read-only: %w", r, err)
547}
548// Check if the location is already read-only.
549var fs unix.Statfs_t
550if err = unix.Statfs(target, &fs); err != nil {
551if errors.Is(err, os.ErrNotExist) {
552// No target, no problem.
553continue
554}
555return undoBinds, fmt.Errorf("checking if directory %q is already read-only: %w", target, err)
556}
557if fs.Flags&unix.ST_RDONLY == unix.ST_RDONLY {
558continue
559}
560// Mount the location over itself, so that we can remount it as read-only, making
561// sure to preserve any combination of nodev/noexec/nosuid that's already in play.
562roFlags := mountFlagsForFSFlags(uintptr(fs.Flags)) | unix.MS_RDONLY
563if err := unix.Mount(target, target, "", bindFlags|roFlags, ""); err != nil {
564if errors.Is(err, os.ErrNotExist) {
565// No target, no problem.
566continue
567}
568return undoBinds, fmt.Errorf("bind mounting %q onto itself in preparation for making it read-only: %w", target, err)
569}
570// Remount the location read-only.
571if err = unix.Statfs(target, &fs); err != nil {
572return undoBinds, fmt.Errorf("checking if directory %q was bound read-only: %w", target, err)
573}
574if fs.Flags&unix.ST_RDONLY == 0 {
575if err := unix.Mount(target, target, "", unix.MS_REMOUNT|unix.MS_RDONLY|bindFlags|mountFlagsForFSFlags(uintptr(fs.Flags)), ""); err != nil {
576return undoBinds, fmt.Errorf("remounting %q in mount namespace read-only: %w", target, err)
577}
578}
579// Check again.
580if err = unix.Statfs(target, &fs); err != nil {
581return undoBinds, fmt.Errorf("checking if directory %q was remounted read-only: %w", target, err)
582}
583if fs.Flags&unix.ST_RDONLY == 0 {
584// Still not read only.
585return undoBinds, fmt.Errorf("verifying that %q in mount namespace was remounted read-only: %w", target, err)
586}
587}
588
589// Create an empty directory for to use for masking directories.
590roEmptyDir := filepath.Join(bundlePath, "empty")
591if len(spec.Linux.MaskedPaths) > 0 {
592if err := os.Mkdir(roEmptyDir, 0700); err != nil {
593return undoBinds, fmt.Errorf("creating empty directory %q: %w", roEmptyDir, err)
594}
595}
596
597// Set up any masked paths that we need to. If we're running inside of
598// a container, some of these locations will already be read-only tmpfs
599// filesystems or bind mounted to os.DevNull. If we're not running
600// inside of a container, and nobody else has done that, we'll do it.
601for _, masked := range spec.Linux.MaskedPaths {
602t := filepath.Join(spec.Root.Path, masked)
603target, err := filepath.EvalSymlinks(t)
604if err != nil {
605target = t
606}
607// Get some info about the target.
608targetinfo, err := os.Stat(target)
609if err != nil {
610if errors.Is(err, os.ErrNotExist) {
611// No target, no problem.
612continue
613}
614return undoBinds, fmt.Errorf("examining %q for masking in mount namespace: %w", target, err)
615}
616if targetinfo.IsDir() {
617// The target's a directory. Check if it's a read-only filesystem.
618var statfs unix.Statfs_t
619if err = unix.Statfs(target, &statfs); err != nil {
620return undoBinds, fmt.Errorf("checking if directory %q is a mountpoint: %w", target, err)
621}
622isReadOnly := statfs.Flags&unix.ST_RDONLY == unix.ST_RDONLY
623// Check if any of the IDs we're mapping could read it.
624var stat unix.Stat_t
625if err = unix.Stat(target, &stat); err != nil {
626return undoBinds, fmt.Errorf("checking permissions on directory %q: %w", target, err)
627}
628isAccessible := false
629if stat.Mode&unix.S_IROTH|unix.S_IXOTH != 0 {
630isAccessible = true
631}
632if !isAccessible && stat.Mode&unix.S_IROTH|unix.S_IXOTH != 0 {
633if len(spec.Linux.GIDMappings) > 0 {
634for _, mapping := range spec.Linux.GIDMappings {
635if stat.Gid >= mapping.ContainerID && stat.Gid < mapping.ContainerID+mapping.Size {
636isAccessible = true
637break
638}
639}
640}
641}
642if !isAccessible && stat.Mode&unix.S_IRUSR|unix.S_IXUSR != 0 {
643if len(spec.Linux.UIDMappings) > 0 {
644for _, mapping := range spec.Linux.UIDMappings {
645if stat.Uid >= mapping.ContainerID && stat.Uid < mapping.ContainerID+mapping.Size {
646isAccessible = true
647break
648}
649}
650}
651}
652// Check if it's empty.
653hasContent := false
654directory, err := os.Open(target)
655if err != nil {
656if !os.IsPermission(err) {
657return undoBinds, fmt.Errorf("opening directory %q: %w", target, err)
658}
659} else {
660names, err := directory.Readdirnames(0)
661directory.Close()
662if err != nil {
663return undoBinds, fmt.Errorf("reading contents of directory %q: %w", target, err)
664}
665hasContent = false
666for _, name := range names {
667switch name {
668case ".", "..":
669continue
670default:
671hasContent = true
672}
673if hasContent {
674break
675}
676}
677}
678// The target's a directory, so read-only bind mount an empty directory on it.
679roFlags := uintptr(syscall.MS_BIND | syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_NOEXEC | syscall.MS_RDONLY)
680if !isReadOnly || (hasContent && isAccessible) {
681if err = unix.Mount(roEmptyDir, target, "bind", roFlags, ""); err != nil {
682return undoBinds, fmt.Errorf("masking directory %q in mount namespace: %w", target, err)
683}
684if err = unix.Statfs(target, &fs); err != nil {
685return undoBinds, fmt.Errorf("checking if masked directory %q was mounted read-only in mount namespace: %w", target, err)
686}
687if fs.Flags&unix.ST_RDONLY == 0 {
688if err = unix.Mount(target, target, "", syscall.MS_REMOUNT|roFlags|mountFlagsForFSFlags(uintptr(fs.Flags)), ""); err != nil {
689return undoBinds, fmt.Errorf("making sure masked directory %q in mount namespace is read only: %w", target, err)
690}
691}
692}
693} else {
694// If the target's is not a directory or os.DevNull, bind mount os.DevNull over it.
695if !isDevNull(targetinfo) {
696if err = unix.Mount(os.DevNull, target, "", uintptr(syscall.MS_BIND|syscall.MS_RDONLY|syscall.MS_PRIVATE), ""); err != nil {
697return undoBinds, fmt.Errorf("masking non-directory %q in mount namespace: %w", target, err)
698}
699}
700}
701}
702return undoBinds, nil
703}
704
705// setPdeathsig sets a parent-death signal for the process
706func setPdeathsig(cmd *exec.Cmd) {
707if cmd.SysProcAttr == nil {
708cmd.SysProcAttr = &syscall.SysProcAttr{}
709}
710cmd.SysProcAttr.Pdeathsig = syscall.SIGKILL
711}
712