17
"github.com/containers/common/libnetwork/slirp4netns"
18
"github.com/containers/common/libnetwork/types"
19
"github.com/containers/common/pkg/cgroups"
20
"github.com/containers/common/pkg/config"
21
"github.com/containers/podman/v5/libpod/define"
22
"github.com/containers/podman/v5/pkg/rootless"
23
spec "github.com/opencontainers/runtime-spec/specs-go"
24
"github.com/opencontainers/runtime-tools/generate"
25
"github.com/opencontainers/selinux/go-selinux/label"
26
"github.com/sirupsen/logrus"
27
"golang.org/x/sys/unix"
31
bindOptions = []string{define.TypeBind, "rprivate"}
34
func (c *Container) mountSHM(shmOptions string) error {
35
contextType := "context"
36
if c.config.LabelNested {
37
contextType = "rootcontext"
40
if err := unix.Mount("shm", c.config.ShmDir, define.TypeTmpfs, unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV,
41
label.FormatMountLabelByType(shmOptions, c.config.MountLabel, contextType)); err != nil {
42
return fmt.Errorf("failed to mount shm tmpfs %q: %w", c.config.ShmDir, err)
47
func (c *Container) unmountSHM(mount string) error {
48
if err := unix.Unmount(mount, 0); err != nil {
49
if err != syscall.EINVAL && err != syscall.ENOENT {
50
return fmt.Errorf("unmounting container %s SHM mount %s: %w", c.ID(), mount, err)
52
// If it's just an EINVAL or ENOENT, debug logs only
53
logrus.Debugf("Container %s failed to unmount %s : %v", c.ID(), mount, err)
58
// prepare mounts the container and sets up other required resources like net
60
func (c *Container) prepare() error {
64
networkStatus map[string]types.StatusBlock
65
createNetNSErr, mountStorageErr error
67
tmpStateLock sync.Mutex
74
// Set up network namespace if not already set up
75
noNetNS := c.state.NetNS == ""
76
if c.config.CreateNetNS && noNetNS && !c.config.PostConfigureNetNS {
77
netNS, networkStatus, createNetNSErr = c.runtime.createNetNS(c)
78
if createNetNSErr != nil {
83
defer tmpStateLock.Unlock()
85
// Assign NetNS attributes to container
87
c.state.NetworkStatus = networkStatus
90
// Mount storage if not mounted
93
mountPoint, mountStorageErr = c.mountStorage()
95
if mountStorageErr != nil {
100
defer tmpStateLock.Unlock()
102
// Finish up mountStorage
103
c.state.Mounted = true
104
c.state.Mountpoint = mountPoint
106
logrus.Debugf("Created root filesystem for container %s at %s", c.ID(), c.state.Mountpoint)
112
if createNetNSErr != nil {
113
createErr = createNetNSErr
115
if mountStorageErr != nil {
116
if createErr != nil {
117
logrus.Errorf("Preparing container %s: %v", c.ID(), createErr)
119
createErr = mountStorageErr
122
// Only trigger storage cleanup if mountStorage was successful.
123
// Otherwise, we may mess up mount counters.
124
if createNetNSErr != nil && mountStorageErr == nil {
125
if err := c.cleanupStorage(); err != nil {
126
// createErr is guaranteed non-nil, so print
128
logrus.Errorf("Preparing container %s: %v", c.ID(), createErr)
129
createErr = fmt.Errorf("unmounting storage for container %s after network create failure: %w", c.ID(), err)
133
// It's OK to unconditionally trigger network cleanup. If the network
134
// isn't ready it will do nothing.
135
if createErr != nil {
136
if err := c.cleanupNetwork(); err != nil {
137
logrus.Errorf("Preparing container %s: %v", c.ID(), createErr)
138
createErr = fmt.Errorf("cleaning up container %s network after setup failure: %w", c.ID(), err)
142
if createErr != nil {
146
// Save changes to container state
147
if err := c.save(); err != nil {
154
// cleanupNetwork unmounts and cleans up the container's network
155
func (c *Container) cleanupNetwork() error {
156
if c.config.NetNsCtr != "" {
159
netDisabled, err := c.NetworkDisabled()
166
if c.state.NetNS == "" {
167
logrus.Debugf("Network is already cleaned up, skipping...")
171
// Stop the container's network namespace (if it has one)
172
if err := c.runtime.teardownNetNS(c); err != nil {
173
logrus.Errorf("Unable to clean up network for container %s: %q", c.ID(), err)
177
c.state.NetworkStatus = nil
186
// reloadNetwork reloads the network for the given container, recreating
188
func (c *Container) reloadNetwork() error {
189
result, err := c.runtime.reloadContainerNetwork(c)
194
c.state.NetworkStatus = result
199
// systemd expects to have /run, /run/lock and /tmp on tmpfs
200
// It also expects to be able to write to /sys/fs/cgroup/systemd and /var/log/journal
201
func (c *Container) setupSystemd(mounts []spec.Mount, g generate.Generator) error {
202
var containerUUIDSet bool
203
for _, s := range c.config.Spec.Process.Env {
204
if strings.HasPrefix(s, "container_uuid=") {
205
containerUUIDSet = true
209
if !containerUUIDSet {
210
g.AddProcessEnv("container_uuid", c.ID()[:32])
212
// limit systemd-specific tmpfs mounts if specified
213
// while creating a pod or ctr, if not, default back to 50%
214
var shmSizeSystemdMntOpt string
215
if c.config.ShmSizeSystemd != 0 {
216
shmSizeSystemdMntOpt = fmt.Sprintf("size=%d", c.config.ShmSizeSystemd)
218
options := []string{"rw", "rprivate", "nosuid", "nodev"}
219
for _, dest := range []string{"/run", "/run/lock"} {
220
if MountExists(mounts, dest) {
223
tmpfsMnt := spec.Mount{
225
Type: define.TypeTmpfs,
226
Source: define.TypeTmpfs,
227
Options: append(options, "tmpcopyup", shmSizeSystemdMntOpt),
231
for _, dest := range []string{"/tmp", "/var/log/journal"} {
232
if MountExists(mounts, dest) {
235
tmpfsMnt := spec.Mount{
237
Type: define.TypeTmpfs,
238
Source: define.TypeTmpfs,
239
Options: append(options, "tmpcopyup", shmSizeSystemdMntOpt),
244
unified, err := cgroups.IsCgroup2UnifiedMode()
250
for _, ns := range c.config.Spec.Linux.Namespaces {
251
if ns.Type == spec.CgroupNamespace {
258
g.RemoveMount("/sys/fs/cgroup")
260
var systemdMnt spec.Mount
262
systemdMnt = spec.Mount{
263
Destination: "/sys/fs/cgroup",
266
Options: []string{"private", "rw"},
269
systemdMnt = spec.Mount{
270
Destination: "/sys/fs/cgroup",
271
Type: define.TypeBind,
272
Source: "/sys/fs/cgroup",
273
Options: []string{define.TypeBind, "private", "rw"},
276
g.AddMount(systemdMnt)
278
hasSystemdMount := MountExists(mounts, "/sys/fs/cgroup/systemd")
279
if hasCgroupNs && !hasSystemdMount {
280
return errors.New("cgroup namespace is not supported with cgroup v1 and systemd mode")
282
mountOptions := []string{define.TypeBind, "rprivate"}
284
if !hasSystemdMount {
285
skipMount := hasSystemdMount
286
var statfs unix.Statfs_t
287
if err := unix.Statfs("/sys/fs/cgroup/systemd", &statfs); err != nil {
288
if errors.Is(err, os.ErrNotExist) {
289
// If the mount is missing on the host, we cannot bind mount it so
293
mountOptions = append(mountOptions, "nodev", "noexec", "nosuid")
295
if statfs.Flags&unix.MS_NODEV == unix.MS_NODEV {
296
mountOptions = append(mountOptions, "nodev")
298
if statfs.Flags&unix.MS_NOEXEC == unix.MS_NOEXEC {
299
mountOptions = append(mountOptions, "noexec")
301
if statfs.Flags&unix.MS_NOSUID == unix.MS_NOSUID {
302
mountOptions = append(mountOptions, "nosuid")
304
if statfs.Flags&unix.MS_RDONLY == unix.MS_RDONLY {
305
mountOptions = append(mountOptions, "ro")
309
systemdMnt := spec.Mount{
310
Destination: "/sys/fs/cgroup/systemd",
311
Type: define.TypeBind,
312
Source: "/sys/fs/cgroup/systemd",
313
Options: mountOptions,
315
g.AddMount(systemdMnt)
316
g.AddLinuxMaskedPaths("/sys/fs/cgroup/systemd/release_agent")
324
// Add an existing container's namespace to the spec
325
func (c *Container) addNamespaceContainer(g *generate.Generator, ns LinuxNS, ctr string, specNS spec.LinuxNamespaceType) error {
326
nsCtr, err := c.runtime.state.Container(ctr)
328
return fmt.Errorf("retrieving dependency %s of container %s from state: %w", ctr, c.ID(), err)
331
if specNS == spec.UTSNamespace {
332
hostname := nsCtr.Hostname()
333
// Joining an existing namespace, cannot set the hostname
335
g.AddProcessEnv("HOSTNAME", hostname)
338
nsPath, err := nsCtr.NamespacePath(ns)
343
if err := g.AddOrReplaceLinuxNamespace(string(specNS), nsPath); err != nil {
350
func isRootlessCgroupSet(cgroup string) bool {
351
// old versions of podman were setting the CgroupParent to CgroupfsDefaultCgroupParent
352
// by default. Avoid breaking these versions and check whether the cgroup parent is
353
// set to the default and in this case enable the old behavior. It should not be a real
354
// problem because the default CgroupParent is usually owned by root so rootless users
356
// This check might be lifted in a future version of Podman.
357
// Check both that the cgroup or its parent is set to the default value (used by pods).
358
return cgroup != CgroupfsDefaultCgroupParent && filepath.Dir(cgroup) != CgroupfsDefaultCgroupParent
361
func (c *Container) expectPodCgroup() (bool, error) {
362
unified, err := cgroups.IsCgroup2UnifiedMode()
366
cgroupManager := c.CgroupManager()
368
case c.config.NoCgroups:
370
case cgroupManager == config.SystemdCgroupsManager:
371
return !rootless.IsRootless() || unified, nil
372
case cgroupManager == config.CgroupfsCgroupsManager:
373
return !rootless.IsRootless(), nil
375
return false, fmt.Errorf("invalid cgroup mode %s requested for pods: %w", cgroupManager, define.ErrInvalidArg)
379
// Get cgroup path in a format suitable for the OCI spec
380
func (c *Container) getOCICgroupPath() (string, error) {
381
unified, err := cgroups.IsCgroup2UnifiedMode()
385
cgroupManager := c.CgroupManager()
387
case c.config.NoCgroups:
389
case c.config.CgroupsMode == cgroupSplit:
390
selfCgroup, err := cgroups.GetOwnCgroupDisallowRoot()
394
return filepath.Join(selfCgroup, fmt.Sprintf("libpod-payload-%s", c.ID())), nil
395
case cgroupManager == config.SystemdCgroupsManager:
396
// When the OCI runtime is set to use Systemd as a cgroup manager, it
397
// expects cgroups to be passed as follows:
399
systemdCgroups := fmt.Sprintf("%s:libpod:%s", path.Base(c.config.CgroupParent), c.ID())
400
logrus.Debugf("Setting Cgroups for container %s to %s", c.ID(), systemdCgroups)
401
return systemdCgroups, nil
402
case (rootless.IsRootless() && (cgroupManager == config.CgroupfsCgroupsManager || !unified)):
403
if c.config.CgroupParent == "" || !isRootlessCgroupSet(c.config.CgroupParent) {
407
case cgroupManager == config.CgroupfsCgroupsManager:
408
cgroupPath := filepath.Join(c.config.CgroupParent, fmt.Sprintf("libpod-%s", c.ID()))
409
logrus.Debugf("Setting Cgroup path for container %s to %s", c.ID(), cgroupPath)
410
return cgroupPath, nil
412
return "", fmt.Errorf("invalid cgroup manager %s requested: %w", cgroupManager, define.ErrInvalidArg)
416
func openDirectory(path string) (fd int, err error) {
417
return unix.Open(path, unix.O_RDONLY|unix.O_PATH, 0)
420
func (c *Container) addNetworkNamespace(g *generate.Generator) error {
421
if c.config.CreateNetNS {
422
if c.config.PostConfigureNetNS {
423
if err := g.AddOrReplaceLinuxNamespace(string(spec.NetworkNamespace), ""); err != nil {
427
if err := g.AddOrReplaceLinuxNamespace(string(spec.NetworkNamespace), c.state.NetNS); err != nil {
435
func (c *Container) addSystemdMounts(g *generate.Generator) error {
437
if err := c.setupSystemd(g.Mounts(), *g); err != nil {
444
func (c *Container) addSharedNamespaces(g *generate.Generator) error {
445
if c.config.IPCNsCtr != "" {
446
if err := c.addNamespaceContainer(g, IPCNS, c.config.IPCNsCtr, spec.IPCNamespace); err != nil {
450
if c.config.MountNsCtr != "" {
451
if err := c.addNamespaceContainer(g, MountNS, c.config.MountNsCtr, spec.MountNamespace); err != nil {
455
if c.config.NetNsCtr != "" {
456
if err := c.addNamespaceContainer(g, NetNS, c.config.NetNsCtr, spec.NetworkNamespace); err != nil {
460
if c.config.PIDNsCtr != "" {
461
if err := c.addNamespaceContainer(g, PIDNS, c.config.PIDNsCtr, spec.PIDNamespace); err != nil {
465
if c.config.UserNsCtr != "" {
466
if err := c.addNamespaceContainer(g, UserNS, c.config.UserNsCtr, spec.UserNamespace); err != nil {
469
if len(g.Config.Linux.UIDMappings) == 0 {
470
// runc complains if no mapping is specified, even if we join another ns. So provide a dummy mapping
471
g.AddLinuxUIDMapping(uint32(0), uint32(0), uint32(1))
472
g.AddLinuxGIDMapping(uint32(0), uint32(0), uint32(1))
476
availableUIDs, availableGIDs, err := rootless.GetAvailableIDMaps()
478
if os.IsNotExist(err) {
479
// The kernel-provided files only exist if user namespaces are supported
480
logrus.Debugf("User or group ID mappings not available: %s", err)
485
g.Config.Linux.UIDMappings = rootless.MaybeSplitMappings(g.Config.Linux.UIDMappings, availableUIDs)
486
g.Config.Linux.GIDMappings = rootless.MaybeSplitMappings(g.Config.Linux.GIDMappings, availableGIDs)
489
// Hostname handling:
490
// If we have a UTS namespace, set Hostname in the OCI spec.
491
// Set the HOSTNAME environment variable unless explicitly overridden by
492
// the user (already present in OCI spec). If we don't have a UTS ns,
493
// set it to the host's hostname instead.
494
hostname := c.Hostname()
497
for _, i := range c.config.Spec.Linux.Namespaces {
498
if i.Type == spec.UTSNamespace && i.Path == "" {
500
g.SetHostname(hostname)
505
tmpHostname, err := os.Hostname()
509
hostname = tmpHostname
512
for _, checkEnv := range g.Config.Process.Env {
513
if strings.SplitN(checkEnv, "=", 2)[0] == "HOSTNAME" {
519
g.AddProcessEnv("HOSTNAME", hostname)
522
if c.config.UTSNsCtr != "" {
523
if err := c.addNamespaceContainer(g, UTSNS, c.config.UTSNsCtr, spec.UTSNamespace); err != nil {
527
if c.config.CgroupNsCtr != "" {
528
if err := c.addNamespaceContainer(g, CgroupNS, c.config.CgroupNsCtr, spec.CgroupNamespace); err != nil {
533
if c.config.UserNsCtr == "" && c.config.IDMappings.AutoUserNs {
534
if err := g.AddOrReplaceLinuxNamespace(string(spec.UserNamespace), ""); err != nil {
537
g.ClearLinuxUIDMappings()
538
for _, uidmap := range c.config.IDMappings.UIDMap {
539
g.AddLinuxUIDMapping(uint32(uidmap.HostID), uint32(uidmap.ContainerID), uint32(uidmap.Size))
541
g.ClearLinuxGIDMappings()
542
for _, gidmap := range c.config.IDMappings.GIDMap {
543
g.AddLinuxGIDMapping(uint32(gidmap.HostID), uint32(gidmap.ContainerID), uint32(gidmap.Size))
549
func (c *Container) addRootPropagation(g *generate.Generator, mounts []spec.Mount) error {
550
// Determine property of RootPropagation based on volume properties. If
551
// a volume is shared, then keep root propagation shared. This should
552
// work for slave and private volumes too.
554
// For slave volumes, it can be either [r]shared/[r]slave.
556
// For private volumes any root propagation value should work.
557
rootPropagation := ""
558
for _, m := range mounts {
559
for _, opt := range m.Options {
561
case MountShared, MountRShared:
562
if rootPropagation != MountShared && rootPropagation != MountRShared {
563
rootPropagation = MountShared
565
case MountSlave, MountRSlave:
566
if rootPropagation != MountShared && rootPropagation != MountRShared && rootPropagation != MountSlave && rootPropagation != MountRSlave {
567
rootPropagation = MountRSlave
572
if rootPropagation != "" {
573
logrus.Debugf("Set root propagation to %q", rootPropagation)
574
if err := g.SetLinuxRootPropagation(rootPropagation); err != nil {
581
func (c *Container) setProcessLabel(g *generate.Generator) {
582
g.SetProcessSelinuxLabel(c.ProcessLabel())
585
func (c *Container) setMountLabel(g *generate.Generator) {
586
g.SetLinuxMountLabel(c.MountLabel())
589
func (c *Container) setCgroupsPath(g *generate.Generator) error {
590
cgroupPath, err := c.getOCICgroupPath()
594
g.SetLinuxCgroupsPath(cgroupPath)
598
// addSpecialDNS adds special dns servers for slirp4netns and pasta
599
func (c *Container) addSpecialDNS(nameservers []string) []string {
600
if c.pastaResult != nil {
601
nameservers = append(nameservers, c.pastaResult.DNSForwardIPs...)
604
// slirp4netns has a built in DNS forwarder.
605
if c.config.NetMode.IsSlirp4netns() {
606
slirp4netnsDNS, err := slirp4netns.GetDNS(c.slirp4netnsSubnet)
608
logrus.Warn("Failed to determine Slirp4netns DNS: ", err.Error())
610
nameservers = append(nameservers, slirp4netnsDNS.String())
616
func (c *Container) isSlirp4netnsIPv6() bool {
617
if c.config.NetMode.IsSlirp4netns() {
618
extraOptions := c.config.NetworkOptions[slirp4netns.BinaryName]
619
options := make([]string, 0, len(c.runtime.config.Engine.NetworkCmdOptions.Get())+len(extraOptions))
620
options = append(options, c.runtime.config.Engine.NetworkCmdOptions.Get()...)
621
options = append(options, extraOptions...)
623
// loop backwards as the last argument wins and we can exit early
624
// This should be kept in sync with c/common/libnetwork/slirp4netns.
625
for i := len(options) - 1; i >= 0; i-- {
627
case "enable_ipv6=true":
629
case "enable_ipv6=false":
641
func (c *Container) hasNetNone() bool {
642
if !c.config.CreateNetNS {
643
for _, ns := range c.config.Spec.Linux.Namespaces {
644
if ns.Type == spec.NetworkNamespace {
654
func setVolumeAtime(mountPoint string, st os.FileInfo) error {
655
stat := st.Sys().(*syscall.Stat_t)
656
atime := time.Unix(int64(stat.Atim.Sec), int64(stat.Atim.Nsec)) //nolint: unconvert
657
if err := os.Chtimes(mountPoint, atime, st.ModTime()); err != nil {
663
func (c *Container) makePlatformBindMounts() error {
664
// Make /etc/hostname
665
// This should never change, so no need to recreate if it exists
666
if _, ok := c.state.BindMounts["/etc/hostname"]; !ok {
667
hostnamePath, err := c.writeStringToRundir("hostname", c.Hostname())
669
return fmt.Errorf("creating hostname file for container %s: %w", c.ID(), err)
671
c.state.BindMounts["/etc/hostname"] = hostnamePath
676
func (c *Container) getConmonPidFd() int {
677
if c.state.ConmonPID != 0 {
678
// Track lifetime of conmon precisely using pidfd_open + poll.
679
// There are many cases for this to fail, for instance conmon is dead
680
// or pidfd_open is not supported (pre linux 5.3), so fall back to the
681
// traditional loop with poll + sleep
682
if fd, err := unix.PidfdOpen(c.state.ConmonPID, 0); err == nil {
684
} else if err != unix.ENOSYS && err != unix.ESRCH {
685
logrus.Debugf("PidfdOpen(%d) failed: %v", c.state.ConmonPID, err)
691
type safeMountInfo struct {
692
// file is the open File.
695
// mountPoint is the mount point.
699
// Close releases the resources allocated with the safe mount info.
700
func (s *safeMountInfo) Close() {
701
_ = unix.Unmount(s.mountPoint, unix.MNT_DETACH)
705
// safeMountSubPath securely mounts a subpath inside a volume to a new temporary location.
706
// The function checks that the subpath is a valid subpath within the volume and that it
707
// does not escape the boundaries of the mount point (volume).
709
// The caller is responsible for closing the file descriptor and unmounting the subpath
710
// when it's no longer needed.
711
func (c *Container) safeMountSubPath(mountPoint, subpath string) (s *safeMountInfo, err error) {
712
joinedPath := filepath.Clean(filepath.Join(mountPoint, subpath))
713
fd, err := unix.Open(joinedPath, unix.O_RDONLY|unix.O_PATH, 0)
717
f := os.NewFile(uintptr(fd), joinedPath)
724
// Once we got the file descriptor, we need to check that the subpath is a valid. We
725
// refer to the open FD so there won't be other path lookups (and no risk to follow a symlink).
726
fdPath := fmt.Sprintf("/proc/%d/fd/%d", os.Getpid(), f.Fd())
727
p, err := os.Readlink(fdPath)
731
relPath, err := filepath.Rel(mountPoint, p)
735
if relPath == ".." || strings.HasPrefix(relPath, "../") {
736
return nil, fmt.Errorf("subpath %q is outside of the volume %q", subpath, mountPoint)
739
fi, err := os.Stat(fdPath)
745
case fi.Mode()&fs.ModeSymlink != 0:
746
return nil, fmt.Errorf("file %q is a symlink", joinedPath)
748
npath, err = os.MkdirTemp(c.state.RunDir, "subpath")
753
tmp, err := os.CreateTemp(c.state.RunDir, "subpath")
760
if err := unix.Mount(fdPath, npath, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
763
return &safeMountInfo{
769
func (c *Container) makePlatformMtabLink(etcInTheContainerFd, rootUID, rootGID int) error {
770
// If /etc/mtab does not exist in container image, then we need to
771
// create it, so that mount command within the container will work.
772
err := unix.Symlinkat("/proc/mounts", etcInTheContainerFd, "mtab")
773
if err != nil && !os.IsExist(err) {
774
return fmt.Errorf("creating /etc/mtab symlink: %w", err)
776
// If the symlink was created, then also chown it to root in the container
777
if err == nil && (rootUID != 0 || rootGID != 0) {
778
err = unix.Fchownat(etcInTheContainerFd, "mtab", rootUID, rootGID, unix.AT_SYMLINK_NOFOLLOW)
780
return fmt.Errorf("chown /etc/mtab: %w", err)
786
func (c *Container) getPlatformRunPath() (string, error) {
790
func (c *Container) addMaskedPaths(g *generate.Generator) {
791
if !c.config.Privileged && g.Config != nil && g.Config.Linux != nil && len(g.Config.Linux.MaskedPaths) > 0 {
792
g.AddLinuxMaskedPaths("/sys/devices/virtual/powercap")
796
func (c *Container) hasPrivateUTS() bool {
798
if c.config.Spec.Linux != nil {
799
for _, ns := range c.config.Spec.Linux.Namespaces {
800
if ns.Type == spec.UTSNamespace {