13
runcconfig "github.com/opencontainers/runc/libcontainer/configs"
14
"github.com/opencontainers/runc/libcontainer/devices"
16
"github.com/containers/common/pkg/cgroups"
17
"github.com/containers/common/pkg/config"
18
"github.com/containers/common/pkg/systemd"
19
"github.com/containers/podman/v5/pkg/errorhandling"
20
"github.com/containers/podman/v5/pkg/rootless"
21
pmount "github.com/containers/storage/pkg/mount"
22
spec "github.com/opencontainers/runtime-spec/specs-go"
23
"github.com/opencontainers/selinux/go-selinux/label"
24
"github.com/sirupsen/logrus"
25
"golang.org/x/sys/unix"
28
func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) {
33
ch := make(chan result)
35
runtime.LockOSThread()
36
restoreDuration, err := func() (int64, error) {
37
fd, err := os.Open(fmt.Sprintf("/proc/%d/task/%d/ns/mnt", os.Getpid(), unix.Gettid()))
41
defer errorhandling.CloseQuiet(fd)
43
// create a new mountns on the current thread
44
if err = unix.Unshare(unix.CLONE_NEWNS); err != nil {
48
if err := unix.Setns(int(fd.Fd()), unix.CLONE_NEWNS); err != nil {
49
logrus.Errorf("Unable to clone new namespace: %q", err)
53
// don't spread our mounts around. We are setting only /sys to be slave
54
// so that the cleanup process is still able to umount the storage and the
55
// changes are propagated to the host.
56
err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "")
58
return 0, fmt.Errorf("cannot make /sys slave: %w", err)
61
mounts, err := pmount.GetMounts()
65
for _, m := range mounts {
66
if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") {
69
err = unix.Unmount(m.Mountpoint, 0)
70
if err != nil && !os.IsNotExist(err) {
71
return 0, fmt.Errorf("cannot unmount %s: %w", m.Mountpoint, err)
74
return r.createOCIContainer(ctr, restoreOptions)
77
restoreDuration: restoreDuration,
82
return res.restoreDuration, res.err
85
// Run the closure with the container's socket label set
86
func (r *ConmonOCIRuntime) withContainerSocketLabel(ctr *Container, closure func() error) error {
87
runtime.LockOSThread()
88
if err := label.SetSocketLabel(ctr.ProcessLabel()); err != nil {
92
// Ignore error returned from SetSocketLabel("") call,
94
if labelErr := label.SetSocketLabel(""); labelErr == nil {
95
// Unlock the thread only if the process label could be restored
96
// successfully. Otherwise leave the thread locked and the Go runtime
97
// will terminate it once it returns to the threads pool.
98
runtime.UnlockOSThread()
100
logrus.Errorf("Unable to reset socket label: %q", labelErr)
105
// moveConmonToCgroupAndSignal gets a container's cgroupParent and moves the conmon process to that cgroup
106
// it then signals for conmon to start by sending nonce data down the start fd
107
func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec.Cmd, startFd *os.File) error {
108
mustCreateCgroup := true
110
if ctr.config.NoCgroups {
111
mustCreateCgroup = false
114
// If cgroup creation is disabled - just signal.
115
switch ctr.config.CgroupsMode {
116
case "disabled", "no-conmon", cgroupSplit:
117
mustCreateCgroup = false
120
// $INVOCATION_ID is set by systemd when running as a service.
121
if ctr.runtime.RemoteURI() == "" && os.Getenv("INVOCATION_ID") != "" {
122
mustCreateCgroup = false
125
if mustCreateCgroup {
126
// Usually rootless users are not allowed to configure cgroupfs.
127
// There are cases though, where it is allowed, e.g. if the cgroup
128
// is manually configured and chowned). Avoid detecting all
129
// such cases and simply use a lower log level.
130
logLevel := logrus.WarnLevel
131
if rootless.IsRootless() {
132
logLevel = logrus.InfoLevel
134
// TODO: This should be a switch - we are not guaranteed that
135
// there are only 2 valid cgroup managers
136
cgroupParent := ctr.CgroupParent()
137
cgroupPath := filepath.Join(ctr.config.CgroupParent, "conmon")
138
cgroupResources, err := GetLimits(ctr.LinuxResources())
140
logrus.StandardLogger().Log(logLevel, "Could not get ctr resources")
142
if ctr.CgroupManager() == config.SystemdCgroupsManager {
143
unitName := createUnitName("libpod-conmon", ctr.ID())
144
realCgroupParent := cgroupParent
145
splitParent := strings.Split(cgroupParent, "/")
146
if strings.HasSuffix(cgroupParent, ".slice") && len(splitParent) > 1 {
147
realCgroupParent = splitParent[len(splitParent)-1]
150
logrus.Infof("Running conmon under slice %s and unitName %s", realCgroupParent, unitName)
151
if err := systemd.RunUnderSystemdScope(cmd.Process.Pid, realCgroupParent, unitName); err != nil {
152
logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to systemd sandbox cgroup: %v", err)
155
control, err := cgroups.New(cgroupPath, &cgroupResources)
157
logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err)
158
} else if err := control.AddPid(cmd.Process.Pid); err != nil {
159
// we need to remove this defer and delete the cgroup once conmon exits
160
// maybe need a conmon monitor?
161
logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err)
166
/* We set the cgroup, now the child can start creating children */
167
return writeConmonPipeData(startFd)
170
// GetLimits converts spec resource limits to cgroup consumable limits
171
func GetLimits(resource *spec.LinuxResources) (runcconfig.Resources, error) {
173
resource = &spec.LinuxResources{}
175
final := &runcconfig.Resources{}
176
devs := []*devices.Rule{}
179
for _, entry := range resource.Devices {
180
if entry.Major == nil || entry.Minor == nil {
191
devs = append(devs, &devices.Rule{
192
Type: devices.Type(runeType),
195
Permissions: devices.Permissions(entry.Access),
202
pageLimits := []*runcconfig.HugepageLimit{}
203
for _, entry := range resource.HugepageLimits {
204
pageLimits = append(pageLimits, &runcconfig.HugepageLimit{
205
Pagesize: entry.Pagesize,
209
final.HugetlbLimit = pageLimits
212
netPriorities := []*runcconfig.IfPrioMap{}
213
if resource.Network != nil {
214
for _, entry := range resource.Network.Priorities {
215
netPriorities = append(netPriorities, &runcconfig.IfPrioMap{
216
Interface: entry.Name,
217
Priority: int64(entry.Priority),
221
final.NetPrioIfpriomap = netPriorities
222
rdma := make(map[string]runcconfig.LinuxRdma)
223
for name, entry := range resource.Rdma {
224
rdma[name] = runcconfig.LinuxRdma{HcaHandles: entry.HcaHandles, HcaObjects: entry.HcaObjects}
229
if resource.Memory != nil {
230
if resource.Memory.Limit != nil {
231
final.Memory = *resource.Memory.Limit
233
if resource.Memory.Reservation != nil {
234
final.MemoryReservation = *resource.Memory.Reservation
236
if resource.Memory.Swap != nil {
237
final.MemorySwap = *resource.Memory.Swap
239
if resource.Memory.Swappiness != nil {
240
final.MemorySwappiness = resource.Memory.Swappiness
245
if resource.CPU != nil {
246
if resource.CPU.Period != nil {
247
final.CpuPeriod = *resource.CPU.Period
249
if resource.CPU.Quota != nil {
250
final.CpuQuota = *resource.CPU.Quota
252
if resource.CPU.RealtimePeriod != nil {
253
final.CpuRtPeriod = *resource.CPU.RealtimePeriod
255
if resource.CPU.RealtimeRuntime != nil {
256
final.CpuRtRuntime = *resource.CPU.RealtimeRuntime
258
if resource.CPU.Shares != nil {
259
final.CpuShares = *resource.CPU.Shares
261
final.CpusetCpus = resource.CPU.Cpus
262
final.CpusetMems = resource.CPU.Mems
266
if resource.BlockIO != nil {
267
if len(resource.BlockIO.ThrottleReadBpsDevice) > 0 {
268
for _, entry := range resource.BlockIO.ThrottleReadBpsDevice {
269
throttle := runcconfig.NewThrottleDevice(entry.Major, entry.Minor, entry.Rate)
270
final.BlkioThrottleReadBpsDevice = append(final.BlkioThrottleReadBpsDevice, throttle)
273
if len(resource.BlockIO.ThrottleWriteBpsDevice) > 0 {
274
for _, entry := range resource.BlockIO.ThrottleWriteBpsDevice {
275
throttle := runcconfig.NewThrottleDevice(entry.Major, entry.Minor, entry.Rate)
276
final.BlkioThrottleWriteBpsDevice = append(final.BlkioThrottleWriteBpsDevice, throttle)
279
if len(resource.BlockIO.ThrottleReadIOPSDevice) > 0 {
280
for _, entry := range resource.BlockIO.ThrottleReadIOPSDevice {
281
throttle := runcconfig.NewThrottleDevice(entry.Major, entry.Minor, entry.Rate)
282
final.BlkioThrottleReadIOPSDevice = append(final.BlkioThrottleReadIOPSDevice, throttle)
285
if len(resource.BlockIO.ThrottleWriteIOPSDevice) > 0 {
286
for _, entry := range resource.BlockIO.ThrottleWriteIOPSDevice {
287
throttle := runcconfig.NewThrottleDevice(entry.Major, entry.Minor, entry.Rate)
288
final.BlkioThrottleWriteIOPSDevice = append(final.BlkioThrottleWriteIOPSDevice, throttle)
291
if resource.BlockIO.LeafWeight != nil {
292
final.BlkioLeafWeight = *resource.BlockIO.LeafWeight
294
if resource.BlockIO.Weight != nil {
295
final.BlkioWeight = *resource.BlockIO.Weight
297
if len(resource.BlockIO.WeightDevice) > 0 {
298
for _, entry := range resource.BlockIO.WeightDevice {
300
if entry.Weight != nil {
303
if entry.LeafWeight != nil {
304
lw = *entry.LeafWeight
306
weight := runcconfig.NewWeightDevice(entry.Major, entry.Minor, w, lw)
307
final.BlkioWeightDevice = append(final.BlkioWeightDevice, weight)
313
if resource.Pids != nil {
314
final.PidsLimit = resource.Pids.Limit
318
if resource.Network != nil {
319
if resource.Network.ClassID != nil {
320
final.NetClsClassid = *resource.Network.ClassID
325
final.Unified = resource.Unified
329
func moveToRuntimeCgroup() error {
330
return cgroups.MoveUnderCgroupSubtree("runtime")