podman

oci_conmon_linux.go
331 строка · 10.3 Кб
Перенос по словам
1
//go:build !remote
2

3
package libpod
4

5
import (
6
	"fmt"
7
	"os"
8
	"os/exec"
9
	"path/filepath"
10
	"runtime"
11
	"strings"
12

13
	runcconfig "github.com/opencontainers/runc/libcontainer/configs"
14
	"github.com/opencontainers/runc/libcontainer/devices"
15

16
	"github.com/containers/common/pkg/cgroups"
17
	"github.com/containers/common/pkg/config"
18
	"github.com/containers/common/pkg/systemd"
19
	"github.com/containers/podman/v5/pkg/errorhandling"
20
	"github.com/containers/podman/v5/pkg/rootless"
21
	pmount "github.com/containers/storage/pkg/mount"
22
	spec "github.com/opencontainers/runtime-spec/specs-go"
23
	"github.com/opencontainers/selinux/go-selinux/label"
24
	"github.com/sirupsen/logrus"
25
	"golang.org/x/sys/unix"
26
)
27

28
func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) {
29
	type result struct {
30
		restoreDuration int64
31
		err             error
32
	}
33
	ch := make(chan result)
34
	go func() {
35
		runtime.LockOSThread()
36
		restoreDuration, err := func() (int64, error) {
37
			fd, err := os.Open(fmt.Sprintf("/proc/%d/task/%d/ns/mnt", os.Getpid(), unix.Gettid()))
38
			if err != nil {
39
				return 0, err
40
			}
41
			defer errorhandling.CloseQuiet(fd)
42

43
			// create a new mountns on the current thread
44
			if err = unix.Unshare(unix.CLONE_NEWNS); err != nil {
45
				return 0, err
46
			}
47
			defer func() {
48
				if err := unix.Setns(int(fd.Fd()), unix.CLONE_NEWNS); err != nil {
49
					logrus.Errorf("Unable to clone new namespace: %q", err)
50
				}
51
			}()
52

53
			// don't spread our mounts around.  We are setting only /sys to be slave
54
			// so that the cleanup process is still able to umount the storage and the
55
			// changes are propagated to the host.
56
			err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "")
57
			if err != nil {
58
				return 0, fmt.Errorf("cannot make /sys slave: %w", err)
59
			}
60

61
			mounts, err := pmount.GetMounts()
62
			if err != nil {
63
				return 0, err
64
			}
65
			for _, m := range mounts {
66
				if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") {
67
					continue
68
				}
69
				err = unix.Unmount(m.Mountpoint, 0)
70
				if err != nil && !os.IsNotExist(err) {
71
					return 0, fmt.Errorf("cannot unmount %s: %w", m.Mountpoint, err)
72
				}
73
			}
74
			return r.createOCIContainer(ctr, restoreOptions)
75
		}()
76
		ch <- result{
77
			restoreDuration: restoreDuration,
78
			err:             err,
79
		}
80
	}()
81
	res := <-ch
82
	return res.restoreDuration, res.err
83
}
84

85
// Run the closure with the container's socket label set
86
func (r *ConmonOCIRuntime) withContainerSocketLabel(ctr *Container, closure func() error) error {
87
	runtime.LockOSThread()
88
	if err := label.SetSocketLabel(ctr.ProcessLabel()); err != nil {
89
		return err
90
	}
91
	err := closure()
92
	// Ignore error returned from SetSocketLabel("") call,
93
	// can't recover.
94
	if labelErr := label.SetSocketLabel(""); labelErr == nil {
95
		// Unlock the thread only if the process label could be restored
96
		// successfully.  Otherwise leave the thread locked and the Go runtime
97
		// will terminate it once it returns to the threads pool.
98
		runtime.UnlockOSThread()
99
	} else {
100
		logrus.Errorf("Unable to reset socket label: %q", labelErr)
101
	}
102
	return err
103
}
104

105
// moveConmonToCgroupAndSignal gets a container's cgroupParent and moves the conmon process to that cgroup
106
// it then signals for conmon to start by sending nonce data down the start fd
107
func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec.Cmd, startFd *os.File) error {
108
	mustCreateCgroup := true
109

110
	if ctr.config.NoCgroups {
111
		mustCreateCgroup = false
112
	}
113

114
	// If cgroup creation is disabled - just signal.
115
	switch ctr.config.CgroupsMode {
116
	case "disabled", "no-conmon", cgroupSplit:
117
		mustCreateCgroup = false
118
	}
119

120
	// $INVOCATION_ID is set by systemd when running as a service.
121
	if ctr.runtime.RemoteURI() == "" && os.Getenv("INVOCATION_ID") != "" {
122
		mustCreateCgroup = false
123
	}
124

125
	if mustCreateCgroup {
126
		// Usually rootless users are not allowed to configure cgroupfs.
127
		// There are cases though, where it is allowed, e.g. if the cgroup
128
		// is manually configured and chowned).  Avoid detecting all
129
		// such cases and simply use a lower log level.
130
		logLevel := logrus.WarnLevel
131
		if rootless.IsRootless() {
132
			logLevel = logrus.InfoLevel
133
		}
134
		// TODO: This should be a switch - we are not guaranteed that
135
		// there are only 2 valid cgroup managers
136
		cgroupParent := ctr.CgroupParent()
137
		cgroupPath := filepath.Join(ctr.config.CgroupParent, "conmon")
138
		cgroupResources, err := GetLimits(ctr.LinuxResources())
139
		if err != nil {
140
			logrus.StandardLogger().Log(logLevel, "Could not get ctr resources")
141
		}
142
		if ctr.CgroupManager() == config.SystemdCgroupsManager {
143
			unitName := createUnitName("libpod-conmon", ctr.ID())
144
			realCgroupParent := cgroupParent
145
			splitParent := strings.Split(cgroupParent, "/")
146
			if strings.HasSuffix(cgroupParent, ".slice") && len(splitParent) > 1 {
147
				realCgroupParent = splitParent[len(splitParent)-1]
148
			}
149

150
			logrus.Infof("Running conmon under slice %s and unitName %s", realCgroupParent, unitName)
151
			if err := systemd.RunUnderSystemdScope(cmd.Process.Pid, realCgroupParent, unitName); err != nil {
152
				logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to systemd sandbox cgroup: %v", err)
153
			}
154
		} else {
155
			control, err := cgroups.New(cgroupPath, &cgroupResources)
156
			if err != nil {
157
				logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err)
158
			} else if err := control.AddPid(cmd.Process.Pid); err != nil {
159
				// we need to remove this defer and delete the cgroup once conmon exits
160
				// maybe need a conmon monitor?
161
				logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err)
162
			}
163
		}
164
	}
165

166
	/* We set the cgroup, now the child can start creating children */
167
	return writeConmonPipeData(startFd)
168
}
169

170
// GetLimits converts spec resource limits to cgroup consumable limits
171
func GetLimits(resource *spec.LinuxResources) (runcconfig.Resources, error) {
172
	if resource == nil {
173
		resource = &spec.LinuxResources{}
174
	}
175
	final := &runcconfig.Resources{}
176
	devs := []*devices.Rule{}
177

178
	// Devices
179
	for _, entry := range resource.Devices {
180
		if entry.Major == nil || entry.Minor == nil {
181
			continue
182
		}
183
		runeType := 'a'
184
		switch entry.Type {
185
		case "b":
186
			runeType = 'b'
187
		case "c":
188
			runeType = 'c'
189
		}
190

191
		devs = append(devs, &devices.Rule{
192
			Type:        devices.Type(runeType),
193
			Major:       *entry.Major,
194
			Minor:       *entry.Minor,
195
			Permissions: devices.Permissions(entry.Access),
196
			Allow:       entry.Allow,
197
		})
198
	}
199
	final.Devices = devs
200

201
	// HugepageLimits
202
	pageLimits := []*runcconfig.HugepageLimit{}
203
	for _, entry := range resource.HugepageLimits {
204
		pageLimits = append(pageLimits, &runcconfig.HugepageLimit{
205
			Pagesize: entry.Pagesize,
206
			Limit:    entry.Limit,
207
		})
208
	}
209
	final.HugetlbLimit = pageLimits
210

211
	// Networking
212
	netPriorities := []*runcconfig.IfPrioMap{}
213
	if resource.Network != nil {
214
		for _, entry := range resource.Network.Priorities {
215
			netPriorities = append(netPriorities, &runcconfig.IfPrioMap{
216
				Interface: entry.Name,
217
				Priority:  int64(entry.Priority),
218
			})
219
		}
220
	}
221
	final.NetPrioIfpriomap = netPriorities
222
	rdma := make(map[string]runcconfig.LinuxRdma)
223
	for name, entry := range resource.Rdma {
224
		rdma[name] = runcconfig.LinuxRdma{HcaHandles: entry.HcaHandles, HcaObjects: entry.HcaObjects}
225
	}
226
	final.Rdma = rdma
227

228
	// Memory
229
	if resource.Memory != nil {
230
		if resource.Memory.Limit != nil {
231
			final.Memory = *resource.Memory.Limit
232
		}
233
		if resource.Memory.Reservation != nil {
234
			final.MemoryReservation = *resource.Memory.Reservation
235
		}
236
		if resource.Memory.Swap != nil {
237
			final.MemorySwap = *resource.Memory.Swap
238
		}
239
		if resource.Memory.Swappiness != nil {
240
			final.MemorySwappiness = resource.Memory.Swappiness
241
		}
242
	}
243

244
	// CPU
245
	if resource.CPU != nil {
246
		if resource.CPU.Period != nil {
247
			final.CpuPeriod = *resource.CPU.Period
248
		}
249
		if resource.CPU.Quota != nil {
250
			final.CpuQuota = *resource.CPU.Quota
251
		}
252
		if resource.CPU.RealtimePeriod != nil {
253
			final.CpuRtPeriod = *resource.CPU.RealtimePeriod
254
		}
255
		if resource.CPU.RealtimeRuntime != nil {
256
			final.CpuRtRuntime = *resource.CPU.RealtimeRuntime
257
		}
258
		if resource.CPU.Shares != nil {
259
			final.CpuShares = *resource.CPU.Shares
260
		}
261
		final.CpusetCpus = resource.CPU.Cpus
262
		final.CpusetMems = resource.CPU.Mems
263
	}
264

265
	// BlkIO
266
	if resource.BlockIO != nil {
267
		if len(resource.BlockIO.ThrottleReadBpsDevice) > 0 {
268
			for _, entry := range resource.BlockIO.ThrottleReadBpsDevice {
269
				throttle := runcconfig.NewThrottleDevice(entry.Major, entry.Minor, entry.Rate)
270
				final.BlkioThrottleReadBpsDevice = append(final.BlkioThrottleReadBpsDevice, throttle)
271
			}
272
		}
273
		if len(resource.BlockIO.ThrottleWriteBpsDevice) > 0 {
274
			for _, entry := range resource.BlockIO.ThrottleWriteBpsDevice {
275
				throttle := runcconfig.NewThrottleDevice(entry.Major, entry.Minor, entry.Rate)
276
				final.BlkioThrottleWriteBpsDevice = append(final.BlkioThrottleWriteBpsDevice, throttle)
277
			}
278
		}
279
		if len(resource.BlockIO.ThrottleReadIOPSDevice) > 0 {
280
			for _, entry := range resource.BlockIO.ThrottleReadIOPSDevice {
281
				throttle := runcconfig.NewThrottleDevice(entry.Major, entry.Minor, entry.Rate)
282
				final.BlkioThrottleReadIOPSDevice = append(final.BlkioThrottleReadIOPSDevice, throttle)
283
			}
284
		}
285
		if len(resource.BlockIO.ThrottleWriteIOPSDevice) > 0 {
286
			for _, entry := range resource.BlockIO.ThrottleWriteIOPSDevice {
287
				throttle := runcconfig.NewThrottleDevice(entry.Major, entry.Minor, entry.Rate)
288
				final.BlkioThrottleWriteIOPSDevice = append(final.BlkioThrottleWriteIOPSDevice, throttle)
289
			}
290
		}
291
		if resource.BlockIO.LeafWeight != nil {
292
			final.BlkioLeafWeight = *resource.BlockIO.LeafWeight
293
		}
294
		if resource.BlockIO.Weight != nil {
295
			final.BlkioWeight = *resource.BlockIO.Weight
296
		}
297
		if len(resource.BlockIO.WeightDevice) > 0 {
298
			for _, entry := range resource.BlockIO.WeightDevice {
299
				var w, lw uint16
300
				if entry.Weight != nil {
301
					w = *entry.Weight
302
				}
303
				if entry.LeafWeight != nil {
304
					lw = *entry.LeafWeight
305
				}
306
				weight := runcconfig.NewWeightDevice(entry.Major, entry.Minor, w, lw)
307
				final.BlkioWeightDevice = append(final.BlkioWeightDevice, weight)
308
			}
309
		}
310
	}
311

312
	// Pids
313
	if resource.Pids != nil {
314
		final.PidsLimit = resource.Pids.Limit
315
	}
316

317
	// Networking
318
	if resource.Network != nil {
319
		if resource.Network.ClassID != nil {
320
			final.NetClsClassid = *resource.Network.ClassID
321
		}
322
	}
323

324
	// Unified state
325
	final.Unified = resource.Unified
326
	return *final, nil
327
}
328

329
func moveToRuntimeCgroup() error {
330
	return cgroups.MoveUnderCgroupSubtree("runtime")
331
}
332
podman

Использование cookies