podman

Форк
0
/
container_top_linux.go 
408 строк · 12.3 Кб
1
//go:build !remote && linux && cgo
2

3
package libpod
4

5
import (
6
	"bufio"
7
	"bytes"
8
	"errors"
9
	"fmt"
10
	"os"
11
	"os/exec"
12
	"path/filepath"
13
	"runtime"
14
	"strconv"
15
	"strings"
16
	"syscall"
17
	"unsafe"
18

19
	"github.com/containers/podman/v5/libpod/define"
20
	"github.com/containers/podman/v5/pkg/rootless"
21
	"github.com/containers/psgo"
22
	"github.com/containers/storage/pkg/reexec"
23
	"github.com/google/shlex"
24
	"github.com/sirupsen/logrus"
25
	"golang.org/x/exp/slices"
26
	"golang.org/x/sys/unix"
27
)
28

29
/*
30
#include <stdlib.h>
31
void fork_exec_ps();
32
void create_argv(int len);
33
void set_argv(int pos, char *arg);
34
*/
35
import "C"
36

37
const (
38
	// podmanTopCommand is the reexec key to safely setup the environment for ps to be executed
39
	podmanTopCommand = "podman-top"
40

41
	// podmanTopExitCode is a special exec code to signal that podman failed to to something in
42
	// reexec command not ps. This is used to give a better error.
43
	podmanTopExitCode = 255
44
)
45

46
func init() {
47
	reexec.Register(podmanTopCommand, podmanTopMain)
48
}
49

50
// podmanTopMain - main function for the reexec
51
func podmanTopMain() {
52
	if err := podmanTopInner(); err != nil {
53
		fmt.Fprint(os.Stderr, err.Error())
54
		os.Exit(podmanTopExitCode)
55
	}
56
	os.Exit(0)
57
}
58

59
// podmanTopInner os.Args = {command name} {pid} {psPath} [args...]
60
// We are rexxec'd in a new mountns, then we need to set some security settings in order
61
// to safely execute ps in the container pid namespace. Most notably make sure podman and
62
// ps are read only to prevent a process from overwriting it.
63
func podmanTopInner() error {
64
	if len(os.Args) < 3 {
65
		return fmt.Errorf("internal error, need at least two arguments")
66
	}
67

68
	// We have to lock the thread as we a) switch namespace below and b) use PR_SET_PDEATHSIG
69
	// Also do not unlock as this thread should not be reused by go we exit anyway at the end.
70
	runtime.LockOSThread()
71

72
	if err := unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0); err != nil {
73
		return fmt.Errorf("PR_SET_PDEATHSIG: %w", err)
74
	}
75
	if err := unix.Prctl(unix.PR_SET_DUMPABLE, 0, 0, 0, 0); err != nil {
76
		return fmt.Errorf("PR_SET_DUMPABLE: %w", err)
77
	}
78

79
	if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
80
		return fmt.Errorf("PR_SET_NO_NEW_PRIVS: %w", err)
81
	}
82

83
	if err := unix.Mount("none", "/", "", unix.MS_REC|unix.MS_PRIVATE, ""); err != nil {
84
		return fmt.Errorf("make / mount private: %w", err)
85
	}
86

87
	psPath := os.Args[2]
88

89
	// try to mount everything read only
90
	if err := unix.MountSetattr(0, "/", unix.AT_RECURSIVE, &unix.MountAttr{
91
		Attr_set: unix.MOUNT_ATTR_RDONLY,
92
	}); err != nil {
93
		if err != unix.ENOSYS {
94
			return fmt.Errorf("mount_setattr / readonly: %w", err)
95
		}
96
		// old kernel without mount_setattr, i.e. on RHEL 8.8
97
		// Bind mount the directories readonly for both podman and ps.
98
		psPath, err = remountReadOnly(psPath)
99
		if err != nil {
100
			return err
101
		}
102
		_, err = remountReadOnly(reexec.Self())
103
		if err != nil {
104
			return err
105
		}
106
	}
107

108
	// extra safety check make sure the ps path is actually read only
109
	err := unix.Access(psPath, unix.W_OK)
110
	if err == nil {
111
		return fmt.Errorf("%q was not mounted read only, this can be dangerous so we will not execute it", psPath)
112
	}
113

114
	pid := os.Args[1]
115
	// join the pid namespace of pid
116
	pidFD, err := os.Open(fmt.Sprintf("/proc/%s/ns/pid", pid))
117
	if err != nil {
118
		return fmt.Errorf("open pidns: %w", err)
119
	}
120
	if err := unix.Setns(int(pidFD.Fd()), unix.CLONE_NEWPID); err != nil {
121
		return fmt.Errorf("setns NEWPID: %w", err)
122
	}
123
	pidFD.Close()
124

125
	args := []string{psPath}
126
	args = append(args, os.Args[3:]...)
127

128
	C.create_argv(C.int(len(args)))
129
	for i, arg := range args {
130
		cArg := C.CString(arg)
131
		C.set_argv(C.int(i), cArg)
132
		defer C.free(unsafe.Pointer(cArg))
133
	}
134

135
	// Now try to close open fds except std streams
136
	// While golang open everything O_CLOEXEC it could still leak fds from
137
	// the parent, i.e. bash. In this case an attacker might be able to
138
	// read/write from them.
139
	// Do this as last step, it has to happen before to fork because the child
140
	// will be immediately in pid namespace so we cannot close them in the child.
141
	entries, err := os.ReadDir("/proc/self/fd")
142
	if err != nil {
143
		return err
144
	}
145
	for _, e := range entries {
146
		i, err := strconv.Atoi(e.Name())
147
		// IsFdInherited checks the we got the fd from a parent process and only close them,
148
		// when we close all that would include the ones from the go runtime which
149
		// then can panic because of that.
150
		if err == nil && i > unix.Stderr && rootless.IsFdInherited(i) {
151
			_ = unix.Close(i)
152
		}
153
	}
154

155
	// this function will always exit for us
156
	C.fork_exec_ps()
157
	return nil
158
}
159

160
// remountReadOnly remounts the parent directory of the given path read only
161
// return the resolved path or an error. The path can then be used to exec the
162
// binary as we know it is on a read only mount now.
163
func remountReadOnly(path string) (string, error) {
164
	resolvedPath, err := filepath.EvalSymlinks(path)
165
	if err != nil {
166
		return "", fmt.Errorf("resolve symlink for %s: %w", path, err)
167
	}
168
	dir := filepath.Dir(resolvedPath)
169
	// create mount point
170
	if err := unix.Mount(dir, dir, "", unix.MS_BIND, ""); err != nil {
171
		return "", fmt.Errorf("mount %s read only: %w", dir, err)
172
	}
173
	// remount readonly
174
	if err := unix.Mount(dir, dir, "", unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY, ""); err != nil {
175
		return "", fmt.Errorf("mount %s read only: %w", dir, err)
176
	}
177
	return resolvedPath, nil
178
}
179

180
// Top gathers statistics about the running processes in a container. It returns a
181
// []string for output
182
func (c *Container) Top(descriptors []string) ([]string, error) {
183
	if c.config.NoCgroups {
184
		return nil, fmt.Errorf("cannot run top on container %s as it did not create a cgroup: %w", c.ID(), define.ErrNoCgroups)
185
	}
186

187
	conStat, err := c.State()
188
	if err != nil {
189
		return nil, fmt.Errorf("unable to look up state for %s: %w", c.ID(), err)
190
	}
191
	if conStat != define.ContainerStateRunning {
192
		return nil, errors.New("top can only be used on running containers")
193
	}
194

195
	// Also support comma-separated input.
196
	psgoDescriptors := []string{}
197
	for _, d := range descriptors {
198
		for _, s := range strings.Split(d, ",") {
199
			if s != "" {
200
				psgoDescriptors = append(psgoDescriptors, s)
201
			}
202
		}
203
	}
204

205
	// If we encountered an ErrUnknownDescriptor error, fallback to executing
206
	// ps(1). This ensures backwards compatibility to users depending on ps(1)
207
	// and makes sure we're ~compatible with docker.
208
	output, psgoErr := c.GetContainerPidInformation(psgoDescriptors)
209
	if psgoErr == nil {
210
		return output, nil
211
	}
212
	if !errors.Is(psgoErr, psgo.ErrUnknownDescriptor) {
213
		return nil, psgoErr
214
	}
215

216
	psDescriptors := descriptors
217
	if len(descriptors) == 1 {
218
		// Note that the descriptors to ps(1) must be shlexed (see #12452).
219
		psDescriptors = make([]string, 0, len(descriptors))
220
		shSplit, err := shlex.Split(descriptors[0])
221
		if err != nil {
222
			return nil, fmt.Errorf("parsing ps args: %w", err)
223
		}
224
		for _, s := range shSplit {
225
			if s != "" {
226
				psDescriptors = append(psDescriptors, s)
227
			}
228
		}
229
	}
230

231
	// Only use ps(1) from the host when we know the container was not started with CAP_SYS_PTRACE,
232
	// with it the container can access /proc/$pid/ files and potentially escape the container fs.
233
	if c.config.Spec.Process.Capabilities != nil &&
234
		!slices.Contains(c.config.Spec.Process.Capabilities.Effective, "CAP_SYS_PTRACE") {
235
		var retry bool
236
		output, retry, err = c.execPS(psDescriptors)
237
		if err != nil {
238
			if !retry {
239
				return nil, err
240
			}
241
			logrus.Warnf("Falling back to container ps(1), could not execute ps(1) from the host: %v", err)
242
			output, err = c.execPSinContainer(psDescriptors)
243
			if err != nil {
244
				return nil, fmt.Errorf("executing ps(1) in container: %w", err)
245
			}
246
		}
247
	} else {
248
		output, err = c.execPSinContainer(psDescriptors)
249
		if err != nil {
250
			return nil, fmt.Errorf("executing ps(1) in container: %w", err)
251
		}
252
	}
253

254
	// Trick: filter the ps command from the output instead of
255
	// checking/requiring PIDs in the output.
256
	filtered := []string{}
257
	cmd := strings.Join(descriptors, " ")
258
	for _, line := range output {
259
		if !strings.Contains(line, cmd) {
260
			filtered = append(filtered, line)
261
		}
262
	}
263

264
	return filtered, nil
265
}
266

267
// GetContainerPidInformation returns process-related data of all processes in
268
// the container.  The output data can be controlled via the `descriptors`
269
// argument which expects format descriptors and supports all AIXformat
270
// descriptors of ps (1) plus some additional ones to for instance inspect the
271
// set of effective capabilities.  Each element in the returned string slice
272
// is a tab-separated string.
273
//
274
// For more details, please refer to github.com/containers/psgo.
275
func (c *Container) GetContainerPidInformation(descriptors []string) ([]string, error) {
276
	pid := strconv.Itoa(c.state.PID)
277
	// NOTE: psgo returns a [][]string to give users the ability to apply
278
	//       filters on the data.  We need to change the API here
279
	//       to return a [][]string if we want to make use of
280
	//       filtering.
281
	opts := psgo.JoinNamespaceOpts{FillMappings: rootless.IsRootless()}
282

283
	psgoOutput, err := psgo.JoinNamespaceAndProcessInfoWithOptions(pid, descriptors, &opts)
284
	if err != nil {
285
		return nil, err
286
	}
287
	res := []string{}
288
	for _, out := range psgoOutput {
289
		res = append(res, strings.Join(out, "\t"))
290
	}
291
	return res, nil
292
}
293

294
// execute ps(1) from the host within the container pid namespace
295
func (c *Container) execPS(psArgs []string) ([]string, bool, error) {
296
	rPipe, wPipe, err := os.Pipe()
297
	if err != nil {
298
		return nil, false, err
299
	}
300
	defer rPipe.Close()
301

302
	outErrChan := make(chan error)
303
	stdout := []string{}
304
	go func() {
305
		defer close(outErrChan)
306
		scanner := bufio.NewScanner(rPipe)
307
		for scanner.Scan() {
308
			stdout = append(stdout, scanner.Text())
309
		}
310
		if err := scanner.Err(); err != nil {
311
			outErrChan <- err
312
		}
313
	}()
314

315
	psPath, err := exec.LookPath("ps")
316
	if err != nil {
317
		wPipe.Close()
318
		return nil, true, err
319
	}
320
	args := append([]string{podmanTopCommand, strconv.Itoa(c.state.PID), psPath}, psArgs...)
321

322
	cmd := reexec.Command(args...)
323
	cmd.SysProcAttr = &syscall.SysProcAttr{
324
		Unshareflags: unix.CLONE_NEWNS,
325
	}
326
	var errBuf bytes.Buffer
327
	cmd.Stdout = wPipe
328
	cmd.Stderr = &errBuf
329
	// nil means use current env so explicitly unset all, to not leak any sensitive env vars
330
	cmd.Env = []string{fmt.Sprintf("HOME=%s", os.Getenv("HOME"))}
331

332
	retryContainerExec := true
333
	err = cmd.Run()
334
	wPipe.Close()
335
	if err != nil {
336
		exitError := &exec.ExitError{}
337
		if errors.As(err, &exitError) {
338
			if exitError.ExitCode() != podmanTopExitCode {
339
				// ps command failed
340
				err = fmt.Errorf("ps(1) failed with exit code %d: %s", exitError.ExitCode(), errBuf.String())
341
				// ps command itself failed: likely invalid args, no point in retrying.
342
				retryContainerExec = false
343
			} else {
344
				// podman-top reexec setup fails somewhere
345
				err = fmt.Errorf("could not execute ps(1) in the container pid namespace: %s", errBuf.String())
346
			}
347
		} else {
348
			err = fmt.Errorf("could not reexec podman-top command: %w", err)
349
		}
350
	}
351

352
	if err := <-outErrChan; err != nil {
353
		return nil, retryContainerExec, fmt.Errorf("failed to read ps stdout: %w", err)
354
	}
355
	return stdout, retryContainerExec, err
356
}
357

358
// execPS executes ps(1) with the specified args in the container via exec session.
359
// This should be a bit safer then execPS() but it requires ps(1) to be installed in the container.
360
func (c *Container) execPSinContainer(args []string) ([]string, error) {
361
	rPipe, wPipe, err := os.Pipe()
362
	if err != nil {
363
		return nil, err
364
	}
365
	defer rPipe.Close()
366

367
	var errBuf bytes.Buffer
368
	streams := new(define.AttachStreams)
369
	streams.OutputStream = wPipe
370
	streams.ErrorStream = &errBuf
371
	streams.AttachOutput = true
372
	streams.AttachError = true
373

374
	outErrChan := make(chan error)
375
	stdout := []string{}
376
	go func() {
377
		defer close(outErrChan)
378
		scanner := bufio.NewScanner(rPipe)
379
		for scanner.Scan() {
380
			stdout = append(stdout, scanner.Text())
381
		}
382
		if err := scanner.Err(); err != nil {
383
			outErrChan <- err
384
		}
385
	}()
386

387
	cmd := append([]string{"ps"}, args...)
388
	config := new(ExecConfig)
389
	config.Command = cmd
390
	ec, err := c.Exec(config, streams, nil)
391
	wPipe.Close()
392
	if err != nil {
393
		return nil, err
394
	} else if ec != 0 {
395
		return nil, fmt.Errorf("runtime failed with exit status: %d and output: %s", ec, errBuf.String())
396
	}
397

398
	if logrus.GetLevel() >= logrus.DebugLevel {
399
		// If we're running in debug mode or higher, we might want to have a
400
		// look at stderr which includes debug logs from conmon.
401
		logrus.Debugf(errBuf.String())
402
	}
403

404
	if err := <-outErrChan; err != nil {
405
		return nil, fmt.Errorf("failed to read ps stdout: %w", err)
406
	}
407
	return stdout, nil
408
}
409

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.