1
//go:build !remote && linux && cgo
19
"github.com/containers/podman/v5/libpod/define"
20
"github.com/containers/podman/v5/pkg/rootless"
21
"github.com/containers/psgo"
22
"github.com/containers/storage/pkg/reexec"
23
"github.com/google/shlex"
24
"github.com/sirupsen/logrus"
25
"golang.org/x/exp/slices"
26
"golang.org/x/sys/unix"
32
void create_argv(int len);
33
void set_argv(int pos, char *arg);
38
// podmanTopCommand is the reexec key to safely setup the environment for ps to be executed
39
podmanTopCommand = "podman-top"
41
// podmanTopExitCode is a special exec code to signal that podman failed to to something in
42
// reexec command not ps. This is used to give a better error.
43
podmanTopExitCode = 255
47
reexec.Register(podmanTopCommand, podmanTopMain)
50
// podmanTopMain - main function for the reexec
52
if err := podmanTopInner(); err != nil {
53
fmt.Fprint(os.Stderr, err.Error())
54
os.Exit(podmanTopExitCode)
59
// podmanTopInner os.Args = {command name} {pid} {psPath} [args...]
60
// We are rexxec'd in a new mountns, then we need to set some security settings in order
61
// to safely execute ps in the container pid namespace. Most notably make sure podman and
62
// ps are read only to prevent a process from overwriting it.
63
func podmanTopInner() error {
65
return fmt.Errorf("internal error, need at least two arguments")
68
// We have to lock the thread as we a) switch namespace below and b) use PR_SET_PDEATHSIG
69
// Also do not unlock as this thread should not be reused by go we exit anyway at the end.
70
runtime.LockOSThread()
72
if err := unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0); err != nil {
73
return fmt.Errorf("PR_SET_PDEATHSIG: %w", err)
75
if err := unix.Prctl(unix.PR_SET_DUMPABLE, 0, 0, 0, 0); err != nil {
76
return fmt.Errorf("PR_SET_DUMPABLE: %w", err)
79
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
80
return fmt.Errorf("PR_SET_NO_NEW_PRIVS: %w", err)
83
if err := unix.Mount("none", "/", "", unix.MS_REC|unix.MS_PRIVATE, ""); err != nil {
84
return fmt.Errorf("make / mount private: %w", err)
89
// try to mount everything read only
90
if err := unix.MountSetattr(0, "/", unix.AT_RECURSIVE, &unix.MountAttr{
91
Attr_set: unix.MOUNT_ATTR_RDONLY,
93
if err != unix.ENOSYS {
94
return fmt.Errorf("mount_setattr / readonly: %w", err)
96
// old kernel without mount_setattr, i.e. on RHEL 8.8
97
// Bind mount the directories readonly for both podman and ps.
98
psPath, err = remountReadOnly(psPath)
102
_, err = remountReadOnly(reexec.Self())
108
// extra safety check make sure the ps path is actually read only
109
err := unix.Access(psPath, unix.W_OK)
111
return fmt.Errorf("%q was not mounted read only, this can be dangerous so we will not execute it", psPath)
115
// join the pid namespace of pid
116
pidFD, err := os.Open(fmt.Sprintf("/proc/%s/ns/pid", pid))
118
return fmt.Errorf("open pidns: %w", err)
120
if err := unix.Setns(int(pidFD.Fd()), unix.CLONE_NEWPID); err != nil {
121
return fmt.Errorf("setns NEWPID: %w", err)
125
args := []string{psPath}
126
args = append(args, os.Args[3:]...)
128
C.create_argv(C.int(len(args)))
129
for i, arg := range args {
130
cArg := C.CString(arg)
131
C.set_argv(C.int(i), cArg)
132
defer C.free(unsafe.Pointer(cArg))
135
// Now try to close open fds except std streams
136
// While golang open everything O_CLOEXEC it could still leak fds from
137
// the parent, i.e. bash. In this case an attacker might be able to
138
// read/write from them.
139
// Do this as last step, it has to happen before to fork because the child
140
// will be immediately in pid namespace so we cannot close them in the child.
141
entries, err := os.ReadDir("/proc/self/fd")
145
for _, e := range entries {
146
i, err := strconv.Atoi(e.Name())
147
// IsFdInherited checks the we got the fd from a parent process and only close them,
148
// when we close all that would include the ones from the go runtime which
149
// then can panic because of that.
150
if err == nil && i > unix.Stderr && rootless.IsFdInherited(i) {
155
// this function will always exit for us
160
// remountReadOnly remounts the parent directory of the given path read only
161
// return the resolved path or an error. The path can then be used to exec the
162
// binary as we know it is on a read only mount now.
163
func remountReadOnly(path string) (string, error) {
164
resolvedPath, err := filepath.EvalSymlinks(path)
166
return "", fmt.Errorf("resolve symlink for %s: %w", path, err)
168
dir := filepath.Dir(resolvedPath)
169
// create mount point
170
if err := unix.Mount(dir, dir, "", unix.MS_BIND, ""); err != nil {
171
return "", fmt.Errorf("mount %s read only: %w", dir, err)
174
if err := unix.Mount(dir, dir, "", unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY, ""); err != nil {
175
return "", fmt.Errorf("mount %s read only: %w", dir, err)
177
return resolvedPath, nil
180
// Top gathers statistics about the running processes in a container. It returns a
181
// []string for output
182
func (c *Container) Top(descriptors []string) ([]string, error) {
183
if c.config.NoCgroups {
184
return nil, fmt.Errorf("cannot run top on container %s as it did not create a cgroup: %w", c.ID(), define.ErrNoCgroups)
187
conStat, err := c.State()
189
return nil, fmt.Errorf("unable to look up state for %s: %w", c.ID(), err)
191
if conStat != define.ContainerStateRunning {
192
return nil, errors.New("top can only be used on running containers")
195
// Also support comma-separated input.
196
psgoDescriptors := []string{}
197
for _, d := range descriptors {
198
for _, s := range strings.Split(d, ",") {
200
psgoDescriptors = append(psgoDescriptors, s)
205
// If we encountered an ErrUnknownDescriptor error, fallback to executing
206
// ps(1). This ensures backwards compatibility to users depending on ps(1)
207
// and makes sure we're ~compatible with docker.
208
output, psgoErr := c.GetContainerPidInformation(psgoDescriptors)
212
if !errors.Is(psgoErr, psgo.ErrUnknownDescriptor) {
216
psDescriptors := descriptors
217
if len(descriptors) == 1 {
218
// Note that the descriptors to ps(1) must be shlexed (see #12452).
219
psDescriptors = make([]string, 0, len(descriptors))
220
shSplit, err := shlex.Split(descriptors[0])
222
return nil, fmt.Errorf("parsing ps args: %w", err)
224
for _, s := range shSplit {
226
psDescriptors = append(psDescriptors, s)
231
// Only use ps(1) from the host when we know the container was not started with CAP_SYS_PTRACE,
232
// with it the container can access /proc/$pid/ files and potentially escape the container fs.
233
if c.config.Spec.Process.Capabilities != nil &&
234
!slices.Contains(c.config.Spec.Process.Capabilities.Effective, "CAP_SYS_PTRACE") {
236
output, retry, err = c.execPS(psDescriptors)
241
logrus.Warnf("Falling back to container ps(1), could not execute ps(1) from the host: %v", err)
242
output, err = c.execPSinContainer(psDescriptors)
244
return nil, fmt.Errorf("executing ps(1) in container: %w", err)
248
output, err = c.execPSinContainer(psDescriptors)
250
return nil, fmt.Errorf("executing ps(1) in container: %w", err)
254
// Trick: filter the ps command from the output instead of
255
// checking/requiring PIDs in the output.
256
filtered := []string{}
257
cmd := strings.Join(descriptors, " ")
258
for _, line := range output {
259
if !strings.Contains(line, cmd) {
260
filtered = append(filtered, line)
267
// GetContainerPidInformation returns process-related data of all processes in
268
// the container. The output data can be controlled via the `descriptors`
269
// argument which expects format descriptors and supports all AIXformat
270
// descriptors of ps (1) plus some additional ones to for instance inspect the
271
// set of effective capabilities. Each element in the returned string slice
272
// is a tab-separated string.
274
// For more details, please refer to github.com/containers/psgo.
275
func (c *Container) GetContainerPidInformation(descriptors []string) ([]string, error) {
276
pid := strconv.Itoa(c.state.PID)
277
// NOTE: psgo returns a [][]string to give users the ability to apply
278
// filters on the data. We need to change the API here
279
// to return a [][]string if we want to make use of
281
opts := psgo.JoinNamespaceOpts{FillMappings: rootless.IsRootless()}
283
psgoOutput, err := psgo.JoinNamespaceAndProcessInfoWithOptions(pid, descriptors, &opts)
288
for _, out := range psgoOutput {
289
res = append(res, strings.Join(out, "\t"))
294
// execute ps(1) from the host within the container pid namespace
295
func (c *Container) execPS(psArgs []string) ([]string, bool, error) {
296
rPipe, wPipe, err := os.Pipe()
298
return nil, false, err
302
outErrChan := make(chan error)
305
defer close(outErrChan)
306
scanner := bufio.NewScanner(rPipe)
308
stdout = append(stdout, scanner.Text())
310
if err := scanner.Err(); err != nil {
315
psPath, err := exec.LookPath("ps")
318
return nil, true, err
320
args := append([]string{podmanTopCommand, strconv.Itoa(c.state.PID), psPath}, psArgs...)
322
cmd := reexec.Command(args...)
323
cmd.SysProcAttr = &syscall.SysProcAttr{
324
Unshareflags: unix.CLONE_NEWNS,
326
var errBuf bytes.Buffer
329
// nil means use current env so explicitly unset all, to not leak any sensitive env vars
330
cmd.Env = []string{fmt.Sprintf("HOME=%s", os.Getenv("HOME"))}
332
retryContainerExec := true
336
exitError := &exec.ExitError{}
337
if errors.As(err, &exitError) {
338
if exitError.ExitCode() != podmanTopExitCode {
340
err = fmt.Errorf("ps(1) failed with exit code %d: %s", exitError.ExitCode(), errBuf.String())
341
// ps command itself failed: likely invalid args, no point in retrying.
342
retryContainerExec = false
344
// podman-top reexec setup fails somewhere
345
err = fmt.Errorf("could not execute ps(1) in the container pid namespace: %s", errBuf.String())
348
err = fmt.Errorf("could not reexec podman-top command: %w", err)
352
if err := <-outErrChan; err != nil {
353
return nil, retryContainerExec, fmt.Errorf("failed to read ps stdout: %w", err)
355
return stdout, retryContainerExec, err
358
// execPS executes ps(1) with the specified args in the container via exec session.
359
// This should be a bit safer then execPS() but it requires ps(1) to be installed in the container.
360
func (c *Container) execPSinContainer(args []string) ([]string, error) {
361
rPipe, wPipe, err := os.Pipe()
367
var errBuf bytes.Buffer
368
streams := new(define.AttachStreams)
369
streams.OutputStream = wPipe
370
streams.ErrorStream = &errBuf
371
streams.AttachOutput = true
372
streams.AttachError = true
374
outErrChan := make(chan error)
377
defer close(outErrChan)
378
scanner := bufio.NewScanner(rPipe)
380
stdout = append(stdout, scanner.Text())
382
if err := scanner.Err(); err != nil {
387
cmd := append([]string{"ps"}, args...)
388
config := new(ExecConfig)
390
ec, err := c.Exec(config, streams, nil)
395
return nil, fmt.Errorf("runtime failed with exit status: %d and output: %s", ec, errBuf.String())
398
if logrus.GetLevel() >= logrus.DebugLevel {
399
// If we're running in debug mode or higher, we might want to have a
400
// look at stderr which includes debug logs from conmon.
401
logrus.Debugf(errBuf.String())
404
if err := <-outErrChan; err != nil {
405
return nil, fmt.Errorf("failed to read ps stdout: %w", err)