inspektor-gadget
788 строк · 21.9 Кб
1// Copyright 2023 The Inspektor Gadget authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package containerhook detects when a container is created or terminated.
16//
17// It uses two mechanisms to detect new containers:
18// 1. fanotify with FAN_OPEN_EXEC_PERM.
19// 2. ebpf on the sys_enter_execve tracepoint to get the execve arguments.
20//
21// Using fanotify with FAN_OPEN_EXEC_PERM allows to call a callback function
22// while the container is being created. The container is paused until the
23// callback function returns.
24//
25// Using ebpf on the sys_enter_execve tracepoint allows to get the execve
26// arguments without the need to read /proc/$pid/cmdline or /proc/$pid/comm.
27// Reading /proc/$pid/cmdline is not possible using only fanotify when the
28// tracer is not in the same pidns as the process being traced. This is the
29// case when Inspektor Gadget is started with hostPID=false.
30//
31// https://github.com/inspektor-gadget/inspektor-gadget/blob/main/docs/devel/fanotify-ebpf.png
32package containerhook33
34import (35"encoding/json"36"errors"37"fmt"38"io"39"math"40"os"41"path/filepath"42"strconv"43"strings"44"sync"45"sync/atomic"46"time"47
48"github.com/cilium/ebpf"49"github.com/cilium/ebpf/link"50ocispec "github.com/opencontainers/runtime-spec/specs-go"51"github.com/s3rj1k/go-fanotify/fanotify"52log "github.com/sirupsen/logrus"53"golang.org/x/sys/unix"54
55"github.com/inspektor-gadget/inspektor-gadget/pkg/btfgen"56"github.com/inspektor-gadget/inspektor-gadget/pkg/gadgets"57"github.com/inspektor-gadget/inspektor-gadget/pkg/utils/host"58)
59
60//go:generate go run github.com/cilium/ebpf/cmd/bpf2go -target $TARGET -cc clang -cflags ${CFLAGS} -no-global-types -type record execruntime ./bpf/execruntime.bpf.c -- -I./bpf/
61
62type EventType int63
64const (65EventTypeAddContainer EventType = iota66EventTypeRemoveContainer
67)
68
69// ContainerEvent is the notification for container creation or termination
70type ContainerEvent struct {71// Type is whether the container was added or removed72Type EventType
73
74// ContainerID is the container id, typically a 64 hexadecimal string75ContainerID string76
77// ContainerName is the container name, typically two words with an underscore78ContainerName string79
80// ContainerPID is the process id of the container81ContainerPID uint3282
83// Container's configuration is the config.json from the OCI runtime84// spec85ContainerConfig *ocispec.Spec86
87// Bundle is the directory containing the config.json from the OCI88// runtime spec89// See https://github.com/opencontainers/runtime-spec/blob/main/bundle.md90Bundle string91}
92
93type ContainerNotifyFunc func(notif ContainerEvent)94
95type watchedContainer struct {96id string97pid int98}
99
100type futureContainer struct {101id string102name string103bundleDir string104pidFile string105}
106
107type ContainerNotifier struct {108runtimeBinaryNotify *fanotify.NotifyFD109callback ContainerNotifyFunc
110
111// containers is the set of containers that are being watched for112// termination. This prevents duplicate calls to113// AddWatchContainerTermination.114//115// Keys: Container ID116containers map[string]*watchedContainer117containersMu sync.Mutex118
119// futureContainers is the set of containers that are detected before120// oci-runtime (runc/crun) creates the container e.g. detected via conmon121//122// Keys: Container ID123futureContainers map[string]*futureContainer124futureMu sync.Mutex125
126objs execruntimeObjects
127links []link.Link128
129// set to true when the notifier is closed is closed130closed atomic.Bool131// this channel is used in watchContainersTermination() to avoid having to wait for the132// ticker to trigger before returning133done chan bool134
135wg sync.WaitGroup136}
137
138// runtimePaths is the list of paths where the container runtime runc or crun
139// could be installed. Depending on the Linux distribution, it could be in
140// different locations.
141//
142// When this package is executed in a container, it prepends the
143// HOST_ROOT env variable to the path.
144var runtimePaths = []string{145"/bin/runc",146"/usr/bin/runc",147"/usr/sbin/runc",148"/usr/local/bin/runc",149"/usr/local/sbin/runc",150"/usr/lib/cri-o-runc/sbin/runc",151"/run/torcx/unpack/docker/bin/runc",152"/usr/bin/crun",153"/usr/bin/conmon",154}
155
156// initFanotify initializes the fanotify API with the flags we need
157func initFanotify() (*fanotify.NotifyFD, error) {158fanotifyFlags := uint(unix.FAN_CLOEXEC | unix.FAN_CLASS_CONTENT | unix.FAN_UNLIMITED_QUEUE | unix.FAN_UNLIMITED_MARKS | unix.FAN_NONBLOCK)159openFlags := os.O_RDONLY | unix.O_LARGEFILE | unix.O_CLOEXEC160return fanotify.Initialize(fanotifyFlags, openFlags)161}
162
163// Supported detects if RuncNotifier is supported in the current environment
164func Supported() bool {165notifier, err := NewContainerNotifier(func(notif ContainerEvent) {})166if notifier != nil {167notifier.Close()168}169if err != nil {170log.Warnf("ContainerNotifier: not supported: %s", err)171}172return err == nil173}
174
175// NewContainerNotifier uses fanotify and ebpf to detect when a container is
176// created or terminated, and call the callback on such event.
177//
178// Limitations:
179// - the container runtime must be installed in one of the paths listed by runtimePaths
180func NewContainerNotifier(callback ContainerNotifyFunc) (*ContainerNotifier, error) {181n := &ContainerNotifier{182callback: callback,183containers: make(map[string]*watchedContainer),184futureContainers: make(map[string]*futureContainer),185done: make(chan bool),186}187
188if err := n.install(); err != nil {189n.Close()190return nil, err191}192
193return n, nil194}
195
196func (n *ContainerNotifier) installEbpf(fanotifyFd int) error {197spec, err := loadExecruntime()198if err != nil {199return fmt.Errorf("load ebpf program for container-hook: %w", err)200}201
202fanotifyPrivateData, err := readPrivateDataFromFd(fanotifyFd)203if err != nil {204return fmt.Errorf("readPrivateDataFromFd: %w", err)205}206
207consts := map[string]interface{}{208"tracer_group": fanotifyPrivateData,209}210if err := spec.RewriteConstants(consts); err != nil {211return fmt.Errorf("RewriteConstants: %w", err)212}213
214opts := ebpf.CollectionOptions{215Programs: ebpf.ProgramOptions{216KernelTypes: btfgen.GetBTFSpec(),217},218}219
220if err := spec.LoadAndAssign(&n.objs, &opts); err != nil {221return fmt.Errorf("loading maps and programs: %w", err)222}223
224// Attach ebpf programs225l, err := link.Kprobe("fsnotify_remove_first_event", n.objs.IgFaPickE, nil)226if err != nil {227return fmt.Errorf("attaching kprobe fsnotify_remove_first_event: %w", err)228}229n.links = append(n.links, l)230
231l, err = link.Kretprobe("fsnotify_remove_first_event", n.objs.IgFaPickX, nil)232if err != nil {233return fmt.Errorf("attaching kretprobe fsnotify_remove_first_event: %w", err)234}235n.links = append(n.links, l)236
237l, err = link.Tracepoint("syscalls", "sys_enter_execve", n.objs.IgExecveE, nil)238if err != nil {239return fmt.Errorf("attaching tracepoint: %w", err)240}241n.links = append(n.links, l)242
243l, err = link.Tracepoint("syscalls", "sys_exit_execve", n.objs.IgExecveX, nil)244if err != nil {245return fmt.Errorf("attaching tracepoint: %w", err)246}247n.links = append(n.links, l)248
249return nil250}
251
252func (n *ContainerNotifier) install() error {253// Start fanotify254runtimeBinaryNotify, err := initFanotify()255if err != nil {256return err257}258n.runtimeBinaryNotify = runtimeBinaryNotify259
260// Load, initialize and attach ebpf program261err = n.installEbpf(runtimeBinaryNotify.Fd)262if err != nil {263return err264}265
266// Attach fanotify to various runtime binaries267runtimeFound := false268
269runtimePath := os.Getenv("RUNTIME_PATH")270if runtimePath != "" {271log.Debugf("container-hook: trying runtime from RUNTIME_PATH env variable at %s", runtimePath)272
273if _, err := os.Stat(runtimePath); errors.Is(err, os.ErrNotExist) {274return err275}276
277if err := runtimeBinaryNotify.Mark(unix.FAN_MARK_ADD, unix.FAN_OPEN_EXEC_PERM, unix.AT_FDCWD, runtimePath); err != nil {278return fmt.Errorf("fanotify marking of %s: %w", runtimePath, err)279}280runtimeFound = true281} else {282for _, r := range runtimePaths {283runtimePath := filepath.Join(host.HostRoot, r)284
285log.Debugf("container-hook: trying runtime at %s", runtimePath)286
287if _, err := os.Stat(runtimePath); errors.Is(err, os.ErrNotExist) {288log.Debugf("container-hook: runc at %s not found", runtimePath)289continue290}291
292if err := runtimeBinaryNotify.Mark(unix.FAN_MARK_ADD, unix.FAN_OPEN_EXEC_PERM, unix.AT_FDCWD, runtimePath); err != nil {293log.Warnf("container-hook: failed to fanotify mark: %s", err)294continue295}296runtimeFound = true297}298}299
300if !runtimeFound {301runtimeBinaryNotify.File.Close()302return fmt.Errorf("no container runtime can be monitored with fanotify. The following paths were tested: %s. You can use the RUNTIME_PATH env variable to specify a custom path. If you are successful doing so, please open a PR to add your custom path to runtimePaths", strings.Join(runtimePaths, ","))303}304
305n.wg.Add(2)306go n.watchContainersTermination()307go n.watchRuntimeBinary()308
309return nil310}
311
312// AddWatchContainerTermination watches a container for termination and
313// generates an event on the notifier. This is automatically called for new
314// containers detected by ContainerNotifier, but it can also be called for
315// containers detected externally such as initial containers.
316func (n *ContainerNotifier) AddWatchContainerTermination(containerID string, containerPID int) error {317n.containersMu.Lock()318defer n.containersMu.Unlock()319
320if _, ok := n.containers[containerID]; ok {321// This container is already being watched for termination322return nil323}324
325n.containers[containerID] = &watchedContainer{326id: containerID,327pid: containerPID,328}329
330return nil331}
332
333// watchContainerTermination waits until the container terminates
334func (n *ContainerNotifier) watchContainersTermination() {335defer n.wg.Done()336
337ticker := time.NewTicker(time.Second)338defer ticker.Stop()339
340for {341select {342case <-n.done:343return344case <-ticker.C:345if n.closed.Load() {346return347}348
349dirEntries, err := os.ReadDir(host.HostProcFs)350if err != nil {351log.Errorf("reading /proc: %s", err)352return353}354pids := make(map[int]bool)355for _, entry := range dirEntries {356pid, err := strconv.Atoi(entry.Name())357if err != nil {358// entry is not a process directory. Ignore.359continue360}361pids[pid] = true362}363
364n.containersMu.Lock()365for _, c := range n.containers {366if pids[c.pid] {367// container still running368continue369}370
371go n.callback(ContainerEvent{372Type: EventTypeRemoveContainer,373ContainerID: c.id,374ContainerPID: uint32(c.pid),375})376
377delete(n.containers, c.id)378}379n.containersMu.Unlock()380}381}382}
383
384func (n *ContainerNotifier) watchPidFileIterate(385pidFileDirNotify *fanotify.NotifyFD,386bundleDir string,387configJSONPath string,388pidFile string,389pidFileDir string,390) (bool, error) {391// Get the next event from fanotify.392// Even though the API allows to pass skipPIDs, we cannot use393// it here because ResponseAllow would not be called.394data, err := pidFileDirNotify.GetEvent()395if err != nil {396return false, fmt.Errorf("%w", err)397}398
399// data can be nil if the event received is from a process in skipPIDs.400// In that case, skip and get the next event.401if data == nil {402return false, nil403}404
405// Don't leak the fd received by GetEvent406defer data.Close()407dataFile := data.File()408defer dataFile.Close()409
410if !data.MatchMask(unix.FAN_ACCESS_PERM) {411// This should not happen: FAN_ACCESS_PERM is the only mask Marked412return false, fmt.Errorf("fanotify: unknown event on runc: mask=%d pid=%d", data.Mask, data.Pid)413}414
415// This unblocks whoever is accessing the pidfile416defer pidFileDirNotify.ResponseAllow(data)417
418path, err := data.GetPath()419if err != nil {420return false, err421}422path = filepath.Join(host.HostRoot, path)423
424// Consider files identical if they have the same device/inode,425// even if the paths differ due to symlinks (for example,426// the event's path is /run/... but the runc --pid-file argument427// uses /var/run/..., where /var/run is a symlink to /run).428filesAreIdentical, err := checkFilesAreIdentical(path, pidFile)429if err != nil {430return false, err431} else if !filesAreIdentical {432return false, nil433}434
435pidFileContent, err := io.ReadAll(dataFile)436if err != nil {437return false, err438}439if len(pidFileContent) == 0 {440return false, fmt.Errorf("empty pid file")441}442containerPID, err := strconv.Atoi(string(pidFileContent))443if err != nil {444return false, err445}446
447// Unfortunately, Linux 5.4 doesn't respect ignore masks448// See fix in Linux 5.9:449// https://github.com/torvalds/linux/commit/497b0c5a7c0688c1b100a9c2e267337f677c198e450// Workaround: remove parent mask. We don't need it anymore :)451err = pidFileDirNotify.Mark(unix.FAN_MARK_REMOVE, unix.FAN_ACCESS_PERM|unix.FAN_EVENT_ON_CHILD, unix.AT_FDCWD, pidFileDir)452if err != nil {453return false, nil454}455
456bundleConfigJSON, err := os.ReadFile(configJSONPath)457if err != nil {458return false, err459}460containerConfig := &ocispec.Spec{}461err = json.Unmarshal(bundleConfigJSON, containerConfig)462if err != nil {463return false, err464}465
466// cri-o appends userdata to bundleDir,467// so we trim it here to get the correct containerID468containerID := filepath.Base(filepath.Clean(strings.TrimSuffix(bundleDir, "userdata")))469
470err = n.AddWatchContainerTermination(containerID, containerPID)471if err != nil {472log.Errorf("container %s with pid %d terminated before we could watch it: %s", containerID, containerPID, err)473return true, nil474}475
476if containerPID > math.MaxUint32 {477log.Errorf("Container PID (%d) exceeds math.MaxUint32 (%d)", containerPID, math.MaxUint32)478return true, nil479}480
481var containerName string482n.futureMu.Lock()483fc, ok := n.futureContainers[containerID]484if ok {485containerName = fc.name486}487delete(n.futureContainers, containerID)488n.futureMu.Unlock()489
490n.callback(ContainerEvent{491Type: EventTypeAddContainer,492ContainerID: containerID,493ContainerPID: uint32(containerPID),494ContainerConfig: containerConfig,495Bundle: bundleDir,496ContainerName: containerName,497})498
499return true, nil500}
501
502func checkFilesAreIdentical(path1, path2 string) (bool, error) {503// Since fanotify masks don't work on Linux 5.4, we could get a504// notification for an unrelated file before the pid file is created505// See fix in Linux 5.9:506// https://github.com/torvalds/linux/commit/497b0c5a7c0688c1b100a9c2e267337f677c198e507// In this case we should not return an error.508if filepath.Base(path1) != filepath.Base(path2) {509return false, nil510}511
512f1, err := os.Stat(path1)513if err != nil {514return false, err515}516
517f2, err := os.Stat(path2)518if err != nil {519return false, err520}521
522return os.SameFile(f1, f2), nil523}
524
525func (n *ContainerNotifier) monitorRuntimeInstance(bundleDir string, pidFile string) error {526fanotifyFlags := uint(unix.FAN_CLOEXEC | unix.FAN_CLASS_CONTENT | unix.FAN_UNLIMITED_QUEUE | unix.FAN_UNLIMITED_MARKS)527openFlags := os.O_RDONLY | unix.O_LARGEFILE | unix.O_CLOEXEC528
529pidFileDirNotify, err := fanotify.Initialize(fanotifyFlags, openFlags)530if err != nil {531return err532}533
534// The pidfile does not exist yet, so we cannot monitor it directly.535// Instead we monitor its parent directory with FAN_EVENT_ON_CHILD to536// get events on the directory's children.537pidFileDir := filepath.Dir(pidFile)538err = pidFileDirNotify.Mark(unix.FAN_MARK_ADD, unix.FAN_ACCESS_PERM|unix.FAN_EVENT_ON_CHILD, unix.AT_FDCWD, pidFileDir)539if err != nil {540pidFileDirNotify.File.Close()541return fmt.Errorf("marking %s: %w", pidFileDir, err)542}543
544// watchPidFileIterate() will read config.json and it might be in the545// same directory as the pid file. To avoid getting events unrelated to546// the pidfile, add an ignore mask.547//548// This is best effort because the ignore mask is unfortunately not549// respected until a fix in Linux 5.9:550// https://github.com/torvalds/linux/commit/497b0c5a7c0688c1b100a9c2e267337f677c198e551configJSONPath := filepath.Join(bundleDir, "config.json")552if _, err := os.Stat(configJSONPath); errors.Is(err, os.ErrNotExist) {553// podman might install config.json in the userdata directory554configJSONPath = filepath.Join(bundleDir, "userdata", "config.json")555if _, err := os.Stat(configJSONPath); errors.Is(err, os.ErrNotExist) {556pidFileDirNotify.File.Close()557return fmt.Errorf("config not found at %s", configJSONPath)558}559}560err = pidFileDirNotify.Mark(unix.FAN_MARK_ADD|unix.FAN_MARK_IGNORED_MASK, unix.FAN_ACCESS_PERM, unix.AT_FDCWD, configJSONPath)561if err != nil {562pidFileDirNotify.File.Close()563return fmt.Errorf("marking %s: %w", configJSONPath, err)564}565
566// similar to config.json, we ignore passwd file if it exists567passwdPath := filepath.Join(bundleDir, "passwd")568if _, err := os.Stat(passwdPath); !errors.Is(err, os.ErrNotExist) {569err = pidFileDirNotify.Mark(unix.FAN_MARK_ADD|unix.FAN_MARK_IGNORED_MASK, unix.FAN_ACCESS_PERM, unix.AT_FDCWD, passwdPath)570if err != nil {571pidFileDirNotify.File.Close()572return fmt.Errorf("marking passwd path: %w", err)573}574}575
576n.wg.Add(1)577go func() {578defer n.wg.Done()579defer pidFileDirNotify.File.Close()580for {581stop, err := n.watchPidFileIterate(pidFileDirNotify, bundleDir, configJSONPath, pidFile, pidFileDir)582if n.closed.Load() {583return584}585if err != nil {586log.Warnf("error watching pid: %v\n", err)587return588}589if stop {590return591}592}593}()594
595return nil596}
597
598func (n *ContainerNotifier) watchRuntimeBinary() {599defer n.wg.Done()600
601for {602stop, err := n.watchRuntimeIterate()603if n.closed.Load() {604n.runtimeBinaryNotify.File.Close()605return606}607if err != nil {608log.Errorf("error watching runtime binary: %v\n", err)609}610if stop {611n.runtimeBinaryNotify.File.Close()612return613}614}615}
616
617func (n *ContainerNotifier) parseConmonCmdline(cmdlineArr []string) {618containerName := ""619containerID := ""620bundleDir := ""621pidFile := ""622
623for i := 0; i < len(cmdlineArr); i++ {624verb := cmdlineArr[i]625arg := ""626if i+1 < len(cmdlineArr) {627arg = cmdlineArr[i+1]628}629switch verb {630case "-n", "--name":631containerName = arg632i++633case "-c", "--cid":634containerID = arg635i++636case "-b", "--bundle":637bundleDir = arg638i++639case "-p", "--container-pidfile":640pidFile = arg641i++642}643}644
645if containerName == "" || containerID == "" || bundleDir == "" || pidFile == "" {646return647}648
649n.futureMu.Lock()650n.futureContainers[containerID] = &futureContainer{651id: containerID,652pidFile: pidFile,653bundleDir: bundleDir,654name: containerName,655}656n.futureMu.Unlock()657}
658
659func (n *ContainerNotifier) parseOCIRuntime(comm string, cmdlineArr []string) {660// Parse oci-runtime (runc/crun) command line661createFound := false662bundleDir := ""663pidFile := ""664
665for i := 0; i < len(cmdlineArr); i++ {666if cmdlineArr[i] == "create" {667createFound = true668continue669}670if cmdlineArr[i] == "--bundle" && i+1 < len(cmdlineArr) {671i++672bundleDir = filepath.Join(host.HostRoot, cmdlineArr[i])673continue674}675if cmdlineArr[i] == "--pid-file" && i+1 < len(cmdlineArr) {676i++677pidFile = filepath.Join(host.HostRoot, cmdlineArr[i])678continue679}680}681
682if createFound && bundleDir != "" && pidFile != "" {683err := n.monitorRuntimeInstance(bundleDir, pidFile)684if err != nil {685log.Errorf("error monitoring runtime instance: %v\n", err)686}687}688}
689
690func (n *ContainerNotifier) watchRuntimeIterate() (bool, error) {691// Get the next event from fanotify.692// Even though the API allows to pass skipPIDs, we cannot use it here693// because ResponseAllow would not be called.694data, err := n.runtimeBinaryNotify.GetEvent()695if err != nil {696return true, err697}698
699// data can be nil if the event received is from a process in skipPIDs.700// In that case, skip and get the next event.701if data == nil {702return false, nil703}704
705// Don't leak the fd received by GetEvent706defer data.Close()707
708if !data.MatchMask(unix.FAN_OPEN_EXEC_PERM) {709// This should not happen: FAN_OPEN_EXEC_PERM is the only mask Marked710return false, fmt.Errorf("fanotify: unknown event on runc: mask=%d pid=%d", data.Mask, data.Pid)711}712
713// This unblocks the execution714defer n.runtimeBinaryNotify.ResponseAllow(data)715
716// Lookup entry in ebpf map ig_fa_records717var record execruntimeRecord718err = n.objs.IgFaRecords.LookupAndDelete(nil, &record)719if err != nil {720return false, fmt.Errorf("lookup record: %w", err)721}722
723// Skip empty record724// This can happen when the ebpf code didn't find the exec args725if record.Pid == 0 {726log.Debugf("skip event with pid=0")727return false, nil728}729if record.ArgsSize == 0 {730log.Debugf("skip event without args")731return false, nil732}733
734callerComm := strings.TrimRight(string(record.CallerComm[:]), "\x00")735
736cmdlineArr := []string{}737calleeComm := ""738for _, arg := range strings.Split(string(record.Args[0:record.ArgsSize]), "\x00") {739if arg != "" {740cmdlineArr = append(cmdlineArr, arg)741}742}743if len(cmdlineArr) == 0 {744log.Debugf("cannot get cmdline for pid %d", record.Pid)745return false, nil746}747if len(cmdlineArr) > 0 {748calleeComm = filepath.Base(cmdlineArr[0])749}750
751log.Debugf("got event with pid=%d caller=%q callee=%q args=%v",752record.Pid,753callerComm, calleeComm,754cmdlineArr)755
756// runc is executing itself with unix.Exec(), so fanotify receives two757// FAN_OPEN_EXEC_PERM events:758// 1. from containerd-shim (or similar)759// 2. from runc, by this re-execution.760// This filter takes the first one.761
762switch calleeComm {763case "conmon":764// Calling sequence: crio/podman -> conmon -> runc/crun765n.parseConmonCmdline(cmdlineArr)766case "runc", "crun":767n.parseOCIRuntime(calleeComm, cmdlineArr)768default:769return false, nil770}771
772return false, nil773}
774
775func (n *ContainerNotifier) Close() {776n.closed.Store(true)777close(n.done)778if n.runtimeBinaryNotify != nil {779n.runtimeBinaryNotify.File.Close()780}781n.wg.Wait()782
783for _, l := range n.links {784gadgets.CloseLink(l)785}786n.links = nil787n.objs.Close()788}
789