inspektor-gadget
734 строки · 20.9 Кб
1// Copyright 2019-2023 The Inspektor Gadget authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//go:build !withoutebpf
16
17package tracer
18
19import (
20"context"
21"errors"
22"fmt"
23"os"
24"sort"
25"strconv"
26"sync"
27"syscall"
28"time"
29"unsafe"
30
31"github.com/cilium/ebpf"
32"github.com/cilium/ebpf/link"
33"github.com/cilium/ebpf/perf"
34log "github.com/sirupsen/logrus"
35
36containercollection "github.com/inspektor-gadget/inspektor-gadget/pkg/container-collection"
37gadgetcontext "github.com/inspektor-gadget/inspektor-gadget/pkg/gadget-context"
38"github.com/inspektor-gadget/inspektor-gadget/pkg/gadgets"
39"github.com/inspektor-gadget/inspektor-gadget/pkg/gadgets/traceloop/types"
40eventtypes "github.com/inspektor-gadget/inspektor-gadget/pkg/types"
41"github.com/inspektor-gadget/inspektor-gadget/pkg/utils/syscalls"
42)
43
44//go:generate go run github.com/cilium/ebpf/cmd/bpf2go -type syscall_event_t -type syscall_event_cont_t -target ${TARGET} -cc clang -cflags ${CFLAGS} traceloop ./bpf/traceloop.bpf.c -- -I./bpf/
45
46// These consts must match the content of traceloop.h.
47const (
48useNullByteLength uint64 = 0x0fffffffffffffff
49useRetAsParamLength uint64 = 0x0ffffffffffffffe
50useArgIndexAsParamLength uint64 = 0x0ffffffffffffff0
51paramProbeAtExitMask uint64 = 0xf000000000000000
52
53syscallEventTypeEnter uint8 = 0
54syscallEventTypeExit uint8 = 1
55
56syscallArgs uint8 = 6
57)
58
59var (
60syscallsOnce sync.Once
61syscallsDeclarations map[string]syscallDeclaration
62)
63
64type containerRingReader struct {
65perfReader *perf.Reader
66mntnsID uint64
67}
68
69type Tracer struct {
70enricher gadgets.DataEnricherByMntNs
71
72innerMapSpec *ebpf.MapSpec
73
74objs traceloopObjects
75enterLink link.Link
76exitLink link.Link
77
78// Same comment than above, this map is designed to handle parallel access.
79// The keys of this map are containerID.
80readers sync.Map
81
82gadgetCtx gadgets.GadgetContext
83ctx context.Context
84cancel context.CancelFunc
85eventCallback func(event *types.Event)
86waitGroup sync.WaitGroup
87
88syscallFilters []string
89}
90
91type syscallEvent struct {
92bootTimestamp uint64
93monotonicTimestamp uint64
94typ uint8
95contNr uint8
96cpu uint16
97id uint16
98pid uint32
99comm string
100args []uint64
101mountNsID uint64
102retval int
103}
104
105type syscallEventContinued struct {
106monotonicTimestamp uint64
107index uint8
108param string
109}
110
111func NewTracer(enricher gadgets.DataEnricherByMntNs, filters []string) (*Tracer, error) {
112t := &Tracer{
113enricher: enricher,
114syscallFilters: filters,
115}
116if err := t.install(); err != nil {
117t.close()
118return nil, err
119}
120return t, nil
121}
122
123func (t *Tracer) install() error {
124spec, err := loadTraceloop()
125if err != nil {
126return fmt.Errorf("loading ebpf program: %w", err)
127}
128
129gadgets.FixBpfKtimeGetBootNs(spec.Programs)
130
131syscallsOnce.Do(func() {
132syscallsDeclarations, err = gatherSyscallsDeclarations()
133})
134if err != nil {
135return fmt.Errorf("gathering syscall definitions: %w", err)
136}
137
138// Fill the syscall map with specific syscall signatures.
139syscallsMapSpec := spec.Maps["syscalls"]
140for name, def := range syscallDefs {
141number, ok := syscalls.GetSyscallNumberByName(name)
142if !ok {
143// It's possible that the syscall doesn't exist for this architecture, skip it
144continue
145}
146
147// We need to do so to avoid taking each time the same address.
148def := def
149syscallsMapSpec.Contents = append(syscallsMapSpec.Contents, ebpf.MapKV{
150Key: uint64(number),
151Value: def,
152})
153}
154
155// Fill the syscall filter map with the corresponding syscall numbers.
156syscallFiltersMapSpec := spec.Maps["syscall_filters"]
157for _, name := range t.syscallFilters {
158if name == "" {
159continue
160}
161
162number, ok := syscalls.GetSyscallNumberByName(name)
163if !ok {
164return fmt.Errorf("syscall %q does not exist", name)
165}
166
167syscallFiltersMapSpec.Contents = append(syscallFiltersMapSpec.Contents, ebpf.MapKV{
168Key: uint64(number),
169// We do not care about the value itself but we need to provide one.
170Value: true,
171})
172}
173
174consts := make(map[string]interface{})
175consts["filter_syscall"] = len(syscallFiltersMapSpec.Contents) > 0
176if err := gadgets.LoadeBPFSpec(nil, spec, consts, &t.objs); err != nil {
177return fmt.Errorf("loading ebpf program: %w", err)
178}
179
180t.enterLink, err = link.AttachRawTracepoint(link.RawTracepointOptions{
181Name: "sys_enter",
182Program: t.objs.IgTraceloopE,
183})
184if err != nil {
185return fmt.Errorf("opening enter tracepoint: %w", err)
186}
187
188t.exitLink, err = link.AttachRawTracepoint(link.RawTracepointOptions{
189Name: "sys_exit",
190Program: t.objs.IgTraceloopX,
191})
192if err != nil {
193return fmt.Errorf("opening exit tracepoint: %w", err)
194}
195
196t.innerMapSpec = spec.Maps["map_of_perf_buffers"].InnerMap
197
198return nil
199}
200
201// Stop stops the tracer
202// TODO: Remove after refactoring
203func (t *Tracer) Stop() {
204t.close()
205}
206
207func (t *Tracer) close() {
208t.enterLink = gadgets.CloseLink(t.enterLink)
209t.exitLink = gadgets.CloseLink(t.exitLink)
210
211t.readers.Range(func(key, _ any) bool {
212t.Delete(key.(string))
213
214return true
215})
216
217t.objs.Close()
218}
219
220func (t *Tracer) Attach(containerID string, mntnsID uint64) error {
221innerBufferSpec := t.innerMapSpec.Copy()
222innerBufferSpec.Name = fmt.Sprintf("perf_buffer_%d", mntnsID)
223
224// 1. Create inner Map as perf buffer.
225innerBuffer, err := ebpf.NewMap(innerBufferSpec)
226if err != nil {
227return fmt.Errorf("creating inner map: %w", err)
228}
229
230// 2. Use this inner Map to create the perf reader.
231perfReader, err := perf.NewReaderWithOptions(innerBuffer, gadgets.PerfBufferPages*os.Getpagesize(), perf.ReaderOptions{Overwritable: true})
232if err != nil {
233innerBuffer.Close()
234
235return fmt.Errorf("creating perf ring buffer: %w", err)
236}
237
238// 3. Add the inner map's file descriptor to outer map.
239err = t.objs.MapOfPerfBuffers.Put(mntnsID, innerBuffer)
240if err != nil {
241innerBuffer.Close()
242perfReader.Close()
243
244return fmt.Errorf("adding perf buffer to map with mntnsID %d: %w", mntnsID, err)
245}
246
247t.readers.Store(containerID, &containerRingReader{
248perfReader: perfReader,
249mntnsID: mntnsID,
250})
251
252return nil
253}
254
255func timestampFromEvent(event *syscallEvent) eventtypes.Time {
256if !gadgets.HasBpfKtimeGetBootNs() {
257// Traceloop works differently than other gadgets: if the
258// kernel does not support bpf_ktime_get_boot_ns, don't
259// generate a timestamp from userspace because traceloop reads
260// events from the ring buffer an arbitrary long time after
261// they are generated, so the timestamp would be meaningless.
262
263// However we need some kind of timestamp for sorting events
264return gadgets.WallTimeFromBootTime(event.monotonicTimestamp)
265}
266return gadgets.WallTimeFromBootTime(event.bootTimestamp)
267}
268
269// Copied/pasted/adapted from kernel macro round_up:
270// https://elixir.bootlin.com/linux/v6.0/source/include/linux/math.h#L25
271func roundUp(x, y uintptr) uintptr {
272return ((x - 1) | (y - 1)) + 1
273}
274
275// The kernel aligns size of perf event with the following snippet:
276// void perf_prepare_sample(...)
277//
278// {
279// //...
280// size = round_up(sum + sizeof(u32), sizeof(u64));
281// raw->size = size - sizeof(u32);
282// frag->pad = raw->size - sum;
283// // ...
284// }
285//
286// (https://elixir.bootlin.com/linux/v6.0/source/kernel/events/core.c#L7353)
287// In the case of our structure of interest (i.e. struct_syscall_event_t and
288// struct_syscall_event_cont_t), their size will be increased by 4, here is
289// an example for struct_syscall_event_t which size is 88:
290// size = round_up(sum + sizeof(u32), sizeof(u64))
291//
292// = round_up(88 + 4, 8)
293// = round_up(92, 8)
294// = 96
295//
296// raw->size = size - sizeof(u32)
297//
298// = 96 - 4
299// = 92
300//
301// So, 4 bytes will be added as padding at the end of the event and the size we
302// will read getting perfEventSample will be 92 instead of 88.
303func alignSize(structSize uintptr) uintptr {
304var ret uintptr
305var foo uint64
306var bar uint32
307
308ret = roundUp(structSize+unsafe.Sizeof(bar), unsafe.Sizeof(foo))
309ret = ret - unsafe.Sizeof(bar)
310
311return ret
312}
313
314// Convert a return value to corresponding error number if meaningful.
315// See man syscalls:
316// Note:
317// system calls indicate a failure by returning a negative error
318// number to the caller on architectures without a separate error
319// register/flag, as noted in syscall(2); when this happens, the
320// wrapper function negates the returned error number (to make it
321// positive), copies it to errno, and returns -1 to the caller of
322// the wrapper.
323func retToStr(ret int) string {
324errNo := int64(ret)
325if errNo >= -4095 && errNo <= -1 {
326return fmt.Sprintf("-1 (%s)", syscall.Errno(-errNo).Error())
327}
328return fmt.Sprintf("%d", ret)
329}
330
331func (t *Tracer) Read(containerID string) ([]*types.Event, error) {
332syscallContinuedEventsMap := make(map[uint64][]*syscallEventContinued)
333syscallEnterEventsMap := make(map[uint64][]*syscallEvent)
334syscallExitEventsMap := make(map[uint64][]*syscallEvent)
335events := make([]*types.Event, 0)
336
337r, ok := t.readers.Load(containerID)
338if !ok {
339return nil, fmt.Errorf("no perf reader for %q", containerID)
340}
341
342reader, ok := r.(*containerRingReader)
343if !ok {
344return nil, errors.New("the map should only contain *containerRingReader")
345}
346
347if reader.perfReader == nil {
348log.Infof("reader for %v is nil, it was surely detached", containerID)
349
350return nil, nil
351}
352
353err := reader.perfReader.Pause()
354if err != nil {
355return nil, err
356}
357
358reader.perfReader.SetDeadline(time.Now())
359
360records := make([]*perf.Record, 0)
361for {
362record, err := reader.perfReader.Read()
363if err != nil {
364if errors.Is(err, os.ErrDeadlineExceeded) {
365break
366} else {
367return nil, err
368}
369}
370records = append(records, &record)
371}
372
373err = reader.perfReader.Resume()
374if err != nil {
375return nil, err
376}
377
378for _, record := range records {
379size := len(record.RawSample)
380
381var sysEvent *traceloopSyscallEventT
382var sysEventCont *traceloopSyscallEventContT
383
384switch uintptr(size) {
385case alignSize(unsafe.Sizeof(*sysEvent)):
386sysEvent = (*traceloopSyscallEventT)(unsafe.Pointer(&record.RawSample[0]))
387
388event := &syscallEvent{
389bootTimestamp: sysEvent.BootTimestamp,
390monotonicTimestamp: sysEvent.MonotonicTimestamp,
391typ: sysEvent.Typ,
392contNr: sysEvent.ContNr,
393cpu: sysEvent.Cpu,
394id: sysEvent.Id,
395pid: sysEvent.Pid,
396comm: gadgets.FromCString(sysEvent.Comm[:]),
397mountNsID: reader.mntnsID,
398}
399
400var typeMap *map[uint64][]*syscallEvent
401switch event.typ {
402case syscallEventTypeEnter:
403event.args = make([]uint64, syscallArgs)
404for i := uint8(0); i < syscallArgs; i++ {
405event.args[i] = sysEvent.Args[i]
406}
407
408typeMap = &syscallEnterEventsMap
409case syscallEventTypeExit:
410// In the C structure, args is an array of uint64.
411// But in this particular case, we used it to store a C long, i.e. the
412// syscall return value, so it is safe to cast it to golang int.
413event.retval = int(sysEvent.Args[0])
414
415typeMap = &syscallExitEventsMap
416default:
417// Rather than returning an error, we skip this event.
418log.Debugf("type %d is not a valid type for syscallEvent, received data are: %v", event.typ, record.RawSample)
419
420continue
421}
422
423if _, ok := (*typeMap)[event.monotonicTimestamp]; !ok {
424(*typeMap)[event.monotonicTimestamp] = make([]*syscallEvent, 0)
425}
426
427(*typeMap)[event.monotonicTimestamp] = append((*typeMap)[event.monotonicTimestamp], event)
428case alignSize(unsafe.Sizeof(*sysEventCont)):
429sysEventCont = (*traceloopSyscallEventContT)(unsafe.Pointer(&record.RawSample[0]))
430
431event := &syscallEventContinued{
432monotonicTimestamp: sysEventCont.MonotonicTimestamp,
433index: sysEventCont.Index,
434}
435
436if sysEventCont.Failed != 0 {
437event.param = "(Failed to dereference pointer)"
438} else if sysEventCont.Length == useNullByteLength {
439// 0 byte at [C.PARAM_LENGTH - 1] is enforced in BPF code
440event.param = gadgets.FromCString(sysEventCont.Param[:])
441} else {
442event.param = gadgets.FromCStringN(sysEventCont.Param[:], int(sysEventCont.Length))
443}
444
445// Remove all non unicode character from the string.
446event.param = strconv.Quote(event.param)
447
448_, ok := syscallContinuedEventsMap[event.monotonicTimestamp]
449if !ok {
450// Just create a 0 elements slice for the moment, the ContNr will be
451// checked later.
452syscallContinuedEventsMap[event.monotonicTimestamp] = make([]*syscallEventContinued, 0)
453}
454
455syscallContinuedEventsMap[event.monotonicTimestamp] = append(syscallContinuedEventsMap[event.monotonicTimestamp], event)
456default:
457log.Debugf("size %d does not correspond to any expected element, which are %d and %d; received data are: %v", size, unsafe.Sizeof(sysEvent), unsafe.Sizeof(sysEventCont), record.RawSample)
458}
459}
460
461// Let's try to publish the events we gathered.
462for enterTimestamp, enterTimestampEvents := range syscallEnterEventsMap {
463for _, enterEvent := range enterTimestampEvents {
464event := &types.Event{
465Event: eventtypes.Event{
466Type: eventtypes.NORMAL,
467Timestamp: timestampFromEvent(enterEvent),
468},
469CPU: enterEvent.cpu,
470Pid: enterEvent.pid,
471Comm: enterEvent.comm,
472WithMountNsID: eventtypes.WithMountNsID{MountNsID: enterEvent.mountNsID},
473Syscall: syscallGetName(enterEvent.id),
474}
475
476syscallDeclaration, err := getSyscallDeclaration(syscallsDeclarations, event.Syscall)
477if err != nil {
478return nil, fmt.Errorf("getting syscall definition")
479}
480
481parametersNumber := syscallDeclaration.getParameterCount()
482event.Parameters = make([]types.SyscallParam, parametersNumber)
483log.Debugf("\tevent parametersNumber: %d", parametersNumber)
484
485for i := uint8(0); i < parametersNumber; i++ {
486paramName, err := syscallDeclaration.getParameterName(i)
487if err != nil {
488return nil, fmt.Errorf("getting syscall parameter name: %w", err)
489}
490log.Debugf("\t\tevent paramName: %q", paramName)
491
492isPointer, err := syscallDeclaration.paramIsPointer(i)
493if err != nil {
494return nil, fmt.Errorf("checking syscall parameter is a pointer: %w", err)
495}
496
497format := "%d"
498if isPointer {
499format = "0x%x"
500}
501paramValue := fmt.Sprintf(format, enterEvent.args[i])
502log.Debugf("\t\tevent paramValue: %q", paramValue)
503
504var paramContent *string
505
506for _, syscallContEvent := range syscallContinuedEventsMap[enterTimestamp] {
507if syscallContEvent.index == i {
508paramContent = &syscallContEvent.param
509log.Debugf("\t\t\tevent paramContent: %q", *paramContent)
510
511break
512}
513}
514
515event.Parameters[i] = types.SyscallParam{
516Name: paramName,
517Value: paramValue,
518Content: paramContent,
519}
520}
521
522delete(syscallContinuedEventsMap, enterTimestamp)
523
524// There is no exit event for exit(), exit_group() and rt_sigreturn().
525if event.Syscall == "exit" || event.Syscall == "exit_group" || event.Syscall == "rt_sigreturn" {
526delete(syscallEnterEventsMap, enterTimestamp)
527
528if t.enricher != nil {
529t.enricher.EnrichByMntNs(&event.CommonData, event.MountNsID)
530}
531
532// As there is no exit events for these syscalls,
533// then there is no return value.
534event.Retval = "X"
535
536log.Debugf("%v", event)
537events = append(events, event)
538
539continue
540}
541
542exitTimestampEvents, ok := syscallExitEventsMap[enterTimestamp]
543if !ok {
544log.Debugf("no exit event for timestamp %d", enterTimestamp)
545
546continue
547}
548
549for _, exitEvent := range exitTimestampEvents {
550if enterEvent.id != exitEvent.id || enterEvent.pid != exitEvent.pid {
551continue
552}
553
554event.Retval = retToStr(exitEvent.retval)
555
556delete(syscallEnterEventsMap, enterTimestamp)
557delete(syscallExitEventsMap, enterTimestamp)
558
559if t.enricher != nil {
560t.enricher.EnrichByMntNs(&event.CommonData, event.MountNsID)
561}
562log.Debugf("%v", event)
563events = append(events, event)
564
565break
566}
567}
568}
569
570log.Debugf("len(events): %d; len(syscallEnterEventsMap): %d; len(syscallExitEventsMap): %d; len(syscallContinuedEventsMap): %d\n", len(events), len(syscallEnterEventsMap), len(syscallExitEventsMap), len(syscallContinuedEventsMap))
571
572// It is possible there are some incomplete events for two mains reasons:
573// 1. Traceloop was started in the middle of a syscall, then we will only get
574// the exit but not the enter.
575// 2. The buffer is full and so it only remains some exit events and not the
576// corresponding enter.
577// Rather than dropping these incomplete events, we just add them to the
578// events to be published.
579for _, enterTimestampEvents := range syscallEnterEventsMap {
580for _, enterEvent := range enterTimestampEvents {
581syscallName := syscallGetName(enterEvent.id)
582
583incompleteEnterEvent := &types.Event{
584Event: eventtypes.Event{
585Type: eventtypes.NORMAL,
586Timestamp: timestampFromEvent(enterEvent),
587},
588CPU: enterEvent.cpu,
589Pid: enterEvent.pid,
590Comm: enterEvent.comm,
591WithMountNsID: eventtypes.WithMountNsID{MountNsID: enterEvent.mountNsID},
592Syscall: syscallName,
593Retval: "unfinished",
594}
595
596if t.enricher != nil {
597t.enricher.EnrichByMntNs(&incompleteEnterEvent.CommonData, incompleteEnterEvent.MountNsID)
598}
599
600events = append(events, incompleteEnterEvent)
601
602log.Debugf("enterEvent(%q): %v\n", syscallName, enterEvent)
603}
604}
605
606for _, exitTimestampEvents := range syscallExitEventsMap {
607for _, exitEvent := range exitTimestampEvents {
608syscallName := syscallGetName(exitEvent.id)
609
610incompleteExitEvent := &types.Event{
611Event: eventtypes.Event{
612Type: eventtypes.NORMAL,
613Timestamp: timestampFromEvent(exitEvent),
614},
615CPU: exitEvent.cpu,
616Pid: exitEvent.pid,
617Comm: exitEvent.comm,
618WithMountNsID: eventtypes.WithMountNsID{MountNsID: exitEvent.mountNsID},
619Syscall: syscallName,
620Retval: retToStr(exitEvent.retval),
621}
622
623if t.enricher != nil {
624t.enricher.EnrichByMntNs(&incompleteExitEvent.CommonData, incompleteExitEvent.MountNsID)
625}
626
627events = append(events, incompleteExitEvent)
628
629log.Debugf("exitEvent(%q): %v\n", syscallName, exitEvent)
630}
631}
632
633// Sort all events by ascending timestamp.
634sort.Slice(events, func(i, j int) bool {
635return events[i].Timestamp < events[j].Timestamp
636})
637
638// Remove timestamps if we couldn't get reliable ones
639if !gadgets.HasBpfKtimeGetBootNs() {
640for i := range events {
641events[i].Timestamp = 0
642}
643}
644
645return events, nil
646}
647
648func (t *Tracer) Detach(mntnsID uint64) error {
649err := t.objs.MapOfPerfBuffers.Delete(mntnsID)
650if err != nil {
651return fmt.Errorf("removing perf buffer from map with mntnsID %d", mntnsID)
652}
653
654return nil
655}
656
657func (t *Tracer) Delete(containerID string) error {
658r, ok := t.readers.LoadAndDelete(containerID)
659if !ok {
660return fmt.Errorf("no reader for containerID %s", containerID)
661}
662
663reader := r.(*containerRingReader)
664err := reader.perfReader.Close()
665reader.perfReader = nil
666
667return err
668}
669
670// --- Registry changes
671
672func (g *GadgetDesc) NewInstance() (gadgets.Gadget, error) {
673return &Tracer{}, nil
674}
675
676func (t *Tracer) Init(gadgetCtx gadgets.GadgetContext) error {
677t.syscallFilters = gadgetCtx.GadgetParams().Get(ParamSyscallFilters).AsStringSlice()
678
679if err := t.install(); err != nil {
680t.close()
681return fmt.Errorf("installing tracer: %w", err)
682}
683
684// Context must be created before the first call to AttachContainer
685t.gadgetCtx = gadgetCtx
686t.ctx, t.cancel = gadgetcontext.WithTimeoutOrCancel(gadgetCtx.Context(), gadgetCtx.Timeout())
687return nil
688}
689
690func (t *Tracer) SetEventHandler(handler any) {
691nh, ok := handler.(func(ev *types.Event))
692if !ok {
693panic("event handler invalid")
694}
695t.eventCallback = nh
696}
697
698func (t *Tracer) AttachContainer(container *containercollection.Container) error {
699t.waitGroup.Add(1)
700err := t.Attach(container.Runtime.ContainerID, container.Mntns)
701if err != nil {
702t.waitGroup.Done()
703return err
704}
705go func() {
706defer t.waitGroup.Done()
707<-t.ctx.Done()
708evs, err := t.Read(container.Runtime.ContainerID)
709if err != nil {
710t.gadgetCtx.Logger().Debugf("error reading from container %s: %v", container.Runtime.ContainerID, err)
711return
712}
713for _, ev := range evs {
714ev.SetContainerMetadata(&container.K8s.BasicK8sMetadata, &container.Runtime.BasicRuntimeMetadata)
715t.eventCallback(ev)
716}
717}()
718return nil
719}
720
721func (t *Tracer) DetachContainer(container *containercollection.Container) error {
722return t.Detach(container.Mntns)
723}
724
725func (t *Tracer) Run(gadgetCtx gadgets.GadgetContext) error {
726<-t.ctx.Done()
727t.waitGroup.Wait()
728return nil
729}
730
731func (t *Tracer) Close() {
732t.cancel()
733t.close()
734}
735