inspektor-gadget

traceloop.bpf.c
531 строка · 14.8 Кб
Перенос по словам
1
// SPDX-License-Identifier: GPL-2.0
2
#include <vmlinux.h>
3
#include <bpf/bpf_helpers.h>
4
#include <bpf/bpf_core_read.h>
5
#include <bpf/bpf_tracing.h>
6
#include <gadget/mntns_filter.h>
7
#include "traceloop.h"
8

9
/*
10
 * Taken from:
11
 * https://github.com/seccomp/libseccomp/blob/afbde6ddaec7c58c3b281d43b0b287269ffca9bd/src/syscalls.csv
12
 */
13
#if defined(__TARGET_ARCH_arm64)
14
#define __NR_rt_sigreturn 139
15
#define __NR_exit_group 94
16
#define __NR_exit 93
17
#elif defined(__TARGET_ARCH_x86)
18
#define __NR_rt_sigreturn 15
19
#define __NR_exit_group 231
20
#define __NR_exit 60
21
#else
22
#error "Traceloop is not supported on your architecture."
23
#endif
24

25
/* Compile with -DSHOW_DEBUG to print debug messages. */
26
#if defined(SHOW_DEBUG)
27
#define bpf_debug_printk(fmt, ...) bpf_printk(fmt, ##__VA_ARGS__)
28
#else /* !defined(SHOW_DEBUG) */
29
#define bpf_debug_printk(fmt, ...)
30
#endif /* !defined(SHOW_DEBUG) */
31

32
/* Compile with -DSHOW_ERROR to print error messages. */
33
#if defined(SHOW_ERROR)
34
#define bpf_error_printk(fmt, ...) bpf_printk(fmt, ##__VA_ARGS__)
35
#else /* !defined(SHOW_ERROR) */
36
#define bpf_error_printk(fmt, ...)
37
#endif /* !defined(SHOW_ERROR) */
38

39
const volatile bool filter_syscall = false;
40

41
const struct syscall_event_t *unused_event __attribute__((unused));
42
const struct syscall_event_cont_t *unused_event_cont __attribute__((unused));
43

44
/*
45
 * We need this to avoid hitting the 512 bytes stack limit.
46
 * Indeed, pt_regs contains several u64 fields, so it is quite big.
47
 */
48
static const struct pt_regs empty;
49
static struct syscall_def_t default_definition;
50

51
struct {
52
	__uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
53
	/*
54
	 * We will use mount namespace ID to get the perf buffer corresponding
55
	 * to this container.
56
	 */
57
	__uint(key_size, sizeof(u64));
58
	__uint(value_size, sizeof(u32));
59
	__uint(max_entries, 1024);
60
	__array(
61
		values, struct {
62
			__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
63
			__uint(key_size, sizeof(u32));
64
			__uint(value_size, sizeof(u32));
65
		});
66
} map_of_perf_buffers SEC(".maps");
67

68
struct {
69
	__uint(type, BPF_MAP_TYPE_HASH);
70
	__uint(key_size, sizeof(u64));
71
	__uint(value_size, sizeof(struct syscall_def_t));
72
	/*
73
	 * We have around 300 syscalls, let's use the immediate greater power of
74
	 * 2.
75
	 */
76
	__uint(max_entries, 512);
77
} syscalls SEC(".maps");
78

79
struct {
80
	__uint(type, BPF_MAP_TYPE_HASH);
81
	__uint(key_size, sizeof(u64));
82
	/*
83
	 * We do not care about the value here, so let's use a bool to consume one
84
	 * byte per value.
85
	 */
86
	__uint(value_size, sizeof(bool));
87
	__uint(map_flags, BPF_F_NO_PREALLOC);
88
	__uint(max_entries, SYSCALL_FILTERS);
89
} syscall_filters SEC(".maps");
90

91
/*
92
 * This key/value store maps thread PIDs to syscall arg arrays
93
 * that were remembered at sys_enter so that sys_exit can probe buffer
94
 * contents and generate syscall events showing the result content.
95
 */
96
struct {
97
	__uint(type, BPF_MAP_TYPE_HASH);
98
	__uint(key_size, sizeof(u64));
99
	__uint(value_size, sizeof(struct remembered_args));
100
	__uint(max_entries, 1024);
101
} probe_at_sys_exit SEC(".maps");
102

103
struct {
104
	__uint(type, BPF_MAP_TYPE_HASH);
105
	__uint(key_size, sizeof(u64));
106
	__uint(value_size, sizeof(struct pt_regs));
107
	__uint(max_entries, 1024);
108
} regs_map SEC(".maps");
109

110
static __always_inline int skip_exit_probe(int nr)
111
{
112
	return !!(nr == __NR_exit || nr == __NR_exit_group ||
113
		  nr == __NR_rt_sigreturn);
114
}
115

116
/*
117
 * Highly inspired from ksnoop.bpf.c:
118
 * https://github.com/iovisor/bcc/blob/f90126bb3770ea1bdd915ff3b47e451c6dde5c40/libbpf-tools/ksnoop.bpf.c#L280
119
 */
120
static __always_inline u64 get_arg(struct pt_regs *regs, int i)
121
{
122
	switch (i) {
123
	case 1:
124
		return PT_REGS_PARM1_CORE_SYSCALL(regs);
125
	case 2:
126
		return PT_REGS_PARM2_CORE_SYSCALL(regs);
127
	case 3:
128
		return PT_REGS_PARM3_CORE_SYSCALL(regs);
129
	case 4:
130
		return PT_REGS_PARM4_CORE_SYSCALL(regs);
131
	case 5:
132
		return PT_REGS_PARM5_CORE_SYSCALL(regs);
133
	case 6:
134
		return PT_REGS_PARM6_CORE_SYSCALL(regs);
135
	default:
136
		bpf_error_printk(
137
			"There is no PT_REGS_PARM%d_SYSCALL macro, check the argument!\n",
138
			i);
139
		return 0;
140
	}
141
}
142

143
static __always_inline bool should_filter_out_syscall(u64 syscall_nr)
144
{
145
	return filter_syscall &&
146
	       bpf_map_lookup_elem(&syscall_filters, &syscall_nr) == NULL;
147
}
148

149
/*
150
 * sys_enter is defined as:
151
 * TP_PROTO(struct pt_regs *regs, long id)
152
 * (https://elixir.bootlin.com/linux/v5.19/source/include/trace/events/syscalls.h#L20)
153
 * So, ctx->args[0] contains a struct pt_regs and ctx->args[1] the syscall ID.
154
 */
155
SEC("raw_tracepoint/sys_enter")
156
int ig_traceloop_e(struct bpf_raw_tracepoint_args *ctx)
157
{
158
	struct remembered_args remembered = {};
159
	u64 pid = bpf_get_current_pid_tgid();
160
	struct syscall_def_t *syscall_def;
161
	/*
162
	 * Initialize struct to empty to be sure all fields (even padding) are zeroed:
163
	 * https://github.com/iovisor/bcc/issues/2623#issuecomment-560214481
164
	 */
165
	struct syscall_event_t sc = {};
166
	struct task_struct *task;
167
	u64 nr = ctx->args[1];
168
	struct pt_regs *args;
169
	void *perf_buffer;
170
	u64 mntns_id;
171
	int ret;
172
	int i;
173

174
	if (should_filter_out_syscall(nr))
175
		return 0;
176

177
	/* The boot time timestamp is used to give the timestamp to users. It
178
	 * is converted to the wall-clock time in userspace. It only works
179
	 * from Linux 5.7. On older kernels, the BPF bytecode for
180
	 * bpf_ktime_get_boot_ns is automatically removed by the BPF loader,
181
	 * see FixBpfKtimeGetBootNs. In this way, this BPF program can still be
182
	 * loaded on older kernels. */
183
	u64 boot_ts = bpf_ktime_get_boot_ns();
184

185
	/* The monotonic timestamp is used by traceloop to match the sys_enter
186
	 * event with the cont and sys_exit events. This is an internal
187
	 * implementation detail not exposed to the user. */
188
	u64 monotonic_ts = bpf_ktime_get_ns();
189

190
	sc.boot_timestamp = boot_ts;
191
	sc.monotonic_timestamp = monotonic_ts;
192
	sc.cont_nr = 0;
193
	sc.cpu = bpf_get_smp_processor_id();
194
	sc.pid = pid >> 32;
195
	sc.typ = SYSCALL_EVENT_TYPE_ENTER;
196
	sc.id = nr;
197

198
	remembered.monotonic_timestamp = monotonic_ts;
199
	remembered.nr = nr;
200

201
	syscall_def = bpf_map_lookup_elem(&syscalls, &nr);
202
	/*
203
	 * syscalls map contains definition for specific syscall like read or
204
	 * write.
205
	 * All others syscalls, like nanosleep, are not in this map because
206
	 * their signature is not specific, in this case, we use the default
207
	 * definition.
208
	 */
209
	if (syscall_def == NULL)
210
		syscall_def = &default_definition;
211

212
	task = (struct task_struct *)bpf_get_current_task();
213
	mntns_id = (u64)BPF_CORE_READ(task, nsproxy, mnt_ns, ns.inum);
214

215
	perf_buffer = bpf_map_lookup_elem(&map_of_perf_buffers, &mntns_id);
216
	if (!perf_buffer)
217
		return 0;
218

219
	bpf_get_current_comm(sc.comm, sizeof(sc.comm));
220

221
	ret = bpf_map_update_elem(&regs_map, &pid, &empty, BPF_NOEXIST);
222
	if (ret) {
223
		bpf_error_printk(
224
			"enter: there should not be any pt_regs for key %lu: %d\n",
225
			pid, ret);
226

227
		return 0;
228
	}
229

230
	args = bpf_map_lookup_elem(&regs_map, &pid);
231
	if (!args) {
232
		bpf_error_printk(
233
			"enter: there should be a pt_regs for key %lu\n", pid);
234

235
		goto end;
236
	}
237

238
	bpf_probe_read(args, sizeof(*args), (void *)ctx->args[0]);
239

240
	for (i = 0; i < SYSCALL_ARGS; i++) {
241
		/* + 1 because PT_REGS_PARM begins from 1. */
242
		u64 arg = get_arg(args, i + 1);
243
		sc.args[i] = arg;
244
		remembered.args[i] = arg;
245
		if (syscall_def->args_len[i])
246
			sc.cont_nr++;
247
	}
248

249
	bpf_debug_printk(
250
		"Perf event output: sc.id: %d; sc.comm: %s; sizeof(sc): %d\n",
251
		sc.id, sc.comm, sizeof(sc));
252
	ret = bpf_perf_event_output(ctx, perf_buffer, BPF_F_CURRENT_CPU, &sc,
253
				    sizeof(sc));
254
	if (ret != 0) {
255
		bpf_error_printk("Problem outputting perf event: %d", ret);
256
	}
257

258
	// Avoid using probe_at_sys_exit for exit() and exit_group() because sys_exit
259
	// would not be called and the map would not be cleaned up and would get full.
260
	// Note that a process can still get killed in the middle, so we would need
261
	// a userspace cleaner for this case (TODO).
262
	if (!skip_exit_probe(nr))
263
		bpf_map_update_elem(&probe_at_sys_exit, &pid, &remembered,
264
				    BPF_ANY);
265

266
// We need to unroll this loop to make this work on kernels 5.4.0-x on ubuntu, see
267
// https://github.com/inspektor-gadget/inspektor-gadget/issues/1465 for more details.
268
#pragma unroll
269
	for (i = 0; i < SYSCALL_ARGS; i++) {
270
		__u64 arg_len = syscall_def->args_len[i];
271

272
		if (!arg_len || (arg_len & PARAM_PROBE_AT_EXIT_MASK) ||
273
		    arg_len == USE_RET_AS_PARAM_LENGTH)
274
			continue;
275

276
		bool null_terminated = false;
277
		struct syscall_event_cont_t sc_cont = {};
278

279
		sc_cont.monotonic_timestamp = monotonic_ts;
280
		sc_cont.index = i;
281
		sc_cont.failed = false;
282

283
		if (arg_len == USE_NULL_BYTE_LENGTH) {
284
			null_terminated = true;
285
			arg_len = 0;
286
		} else if (arg_len >= USE_ARG_INDEX_AS_PARAM_LENGTH) {
287
			__u64 idx = arg_len &
288
				    USE_ARG_INDEX_AS_PARAM_LENGTH_MASK;
289

290
			/*
291
			 * Access args via the previously saved map entry instead of
292
			 * the ctx pointer or 'remembered' struct to avoid this verifier
293
			 * issue (which does not occur in sys_exit for the same code):
294
			 * "variable ctx access var_off=(0x0; 0x38) disallowed"
295
			 */
296
			struct remembered_args *remembered_ctx_workaround;
297
			if (idx < SYSCALL_ARGS) {
298
				remembered_ctx_workaround = bpf_map_lookup_elem(
299
					&probe_at_sys_exit, &pid);
300
				if (remembered_ctx_workaround)
301
					arg_len = remembered_ctx_workaround
302
							  ->args[idx];
303
				else
304
					arg_len = 0;
305
			} else {
306
				arg_len = PARAM_LEN;
307
			}
308
		}
309

310
		if (arg_len > sizeof(sc_cont.param))
311
			arg_len = sizeof(sc_cont.param);
312

313
		if (null_terminated)
314
			sc_cont.length = USE_NULL_BYTE_LENGTH;
315
		else
316
			sc_cont.length = arg_len;
317

318
		/* + 1 because PT_REGS_PARM begins from 1. */
319
		u64 arg = get_arg(args, i + 1);
320

321
		if (!arg_len &&
322
		    null_terminated /* NULL terminated argument like string */
323
		    && bpf_probe_read_user_str(sc_cont.param, PARAM_LEN,
324
					       (void *)(arg)) < 0)
325
			sc_cont.failed = true;
326
		else if (sizeof(u8) <= arg_len &&
327
			 arg_len <=
328
				 sizeof(u64) /* Conventional arguments like type (char, int, etc.) */
329
			 && bpf_probe_read_user(sc_cont.param, arg_len,
330
						(void *)(arg)))
331
			sc_cont.failed = true;
332
		else if (bpf_probe_read_user(
333
				 sc_cont.param, PARAM_LEN,
334
				 (void *)(arg))) /* TODO Struct arguments? */
335
			sc_cont.failed = true;
336

337
		bpf_debug_printk(
338
			"Perf event output: sc_cont.index: %d; sizeof(sc_cont): %d\n",
339
			sc_cont.index, sizeof(sc_cont));
340
		ret = bpf_perf_event_output(ctx, perf_buffer, BPF_F_CURRENT_CPU,
341
					    &sc_cont, sizeof(sc_cont));
342
		if (ret != 0) {
343
			bpf_error_printk(
344
				"Problem outputting continued perf event: %d",
345
				ret);
346
		}
347
	}
348

349
end:
350
	bpf_map_delete_elem(&regs_map, &pid);
351

352
	return 0;
353
}
354

355
/*
356
 * syscall_get_nr() is defined for each architecture in the Linux kernel.
357
 * As we cannot use trace_event_raw_sys_exit, we need to get the current syscall
358
 * number from the register.
359
 * So, this function should be expanded with the code of the architecture we
360
 * support.
361
 */
362
static __always_inline int syscall_get_nr(struct pt_regs *regs)
363
{
364
#if defined(__TARGET_ARCH_arm64)
365
	return regs->syscallno;
366
#elif defined(__TARGET_ARCH_x86)
367
	return regs->orig_ax;
368
#else
369
#error "Traceloop is not supported on your architecture."
370
#endif
371
}
372

373
/*
374
 * sys_exit is defined as:
375
 * TP_PROTO(struct pt_regs *regs, long ret),
376
 * (https://elixir.bootlin.com/linux/v5.19/source/include/trace/events/syscalls.h#L46)
377
 * So, ctx->args[0] contains a struct pt_regs and ctx->args[1] the syscall
378
 * return value.
379
 */
380
SEC("raw_tracepoint/sys_exit")
381
int ig_traceloop_x(struct bpf_raw_tracepoint_args *ctx)
382
{
383
	u64 pid = bpf_get_current_pid_tgid();
384
	struct remembered_args *remembered;
385
	struct syscall_def_t *syscall_def;
386
	struct task_struct *task;
387
	long ret = ctx->args[1];
388
	struct pt_regs *args;
389
	void *perf_buffer;
390
	u64 mntns_id;
391
	int i, r;
392
	u64 nr;
393

394
	r = bpf_map_update_elem(&regs_map, &pid, &empty, BPF_NOEXIST);
395
	if (r) {
396
		bpf_error_printk(
397
			"exit: there should not be any pt_regs for key %lu: %d\n",
398
			pid, r);
399

400
		return 0;
401
	}
402

403
	args = bpf_map_lookup_elem(&regs_map, &pid);
404
	if (!args) {
405
		bpf_error_printk(
406
			"exit: there should be a pt_regs for key %lu\n", pid);
407

408
		goto end;
409
	}
410

411
	bpf_probe_read(args, sizeof(*args), (void *)ctx->args[0]);
412
	nr = syscall_get_nr(args);
413
	/* TODO Why this can occur? */
414
	if (nr == -1)
415
		goto end;
416

417
	struct syscall_event_t sc = {
418
		.boot_timestamp = bpf_ktime_get_boot_ns(),
419
		.cpu = bpf_get_smp_processor_id(),
420
		.pid = pid >> 32,
421
		.typ = SYSCALL_EVENT_TYPE_EXIT,
422
		.id = nr,
423
	};
424
	sc.args[0] = ret;
425

426
	syscall_def = bpf_map_lookup_elem(&syscalls, &nr);
427
	if (syscall_def == NULL)
428
		syscall_def = &default_definition;
429

430
	task = (struct task_struct *)bpf_get_current_task();
431
	mntns_id = (u64)BPF_CORE_READ(task, nsproxy, mnt_ns, ns.inum);
432

433
	perf_buffer = bpf_map_lookup_elem(&map_of_perf_buffers, &mntns_id);
434
	if (!perf_buffer)
435
		goto end;
436

437
	remembered = bpf_map_lookup_elem(&probe_at_sys_exit, &pid);
438
	if (!remembered)
439
		goto end;
440

441
	/*
442
	 * This ensures all events (enter, exit and cont) related to a given
443
	 * syscall have the same timestamp.
444
	 */
445
	sc.monotonic_timestamp = remembered->monotonic_timestamp;
446

447
	for (i = 0; i < SYSCALL_ARGS; i++) {
448
		__u64 arg_len = syscall_def->args_len[i];
449

450
		if (!arg_len || !(arg_len & PARAM_PROBE_AT_EXIT_MASK))
451
			goto end_loop;
452

453
		bool null_terminated = false;
454
		struct syscall_event_cont_t sc_cont = {
455
			.monotonic_timestamp = remembered->monotonic_timestamp,
456
			.index = i,
457
			.failed = false,
458
		};
459

460
		arg_len &= ~PARAM_PROBE_AT_EXIT_MASK;
461

462
		if (arg_len == USE_RET_AS_PARAM_LENGTH) {
463
			if ((signed long)ret < 0)
464
				arg_len = 0;
465
			else
466
				arg_len = ret;
467
		} else if (arg_len == USE_NULL_BYTE_LENGTH) {
468
			null_terminated = true;
469
			arg_len = 0;
470
		} else if (arg_len >= USE_ARG_INDEX_AS_PARAM_LENGTH) {
471
			__u64 idx = arg_len &
472
				    USE_ARG_INDEX_AS_PARAM_LENGTH_MASK;
473
			if (idx < SYSCALL_ARGS)
474
				arg_len = remembered->args[idx];
475
			else
476
				arg_len = PARAM_LEN;
477
		}
478

479
		if (arg_len > sizeof(sc_cont.param))
480
			arg_len = sizeof(sc_cont.param);
481

482
		if (null_terminated)
483
			sc_cont.length = USE_NULL_BYTE_LENGTH;
484
		else
485
			sc_cont.length = arg_len;
486

487
		if (arg_len == 0 && null_terminated) {
488
			if (bpf_probe_read_user_str(
489
				    sc_cont.param, PARAM_LEN,
490
				    (void *)(remembered->args[i])) < 0)
491
				sc_cont.failed = true;
492
		} else if (sizeof(u8) <= arg_len && arg_len <= sizeof(u64) &&
493
			   bpf_probe_read_user(sc_cont.param, arg_len,
494
					       (void *)(remembered->args[i]))) {
495
			sc_cont.failed = true;
496
		} else if (bpf_probe_read_user(sc_cont.param, PARAM_LEN,
497
					       (void *)(remembered->args[i]))) {
498
			sc_cont.failed = true;
499
		}
500

501
		bpf_debug_printk(
502
			"Perf event output (exit): sc_cont.index: %d; sizeof(sc_cont): %d\n",
503
			sc_cont.index, sizeof(sc_cont));
504
		r = bpf_perf_event_output(ctx, perf_buffer, BPF_F_CURRENT_CPU,
505
					  &sc_cont, sizeof(sc_cont));
506
		if (r != 0) {
507
			bpf_error_printk(
508
				"Problem outputting continued perf event: %d",
509
				ret);
510
		}
511
end_loop:
512
		bpf_map_delete_elem(&probe_at_sys_exit, &pid);
513
	}
514

515
	bpf_get_current_comm(sc.comm, sizeof(sc.comm));
516

517
	bpf_debug_printk(
518
		"Perf event output (exit): sc.id: %d; sc.comm: %s; sizeof(sc): %d\n",
519
		sc.id, sc.comm, sizeof(sc));
520
	r = bpf_perf_event_output(ctx, perf_buffer, BPF_F_CURRENT_CPU, &sc,
521
				  sizeof(sc));
522
	if (r != 0) {
523
		bpf_error_printk("Problem outputting perf event: %d", ret);
524
	}
525
end:
526
	bpf_map_delete_elem(&regs_map, &pid);
527

528
	return 0;
529
}
530

531
char LICENSE[] SEC("license") = "GPL";
532
inspektor-gadget

Использование cookies