inspektor-gadget
1// SPDX-License-Identifier: GPL-2.0
2/* Copyright (c) 2021 The Inspektor Gadget authors */
3
4#include <linux/bpf.h>5#include <linux/if_ether.h>6#include <linux/ip.h>7#include <linux/in.h>8#include <linux/udp.h>9#include <sys/socket.h>10
11#include <bpf/bpf_helpers.h>12#include <bpf/bpf_endian.h>13
14#define GADGET_TYPE_NETWORKING15#include <gadget/sockets-map.h>16
17#include "dns-common.h"18
19#define DNS_OFF (ETH_HLEN + sizeof(struct iphdr) + sizeof(struct udphdr))20
21#define DNS_CLASS_IN \221 // https://datatracker.ietf.org/doc/html/rfc1035#section-3.2.423#define DNS_TYPE_A \241 // https://datatracker.ietf.org/doc/html/rfc1035#section-3.2.225#define DNS_TYPE_AAAA 28 // https://www.rfc-editor.org/rfc/rfc3596#section-2.126
27#ifndef PACKET_HOST28#define PACKET_HOST 0x029#endif30
31#ifndef PACKET_OUTGOING32#define PACKET_OUTGOING 0x433#endif34
35#define DNS_QR_QUERY 036#define DNS_QR_RESP 137
38// we need this to make sure the compiler doesn't remove our struct
39const struct event_t *unusedevent __attribute__((unused));40
41struct {42__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);43} events SEC(".maps");44
45// https://datatracker.ietf.org/doc/html/rfc1035#section-4.1.1
46union dnsflags {47struct {48#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__49__u8 rcode : 4; // response code50__u8 z : 3; // reserved51__u8 ra : 1; // recursion available52__u8 rd : 1; // recursion desired53__u8 tc : 1; // truncation54__u8 aa : 1; // authoritive answer55__u8 opcode : 4; // kind of query56__u8 qr : 1; // 0=query; 1=response57#elif __BYTE_ORDER == __ORDER_BIG_ENDIAN__58__u8 qr : 1; // 0=query; 1=response59__u8 opcode : 4; // kind of query60__u8 aa : 1; // authoritive answer61__u8 tc : 1; // truncation62__u8 rd : 1; // recursion desired63__u8 ra : 1; // recursion available64__u8 z : 3; // reserved65__u8 rcode : 4; // response code66#else67#error "Fix your compiler's __BYTE_ORDER__?!"68#endif69};70__u16 flags;71};72
73struct dnshdr {74__u16 id;75
76union dnsflags flags;77
78__u16 qdcount; // number of question entries79__u16 ancount; // number of answer entries80__u16 nscount; // number of authority records81__u16 arcount; // number of additional records82};83
84// DNS resource record
85// https://datatracker.ietf.org/doc/html/rfc1035#section-4.1.3
86#pragma pack(2)87struct dnsrr {88__u16 name; // Two octets when using message compression, see https://datatracker.ietf.org/doc/html/rfc1035#section-4.1.489__u16 type;90__u16 class;91__u32 ttl;92__u16 rdlength;93// Followed by rdata94};95
96// The stack is limited, so use a map to build the event
97struct {98__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);99__uint(max_entries, 1);100__type(key, __u32);101__type(value, struct event_t);102} tmp_event SEC(".maps");103
104// Map of DNS query to timestamp so we can calculate latency from query sent to answer received.
105struct query_key_t {106__u64 pid_tgid;107__u16 id;108};109
110struct {111__uint(type, BPF_MAP_TYPE_HASH);112__type(key, struct query_key_t);113__type(value, __u64); // timestamp of the query114__uint(max_entries, 1024);115} query_map SEC(".maps");116
117static __always_inline __u32 dns_name_length(struct __sk_buff *skb)118{
119// This loop iterates over the DNS labels to find the total DNS name120// length.121unsigned int i;122unsigned int skip = 0;123for (i = 0; i < MAX_DNS_NAME; i++) {124if (skip != 0) {125skip--;126} else {127int label_len = load_byte(128skb, DNS_OFF + sizeof(struct dnshdr) + i);129if (label_len == 0)130break;131// The simple solution "i += label_len" gives verifier132// errors, so work around with skip.133skip = label_len;134}135}136
137return i < MAX_DNS_NAME ? i : MAX_DNS_NAME;138}
139
140// Save the IPv4 and IPv6 addresses in event->anaddr. Returns the number of saved addresses.
141static __always_inline int load_addresses(struct __sk_buff *skb, int ancount,142int anoffset, struct event_t *event)143{
144int rroffset = anoffset;145int index = 0;146for (int i = 0; i < ancount && i < MAX_ADDR_ANSWERS; i++) {147__u16 rrname =148load_byte(skb, rroffset + offsetof(struct dnsrr, name));149
150// In most cases, the name will be compressed to two octets (indicated by first two bits 0b11).151// The offset calculations below assume compression, so exit early if the name isn't compressed.152if ((rrname & 0xf0) != 0xc0)153return 0;154
155// Safe to assume that all answers refer to the same domain name156// because we verified earlier that there's exactly one question.157
158__u16 rrtype =159load_half(skb, rroffset + offsetof(struct dnsrr, type));160__u16 rrclass = load_half(skb, rroffset + offsetof(struct dnsrr,161class));162__u16 rdlength = load_half(163skb, rroffset + offsetof(struct dnsrr, rdlength));164
165if (rrtype == DNS_TYPE_A && rrclass == DNS_CLASS_IN &&166rdlength == 4) {167// A record contains an IPv4 address.168// Encode this as IPv4-mapped-IPv6 in the BPF event (::ffff:<ipv4>)169// https://datatracker.ietf.org/doc/html/rfc4291#section-2.5.5.2170__builtin_memset(&event->anaddr[index][0], 0x0, 10);171__builtin_memset(&event->anaddr[index][10], 0xff, 2);172bpf_skb_load_bytes(skb, rroffset + sizeof(struct dnsrr),173&event->anaddr[index][12], rdlength);174index++;175} else if (rrtype == DNS_TYPE_AAAA && rrclass == DNS_CLASS_IN &&176rdlength == 16) {177// AAAA record contains an IPv6 address.178bpf_skb_load_bytes(skb, rroffset + sizeof(struct dnsrr),179&event->anaddr[index][0], rdlength);180index++;181}182rroffset += sizeof(struct dnsrr) + rdlength;183}184return index;185}
186
187static __always_inline int output_dns_event(struct __sk_buff *skb,188union dnsflags flags,189__u32 name_len, __u16 ancount)190{
191__u32 zero = 0;192struct event_t *event = bpf_map_lookup_elem(&tmp_event, &zero);193if (!event)194return 0;195
196__builtin_memset(event, 0, sizeof(*event));197
198event->netns = skb->cb[0]; // cb[0] initialized by dispatcher.bpf.c199event->timestamp = bpf_ktime_get_boot_ns();200event->id = load_half(skb, DNS_OFF + offsetof(struct dnshdr, id));201event->af = AF_INET;202event->daddr_v4 =203load_word(skb, ETH_HLEN + offsetof(struct iphdr, daddr));204event->saddr_v4 =205load_word(skb, ETH_HLEN + offsetof(struct iphdr, saddr));206// load_word converts from network to host endianness. Convert back to207// network endianness because inet_ntop() requires it.208event->daddr_v4 = bpf_htonl(event->daddr_v4);209event->saddr_v4 = bpf_htonl(event->saddr_v4);210
211// Check network protocol.212// This only works with IPv4.213// For IPv6, gadget_socket_lookup() in sockets-map.h214// provides an example how to parse ip/ports on IPv6.215event->proto =216load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol));217if (event->proto == IPPROTO_TCP) {218event->sport =219load_half(skb, ETH_HLEN + sizeof(struct iphdr) +220offsetof(struct tcphdr, source));221event->dport =222load_half(skb, ETH_HLEN + sizeof(struct iphdr) +223offsetof(struct tcphdr, dest));224} else if (event->proto == IPPROTO_UDP) {225event->sport =226load_half(skb, ETH_HLEN + sizeof(struct iphdr) +227offsetof(struct udphdr, source));228event->dport =229load_half(skb, ETH_HLEN + sizeof(struct iphdr) +230offsetof(struct udphdr, dest));231}232
233event->qr = flags.qr;234
235if (flags.qr == 1) {236// Response code set only for replies.237event->rcode = flags.rcode;238}239
240bpf_skb_load_bytes(skb, DNS_OFF + sizeof(struct dnshdr), event->name,241name_len);242
243event->pkt_type = skb->pkt_type;244
245// Read QTYPE right after the QNAME (name_len + the zero length octet)246// https://datatracker.ietf.org/doc/html/rfc1035#section-4.1.2247event->qtype =248load_half(skb, DNS_OFF + sizeof(struct dnshdr) + name_len + 1);249
250// Enrich event with process metadata251struct sockets_value *skb_val = gadget_socket_lookup(skb);252if (skb_val != NULL) {253event->mount_ns_id = skb_val->mntns;254event->pid = skb_val->pid_tgid >> 32;255event->tid = (__u32)skb_val->pid_tgid;256__builtin_memcpy(&event->task, skb_val->task,257sizeof(event->task));258event->uid = (__u32)skb_val->uid_gid;259event->gid = (__u32)(skb_val->uid_gid >> 32);260}261
262event->ancount = ancount;263
264// DNS answers start immediately after qname (name_len octets)265// + the zero length octet + qtype (2 octets) + qclass (2 octets).266int anoffset = DNS_OFF + sizeof(struct dnshdr) + name_len + 5;267int anaddrcount = load_addresses(skb, ancount, anoffset, event);268event->anaddrcount = anaddrcount;269
270// Calculate latency:271//272// Track the latency from when a query is sent from a container273// to when a response to the query is received by that same container.274//275// * On DNS query sent from a container namespace (qr == DNS_QR_QUERY and pkt_type == OUTGOING),276// store the query timestamp in a map.277//278// * On DNS response received in the same container namespace (qr == DNS_QR_RESP and pkt_type == HOST)279// retrieve/delete the query timestamp and set the latency field on the event.280//281// A garbage collection thread running in userspace periodically scans for keys with old timestamps282// to free space occupied by queries that never receive a response.283//284// Skip this if skb_val == NULL (gadget_socket_lookup did not set pid_tgid we use in the query key)285// or if event->timestamp == 0 (kernels before 5.8 don't support bpf_ktime_get_boot_ns, and the patched286// version IG injects always returns zero).287if (skb_val != NULL && event->timestamp > 0) {288struct query_key_t query_key = {289.pid_tgid = skb_val->pid_tgid,290.id = event->id,291};292if (event->qr == DNS_QR_QUERY &&293event->pkt_type == PACKET_OUTGOING) {294bpf_map_update_elem(&query_map, &query_key,295&event->timestamp, BPF_NOEXIST);296} else if (event->qr == DNS_QR_RESP &&297event->pkt_type == PACKET_HOST) {298__u64 *query_ts =299bpf_map_lookup_elem(&query_map, &query_key);300if (query_ts != NULL) {301// query ts should always be less than the event ts, but check anyway to be safe.302if (*query_ts < event->timestamp) {303event->latency_ns =304event->timestamp - *query_ts;305}306bpf_map_delete_elem(&query_map, &query_key);307}308}309}310
311// size of full structure - addresses + only used addresses312unsigned long long size =313sizeof(*event) - MAX_ADDR_ANSWERS * 16 + anaddrcount * 16;314bpf_perf_event_output(skb, &events, BPF_F_CURRENT_CPU, event, size);315
316return 0;317}
318
319SEC("socket1")320int ig_trace_dns(struct __sk_buff *skb)321{
322// Skip non-IP packets323if (load_half(skb, offsetof(struct ethhdr, h_proto)) != ETH_P_IP)324return 0;325
326// Skip non-UDP packets327if (load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)) !=328IPPROTO_UDP)329return 0;330
331union dnsflags flags;332flags.flags = load_half(skb, DNS_OFF + offsetof(struct dnshdr, flags));333
334// Skip DNS packets with more than 1 question335if (load_half(skb, DNS_OFF + offsetof(struct dnshdr, qdcount)) != 1)336return 0;337
338__u16 ancount =339load_half(skb, DNS_OFF + offsetof(struct dnshdr, ancount));340__u16 nscount =341load_half(skb, DNS_OFF + offsetof(struct dnshdr, nscount));342
343// Skip DNS queries with answers344if ((flags.qr == 0) && (ancount + nscount != 0))345return 0;346
347__u32 name_len = dns_name_length(skb);348if (name_len == 0)349return 0;350
351return output_dns_event(skb, flags, name_len, ancount);352}
353
354char _license[] SEC("license") = "GPL";355