tetragon
130 строк · 4.2 Кб
1// SPDX-License-Identifier: Apache-2.0
2// Copyright Authors of Tetragon
3
4package errormetrics
5
6import (
7"fmt"
8
9"github.com/cilium/tetragon/pkg/api/ops"
10"github.com/cilium/tetragon/pkg/metrics/consts"
11"github.com/prometheus/client_golang/prometheus"
12)
13
14type ErrorType int
15
16const (
17// Process not found on get() call.
18ProcessCacheMissOnGet ErrorType = iota
19// Process evicted from the cache.
20ProcessCacheEvicted
21// Process not found on remove() call.
22ProcessCacheMissOnRemove
23// Tid and Pid mismatch that could affect BPF and user space caching logic
24ProcessPidTidMismatch
25// An event is missing process info.
26EventMissingProcessInfo
27// An error occurred in an event handler.
28HandlerError
29// An event finalizer on Process failed
30EventFinalizeProcessInfoFailed
31// Failed to resolve Process uid to username
32ProcessMetadataUsernameFailed
33// The username resolution was skipped since the process is not in host
34// namespaces.
35ProcessMetadataUsernameIgnoredNotInHost
36)
37
38var errorTypeLabelValues = map[ErrorType]string{
39ProcessCacheMissOnGet: "process_cache_miss_on_get",
40ProcessCacheEvicted: "process_cache_evicted",
41ProcessCacheMissOnRemove: "process_cache_miss_on_remove",
42ProcessPidTidMismatch: "process_pid_tid_mismatch",
43EventMissingProcessInfo: "event_missing_process_info",
44HandlerError: "handler_error",
45EventFinalizeProcessInfoFailed: "event_finalize_process_info_failed",
46ProcessMetadataUsernameFailed: "process_metadata_username_failed",
47ProcessMetadataUsernameIgnoredNotInHost: "process_metadata_username_ignored_not_in_host_namespaces",
48}
49
50func (e ErrorType) String() string {
51return errorTypeLabelValues[e]
52}
53
54type EventHandlerError int
55
56// TODO: Recognize different errors returned by individual handlers
57const (
58HandlePerfUnknownOp EventHandlerError = iota
59HandlePerfHandlerError
60)
61
62var eventHandlerErrorLabelValues = map[EventHandlerError]string{
63HandlePerfUnknownOp: "unknown_opcode",
64HandlePerfHandlerError: "event_handler_failed",
65}
66
67func (e EventHandlerError) String() string {
68return eventHandlerErrorLabelValues[e]
69}
70
71var (
72ErrorTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
73Namespace: consts.MetricsNamespace,
74Name: "errors_total",
75Help: "The total number of Tetragon errors. For internal use only.",
76ConstLabels: nil,
77}, []string{"type"})
78
79HandlerErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
80Namespace: consts.MetricsNamespace,
81Name: "handler_errors_total",
82Help: "The total number of event handler errors. For internal use only.",
83ConstLabels: nil,
84}, []string{"opcode", "error_type"})
85)
86
87func InitMetrics(registry *prometheus.Registry) {
88registry.MustRegister(ErrorTotal)
89registry.MustRegister(HandlerErrors)
90
91// Initialize metrics with labels
92for er := range errorTypeLabelValues {
93GetErrorTotal(er).Add(0)
94}
95for opcode := range ops.OpCodeStrings {
96if opcode != ops.MsgOpUndef && opcode != ops.MsgOpTest {
97GetHandlerErrors(opcode, HandlePerfHandlerError).Add(0)
98}
99}
100// NB: We initialize only ops.MsgOpUndef here, but unknown_opcode can occur for any opcode
101// that is not explicitly handled.
102GetHandlerErrors(ops.MsgOpUndef, HandlePerfUnknownOp).Add(0)
103
104// NOTES:
105// * op, msg_op, opcode - standardize on a label (+ add human-readable label)
106// * error, error_type, type - standardize on a label
107// * Delete errors_total{type="handler_error"} - it duplicates handler_errors_total
108// * Consider further splitting errors_total
109// * Rename handler_errors_total to event_handler_errors_total?
110}
111
112// Get a new handle on an ErrorTotal metric for an ErrorType
113func GetErrorTotal(er ErrorType) prometheus.Counter {
114return ErrorTotal.WithLabelValues(er.String())
115}
116
117// Increment an ErrorTotal for an ErrorType
118func ErrorTotalInc(er ErrorType) {
119GetErrorTotal(er).Inc()
120}
121
122// Get a new handle on the HandlerErrors metric
123func GetHandlerErrors(opcode ops.OpCode, er EventHandlerError) prometheus.Counter {
124return HandlerErrors.WithLabelValues(fmt.Sprint(int32(opcode)), er.String())
125}
126
127// Increment the HandlerErrors metric
128func HandlerErrorsInc(opcode ops.OpCode, er EventHandlerError) {
129GetHandlerErrors(opcode, er).Inc()
130}
131