tetragon

Форк
0
/
errormetrics.go 
130 строк · 4.2 Кб
1
// SPDX-License-Identifier: Apache-2.0
2
// Copyright Authors of Tetragon
3

4
package errormetrics
5

6
import (
7
	"fmt"
8

9
	"github.com/cilium/tetragon/pkg/api/ops"
10
	"github.com/cilium/tetragon/pkg/metrics/consts"
11
	"github.com/prometheus/client_golang/prometheus"
12
)
13

14
type ErrorType int
15

16
const (
17
	// Process not found on get() call.
18
	ProcessCacheMissOnGet ErrorType = iota
19
	// Process evicted from the cache.
20
	ProcessCacheEvicted
21
	// Process not found on remove() call.
22
	ProcessCacheMissOnRemove
23
	// Tid and Pid mismatch that could affect BPF and user space caching logic
24
	ProcessPidTidMismatch
25
	// An event is missing process info.
26
	EventMissingProcessInfo
27
	// An error occurred in an event handler.
28
	HandlerError
29
	// An event finalizer on Process failed
30
	EventFinalizeProcessInfoFailed
31
	// Failed to resolve Process uid to username
32
	ProcessMetadataUsernameFailed
33
	// The username resolution was skipped since the process is not in host
34
	// namespaces.
35
	ProcessMetadataUsernameIgnoredNotInHost
36
)
37

38
var errorTypeLabelValues = map[ErrorType]string{
39
	ProcessCacheMissOnGet:                   "process_cache_miss_on_get",
40
	ProcessCacheEvicted:                     "process_cache_evicted",
41
	ProcessCacheMissOnRemove:                "process_cache_miss_on_remove",
42
	ProcessPidTidMismatch:                   "process_pid_tid_mismatch",
43
	EventMissingProcessInfo:                 "event_missing_process_info",
44
	HandlerError:                            "handler_error",
45
	EventFinalizeProcessInfoFailed:          "event_finalize_process_info_failed",
46
	ProcessMetadataUsernameFailed:           "process_metadata_username_failed",
47
	ProcessMetadataUsernameIgnoredNotInHost: "process_metadata_username_ignored_not_in_host_namespaces",
48
}
49

50
func (e ErrorType) String() string {
51
	return errorTypeLabelValues[e]
52
}
53

54
type EventHandlerError int
55

56
// TODO: Recognize different errors returned by individual handlers
57
const (
58
	HandlePerfUnknownOp EventHandlerError = iota
59
	HandlePerfHandlerError
60
)
61

62
var eventHandlerErrorLabelValues = map[EventHandlerError]string{
63
	HandlePerfUnknownOp:    "unknown_opcode",
64
	HandlePerfHandlerError: "event_handler_failed",
65
}
66

67
func (e EventHandlerError) String() string {
68
	return eventHandlerErrorLabelValues[e]
69
}
70

71
var (
72
	ErrorTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
73
		Namespace:   consts.MetricsNamespace,
74
		Name:        "errors_total",
75
		Help:        "The total number of Tetragon errors. For internal use only.",
76
		ConstLabels: nil,
77
	}, []string{"type"})
78

79
	HandlerErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
80
		Namespace:   consts.MetricsNamespace,
81
		Name:        "handler_errors_total",
82
		Help:        "The total number of event handler errors. For internal use only.",
83
		ConstLabels: nil,
84
	}, []string{"opcode", "error_type"})
85
)
86

87
func InitMetrics(registry *prometheus.Registry) {
88
	registry.MustRegister(ErrorTotal)
89
	registry.MustRegister(HandlerErrors)
90

91
	// Initialize metrics with labels
92
	for er := range errorTypeLabelValues {
93
		GetErrorTotal(er).Add(0)
94
	}
95
	for opcode := range ops.OpCodeStrings {
96
		if opcode != ops.MsgOpUndef && opcode != ops.MsgOpTest {
97
			GetHandlerErrors(opcode, HandlePerfHandlerError).Add(0)
98
		}
99
	}
100
	// NB: We initialize only ops.MsgOpUndef here, but unknown_opcode can occur for any opcode
101
	// that is not explicitly handled.
102
	GetHandlerErrors(ops.MsgOpUndef, HandlePerfUnknownOp).Add(0)
103

104
	// NOTES:
105
	// * op, msg_op, opcode - standardize on a label (+ add human-readable label)
106
	// * error, error_type, type - standardize on a label
107
	// * Delete errors_total{type="handler_error"} - it duplicates handler_errors_total
108
	// * Consider further splitting errors_total
109
	// * Rename handler_errors_total to event_handler_errors_total?
110
}
111

112
// Get a new handle on an ErrorTotal metric for an ErrorType
113
func GetErrorTotal(er ErrorType) prometheus.Counter {
114
	return ErrorTotal.WithLabelValues(er.String())
115
}
116

117
// Increment an ErrorTotal for an ErrorType
118
func ErrorTotalInc(er ErrorType) {
119
	GetErrorTotal(er).Inc()
120
}
121

122
// Get a new handle on the HandlerErrors metric
123
func GetHandlerErrors(opcode ops.OpCode, er EventHandlerError) prometheus.Counter {
124
	return HandlerErrors.WithLabelValues(fmt.Sprint(int32(opcode)), er.String())
125
}
126

127
// Increment the HandlerErrors metric
128
func HandlerErrorsInc(opcode ops.OpCode, er EventHandlerError) {
129
	GetHandlerErrors(opcode, er).Inc()
130
}
131

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.