11
"go.avito.ru/DO/moira"
12
"go.avito.ru/DO/moira/logging"
13
"go.avito.ru/DO/moira/netbox"
16
// name of user that creates silent patterns based on inactive servers info
18
autoCreateSPDuration = 60 * time.Minute
19
autoCreateSPUser = "moira-cmdb-auto"
20
checkInterval = 10 // seconds
21
silencerLockKey = "moira-silencer-full-update"
25
blacklist = []string{"servers", "network", "containers", "resources", "apps", "products", "complex", "offices"}
33
// Silencer represents worker that stores and periodically updates silent patterns' cache
34
// it is an auxiliary component which is launched and maintained by any main component, such as checker or notifier
38
metrics moira.Maintenance
39
tags moira.Maintenance
41
database moira.Database
42
logger *logging.Logger
48
// NewSilencer returns Silencer singleton instance
49
// it also creates it if necessary
50
func NewSilencer(database moira.Database, config *netbox.Config) *Silencer {
53
defer initLock.Unlock()
57
metrics: moira.NewMaintenance(),
58
tags: moira.NewMaintenance(),
60
logger: logging.GetLogger(""),
65
if config != nil && !silencer.isFullUpdate {
66
silencer.isFullUpdate = true
67
silencer.netbox = netbox.CreateClient(config)
68
silencer.netbox.SetTimeout(5 * time.Second)
74
// Start begins the lifecycle of the Silencer
75
func (worker *Silencer) Start() {
76
// starting the timer that updates silent pattern list every 10 seconds
77
worker.tomb.Go(func() error {
78
checkTicker := time.NewTicker(checkInterval * time.Second)
79
defer checkTicker.Stop()
83
case <-worker.tomb.Dying():
86
worker.updateSilentPatterns()
92
// Stop ends the lifecycle of the Silencer
93
func (worker *Silencer) Stop() error {
95
return worker.tomb.Wait()
98
// IsMetricSilenced can tell whether or not the given metric matches any silent pattern
99
func (worker *Silencer) IsMetricSilenced(metric string, ts int64) bool {
100
metrics := worker.metrics
101
for pattern := range metrics {
102
// see if the metric matches current pattern in the first place
103
if !worker.isPatternMatched(metric, pattern) {
107
// if it does then make sure that timing is correct
108
maintained, _ := metrics.Get(metric, ts)
116
// IsTagsSilenced can tell whether or not any of given tags must be silenced
117
func (worker *Silencer) IsTagsSilenced(tags []string, ts int64) bool {
118
maintenance := worker.tags
119
for _, tag := range tags {
120
if maintained, _ := maintenance.Get(tag, ts); maintained {
127
// getInactiveDeviceList requests the list of inactive devices using netbox client
128
// it also handles panic
129
func (worker *Silencer) getInactiveDeviceList() netbox.DeviceBriefList {
131
if r := recover(); r != nil {
132
worker.logger.Error(fmt.Sprintf("Silencer recovered from netbox client panic: %v (%T)", r, r))
136
return worker.netbox.InactiveDeviceList()
139
// getSilentMetricsQtyEstimate returns estimated quantity of silent metrics
140
// (both devices and containers) by inactive devices data
141
func (worker *Silencer) getSilentMetricsQtyEstimate(list netbox.DeviceBriefList) int {
142
result := 2 * len(list.List)
143
for _, deviceInfo := range list.List {
144
result += len(deviceInfo.Containers)
149
// isPatternMatched can tell whether or not the given metric matches the given silent pattern
150
func (worker *Silencer) isPatternMatched(metricName, silentPattern string) bool {
151
metricParts := strings.Split(metricName, ".")
152
patternParts := strings.Split(silentPattern, ".")
153
if len(patternParts) > len(metricParts) {
159
isLastMatched := false
162
for _, b := range blacklist { // skip blacklist prefix
163
if metricParts[0] == b {
169
for ; mIndex < len(metricParts); mIndex++ {
170
if metricParts[mIndex] == patternParts[pIndex] {
174
if matchCount == len(patternParts) {
180
isLastMatched = false
190
// silentMetricsFullUpdate makes full update of silent metrics:
191
// it reloads current list from DB and merges it with inactive containers and/or servers
192
func (worker *Silencer) silentMetricsFullUpdate() error {
193
// reload current list
194
currentMetrics, err := worker.database.GetSilentPatternsTyped(moira.SPTMetric)
199
// get inactive containers and servers data from netbox
200
inactiveDeviceList := worker.getInactiveDeviceList()
201
inactiveDeviceQty := worker.getSilentMetricsQtyEstimate(inactiveDeviceList)
203
if err := worker.database.LockSilentPatterns(moira.SPTMetric); err != nil {
206
defer worker.database.UnlockSilentPatterns(moira.SPTMetric)
208
inactiveNames := make(map[string]bool, inactiveDeviceQty) // set for processed names
209
processedMetrics := make(map[string]*moira.SilentPatternData, inactiveDeviceQty) // map for processed metrics
210
metricsUpdateBatch := make([]*moira.SilentPatternData, 0, inactiveDeviceQty)
211
metricsRemoveBatch := make([]*moira.SilentPatternData, 0, inactiveDeviceQty)
212
result := make([]string, 0, len(currentMetrics)+inactiveDeviceQty)
214
// transform servers-and-containers list to the flat map
215
for _, deviceInfo := range inactiveDeviceList.List {
216
inactiveNames[deviceInfo.Name] = true // name of the server
217
if deviceInfo.NamePrevious != "" && deviceInfo.NamePrevious != deviceInfo.Name {
218
inactiveNames[deviceInfo.NamePrevious] = true // and its previous name
220
for _, containerInfo := range deviceInfo.Containers {
221
inactiveNames[containerInfo.Name] = true // names of its containers
229
// first, process current list
230
for _, currentMetric := range currentMetrics {
232
// leave only those patterns which aren't expired yet
233
expired = time.Now().Unix() > currentMetric.Until
234
// also pattern could be auto-created based on inactive device list, but this device is inactive no longer
235
obsolete = currentMetric.Login == autoCreateSPUser && !inactiveNames[currentMetric.Pattern]
237
pattern = currentMetric.Pattern
240
if !expired && !obsolete {
241
processedMetrics[pattern] = currentMetric
242
result = append(result, pattern)
246
worker.logger.InfoE(fmt.Sprintf("Silent metric '%s' added to remove batch", pattern), map[string]interface{}{
248
"obsolete": obsolete,
251
metricsRemoveBatch = append(metricsRemoveBatch, currentMetric)
254
worker.logger.InfoE(fmt.Sprintf("Got remove batch of %d metrics", len(metricsRemoveBatch)), metricsRemoveBatch)
255
if len(metricsRemoveBatch) > 0 {
256
if err = worker.database.RemoveSilentPatterns(moira.SPTMetric, metricsRemoveBatch...); err != nil {
261
// next, add or update metrics based on inactive servers-and-containers
263
autoPatternCreated := now.Unix()
264
autoPatternExpiration := now.Add(autoCreateSPDuration).Unix()
265
for inactiveName := range inactiveNames {
266
spd, ok := processedMetrics[inactiveName]
268
// this container or server hasn't been added yet, so add it now
269
spd = &moira.SilentPatternData{
270
Pattern: inactiveName,
271
Login: autoCreateSPUser,
272
Created: autoPatternCreated,
274
result = append(result, inactiveName)
277
if spd.Login == autoCreateSPUser {
278
// if it is auto-created pattern then its expiration will be extended
279
spd.Until = autoPatternExpiration
280
metricsUpdateBatch = append(metricsUpdateBatch, spd)
284
worker.logger.InfoE(fmt.Sprintf("Got update batch of %d metrics", len(metricsUpdateBatch)), metricsUpdateBatch)
285
if len(metricsUpdateBatch) > 0 {
286
if err = worker.database.SaveSilentPatterns(moira.SPTMetric, metricsUpdateBatch...); err != nil {
294
// updateSilentMetrics refreshes silent metrics list and returns new list
295
func (worker *Silencer) updateSilentMetrics(isFullUpdate bool) (moira.Maintenance, error) {
297
if err := worker.silentMetricsFullUpdate(); err != nil {
301
return worker.database.GetOrCreateMaintenanceSilent(moira.SPTMetric)
304
// updateSilentTags refreshes silent tags list and returns new map
305
func (worker *Silencer) updateSilentTags(isFullUpdate bool) (moira.Maintenance, error) {
307
silentTags, err := worker.database.GetSilentPatternsTyped(moira.SPTTag)
313
newSilentTags := make(map[string]bool, len(silentTags))
315
removeTags := make([]*moira.SilentPatternData, 0, len(silentTags))
317
// set the lock in case tags are going to be updated
319
if err := worker.database.LockSilentPatterns(moira.SPTTag); err != nil {
322
defer worker.database.UnlockSilentPatterns(moira.SPTTag)
325
for _, silentTag := range silentTags {
326
if isFullUpdate && time.Now().Unix() > silentTag.Until {
327
worker.logger.InfoE("Remove silent tag (obsolete)", silentTag)
328
removeTags = append(removeTags, silentTag)
330
newSilentTags[silentTag.Pattern] = true
334
if len(removeTags) > 0 {
335
if err = worker.database.RemoveSilentPatterns(moira.SPTTag, removeTags...); err != nil {
340
return worker.database.GetOrCreateMaintenanceSilent(moira.SPTTag)
343
// updateSilentPatterns refreshes silent patterns data
344
func (worker *Silencer) updateSilentPatterns() {
346
defer worker.mu.Unlock()
348
isFullUpdate := worker.isFullUpdate
350
isFullUpdate, _ = worker.database.SetLock(silencerLockKey, checkInterval)
353
metrics, err := worker.updateSilentMetrics(isFullUpdate)
355
worker.logger.ErrorF("Failed to update silent metrics: %v", err)
358
worker.metrics = metrics
360
tags, err := worker.updateSilentTags(isFullUpdate)
362
worker.logger.ErrorF("Failed to update silent tags: %v", err)