moira

Форк
0
/
silencer.go 
366 строк · 10.3 Кб
1
package silencer
2

3
import (
4
	"fmt"
5
	"strings"
6
	"sync"
7
	"time"
8

9
	"gopkg.in/tomb.v2"
10

11
	"go.avito.ru/DO/moira"
12
	"go.avito.ru/DO/moira/logging"
13
	"go.avito.ru/DO/moira/netbox"
14
)
15

16
// name of user that creates silent patterns based on inactive servers info
17
const (
18
	autoCreateSPDuration = 60 * time.Minute
19
	autoCreateSPUser     = "moira-cmdb-auto"
20
	checkInterval        = 10 // seconds
21
	silencerLockKey      = "moira-silencer-full-update"
22
)
23

24
var (
25
	blacklist = []string{"servers", "network", "containers", "resources", "apps", "products", "complex", "offices"}
26
)
27

28
var (
29
	initLock sync.Mutex
30
	silencer *Silencer
31
)
32

33
// Silencer represents worker that stores and periodically updates silent patterns' cache
34
// it is an auxiliary component which is launched and maintained by any main component, such as checker or notifier
35
type Silencer struct {
36
	mu           sync.Mutex
37
	isFullUpdate bool
38
	metrics      moira.Maintenance
39
	tags         moira.Maintenance
40

41
	database moira.Database
42
	logger   *logging.Logger
43
	netbox   *netbox.Client
44

45
	tomb tomb.Tomb
46
}
47

48
// NewSilencer returns Silencer singleton instance
49
// it also creates it if necessary
50
func NewSilencer(database moira.Database, config *netbox.Config) *Silencer {
51
	if silencer == nil {
52
		initLock.Lock()
53
		defer initLock.Unlock()
54

55
		if silencer == nil {
56
			silencer = &Silencer{
57
				metrics:  moira.NewMaintenance(),
58
				tags:     moira.NewMaintenance(),
59
				database: database,
60
				logger:   logging.GetLogger(""),
61
			}
62
		}
63
	}
64

65
	if config != nil && !silencer.isFullUpdate {
66
		silencer.isFullUpdate = true
67
		silencer.netbox = netbox.CreateClient(config)
68
		silencer.netbox.SetTimeout(5 * time.Second)
69
	}
70

71
	return silencer
72
}
73

74
// Start begins the lifecycle of the Silencer
75
func (worker *Silencer) Start() {
76
	// starting the timer that updates silent pattern list every 10 seconds
77
	worker.tomb.Go(func() error {
78
		checkTicker := time.NewTicker(checkInterval * time.Second)
79
		defer checkTicker.Stop()
80

81
		for {
82
			select {
83
			case <-worker.tomb.Dying():
84
				return nil
85
			case <-checkTicker.C:
86
				worker.updateSilentPatterns()
87
			}
88
		}
89
	})
90
}
91

92
// Stop ends the lifecycle of the Silencer
93
func (worker *Silencer) Stop() error {
94
	worker.tomb.Kill(nil)
95
	return worker.tomb.Wait()
96
}
97

98
// IsMetricSilenced can tell whether or not the given metric matches any silent pattern
99
func (worker *Silencer) IsMetricSilenced(metric string, ts int64) bool {
100
	metrics := worker.metrics
101
	for pattern := range metrics {
102
		// see if the metric matches current pattern in the first place
103
		if !worker.isPatternMatched(metric, pattern) {
104
			continue
105
		}
106

107
		// if it does then make sure that timing is correct
108
		maintained, _ := metrics.Get(metric, ts)
109
		if maintained {
110
			return true
111
		}
112
	}
113
	return false
114
}
115

116
// IsTagsSilenced can tell whether or not any of given tags must be silenced
117
func (worker *Silencer) IsTagsSilenced(tags []string, ts int64) bool {
118
	maintenance := worker.tags
119
	for _, tag := range tags {
120
		if maintained, _ := maintenance.Get(tag, ts); maintained {
121
			return true
122
		}
123
	}
124
	return false
125
}
126

127
// getInactiveDeviceList requests the list of inactive devices using netbox client
128
// it also handles panic
129
func (worker *Silencer) getInactiveDeviceList() netbox.DeviceBriefList {
130
	defer func() {
131
		if r := recover(); r != nil {
132
			worker.logger.Error(fmt.Sprintf("Silencer recovered from netbox client panic: %v (%T)", r, r))
133
		}
134
	}()
135

136
	return worker.netbox.InactiveDeviceList()
137
}
138

139
// getSilentMetricsQtyEstimate returns estimated quantity of silent metrics
140
// (both devices and containers) by inactive devices data
141
func (worker *Silencer) getSilentMetricsQtyEstimate(list netbox.DeviceBriefList) int {
142
	result := 2 * len(list.List)
143
	for _, deviceInfo := range list.List {
144
		result += len(deviceInfo.Containers)
145
	}
146
	return result
147
}
148

149
// isPatternMatched can tell whether or not the given metric matches the given silent pattern
150
func (worker *Silencer) isPatternMatched(metricName, silentPattern string) bool {
151
	metricParts := strings.Split(metricName, ".")
152
	patternParts := strings.Split(silentPattern, ".")
153
	if len(patternParts) > len(metricParts) {
154
		return false
155
	}
156

157
	pIndex := 0
158
	matchCount := 0
159
	isLastMatched := false
160

161
	mIndex := 0
162
	for _, b := range blacklist { // skip blacklist prefix
163
		if metricParts[0] == b {
164
			mIndex = 1
165
			break
166
		}
167
	}
168

169
	for ; mIndex < len(metricParts); mIndex++ {
170
		if metricParts[mIndex] == patternParts[pIndex] {
171
			pIndex++
172
			matchCount++
173
			isLastMatched = true
174
			if matchCount == len(patternParts) {
175
				return true
176
			}
177
		} else {
178
			pIndex = 0
179
			if isLastMatched {
180
				isLastMatched = false
181
				mIndex -= matchCount
182
			}
183
			matchCount = 0
184
		}
185
	}
186

187
	return false
188
}
189

190
// silentMetricsFullUpdate makes full update of silent metrics:
191
// it reloads current list from DB and merges it with inactive containers and/or servers
192
func (worker *Silencer) silentMetricsFullUpdate() error {
193
	// reload current list
194
	currentMetrics, err := worker.database.GetSilentPatternsTyped(moira.SPTMetric)
195
	if err != nil {
196
		return err
197
	}
198

199
	// get inactive containers and servers data from netbox
200
	inactiveDeviceList := worker.getInactiveDeviceList()
201
	inactiveDeviceQty := worker.getSilentMetricsQtyEstimate(inactiveDeviceList)
202

203
	if err := worker.database.LockSilentPatterns(moira.SPTMetric); err != nil {
204
		return err
205
	}
206
	defer worker.database.UnlockSilentPatterns(moira.SPTMetric)
207

208
	inactiveNames := make(map[string]bool, inactiveDeviceQty)                        // set for processed names
209
	processedMetrics := make(map[string]*moira.SilentPatternData, inactiveDeviceQty) // map for processed metrics
210
	metricsUpdateBatch := make([]*moira.SilentPatternData, 0, inactiveDeviceQty)
211
	metricsRemoveBatch := make([]*moira.SilentPatternData, 0, inactiveDeviceQty)
212
	result := make([]string, 0, len(currentMetrics)+inactiveDeviceQty)
213

214
	// transform servers-and-containers list to the flat map
215
	for _, deviceInfo := range inactiveDeviceList.List {
216
		inactiveNames[deviceInfo.Name] = true // name of the server
217
		if deviceInfo.NamePrevious != "" && deviceInfo.NamePrevious != deviceInfo.Name {
218
			inactiveNames[deviceInfo.NamePrevious] = true // and its previous name
219
		}
220
		for _, containerInfo := range deviceInfo.Containers {
221
			inactiveNames[containerInfo.Name] = true // names of its containers
222
		}
223
	}
224

225
	//
226
	// start processing
227
	//
228

229
	// first, process current list
230
	for _, currentMetric := range currentMetrics {
231
		var (
232
			// leave only those patterns which aren't expired yet
233
			expired = time.Now().Unix() > currentMetric.Until
234
			// also pattern could be auto-created based on inactive device list, but this device is inactive no longer
235
			obsolete = currentMetric.Login == autoCreateSPUser && !inactiveNames[currentMetric.Pattern]
236

237
			pattern = currentMetric.Pattern
238
		)
239

240
		if !expired && !obsolete {
241
			processedMetrics[pattern] = currentMetric
242
			result = append(result, pattern)
243
			continue
244
		}
245

246
		worker.logger.InfoE(fmt.Sprintf("Silent metric '%s' added to remove batch", pattern), map[string]interface{}{
247
			"expired":  expired,
248
			"obsolete": obsolete,
249
			"pattern":  pattern,
250
		})
251
		metricsRemoveBatch = append(metricsRemoveBatch, currentMetric)
252
	}
253

254
	worker.logger.InfoE(fmt.Sprintf("Got remove batch of %d metrics", len(metricsRemoveBatch)), metricsRemoveBatch)
255
	if len(metricsRemoveBatch) > 0 {
256
		if err = worker.database.RemoveSilentPatterns(moira.SPTMetric, metricsRemoveBatch...); err != nil {
257
			return err
258
		}
259
	}
260

261
	// next, add or update metrics based on inactive servers-and-containers
262
	now := time.Now()
263
	autoPatternCreated := now.Unix()
264
	autoPatternExpiration := now.Add(autoCreateSPDuration).Unix()
265
	for inactiveName := range inactiveNames {
266
		spd, ok := processedMetrics[inactiveName]
267
		if !ok {
268
			// this container or server hasn't been added yet, so add it now
269
			spd = &moira.SilentPatternData{
270
				Pattern: inactiveName,
271
				Login:   autoCreateSPUser,
272
				Created: autoPatternCreated,
273
			}
274
			result = append(result, inactiveName)
275
		}
276

277
		if spd.Login == autoCreateSPUser {
278
			// if it is auto-created pattern then its expiration will be extended
279
			spd.Until = autoPatternExpiration
280
			metricsUpdateBatch = append(metricsUpdateBatch, spd)
281
		}
282
	}
283

284
	worker.logger.InfoE(fmt.Sprintf("Got update batch of %d metrics", len(metricsUpdateBatch)), metricsUpdateBatch)
285
	if len(metricsUpdateBatch) > 0 {
286
		if err = worker.database.SaveSilentPatterns(moira.SPTMetric, metricsUpdateBatch...); err != nil {
287
			return err
288
		}
289
	}
290

291
	return nil
292
}
293

294
// updateSilentMetrics refreshes silent metrics list and returns new list
295
func (worker *Silencer) updateSilentMetrics(isFullUpdate bool) (moira.Maintenance, error) {
296
	if isFullUpdate {
297
		if err := worker.silentMetricsFullUpdate(); err != nil {
298
			return nil, err
299
		}
300
	}
301
	return worker.database.GetOrCreateMaintenanceSilent(moira.SPTMetric)
302
}
303

304
// updateSilentTags refreshes silent tags list and returns new map
305
func (worker *Silencer) updateSilentTags(isFullUpdate bool) (moira.Maintenance, error) {
306
	// current list
307
	silentTags, err := worker.database.GetSilentPatternsTyped(moira.SPTTag)
308
	if err != nil {
309
		return nil, err
310
	}
311

312
	// updated map
313
	newSilentTags := make(map[string]bool, len(silentTags))
314
	// tags to remove
315
	removeTags := make([]*moira.SilentPatternData, 0, len(silentTags))
316

317
	// set the lock in case tags are going to be updated
318
	if isFullUpdate {
319
		if err := worker.database.LockSilentPatterns(moira.SPTTag); err != nil {
320
			return nil, err
321
		}
322
		defer worker.database.UnlockSilentPatterns(moira.SPTTag)
323
	}
324

325
	for _, silentTag := range silentTags {
326
		if isFullUpdate && time.Now().Unix() > silentTag.Until {
327
			worker.logger.InfoE("Remove silent tag (obsolete)", silentTag)
328
			removeTags = append(removeTags, silentTag)
329
		} else {
330
			newSilentTags[silentTag.Pattern] = true
331
		}
332
	}
333

334
	if len(removeTags) > 0 {
335
		if err = worker.database.RemoveSilentPatterns(moira.SPTTag, removeTags...); err != nil {
336
			return nil, err
337
		}
338
	}
339

340
	return worker.database.GetOrCreateMaintenanceSilent(moira.SPTTag)
341
}
342

343
// updateSilentPatterns refreshes silent patterns data
344
func (worker *Silencer) updateSilentPatterns() {
345
	worker.mu.Lock()
346
	defer worker.mu.Unlock()
347

348
	isFullUpdate := worker.isFullUpdate
349
	if isFullUpdate {
350
		isFullUpdate, _ = worker.database.SetLock(silencerLockKey, checkInterval)
351
	}
352

353
	metrics, err := worker.updateSilentMetrics(isFullUpdate)
354
	if err != nil {
355
		worker.logger.ErrorF("Failed to update silent metrics: %v", err)
356
		return
357
	}
358
	worker.metrics = metrics
359

360
	tags, err := worker.updateSilentTags(isFullUpdate)
361
	if err != nil {
362
		worker.logger.ErrorF("Failed to update silent tags: %v", err)
363
		return
364
	}
365
	worker.tags = tags
366
}
367

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.