Dragonfly2

Форк
0
/
scheduler.go 
467 строк · 12.9 Кб
1
/*
2
 *     Copyright 2020 The Dragonfly Authors
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *      http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16

17
package scheduler
18

19
import (
20
	"context"
21
	"crypto/tls"
22
	"errors"
23
	"fmt"
24
	"net"
25
	"net/http"
26
	"path/filepath"
27
	"time"
28

29
	"github.com/go-redis/redis/v8"
30
	"github.com/johanbrandhorst/certify"
31
	"google.golang.org/grpc"
32
	"google.golang.org/grpc/credentials"
33
	"google.golang.org/grpc/credentials/insecure"
34
	zapadapter "logur.dev/adapter/zap"
35

36
	logger "d7y.io/dragonfly/v2/internal/dflog"
37
	"d7y.io/dragonfly/v2/internal/dynconfig"
38
	"d7y.io/dragonfly/v2/pkg/cache"
39
	"d7y.io/dragonfly/v2/pkg/dfpath"
40
	"d7y.io/dragonfly/v2/pkg/gc"
41
	"d7y.io/dragonfly/v2/pkg/issuer"
42
	"d7y.io/dragonfly/v2/pkg/net/ip"
43
	pkgredis "d7y.io/dragonfly/v2/pkg/redis"
44
	"d7y.io/dragonfly/v2/pkg/rpc"
45
	managerclient "d7y.io/dragonfly/v2/pkg/rpc/manager/client"
46
	securityclient "d7y.io/dragonfly/v2/pkg/rpc/security/client"
47
	trainerclient "d7y.io/dragonfly/v2/pkg/rpc/trainer/client"
48
	"d7y.io/dragonfly/v2/pkg/types"
49
	"d7y.io/dragonfly/v2/scheduler/announcer"
50
	"d7y.io/dragonfly/v2/scheduler/config"
51
	"d7y.io/dragonfly/v2/scheduler/job"
52
	"d7y.io/dragonfly/v2/scheduler/metrics"
53
	"d7y.io/dragonfly/v2/scheduler/networktopology"
54
	"d7y.io/dragonfly/v2/scheduler/resource"
55
	"d7y.io/dragonfly/v2/scheduler/rpcserver"
56
	"d7y.io/dragonfly/v2/scheduler/scheduling"
57
	"d7y.io/dragonfly/v2/scheduler/scheduling/evaluator"
58
	"d7y.io/dragonfly/v2/scheduler/storage"
59
)
60

61
const (
62
	// gracefulStopTimeout specifies a time limit for
63
	// grpc server to complete a graceful shutdown.
64
	gracefulStopTimeout = 10 * time.Minute
65
)
66

67
// Server is the scheduler server.
68
type Server struct {
69
	// Server configuration.
70
	config *config.Config
71

72
	// GRPC server.
73
	grpcServer *grpc.Server
74

75
	// Metrics server.
76
	metricsServer *http.Server
77

78
	// Manager client.
79
	managerClient managerclient.V2
80

81
	// Security client.
82
	securityClient securityclient.V1
83

84
	// Trainer client.
85
	trainerClient trainerclient.V1
86

87
	// Resource interface.
88
	resource resource.Resource
89

90
	// Dynamic config.
91
	dynconfig config.DynconfigInterface
92

93
	// Async job.
94
	job job.Job
95

96
	// Storage interface.
97
	storage storage.Storage
98

99
	// Announcer interface.
100
	announcer announcer.Announcer
101

102
	// Network topology interface.
103
	networkTopology networktopology.NetworkTopology
104

105
	// GC service.
106
	gc gc.GC
107
}
108

109
// New creates a new scheduler server.
110
func New(ctx context.Context, cfg *config.Config, d dfpath.Dfpath) (*Server, error) {
111
	s := &Server{config: cfg}
112

113
	// Initialize Storage.
114
	storage, err := storage.New(
115
		d.DataDir(),
116
		cfg.Storage.MaxSize,
117
		cfg.Storage.MaxBackups,
118
		cfg.Storage.BufferSize,
119
	)
120
	if err != nil {
121
		return nil, err
122
	}
123
	s.storage = storage
124

125
	// Initialize dial options of manager grpc client.
126
	managerDialOptions := []grpc.DialOption{}
127
	if cfg.Security.AutoIssueCert {
128
		clientTransportCredentials, err := rpc.NewClientCredentials(cfg.Security.TLSPolicy, nil, []byte(cfg.Security.CACert))
129
		if err != nil {
130
			return nil, err
131
		}
132

133
		managerDialOptions = append(managerDialOptions, grpc.WithTransportCredentials(clientTransportCredentials))
134
	} else {
135
		managerDialOptions = append(managerDialOptions, grpc.WithTransportCredentials(insecure.NewCredentials()))
136
	}
137

138
	// Initialize manager client.
139
	managerClient, err := managerclient.GetV2ByAddr(ctx, cfg.Manager.Addr, managerDialOptions...)
140
	if err != nil {
141
		return nil, err
142
	}
143
	s.managerClient = managerClient
144

145
	// Initialize dial options of trainer grpc client.
146
	if cfg.Trainer.Enable {
147
		trainerDialOptions := []grpc.DialOption{}
148
		if cfg.Security.AutoIssueCert {
149
			clientTransportCredentials, err := rpc.NewClientCredentials(cfg.Security.TLSPolicy, nil, []byte(cfg.Security.CACert))
150
			if err != nil {
151
				return nil, err
152
			}
153

154
			trainerDialOptions = append(trainerDialOptions, grpc.WithTransportCredentials(clientTransportCredentials))
155
		} else {
156
			trainerDialOptions = append(trainerDialOptions, grpc.WithTransportCredentials(insecure.NewCredentials()))
157
		}
158

159
		// Initialize trainer client.
160
		trainerClient, err := trainerclient.GetV1ByAddr(ctx, cfg.Trainer.Addr, trainerDialOptions...)
161
		if err != nil {
162
			return nil, err
163
		}
164
		s.trainerClient = trainerClient
165
	}
166

167
	// Initialize dial options of announcer.
168
	announcerOptions := []announcer.Option{}
169
	if s.trainerClient != nil {
170
		announcerOptions = append(announcerOptions, announcer.WithTrainerClient(s.trainerClient))
171
	}
172

173
	// Initialize announcer.
174
	announcer, err := announcer.New(cfg, s.managerClient, storage, announcerOptions...)
175
	if err != nil {
176
		return nil, err
177
	}
178
	s.announcer = announcer
179

180
	// Initialize certify client.
181
	var (
182
		certifyClient              *certify.Certify
183
		clientTransportCredentials credentials.TransportCredentials
184
	)
185
	if cfg.Security.AutoIssueCert {
186
		// Initialize security client.
187
		securityClient, err := securityclient.GetV1(ctx, cfg.Manager.Addr, managerDialOptions...)
188
		if err != nil {
189
			return nil, err
190
		}
191
		s.securityClient = securityClient
192

193
		certifyClient = &certify.Certify{
194
			CommonName:  types.SchedulerName,
195
			Issuer:      issuer.NewDragonflyIssuer(s.securityClient, issuer.WithValidityPeriod(cfg.Security.CertSpec.ValidityPeriod)),
196
			RenewBefore: time.Hour,
197
			CertConfig: &certify.CertConfig{
198
				SubjectAlternativeNames:   cfg.Security.CertSpec.DNSNames,
199
				IPSubjectAlternativeNames: append(cfg.Security.CertSpec.IPAddresses, cfg.Server.AdvertiseIP),
200
			},
201
			IssueTimeout: 0,
202
			Logger:       zapadapter.New(logger.CoreLogger.Desugar()),
203
			Cache: cache.NewCertifyMutliCache(
204
				certify.NewMemCache(),
205
				certify.DirCache(filepath.Join(d.CacheDir(), cache.CertifyCacheDirName, types.SchedulerName))),
206
		}
207

208
		clientTransportCredentials, err = rpc.NewClientCredentialsByCertify(cfg.Security.TLSPolicy, []byte(cfg.Security.CACert), certifyClient)
209
		if err != nil {
210
			return nil, err
211
		}
212

213
		// Issue a certificate to reduce first time delay.
214
		if _, err := certifyClient.GetCertificate(&tls.ClientHelloInfo{
215
			ServerName: cfg.Server.AdvertiseIP.String(),
216
		}); err != nil {
217
			return nil, err
218
		}
219
	}
220

221
	// Initialize dynconfig client.
222
	dynconfig, err := config.NewDynconfig(s.managerClient, filepath.Join(d.CacheDir(), dynconfig.CacheDirName), cfg, config.WithTransportCredentials(clientTransportCredentials))
223
	if err != nil {
224
		return nil, err
225
	}
226
	s.dynconfig = dynconfig
227

228
	// Initialize GC.
229
	s.gc = gc.New(gc.WithLogger(logger.GCLogger))
230

231
	// Initialize resource.
232
	resource, err := resource.New(cfg, s.gc, dynconfig, resource.WithTransportCredentials(clientTransportCredentials))
233
	if err != nil {
234
		return nil, err
235
	}
236
	s.resource = resource
237

238
	// Initialize redis client.
239
	var rdb redis.UniversalClient
240
	if pkgredis.IsEnabled(cfg.Database.Redis.Addrs) {
241
		rdb, err = pkgredis.NewRedis(&redis.UniversalOptions{
242
			Addrs:      cfg.Database.Redis.Addrs,
243
			MasterName: cfg.Database.Redis.MasterName,
244
			DB:         cfg.Database.Redis.NetworkTopologyDB,
245
			Username:   cfg.Database.Redis.Username,
246
			Password:   cfg.Database.Redis.Password,
247
		})
248
		if err != nil {
249
			return nil, err
250
		}
251
	}
252

253
	// Initialize job service.
254
	if cfg.Job.Enable && pkgredis.IsEnabled(cfg.Database.Redis.Addrs) {
255
		s.job, err = job.New(cfg, resource)
256
		if err != nil {
257
			return nil, err
258
		}
259
	}
260

261
	// Initialize options of evaluator.
262
	evaluatorNetworkTopologyOptions := []evaluator.NetworkTopologyOption{}
263
	// Initialize network topology service.
264
	if cfg.Scheduler.Algorithm == evaluator.NetworkTopologyAlgorithm {
265
		cache := cache.New(cfg.Scheduler.NetworkTopology.Cache.TTL, cfg.Scheduler.NetworkTopology.Cache.Interval)
266
		s.networkTopology, err = networktopology.NewNetworkTopology(cfg.Scheduler.NetworkTopology, rdb, cache, resource, s.storage)
267
		if err != nil {
268
			return nil, err
269
		}
270

271
		evaluatorNetworkTopologyOptions = append(evaluatorNetworkTopologyOptions, evaluator.WithNetworkTopology(s.networkTopology))
272
	}
273

274
	// Initialize scheduling.
275
	scheduling := scheduling.New(&cfg.Scheduler, dynconfig, d.PluginDir(), evaluatorNetworkTopologyOptions...)
276

277
	// Initialize server options of scheduler grpc server.
278
	schedulerServerOptions := []grpc.ServerOption{}
279
	if certifyClient != nil {
280
		serverTransportCredentials, err := rpc.NewServerCredentialsByCertify(cfg.Security.TLSPolicy, cfg.Security.TLSVerify, []byte(cfg.Security.CACert), certifyClient)
281
		if err != nil {
282
			return nil, err
283
		}
284

285
		schedulerServerOptions = append(schedulerServerOptions, grpc.Creds(serverTransportCredentials))
286
	} else {
287
		schedulerServerOptions = append(schedulerServerOptions, grpc.Creds(insecure.NewCredentials()))
288
	}
289

290
	svr := rpcserver.New(cfg, resource, scheduling, dynconfig, s.storage, s.networkTopology, schedulerServerOptions...)
291
	s.grpcServer = svr
292

293
	// Initialize metrics.
294
	if cfg.Metrics.Enable {
295
		s.metricsServer = metrics.New(&cfg.Metrics, s.grpcServer)
296
	}
297

298
	return s, nil
299
}
300

301
// Serve starts the scheduler server.
302
func (s *Server) Serve() error {
303
	// Serve dynconfig.
304
	go func() {
305
		if err := s.dynconfig.Serve(); err != nil {
306
			logger.Fatalf("dynconfig start failed %s", err.Error())
307
		}
308

309
		logger.Info("dynconfig start successfully")
310
	}()
311

312
	// Serve GC.
313
	s.gc.Start()
314
	logger.Info("gc start successfully")
315

316
	// Serve Job.
317
	if s.job != nil {
318
		s.job.Serve()
319
		logger.Info("job start successfully")
320
	}
321

322
	// Started metrics server.
323
	if s.metricsServer != nil {
324
		go func() {
325
			logger.Infof("started metrics server at %s", s.metricsServer.Addr)
326
			if err := s.metricsServer.ListenAndServe(); err != nil {
327
				if err == http.ErrServerClosed {
328
					return
329
				}
330

331
				logger.Fatalf("metrics server closed unexpect: %s", err.Error())
332
			}
333
		}()
334
	}
335

336
	// Serve announcer.
337
	go func() {
338
		s.announcer.Serve()
339
		logger.Info("announcer start successfully")
340
	}()
341

342
	// Serve network topology.
343
	if s.networkTopology != nil {
344
		go func() {
345
			s.networkTopology.Serve()
346
			logger.Info("network topology start successfully")
347
		}()
348
	}
349

350
	// Generate GRPC limit listener.
351
	ip, ok := ip.FormatIP(s.config.Server.ListenIP.String())
352
	if !ok {
353
		return errors.New("format ip failed")
354
	}
355

356
	listener, err := net.Listen("tcp", fmt.Sprintf("%s:%d", ip, s.config.Server.Port))
357
	if err != nil {
358
		logger.Fatalf("net listener failed to start: %s", err.Error())
359
	}
360
	defer listener.Close()
361

362
	// Started GRPC server.
363
	logger.Infof("started grpc server at %s://%s", listener.Addr().Network(), listener.Addr().String())
364
	if err := s.grpcServer.Serve(listener); err != nil {
365
		logger.Errorf("stoped grpc server: %s", err.Error())
366
		return err
367
	}
368

369
	return nil
370
}
371

372
// Stop stops the scheduler server.
373
func (s *Server) Stop() {
374
	// Stop dynconfig.
375
	if err := s.dynconfig.Stop(); err != nil {
376
		logger.Errorf("stop dynconfig failed %s", err.Error())
377
	} else {
378
		logger.Info("stop dynconfig closed")
379
	}
380

381
	// Stop resource.
382
	if err := s.resource.Stop(); err != nil {
383
		logger.Errorf("stop resource failed %s", err.Error())
384
	} else {
385
		logger.Info("stop resource closed")
386
	}
387

388
	// Clean download storage.
389
	if err := s.storage.ClearDownload(); err != nil {
390
		logger.Errorf("clean download storage failed %s", err.Error())
391
	} else {
392
		logger.Info("clean download storage completed")
393
	}
394

395
	// Clean network topology storage.
396
	if err := s.storage.ClearNetworkTopology(); err != nil {
397
		logger.Errorf("clean network topology storage failed %s", err.Error())
398
	} else {
399
		logger.Info("clean network topology storage completed")
400
	}
401

402
	// Stop GC.
403
	s.gc.Stop()
404
	logger.Info("gc closed")
405

406
	// Stop metrics server.
407
	if s.metricsServer != nil {
408
		if err := s.metricsServer.Shutdown(context.Background()); err != nil {
409
			logger.Errorf("metrics server failed to stop: %s", err.Error())
410
		} else {
411
			logger.Info("metrics server closed under request")
412
		}
413
	}
414

415
	// Stop announcer.
416
	s.announcer.Stop()
417
	logger.Info("stop announcer closed")
418

419
	// Stop manager client.
420
	if s.managerClient != nil {
421
		if err := s.managerClient.Close(); err != nil {
422
			logger.Errorf("manager client failed to stop: %s", err.Error())
423
		} else {
424
			logger.Info("manager client closed")
425
		}
426
	}
427

428
	// Stop trainer client.
429
	if s.trainerClient != nil {
430
		if err := s.trainerClient.Close(); err != nil {
431
			logger.Errorf("trainer client failed to stop: %s", err.Error())
432
		} else {
433
			logger.Info("trainer client closed")
434
		}
435
	}
436

437
	// Stop security client.
438
	if s.securityClient != nil {
439
		if err := s.securityClient.Close(); err != nil {
440
			logger.Errorf("security client failed to stop: %s", err.Error())
441
		} else {
442
			logger.Info("security client closed")
443
		}
444
	}
445

446
	// Stop network topology.
447
	if s.networkTopology != nil {
448
		s.networkTopology.Stop()
449
		logger.Info("network topology closed")
450
	}
451

452
	// Stop GRPC server.
453
	stopped := make(chan struct{})
454
	go func() {
455
		s.grpcServer.GracefulStop()
456
		logger.Info("grpc server closed under request")
457
		close(stopped)
458
	}()
459

460
	t := time.NewTimer(gracefulStopTimeout)
461
	select {
462
	case <-t.C:
463
		s.grpcServer.Stop()
464
	case <-stopped:
465
		t.Stop()
466
	}
467
}
468

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.