talos

Форк
0
576 строк · 15.2 Кб
1
// This Source Code Form is subject to the terms of the Mozilla Public
2
// License, v. 2.0. If a copy of the MPL was not distributed with this
3
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
4

5
package qemu
6

7
import (
8
	"context"
9
	"errors"
10
	"fmt"
11
	"log"
12
	"net/netip"
13
	"os"
14
	"os/exec"
15
	"path/filepath"
16
	"strconv"
17
	"strings"
18

19
	"github.com/alexflint/go-filemutex"
20
	"github.com/containernetworking/cni/libcni"
21
	"github.com/containernetworking/cni/pkg/types"
22
	types100 "github.com/containernetworking/cni/pkg/types/100"
23
	"github.com/containernetworking/plugins/pkg/ns"
24
	"github.com/containernetworking/plugins/pkg/testutils"
25
	"github.com/containernetworking/plugins/pkg/utils"
26
	"github.com/coreos/go-iptables/iptables"
27
	"github.com/google/uuid"
28
	"github.com/siderolabs/gen/xslices"
29
	"github.com/siderolabs/go-blockdevice/blockdevice/partition/gpt"
30
	sideronet "github.com/siderolabs/net"
31

32
	"github.com/siderolabs/talos/pkg/provision"
33
	"github.com/siderolabs/talos/pkg/provision/internal/cniutils"
34
	"github.com/siderolabs/talos/pkg/provision/providers/vm"
35
)
36

37
// LaunchConfig is passed in to the Launch function over stdin.
38
type LaunchConfig struct {
39
	StatePath string
40

41
	// VM options
42
	DiskPaths         []string
43
	DiskDrivers       []string
44
	VCPUCount         int64
45
	MemSize           int64
46
	QemuExecutable    string
47
	KernelImagePath   string
48
	InitrdPath        string
49
	ISOPath           string
50
	PFlashImages      []string
51
	KernelArgs        string
52
	MachineType       string
53
	MonitorPath       string
54
	DefaultBootOrder  string
55
	EnableKVM         bool
56
	BootloaderEnabled bool
57
	TPM2Config        tpm2Config
58
	NodeUUID          uuid.UUID
59
	BadRTC            bool
60

61
	// Talos config
62
	Config string
63

64
	// Network
65
	BridgeName        string
66
	NetworkConfig     *libcni.NetworkConfigList
67
	CNI               provision.CNIConfig
68
	IPs               []netip.Addr
69
	CIDRs             []netip.Prefix
70
	NoMasqueradeCIDRs []netip.Prefix
71
	Hostname          string
72
	GatewayAddrs      []netip.Addr
73
	MTU               int
74
	Nameservers       []netip.Addr
75

76
	// PXE
77
	TFTPServer       string
78
	BootFilename     string
79
	IPXEBootFileName string
80

81
	// API
82
	APIPort int
83

84
	// filled by CNI invocation
85
	tapName string
86
	vmMAC   string
87
	ns      ns.NetNS
88

89
	// signals
90
	c chan os.Signal
91

92
	// controller
93
	controller *Controller
94
}
95

96
type tpm2Config struct {
97
	NodeName string
98
	StateDir string
99
}
100

101
// withCNIOperationLocked ensures that CNI operations don't run concurrently.
102
//
103
// There are race conditions in the CNI plugins that can cause a failure if called concurrently.
104
func withCNIOperationLocked[T any](config *LaunchConfig, f func() (T, error)) (T, error) {
105
	var zeroT T
106

107
	lock, err := filemutex.New(filepath.Join(config.StatePath, "cni.lock"))
108
	if err != nil {
109
		return zeroT, fmt.Errorf("failed to create CNI lock: %w", err)
110
	}
111

112
	if err = lock.Lock(); err != nil {
113
		return zeroT, fmt.Errorf("failed to acquire CNI lock: %w", err)
114
	}
115

116
	defer func() {
117
		if err := lock.Close(); err != nil {
118
			log.Printf("failed to release CNI lock: %s", err)
119
		}
120
	}()
121

122
	return f()
123
}
124

125
// withCNIOperationLockedNoResult ensures that CNI operations don't run concurrently.
126
func withCNIOperationLockedNoResult(config *LaunchConfig, f func() error) error {
127
	_, err := withCNIOperationLocked(config, func() (struct{}, error) {
128
		return struct{}{}, f()
129
	})
130

131
	return err
132
}
133

134
// withCNI creates network namespace, launches CNI and passes control to the next function
135
// filling config with netNS and interface details.
136
//
137
//nolint:gocyclo
138
func withCNI(ctx context.Context, config *LaunchConfig, f func(config *LaunchConfig) error) error {
139
	// random ID for the CNI, maps to single VM
140
	containerID := uuid.New().String()
141

142
	cniConfig := libcni.NewCNIConfigWithCacheDir(config.CNI.BinPath, config.CNI.CacheDir, nil)
143

144
	// create a network namespace
145
	ns, err := testutils.NewNS()
146
	if err != nil {
147
		return err
148
	}
149

150
	defer func() {
151
		ns.Close()              //nolint:errcheck
152
		testutils.UnmountNS(ns) //nolint:errcheck
153
	}()
154

155
	ips := make([]string, len(config.IPs))
156
	for j := range ips {
157
		ips[j] = sideronet.FormatCIDR(config.IPs[j], config.CIDRs[j])
158
	}
159

160
	gatewayAddrs := xslices.Map(config.GatewayAddrs, netip.Addr.String)
161

162
	runtimeConf := libcni.RuntimeConf{
163
		ContainerID: containerID,
164
		NetNS:       ns.Path(),
165
		IfName:      "veth0",
166
		Args: [][2]string{
167
			{"IP", strings.Join(ips, ",")},
168
			{"GATEWAY", strings.Join(gatewayAddrs, ",")},
169
			{"IgnoreUnknown", "1"},
170
		},
171
	}
172

173
	// attempt to clean up network in case it was deployed previously
174
	err = withCNIOperationLockedNoResult(
175
		config,
176
		func() error {
177
			return cniConfig.DelNetworkList(ctx, config.NetworkConfig, &runtimeConf)
178
		},
179
	)
180
	if err != nil {
181
		return fmt.Errorf("error deleting CNI network: %w", err)
182
	}
183

184
	res, err := withCNIOperationLocked(
185
		config,
186
		func() (types.Result, error) {
187
			return cniConfig.AddNetworkList(ctx, config.NetworkConfig, &runtimeConf)
188
		},
189
	)
190
	if err != nil {
191
		return fmt.Errorf("error provisioning CNI network: %w", err)
192
	}
193

194
	defer func() {
195
		if e := withCNIOperationLockedNoResult(
196
			config,
197
			func() error {
198
				return cniConfig.DelNetworkList(ctx, config.NetworkConfig, &runtimeConf)
199
			},
200
		); e != nil {
201
			log.Printf("error cleaning up CNI: %s", e)
202
		}
203
	}()
204

205
	currentResult, err := types100.NewResultFromResult(res)
206
	if err != nil {
207
		return fmt.Errorf("failed to parse cni result: %w", err)
208
	}
209

210
	vmIface, tapIface, err := cniutils.VMTapPair(currentResult, containerID)
211
	if err != nil {
212
		return errors.New(
213
			"failed to parse VM network configuration from CNI output, ensure CNI is configured with a plugin " +
214
				"that supports automatic VM network configuration such as tc-redirect-tap")
215
	}
216

217
	cniChain := utils.FormatChainName(config.NetworkConfig.Name, containerID)
218

219
	ipt, err := iptables.New()
220
	if err != nil {
221
		return fmt.Errorf("failed to initialize iptables: %w", err)
222
	}
223

224
	// don't masquerade traffic with "broadcast" destination from the VM
225
	//
226
	// no need to clean up the rule, as CNI drops the whole chain
227
	if err = ipt.InsertUnique("nat", cniChain, 1, "--destination", "255.255.255.255/32", "-j", "ACCEPT"); err != nil {
228
		return fmt.Errorf("failed to insert iptables rule to allow broadcast traffic: %w", err)
229
	}
230

231
	for _, cidr := range config.NoMasqueradeCIDRs {
232
		if err = ipt.InsertUnique("nat", cniChain, 1, "--destination", cidr.String(), "-j", "ACCEPT"); err != nil {
233
			return fmt.Errorf("failed to insert iptables rule to allow non-masquerade traffic to cidr %q: %w", cidr.String(), err)
234
		}
235
	}
236

237
	config.tapName = tapIface.Name
238
	config.vmMAC = vmIface.Mac
239
	config.ns = ns
240

241
	for j := range config.CIDRs {
242
		nameservers := make([]netip.Addr, 0, len(config.Nameservers))
243

244
		// filter nameservers by IPv4/IPv6 matching IPs
245
		for i := range config.Nameservers {
246
			if config.IPs[j].Is6() {
247
				if config.Nameservers[i].Is6() {
248
					nameservers = append(nameservers, config.Nameservers[i])
249
				}
250
			} else {
251
				if config.Nameservers[i].Is4() {
252
					nameservers = append(nameservers, config.Nameservers[i])
253
				}
254
			}
255
		}
256

257
		// dump node IP/mac/hostname for dhcp
258
		if err = vm.DumpIPAMRecord(config.StatePath, vm.IPAMRecord{
259
			IP:               config.IPs[j],
260
			Netmask:          byte(config.CIDRs[j].Bits()),
261
			MAC:              vmIface.Mac,
262
			Hostname:         config.Hostname,
263
			Gateway:          config.GatewayAddrs[j],
264
			MTU:              config.MTU,
265
			Nameservers:      nameservers,
266
			TFTPServer:       config.TFTPServer,
267
			IPXEBootFilename: config.IPXEBootFileName,
268
		}); err != nil {
269
			return err
270
		}
271
	}
272

273
	return f(config)
274
}
275

276
func checkPartitions(config *LaunchConfig) (bool, error) {
277
	disk, err := os.Open(config.DiskPaths[0])
278
	if err != nil {
279
		return false, fmt.Errorf("failed to open disk file %w", err)
280
	}
281

282
	defer disk.Close() //nolint:errcheck
283

284
	diskTable, err := gpt.Open(disk)
285
	if err != nil {
286
		if errors.Is(err, gpt.ErrPartitionTableDoesNotExist) {
287
			return false, nil
288
		}
289

290
		return false, fmt.Errorf("error creating GPT object: %w", err)
291
	}
292

293
	if err = diskTable.Read(); err != nil {
294
		return false, err
295
	}
296

297
	return len(diskTable.Partitions().Items()) > 0, nil
298
}
299

300
// launchVM runs qemu with args built based on config.
301
//
302
//nolint:gocyclo,cyclop
303
func launchVM(config *LaunchConfig) error {
304
	bootOrder := config.DefaultBootOrder
305

306
	if config.controller.ForcePXEBoot() {
307
		bootOrder = "nc"
308
	}
309

310
	cpuArg := "max"
311

312
	if config.BadRTC {
313
		cpuArg += ",-kvmclock"
314
	}
315

316
	args := []string{
317
		"-m", strconv.FormatInt(config.MemSize, 10),
318
		"-smp", fmt.Sprintf("cpus=%d", config.VCPUCount),
319
		"-cpu", cpuArg,
320
		"-nographic",
321
		"-netdev", fmt.Sprintf("tap,id=net0,ifname=%s,script=no,downscript=no", config.tapName),
322
		"-device", fmt.Sprintf("virtio-net-pci,netdev=net0,mac=%s", config.vmMAC),
323
		// TODO: uncomment the following line to get another eth interface not connected to anything
324
		// "-nic", "tap,model=virtio-net-pci",
325
		"-device", "virtio-rng-pci",
326
		"-device", "virtio-balloon,deflate-on-oom=on",
327
		"-monitor", fmt.Sprintf("unix:%s,server,nowait", config.MonitorPath),
328
		"-no-reboot",
329
		"-boot", fmt.Sprintf("order=%s,reboot-timeout=5000", bootOrder),
330
		"-smbios", fmt.Sprintf("type=1,uuid=%s", config.NodeUUID),
331
		"-chardev", fmt.Sprintf("socket,path=%s/%s.sock,server=on,wait=off,id=qga0", config.StatePath, config.Hostname),
332
		"-device", "virtio-serial",
333
		"-device", "virtserialport,chardev=qga0,name=org.qemu.guest_agent.0",
334
		"-device", "i6300esb,id=watchdog0",
335
		"-watchdog-action",
336
		"pause",
337
	}
338

339
	var (
340
		scsiAttached, ahciAttached, nvmeAttached bool
341
		ahciBus                                  int
342
	)
343

344
	for i, disk := range config.DiskPaths {
345
		driver := config.DiskDrivers[i]
346

347
		switch driver {
348
		case "virtio":
349
			args = append(args, "-drive", fmt.Sprintf("format=raw,if=virtio,file=%s,cache=none,", disk))
350
		case "ide":
351
			args = append(args, "-drive", fmt.Sprintf("format=raw,if=ide,file=%s,cache=none,", disk))
352
		case "ahci":
353
			if !ahciAttached {
354
				args = append(args, "-device", "ahci,id=ahci0")
355
				ahciAttached = true
356
			}
357

358
			args = append(args,
359
				"-drive", fmt.Sprintf("id=ide%d,format=raw,if=none,file=%s", i, disk),
360
				"-device", fmt.Sprintf("ide-hd,drive=ide%d,bus=ahci0.%d", i, ahciBus),
361
			)
362

363
			ahciBus++
364
		case "scsi":
365
			if !scsiAttached {
366
				args = append(args, "-device", "virtio-scsi-pci,id=scsi0")
367
				scsiAttached = true
368
			}
369

370
			args = append(args,
371
				"-drive", fmt.Sprintf("id=scsi%d,format=raw,if=none,file=%s,discard=unmap,aio=native,cache=none", i, disk),
372
				"-device", fmt.Sprintf("scsi-hd,drive=scsi%d,bus=scsi0.0", i),
373
			)
374
		case "nvme":
375
			if !nvmeAttached {
376
				// [TODO]: once Talos is fixed, use multipath NVME: https://qemu-project.gitlab.io/qemu/system/devices/nvme.html
377
				args = append(args,
378
					"-device", "nvme,id=nvme-ctrl-0,serial=deadbeef",
379
				)
380
				nvmeAttached = true
381
			}
382

383
			args = append(args,
384
				"-drive", fmt.Sprintf("id=nvme%d,format=raw,if=none,file=%s,discard=unmap,aio=native,cache=none", i, disk),
385
				"-device", fmt.Sprintf("nvme-ns,drive=nvme%d", i),
386
			)
387
		default:
388
			return fmt.Errorf("unsupported disk driver %q", driver)
389
		}
390
	}
391

392
	machineArg := config.MachineType
393

394
	if config.EnableKVM {
395
		machineArg += ",accel=kvm,smm=on"
396
	}
397

398
	args = append(args, "-machine", machineArg)
399

400
	pflashArgs := make([]string, 2*len(config.PFlashImages))
401
	for i := range config.PFlashImages {
402
		pflashArgs[2*i] = "-drive"
403
		pflashArgs[2*i+1] = fmt.Sprintf("file=%s,format=raw,if=pflash", config.PFlashImages[i])
404
	}
405

406
	args = append(args, pflashArgs...)
407

408
	// check if disk is empty/wiped
409
	diskBootable, err := checkPartitions(config)
410
	if err != nil {
411
		return err
412
	}
413

414
	if config.TPM2Config.NodeName != "" {
415
		tpm2SocketPath := filepath.Join(config.TPM2Config.StateDir, "swtpm.sock")
416

417
		cmd := exec.Command("swtpm", []string{
418
			"socket",
419
			"--tpmstate",
420
			fmt.Sprintf("dir=%s,mode=0644", config.TPM2Config.StateDir),
421
			"--ctrl",
422
			fmt.Sprintf("type=unixio,path=%s", tpm2SocketPath),
423
			"--tpm2",
424
			"--pid",
425
			fmt.Sprintf("file=%s", filepath.Join(config.TPM2Config.StateDir, "swtpm.pid")),
426
			"--log",
427
			fmt.Sprintf("file=%s,level=20", filepath.Join(config.TPM2Config.StateDir, "swtpm.log")),
428
		}...)
429

430
		log.Printf("starting swtpm: %s", cmd.String())
431

432
		if err := cmd.Start(); err != nil {
433
			return err
434
		}
435

436
		args = append(args,
437
			"-chardev",
438
			fmt.Sprintf("socket,id=chrtpm,path=%s", tpm2SocketPath),
439
			"-tpmdev",
440
			"emulator,id=tpm0,chardev=chrtpm",
441
			"-device",
442
			"tpm-tis,tpmdev=tpm0",
443
		)
444
	}
445

446
	if !diskBootable || !config.BootloaderEnabled {
447
		if config.ISOPath != "" {
448
			args = append(args,
449
				"-cdrom", config.ISOPath,
450
			)
451
		} else if config.KernelImagePath != "" {
452
			args = append(args,
453
				"-kernel", config.KernelImagePath,
454
				"-initrd", config.InitrdPath,
455
				"-append", config.KernelArgs,
456
			)
457
		}
458
	}
459

460
	if config.BadRTC {
461
		args = append(args,
462
			"-rtc",
463
			"base=2011-11-11T11:11:00,clock=rt",
464
		)
465
	}
466

467
	fmt.Fprintf(os.Stderr, "starting %s with args:\n%s\n", config.QemuExecutable, strings.Join(args, " "))
468
	cmd := exec.Command(
469
		config.QemuExecutable,
470
		args...,
471
	)
472

473
	cmd.Stdout = os.Stdout
474
	cmd.Stderr = os.Stderr
475

476
	if err := ns.WithNetNSPath(config.ns.Path(), func(_ ns.NetNS) error {
477
		return cmd.Start()
478
	}); err != nil {
479
		return err
480
	}
481

482
	done := make(chan error)
483

484
	go func() {
485
		done <- cmd.Wait()
486
	}()
487

488
	for {
489
		select {
490
		case sig := <-config.c:
491
			fmt.Fprintf(os.Stderr, "exiting VM as signal %s was received\n", sig)
492

493
			if err := cmd.Process.Kill(); err != nil {
494
				return fmt.Errorf("failed to kill process %w", err)
495
			}
496

497
			<-done
498

499
			return errors.New("process stopped")
500
		case err := <-done:
501
			if err != nil {
502
				return fmt.Errorf("process exited with error %s", err)
503
			}
504

505
			// graceful exit
506
			return nil
507
		case command := <-config.controller.CommandsCh():
508
			if command == VMCommandStop {
509
				fmt.Fprintf(os.Stderr, "exiting VM as stop command via API was received\n")
510

511
				if err := cmd.Process.Kill(); err != nil {
512
					return fmt.Errorf("failed to kill process %w", err)
513
				}
514

515
				<-done
516

517
				return nil
518
			}
519
		}
520
	}
521
}
522

523
// Launch a control process around qemu VM manager.
524
//
525
// This function is invoked from 'talosctl qemu-launch' hidden command
526
// and wraps starting, controlling 'qemu' VM process.
527
//
528
// Launch restarts VM forever until control process is stopped itself with a signal.
529
//
530
// Process is expected to receive configuration on stdin. Current working directory
531
// should be cluster state directory, process output should be redirected to the
532
// logfile in state directory.
533
//
534
// When signals SIGINT, SIGTERM are received, control process stops qemu and exits.
535
func Launch() error {
536
	var config LaunchConfig
537

538
	ctx := context.Background()
539

540
	if err := vm.ReadConfig(&config); err != nil {
541
		return err
542
	}
543

544
	config.c = vm.ConfigureSignals()
545
	config.controller = NewController()
546

547
	httpServer, err := vm.NewHTTPServer(config.GatewayAddrs[0], config.APIPort, []byte(config.Config), config.controller)
548
	if err != nil {
549
		return err
550
	}
551

552
	httpServer.Serve()
553
	defer httpServer.Shutdown(ctx) //nolint:errcheck
554

555
	// patch kernel args
556
	config.KernelArgs = strings.ReplaceAll(config.KernelArgs, "{TALOS_CONFIG_URL}", fmt.Sprintf("http://%s/config.yaml", httpServer.GetAddr()))
557

558
	return withCNI(ctx, &config, func(config *LaunchConfig) error {
559
		for {
560
			for config.controller.PowerState() != PoweredOn {
561
				select {
562
				case <-config.controller.CommandsCh():
563
					// machine might have been powered on
564
				case sig := <-config.c:
565
					fmt.Fprintf(os.Stderr, "exiting stopped launcher as signal %s was received\n", sig)
566

567
					return errors.New("process stopped")
568
				}
569
			}
570

571
			if err := launchVM(config); err != nil {
572
				return err
573
			}
574
		}
575
	})
576
}
577

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.