cubefs

blockdec.go
684 строки · 17.4 Кб
Перенос по словам
1
// Copyright 2019+ Klaus Post. All rights reserved.
2
// License information can be found in the LICENSE file.
3
// Based on work by Yann Collet, released under BSD License.
4

5
package zstd
6

7
import (
8
	"errors"
9
	"fmt"
10
	"io"
11
	"sync"
12

13
	"github.com/klauspost/compress/huff0"
14
	"github.com/klauspost/compress/zstd/internal/xxhash"
15
)
16

17
type blockType uint8
18

19
//go:generate stringer -type=blockType,literalsBlockType,seqCompMode,tableIndex
20

21
const (
22
	blockTypeRaw blockType = iota
23
	blockTypeRLE
24
	blockTypeCompressed
25
	blockTypeReserved
26
)
27

28
type literalsBlockType uint8
29

30
const (
31
	literalsBlockRaw literalsBlockType = iota
32
	literalsBlockRLE
33
	literalsBlockCompressed
34
	literalsBlockTreeless
35
)
36

37
const (
38
	// maxCompressedBlockSize is the biggest allowed compressed block size (128KB)
39
	maxCompressedBlockSize = 128 << 10
40

41
	// Maximum possible block size (all Raw+Uncompressed).
42
	maxBlockSize = (1 << 21) - 1
43

44
	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#literals_section_header
45
	maxCompressedLiteralSize = 1 << 18
46
	maxRLELiteralSize        = 1 << 20
47
	maxMatchLen              = 131074
48
	maxSequences             = 0x7f00 + 0xffff
49

50
	// We support slightly less than the reference decoder to be able to
51
	// use ints on 32 bit archs.
52
	maxOffsetBits = 30
53
)
54

55
var (
56
	huffDecoderPool = sync.Pool{New: func() interface{} {
57
		return &huff0.Scratch{}
58
	}}
59

60
	fseDecoderPool = sync.Pool{New: func() interface{} {
61
		return &fseDecoder{}
62
	}}
63
)
64

65
type blockDec struct {
66
	// Raw source data of the block.
67
	data        []byte
68
	dataStorage []byte
69

70
	// Destination of the decoded data.
71
	dst []byte
72

73
	// Buffer for literals data.
74
	literalBuf []byte
75

76
	// Window size of the block.
77
	WindowSize uint64
78

79
	err error
80

81
	// Check against this crc
82
	checkCRC []byte
83

84
	// Frame to use for singlethreaded decoding.
85
	// Should not be used by the decoder itself since parent may be another frame.
86
	localFrame *frameDec
87

88
	sequence []seqVals
89

90
	async struct {
91
		newHist  *history
92
		literals []byte
93
		seqData  []byte
94
		seqSize  int // Size of uncompressed sequences
95
		fcs      uint64
96
	}
97

98
	// Block is RLE, this is the size.
99
	RLESize uint32
100
	tmp     [4]byte
101

102
	Type blockType
103

104
	// Is this the last block of a frame?
105
	Last bool
106

107
	// Use less memory
108
	lowMem bool
109
}
110

111
func (b *blockDec) String() string {
112
	if b == nil {
113
		return "<nil>"
114
	}
115
	return fmt.Sprintf("Steam Size: %d, Type: %v, Last: %t, Window: %d", len(b.data), b.Type, b.Last, b.WindowSize)
116
}
117

118
func newBlockDec(lowMem bool) *blockDec {
119
	b := blockDec{
120
		lowMem: lowMem,
121
	}
122
	return &b
123
}
124

125
// reset will reset the block.
126
// Input must be a start of a block and will be at the end of the block when returned.
127
func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
128
	b.WindowSize = windowSize
129
	tmp, err := br.readSmall(3)
130
	if err != nil {
131
		println("Reading block header:", err)
132
		return err
133
	}
134
	bh := uint32(tmp[0]) | (uint32(tmp[1]) << 8) | (uint32(tmp[2]) << 16)
135
	b.Last = bh&1 != 0
136
	b.Type = blockType((bh >> 1) & 3)
137
	// find size.
138
	cSize := int(bh >> 3)
139
	maxSize := maxBlockSize
140
	switch b.Type {
141
	case blockTypeReserved:
142
		return ErrReservedBlockType
143
	case blockTypeRLE:
144
		if cSize > maxCompressedBlockSize || cSize > int(b.WindowSize) {
145
			if debugDecoder {
146
				printf("rle block too big: csize:%d block: %+v\n", uint64(cSize), b)
147
			}
148
			return ErrWindowSizeExceeded
149
		}
150
		b.RLESize = uint32(cSize)
151
		if b.lowMem {
152
			maxSize = cSize
153
		}
154
		cSize = 1
155
	case blockTypeCompressed:
156
		if debugDecoder {
157
			println("Data size on stream:", cSize)
158
		}
159
		b.RLESize = 0
160
		maxSize = maxCompressedBlockSize
161
		if windowSize < maxCompressedBlockSize && b.lowMem {
162
			maxSize = int(windowSize)
163
		}
164
		if cSize > maxCompressedBlockSize || uint64(cSize) > b.WindowSize {
165
			if debugDecoder {
166
				printf("compressed block too big: csize:%d block: %+v\n", uint64(cSize), b)
167
			}
168
			return ErrCompressedSizeTooBig
169
		}
170
	case blockTypeRaw:
171
		if cSize > maxCompressedBlockSize || cSize > int(b.WindowSize) {
172
			if debugDecoder {
173
				printf("rle block too big: csize:%d block: %+v\n", uint64(cSize), b)
174
			}
175
			return ErrWindowSizeExceeded
176
		}
177

178
		b.RLESize = 0
179
		// We do not need a destination for raw blocks.
180
		maxSize = -1
181
	default:
182
		panic("Invalid block type")
183
	}
184

185
	// Read block data.
186
	if cap(b.dataStorage) < cSize {
187
		if b.lowMem || cSize > maxCompressedBlockSize {
188
			b.dataStorage = make([]byte, 0, cSize)
189
		} else {
190
			b.dataStorage = make([]byte, 0, maxCompressedBlockSize)
191
		}
192
	}
193
	if cap(b.dst) <= maxSize {
194
		b.dst = make([]byte, 0, maxSize+1)
195
	}
196
	b.data, err = br.readBig(cSize, b.dataStorage)
197
	if err != nil {
198
		if debugDecoder {
199
			println("Reading block:", err, "(", cSize, ")", len(b.data))
200
			printf("%T", br)
201
		}
202
		return err
203
	}
204
	return nil
205
}
206

207
// sendEOF will make the decoder send EOF on this frame.
208
func (b *blockDec) sendErr(err error) {
209
	b.Last = true
210
	b.Type = blockTypeReserved
211
	b.err = err
212
}
213

214
// Close will release resources.
215
// Closed blockDec cannot be reset.
216
func (b *blockDec) Close() {
217
}
218

219
// decodeBuf
220
func (b *blockDec) decodeBuf(hist *history) error {
221
	switch b.Type {
222
	case blockTypeRLE:
223
		if cap(b.dst) < int(b.RLESize) {
224
			if b.lowMem {
225
				b.dst = make([]byte, b.RLESize)
226
			} else {
227
				b.dst = make([]byte, maxBlockSize)
228
			}
229
		}
230
		b.dst = b.dst[:b.RLESize]
231
		v := b.data[0]
232
		for i := range b.dst {
233
			b.dst[i] = v
234
		}
235
		hist.appendKeep(b.dst)
236
		return nil
237
	case blockTypeRaw:
238
		hist.appendKeep(b.data)
239
		return nil
240
	case blockTypeCompressed:
241
		saved := b.dst
242
		// Append directly to history
243
		if hist.ignoreBuffer == 0 {
244
			b.dst = hist.b
245
			hist.b = nil
246
		} else {
247
			b.dst = b.dst[:0]
248
		}
249
		err := b.decodeCompressed(hist)
250
		if debugDecoder {
251
			println("Decompressed to total", len(b.dst), "bytes, hash:", xxhash.Sum64(b.dst), "error:", err)
252
		}
253
		if hist.ignoreBuffer == 0 {
254
			hist.b = b.dst
255
			b.dst = saved
256
		} else {
257
			hist.appendKeep(b.dst)
258
		}
259
		return err
260
	case blockTypeReserved:
261
		// Used for returning errors.
262
		return b.err
263
	default:
264
		panic("Invalid block type")
265
	}
266
}
267

268
func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err error) {
269
	// There must be at least one byte for Literals_Block_Type and one for Sequences_Section_Header
270
	if len(in) < 2 {
271
		return in, ErrBlockTooSmall
272
	}
273

274
	litType := literalsBlockType(in[0] & 3)
275
	var litRegenSize int
276
	var litCompSize int
277
	sizeFormat := (in[0] >> 2) & 3
278
	var fourStreams bool
279
	var literals []byte
280
	switch litType {
281
	case literalsBlockRaw, literalsBlockRLE:
282
		switch sizeFormat {
283
		case 0, 2:
284
			// Regenerated_Size uses 5 bits (0-31). Literals_Section_Header uses 1 byte.
285
			litRegenSize = int(in[0] >> 3)
286
			in = in[1:]
287
		case 1:
288
			// Regenerated_Size uses 12 bits (0-4095). Literals_Section_Header uses 2 bytes.
289
			litRegenSize = int(in[0]>>4) + (int(in[1]) << 4)
290
			in = in[2:]
291
		case 3:
292
			//  Regenerated_Size uses 20 bits (0-1048575). Literals_Section_Header uses 3 bytes.
293
			if len(in) < 3 {
294
				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
295
				return in, ErrBlockTooSmall
296
			}
297
			litRegenSize = int(in[0]>>4) + (int(in[1]) << 4) + (int(in[2]) << 12)
298
			in = in[3:]
299
		}
300
	case literalsBlockCompressed, literalsBlockTreeless:
301
		switch sizeFormat {
302
		case 0, 1:
303
			// Both Regenerated_Size and Compressed_Size use 10 bits (0-1023).
304
			if len(in) < 3 {
305
				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
306
				return in, ErrBlockTooSmall
307
			}
308
			n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12)
309
			litRegenSize = int(n & 1023)
310
			litCompSize = int(n >> 10)
311
			fourStreams = sizeFormat == 1
312
			in = in[3:]
313
		case 2:
314
			fourStreams = true
315
			if len(in) < 4 {
316
				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
317
				return in, ErrBlockTooSmall
318
			}
319
			n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12) + (uint64(in[3]) << 20)
320
			litRegenSize = int(n & 16383)
321
			litCompSize = int(n >> 14)
322
			in = in[4:]
323
		case 3:
324
			fourStreams = true
325
			if len(in) < 5 {
326
				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
327
				return in, ErrBlockTooSmall
328
			}
329
			n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12) + (uint64(in[3]) << 20) + (uint64(in[4]) << 28)
330
			litRegenSize = int(n & 262143)
331
			litCompSize = int(n >> 18)
332
			in = in[5:]
333
		}
334
	}
335
	if debugDecoder {
336
		println("literals type:", litType, "litRegenSize:", litRegenSize, "litCompSize:", litCompSize, "sizeFormat:", sizeFormat, "4X:", fourStreams)
337
	}
338
	if litRegenSize > int(b.WindowSize) || litRegenSize > maxCompressedBlockSize {
339
		return in, ErrWindowSizeExceeded
340
	}
341

342
	switch litType {
343
	case literalsBlockRaw:
344
		if len(in) < litRegenSize {
345
			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litRegenSize)
346
			return in, ErrBlockTooSmall
347
		}
348
		literals = in[:litRegenSize]
349
		in = in[litRegenSize:]
350
		//printf("Found %d uncompressed literals\n", litRegenSize)
351
	case literalsBlockRLE:
352
		if len(in) < 1 {
353
			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", 1)
354
			return in, ErrBlockTooSmall
355
		}
356
		if cap(b.literalBuf) < litRegenSize {
357
			if b.lowMem {
358
				b.literalBuf = make([]byte, litRegenSize)
359
			} else {
360
				if litRegenSize > maxCompressedLiteralSize {
361
					// Exceptional
362
					b.literalBuf = make([]byte, litRegenSize)
363
				} else {
364
					b.literalBuf = make([]byte, litRegenSize, maxCompressedLiteralSize)
365
				}
366
			}
367
		}
368
		literals = b.literalBuf[:litRegenSize]
369
		v := in[0]
370
		for i := range literals {
371
			literals[i] = v
372
		}
373
		in = in[1:]
374
		if debugDecoder {
375
			printf("Found %d RLE compressed literals\n", litRegenSize)
376
		}
377
	case literalsBlockTreeless:
378
		if len(in) < litCompSize {
379
			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litCompSize)
380
			return in, ErrBlockTooSmall
381
		}
382
		// Store compressed literals, so we defer decoding until we get history.
383
		literals = in[:litCompSize]
384
		in = in[litCompSize:]
385
		if debugDecoder {
386
			printf("Found %d compressed literals\n", litCompSize)
387
		}
388
		huff := hist.huffTree
389
		if huff == nil {
390
			return in, errors.New("literal block was treeless, but no history was defined")
391
		}
392
		// Ensure we have space to store it.
393
		if cap(b.literalBuf) < litRegenSize {
394
			if b.lowMem {
395
				b.literalBuf = make([]byte, 0, litRegenSize)
396
			} else {
397
				b.literalBuf = make([]byte, 0, maxCompressedLiteralSize)
398
			}
399
		}
400
		var err error
401
		// Use our out buffer.
402
		huff.MaxDecodedSize = maxCompressedBlockSize
403
		if fourStreams {
404
			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
405
		} else {
406
			literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals)
407
		}
408
		// Make sure we don't leak our literals buffer
409
		if err != nil {
410
			println("decompressing literals:", err)
411
			return in, err
412
		}
413
		if len(literals) != litRegenSize {
414
			return in, fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
415
		}
416

417
	case literalsBlockCompressed:
418
		if len(in) < litCompSize {
419
			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litCompSize)
420
			return in, ErrBlockTooSmall
421
		}
422
		literals = in[:litCompSize]
423
		in = in[litCompSize:]
424
		// Ensure we have space to store it.
425
		if cap(b.literalBuf) < litRegenSize {
426
			if b.lowMem {
427
				b.literalBuf = make([]byte, 0, litRegenSize)
428
			} else {
429
				b.literalBuf = make([]byte, 0, maxCompressedBlockSize)
430
			}
431
		}
432
		huff := hist.huffTree
433
		if huff == nil || (hist.dict != nil && huff == hist.dict.litEnc) {
434
			huff = huffDecoderPool.Get().(*huff0.Scratch)
435
			if huff == nil {
436
				huff = &huff0.Scratch{}
437
			}
438
		}
439
		var err error
440
		huff, literals, err = huff0.ReadTable(literals, huff)
441
		if err != nil {
442
			println("reading huffman table:", err)
443
			return in, err
444
		}
445
		hist.huffTree = huff
446
		huff.MaxDecodedSize = maxCompressedBlockSize
447
		// Use our out buffer.
448
		if fourStreams {
449
			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
450
		} else {
451
			literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals)
452
		}
453
		if err != nil {
454
			println("decoding compressed literals:", err)
455
			return in, err
456
		}
457
		// Make sure we don't leak our literals buffer
458
		if len(literals) != litRegenSize {
459
			return in, fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
460
		}
461
		if debugDecoder {
462
			printf("Decompressed %d literals into %d bytes\n", litCompSize, litRegenSize)
463
		}
464
	}
465
	hist.decoders.literals = literals
466
	return in, nil
467
}
468

469
// decodeCompressed will start decompressing a block.
470
func (b *blockDec) decodeCompressed(hist *history) error {
471
	in := b.data
472
	in, err := b.decodeLiterals(in, hist)
473
	if err != nil {
474
		return err
475
	}
476
	err = b.prepareSequences(in, hist)
477
	if err != nil {
478
		return err
479
	}
480
	if hist.decoders.nSeqs == 0 {
481
		b.dst = append(b.dst, hist.decoders.literals...)
482
		return nil
483
	}
484
	err = hist.decoders.decodeSync(hist)
485
	if err != nil {
486
		return err
487
	}
488
	b.dst = hist.decoders.out
489
	hist.recentOffsets = hist.decoders.prevOffset
490
	return nil
491
}
492

493
func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) {
494
	// Decode Sequences
495
	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#sequences-section
496
	if len(in) < 1 {
497
		return ErrBlockTooSmall
498
	}
499
	var nSeqs int
500
	seqHeader := in[0]
501
	switch {
502
	case seqHeader == 0:
503
		in = in[1:]
504
	case seqHeader < 128:
505
		nSeqs = int(seqHeader)
506
		in = in[1:]
507
	case seqHeader < 255:
508
		if len(in) < 2 {
509
			return ErrBlockTooSmall
510
		}
511
		nSeqs = int(seqHeader-128)<<8 | int(in[1])
512
		in = in[2:]
513
	case seqHeader == 255:
514
		if len(in) < 3 {
515
			return ErrBlockTooSmall
516
		}
517
		nSeqs = 0x7f00 + int(in[1]) + (int(in[2]) << 8)
518
		in = in[3:]
519
	}
520

521
	var seqs = &hist.decoders
522
	seqs.nSeqs = nSeqs
523
	if nSeqs > 0 {
524
		if len(in) < 1 {
525
			return ErrBlockTooSmall
526
		}
527
		br := byteReader{b: in, off: 0}
528
		compMode := br.Uint8()
529
		br.advance(1)
530
		if debugDecoder {
531
			printf("Compression modes: 0b%b", compMode)
532
		}
533
		for i := uint(0); i < 3; i++ {
534
			mode := seqCompMode((compMode >> (6 - i*2)) & 3)
535
			if debugDecoder {
536
				println("Table", tableIndex(i), "is", mode)
537
			}
538
			var seq *sequenceDec
539
			switch tableIndex(i) {
540
			case tableLiteralLengths:
541
				seq = &seqs.litLengths
542
			case tableOffsets:
543
				seq = &seqs.offsets
544
			case tableMatchLengths:
545
				seq = &seqs.matchLengths
546
			default:
547
				panic("unknown table")
548
			}
549
			switch mode {
550
			case compModePredefined:
551
				if seq.fse != nil && !seq.fse.preDefined {
552
					fseDecoderPool.Put(seq.fse)
553
				}
554
				seq.fse = &fsePredef[i]
555
			case compModeRLE:
556
				if br.remain() < 1 {
557
					return ErrBlockTooSmall
558
				}
559
				v := br.Uint8()
560
				br.advance(1)
561
				if seq.fse == nil || seq.fse.preDefined {
562
					seq.fse = fseDecoderPool.Get().(*fseDecoder)
563
				}
564
				symb, err := decSymbolValue(v, symbolTableX[i])
565
				if err != nil {
566
					printf("RLE Transform table (%v) error: %v", tableIndex(i), err)
567
					return err
568
				}
569
				seq.fse.setRLE(symb)
570
				if debugDecoder {
571
					printf("RLE set to %+v, code: %v", symb, v)
572
				}
573
			case compModeFSE:
574
				println("Reading table for", tableIndex(i))
575
				if seq.fse == nil || seq.fse.preDefined {
576
					seq.fse = fseDecoderPool.Get().(*fseDecoder)
577
				}
578
				err := seq.fse.readNCount(&br, uint16(maxTableSymbol[i]))
579
				if err != nil {
580
					println("Read table error:", err)
581
					return err
582
				}
583
				err = seq.fse.transform(symbolTableX[i])
584
				if err != nil {
585
					println("Transform table error:", err)
586
					return err
587
				}
588
				if debugDecoder {
589
					println("Read table ok", "symbolLen:", seq.fse.symbolLen)
590
				}
591
			case compModeRepeat:
592
				seq.repeat = true
593
			}
594
			if br.overread() {
595
				return io.ErrUnexpectedEOF
596
			}
597
		}
598
		in = br.unread()
599
	}
600
	if debugDecoder {
601
		println("Literals:", len(seqs.literals), "hash:", xxhash.Sum64(seqs.literals), "and", seqs.nSeqs, "sequences.")
602
	}
603

604
	if nSeqs == 0 {
605
		if len(b.sequence) > 0 {
606
			b.sequence = b.sequence[:0]
607
		}
608
		return nil
609
	}
610
	br := seqs.br
611
	if br == nil {
612
		br = &bitReader{}
613
	}
614
	if err := br.init(in); err != nil {
615
		return err
616
	}
617

618
	if err := seqs.initialize(br, hist, b.dst); err != nil {
619
		println("initializing sequences:", err)
620
		return err
621
	}
622
	return nil
623
}
624

625
func (b *blockDec) decodeSequences(hist *history) error {
626
	if cap(b.sequence) < hist.decoders.nSeqs {
627
		if b.lowMem {
628
			b.sequence = make([]seqVals, 0, hist.decoders.nSeqs)
629
		} else {
630
			b.sequence = make([]seqVals, 0, 0x7F00+0xffff)
631
		}
632
	}
633
	b.sequence = b.sequence[:hist.decoders.nSeqs]
634
	if hist.decoders.nSeqs == 0 {
635
		hist.decoders.seqSize = len(hist.decoders.literals)
636
		return nil
637
	}
638
	hist.decoders.prevOffset = hist.recentOffsets
639
	err := hist.decoders.decode(b.sequence)
640
	hist.recentOffsets = hist.decoders.prevOffset
641
	return err
642
}
643

644
func (b *blockDec) executeSequences(hist *history) error {
645
	hbytes := hist.b
646
	if len(hbytes) > hist.windowSize {
647
		hbytes = hbytes[len(hbytes)-hist.windowSize:]
648
		// We do not need history anymore.
649
		if hist.dict != nil {
650
			hist.dict.content = nil
651
		}
652
	}
653
	hist.decoders.windowSize = hist.windowSize
654
	hist.decoders.out = b.dst[:0]
655
	err := hist.decoders.execute(b.sequence, hbytes)
656
	if err != nil {
657
		return err
658
	}
659
	return b.updateHistory(hist)
660
}
661

662
func (b *blockDec) updateHistory(hist *history) error {
663
	if len(b.data) > maxCompressedBlockSize {
664
		return fmt.Errorf("compressed block size too large (%d)", len(b.data))
665
	}
666
	// Set output and release references.
667
	b.dst = hist.decoders.out
668
	hist.recentOffsets = hist.decoders.prevOffset
669

670
	if b.Last {
671
		// if last block we don't care about history.
672
		println("Last block, no history returned")
673
		hist.b = hist.b[:0]
674
		return nil
675
	} else {
676
		hist.append(b.dst)
677
		if debugDecoder {
678
			println("Finished block with ", len(b.sequence), "sequences. Added", len(b.dst), "to history, now length", len(hist.b))
679
		}
680
	}
681
	hist.decoders.out, hist.decoders.literals = nil, nil
682

683
	return nil
684
}
685
cubefs

Использование cookies