cubefs
85 строк · 1.8 Кб
1package compress2
3import "math"4
5// Estimate returns a normalized compressibility estimate of block b.
6// Values close to zero are likely uncompressible.
7// Values above 0.1 are likely to be compressible.
8// Values above 0.5 are very compressible.
9// Very small lengths will return 0.
10func Estimate(b []byte) float64 {11if len(b) < 16 {12return 013}14
15// Correctly predicted order 116hits := 017lastMatch := false18var o1 [256]byte19var hist [256]int20c1 := byte(0)21for _, c := range b {22if c == o1[c1] {23// We only count a hit if there was two correct predictions in a row.24if lastMatch {25hits++26}27lastMatch = true28} else {29lastMatch = false30}31o1[c1] = c32c1 = c33hist[c]++34}35
36// Use x^0.6 to give better spread37prediction := math.Pow(float64(hits)/float64(len(b)), 0.6)38
39// Calculate histogram distribution40variance := float64(0)41avg := float64(len(b)) / 25642
43for _, v := range hist {44Δ := float64(v) - avg45variance += Δ * Δ46}47
48stddev := math.Sqrt(float64(variance)) / float64(len(b))49exp := math.Sqrt(1 / float64(len(b)))50
51// Subtract expected stddev52stddev -= exp53if stddev < 0 {54stddev = 055}56stddev *= 1 + exp57
58// Use x^0.4 to give better spread59entropy := math.Pow(stddev, 0.4)60
61// 50/50 weight between prediction and histogram distribution62return math.Pow((prediction+entropy)/2, 0.9)63}
64
65// ShannonEntropyBits returns the number of bits minimum required to represent
66// an entropy encoding of the input bytes.
67// https://en.wiktionary.org/wiki/Shannon_entropy
68func ShannonEntropyBits(b []byte) int {69if len(b) == 0 {70return 071}72var hist [256]int73for _, c := range b {74hist[c]++75}76shannon := float64(0)77invTotal := 1.0 / float64(len(b))78for _, v := range hist[:] {79if v > 0 {80n := float64(v)81shannon += math.Ceil(-math.Log2(n*invTotal) * n)82}83}84return int(math.Ceil(shannon))85}
86