cubefs
591 строка · 13.1 Кб
1//go:build !noasm && !appengine && !gccgo
2// +build !noasm,!appengine,!gccgo
3
4// Copyright 2015, Klaus Post, see LICENSE for details.
5
6package reedsolomon
7
8//go:noescape
9func galMulSSSE3(low, high, in, out []byte)
10
11//go:noescape
12func galMulSSSE3Xor(low, high, in, out []byte)
13
14//go:noescape
15func galMulAVX2Xor(low, high, in, out []byte)
16
17//go:noescape
18func galMulAVX2(low, high, in, out []byte)
19
20//go:noescape
21func sSE2XorSlice(in, out []byte)
22
23//go:noescape
24func galMulAVX2Xor_64(low, high, in, out []byte)
25
26//go:noescape
27func galMulAVX2_64(low, high, in, out []byte)
28
29//go:noescape
30func sSE2XorSlice_64(in, out []byte)
31
32//go:noescape
33func avx2XorSlice_64(in, out []byte)
34
35// This is what the assembler routines do in blocks of 16 bytes:
36/*
37func galMulSSSE3(low, high, in, out []byte) {
38for n, input := range in {
39l := input & 0xf
40h := input >> 4
41out[n] = low[l] ^ high[h]
42}
43}
44
45func galMulSSSE3Xor(low, high, in, out []byte) {
46for n, input := range in {
47l := input & 0xf
48h := input >> 4
49out[n] ^= low[l] ^ high[h]
50}
51}
52*/
53
54// bigSwitchover is the size where 64 bytes are processed per loop.
55const bigSwitchover = 128
56
57func galMulSlice(c byte, in, out []byte, o *options) {
58if c == 1 {
59copy(out, in)
60return
61}
62if o.useAVX2 {
63if len(in) >= bigSwitchover {
64galMulAVX2_64(mulTableLow[c][:], mulTableHigh[c][:], in, out)
65done := (len(in) >> 6) << 6
66in = in[done:]
67out = out[done:]
68}
69if len(in) > 32 {
70galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out)
71done := (len(in) >> 5) << 5
72in = in[done:]
73out = out[done:]
74}
75} else if o.useSSSE3 {
76galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out)
77done := (len(in) >> 4) << 4
78in = in[done:]
79out = out[done:]
80}
81out = out[:len(in)]
82mt := mulTable[c][:256]
83for i := range in {
84out[i] = mt[in[i]]
85}
86}
87
88func galMulSliceXor(c byte, in, out []byte, o *options) {
89if c == 1 {
90sliceXor(in, out, o)
91return
92}
93
94if o.useAVX2 {
95if len(in) >= bigSwitchover {
96galMulAVX2Xor_64(mulTableLow[c][:], mulTableHigh[c][:], in, out)
97done := (len(in) >> 6) << 6
98in = in[done:]
99out = out[done:]
100}
101if len(in) >= 32 {
102galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
103done := (len(in) >> 5) << 5
104in = in[done:]
105out = out[done:]
106}
107} else if o.useSSSE3 {
108galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
109done := (len(in) >> 4) << 4
110in = in[done:]
111out = out[done:]
112}
113if len(in) == 0 {
114return
115}
116out = out[:len(in)]
117mt := mulTable[c][:256]
118for i := range in {
119out[i] ^= mt[in[i]]
120}
121}
122
123// simple slice xor
124func sliceXor(in, out []byte, o *options) {
125if o.useSSE2 {
126if len(in) >= bigSwitchover {
127if o.useAVX2 {
128avx2XorSlice_64(in, out)
129done := (len(in) >> 6) << 6
130in = in[done:]
131out = out[done:]
132} else {
133sSE2XorSlice_64(in, out)
134done := (len(in) >> 6) << 6
135in = in[done:]
136out = out[done:]
137}
138}
139if len(in) >= 16 {
140sSE2XorSlice(in, out)
141done := (len(in) >> 4) << 4
142in = in[done:]
143out = out[done:]
144}
145} else {
146sliceXorGo(in, out, o)
147return
148}
149out = out[:len(in)]
150for i := range in {
151out[i] ^= in[i]
152}
153}
154
155// 4-way butterfly
156func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
157if len(work[0]) == 0 {
158return
159}
160
161t01 := &multiply256LUT[log_m01]
162t23 := &multiply256LUT[log_m23]
163t02 := &multiply256LUT[log_m02]
164if o.useAVX512 {
165if log_m01 == modulus {
166if log_m23 == modulus {
167if log_m02 == modulus {
168ifftDIT4_avx512_7(work, dist*24, t01, t23, t02)
169} else {
170ifftDIT4_avx512_3(work, dist*24, t01, t23, t02)
171}
172} else {
173if log_m02 == modulus {
174ifftDIT4_avx512_5(work, dist*24, t01, t23, t02)
175} else {
176ifftDIT4_avx512_1(work, dist*24, t01, t23, t02)
177}
178}
179} else {
180if log_m23 == modulus {
181if log_m02 == modulus {
182ifftDIT4_avx512_6(work, dist*24, t01, t23, t02)
183} else {
184ifftDIT4_avx512_2(work, dist*24, t01, t23, t02)
185}
186} else {
187if log_m02 == modulus {
188ifftDIT4_avx512_4(work, dist*24, t01, t23, t02)
189} else {
190ifftDIT4_avx512_0(work, dist*24, t01, t23, t02)
191}
192}
193}
194return
195} else if o.useAVX2 {
196if log_m01 == modulus {
197if log_m23 == modulus {
198if log_m02 == modulus {
199ifftDIT4_avx2_7(work, dist*24, t01, t23, t02)
200} else {
201ifftDIT4_avx2_3(work, dist*24, t01, t23, t02)
202}
203} else {
204if log_m02 == modulus {
205ifftDIT4_avx2_5(work, dist*24, t01, t23, t02)
206} else {
207ifftDIT4_avx2_1(work, dist*24, t01, t23, t02)
208}
209}
210} else {
211if log_m23 == modulus {
212if log_m02 == modulus {
213ifftDIT4_avx2_6(work, dist*24, t01, t23, t02)
214} else {
215ifftDIT4_avx2_2(work, dist*24, t01, t23, t02)
216}
217} else {
218if log_m02 == modulus {
219ifftDIT4_avx2_4(work, dist*24, t01, t23, t02)
220} else {
221ifftDIT4_avx2_0(work, dist*24, t01, t23, t02)
222}
223}
224}
225return
226}
227ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
228}
229
230// 4-way butterfly
231func ifftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) {
232if len(work[0]) == 0 {
233return
234}
235
236if false && o.useGFNI {
237// Note that these currently require that length is multiple of 64.
238t01 := gf2p811dMulMatrices[log_m01]
239t23 := gf2p811dMulMatrices[log_m23]
240t02 := gf2p811dMulMatrices[log_m02]
241if log_m01 == modulus8 {
242if log_m23 == modulus8 {
243if log_m02 == modulus8 {
244ifftDIT48_gfni_7(work, dist*24, t01, t23, t02)
245} else {
246ifftDIT48_gfni_3(work, dist*24, t01, t23, t02)
247}
248} else {
249if log_m02 == modulus8 {
250ifftDIT48_gfni_5(work, dist*24, t01, t23, t02)
251} else {
252ifftDIT48_gfni_1(work, dist*24, t01, t23, t02)
253}
254}
255} else {
256if log_m23 == modulus8 {
257if log_m02 == modulus8 {
258ifftDIT48_gfni_6(work, dist*24, t01, t23, t02)
259} else {
260ifftDIT48_gfni_2(work, dist*24, t01, t23, t02)
261}
262} else {
263if log_m02 == modulus8 {
264ifftDIT48_gfni_4(work, dist*24, t01, t23, t02)
265} else {
266ifftDIT48_gfni_0(work, dist*24, t01, t23, t02)
267}
268}
269}
270return
271}
272if o.useAVX2 {
273// Note that these currently require that length is multiple of 64.
274t01 := &multiply256LUT8[log_m01]
275t23 := &multiply256LUT8[log_m23]
276t02 := &multiply256LUT8[log_m02]
277if log_m01 == modulus8 {
278if log_m23 == modulus8 {
279if log_m02 == modulus8 {
280ifftDIT48_avx2_7(work, dist*24, t01, t23, t02)
281} else {
282ifftDIT48_avx2_3(work, dist*24, t01, t23, t02)
283}
284} else {
285if log_m02 == modulus8 {
286ifftDIT48_avx2_5(work, dist*24, t01, t23, t02)
287} else {
288ifftDIT48_avx2_1(work, dist*24, t01, t23, t02)
289}
290}
291} else {
292if log_m23 == modulus8 {
293if log_m02 == modulus8 {
294ifftDIT48_avx2_6(work, dist*24, t01, t23, t02)
295} else {
296ifftDIT48_avx2_2(work, dist*24, t01, t23, t02)
297}
298} else {
299if log_m02 == modulus8 {
300ifftDIT48_avx2_4(work, dist*24, t01, t23, t02)
301} else {
302ifftDIT48_avx2_0(work, dist*24, t01, t23, t02)
303}
304}
305}
306return
307}
308ifftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o)
309}
310
311func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
312if len(work[0]) == 0 {
313return
314}
315
316t01 := &multiply256LUT[log_m01]
317t23 := &multiply256LUT[log_m23]
318t02 := &multiply256LUT[log_m02]
319if o.useAVX512 {
320if log_m02 == modulus {
321if log_m01 == modulus {
322if log_m23 == modulus {
323fftDIT4_avx512_7(work, dist*24, t01, t23, t02)
324} else {
325fftDIT4_avx512_3(work, dist*24, t01, t23, t02)
326}
327} else {
328if log_m23 == modulus {
329fftDIT4_avx512_5(work, dist*24, t01, t23, t02)
330} else {
331fftDIT4_avx512_1(work, dist*24, t01, t23, t02)
332}
333}
334} else {
335if log_m01 == modulus {
336if log_m23 == modulus {
337fftDIT4_avx512_6(work, dist*24, t01, t23, t02)
338} else {
339fftDIT4_avx512_2(work, dist*24, t01, t23, t02)
340}
341} else {
342if log_m23 == modulus {
343fftDIT4_avx512_4(work, dist*24, t01, t23, t02)
344} else {
345fftDIT4_avx512_0(work, dist*24, t01, t23, t02)
346}
347}
348}
349return
350} else if o.useAVX2 {
351if log_m02 == modulus {
352if log_m01 == modulus {
353if log_m23 == modulus {
354fftDIT4_avx2_7(work, dist*24, t01, t23, t02)
355} else {
356fftDIT4_avx2_3(work, dist*24, t01, t23, t02)
357}
358} else {
359if log_m23 == modulus {
360fftDIT4_avx2_5(work, dist*24, t01, t23, t02)
361} else {
362fftDIT4_avx2_1(work, dist*24, t01, t23, t02)
363}
364}
365} else {
366if log_m01 == modulus {
367if log_m23 == modulus {
368fftDIT4_avx2_6(work, dist*24, t01, t23, t02)
369} else {
370fftDIT4_avx2_2(work, dist*24, t01, t23, t02)
371}
372} else {
373if log_m23 == modulus {
374fftDIT4_avx2_4(work, dist*24, t01, t23, t02)
375} else {
376fftDIT4_avx2_0(work, dist*24, t01, t23, t02)
377}
378}
379}
380return
381}
382fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
383}
384
385// 4-way butterfly
386func fftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) {
387if len(work[0]) == 0 {
388return
389}
390
391if false && o.useGFNI {
392t01 := gf2p811dMulMatrices[log_m01]
393t23 := gf2p811dMulMatrices[log_m23]
394t02 := gf2p811dMulMatrices[log_m02]
395// Note that these currently require that length is multiple of 64.
396if log_m02 == modulus8 {
397if log_m01 == modulus8 {
398if log_m23 == modulus8 {
399fftDIT48_gfni_7(work, dist*24, t01, t23, t02)
400} else {
401fftDIT48_gfni_3(work, dist*24, t01, t23, t02)
402}
403} else {
404if log_m23 == modulus8 {
405fftDIT48_gfni_5(work, dist*24, t01, t23, t02)
406} else {
407fftDIT48_gfni_1(work, dist*24, t01, t23, t02)
408}
409}
410} else {
411if log_m01 == modulus8 {
412if log_m23 == modulus8 {
413fftDIT48_gfni_6(work, dist*24, t01, t23, t02)
414} else {
415fftDIT48_gfni_2(work, dist*24, t01, t23, t02)
416}
417} else {
418if log_m23 == modulus8 {
419fftDIT48_gfni_4(work, dist*24, t01, t23, t02)
420} else {
421fftDIT48_gfni_0(work, dist*24, t01, t23, t02)
422}
423}
424}
425return
426}
427if o.useAVX2 {
428t01 := &multiply256LUT8[log_m01]
429t23 := &multiply256LUT8[log_m23]
430t02 := &multiply256LUT8[log_m02]
431// Note that these currently require that length is multiple of 64.
432if log_m02 == modulus8 {
433if log_m01 == modulus8 {
434if log_m23 == modulus8 {
435fftDIT48_avx2_7(work, dist*24, t01, t23, t02)
436} else {
437fftDIT48_avx2_3(work, dist*24, t01, t23, t02)
438}
439} else {
440if log_m23 == modulus8 {
441fftDIT48_avx2_5(work, dist*24, t01, t23, t02)
442} else {
443fftDIT48_avx2_1(work, dist*24, t01, t23, t02)
444}
445}
446} else {
447if log_m01 == modulus8 {
448if log_m23 == modulus8 {
449fftDIT48_avx2_6(work, dist*24, t01, t23, t02)
450} else {
451fftDIT48_avx2_2(work, dist*24, t01, t23, t02)
452}
453} else {
454if log_m23 == modulus8 {
455fftDIT48_avx2_4(work, dist*24, t01, t23, t02)
456} else {
457fftDIT48_avx2_0(work, dist*24, t01, t23, t02)
458}
459}
460}
461return
462}
463fftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o)
464}
465
466// 2-way butterfly forward
467func fftDIT2(x, y []byte, log_m ffe, o *options) {
468if len(x) == 0 {
469return
470}
471if o.useAVX2 {
472tmp := &multiply256LUT[log_m]
473fftDIT2_avx2(x, y, tmp)
474} else if o.useSSSE3 {
475tmp := &multiply256LUT[log_m]
476fftDIT2_ssse3(x, y, tmp)
477} else {
478// Reference version:
479refMulAdd(x, y, log_m)
480sliceXor(x, y, o)
481}
482}
483
484// 2-way butterfly forward
485func fftDIT28(x, y []byte, log_m ffe8, o *options) {
486if len(x) == 0 {
487return
488}
489
490if o.useAVX2 {
491fftDIT28_avx2(x, y, &multiply256LUT8[log_m])
492if len(x)&63 == 0 {
493return
494}
495done := (len(y) >> 6) << 6
496y = y[done:]
497x = x[done:]
498}
499mulAdd8(x, y, log_m, o)
500sliceXor(x, y, o)
501}
502
503// 2-way butterfly inverse
504func ifftDIT28(x, y []byte, log_m ffe8, o *options) {
505if len(x) == 0 {
506return
507}
508
509if o.useAVX2 {
510ifftDIT28_avx2(x, y, &multiply256LUT8[log_m])
511if len(x)&63 == 0 {
512return
513}
514done := (len(y) >> 6) << 6
515y = y[done:]
516x = x[done:]
517}
518sliceXor(x, y, o)
519mulAdd8(x, y, log_m, o)
520}
521
522func mulAdd8(x, y []byte, log_m ffe8, o *options) {
523if o.useAVX2 {
524t := &multiply256LUT8[log_m]
525galMulAVX2Xor_64(t[:16], t[16:32], y, x)
526done := (len(y) >> 6) << 6
527y = y[done:]
528x = x[done:]
529} else if o.useSSSE3 {
530t := &multiply256LUT8[log_m]
531galMulSSSE3Xor(t[:16], t[16:32], y, x)
532done := (len(y) >> 4) << 4
533y = y[done:]
534x = x[done:]
535}
536refMulAdd8(x, y, log_m)
537}
538
539// 2-way butterfly
540func ifftDIT2(x, y []byte, log_m ffe, o *options) {
541if len(x) == 0 {
542return
543}
544if o.useAVX2 {
545tmp := &multiply256LUT[log_m]
546ifftDIT2_avx2(x, y, tmp)
547} else if o.useSSSE3 {
548tmp := &multiply256LUT[log_m]
549ifftDIT2_ssse3(x, y, tmp)
550} else {
551// Reference version:
552sliceXor(x, y, o)
553refMulAdd(x, y, log_m)
554}
555}
556
557func mulgf16(x, y []byte, log_m ffe, o *options) {
558if len(x) == 0 {
559return
560}
561if o.useAVX2 {
562tmp := &multiply256LUT[log_m]
563mulgf16_avx2(x, y, tmp)
564} else if o.useSSSE3 {
565tmp := &multiply256LUT[log_m]
566mulgf16_ssse3(x, y, tmp)
567} else {
568refMul(x, y, log_m)
569}
570}
571
572func mulgf8(out, in []byte, log_m ffe8, o *options) {
573if o.useAVX2 {
574t := &multiply256LUT8[log_m]
575galMulAVX2_64(t[:16], t[16:32], in, out)
576done := (len(in) >> 6) << 6
577in = in[done:]
578out = out[done:]
579} else if o.useSSSE3 {
580t := &multiply256LUT8[log_m]
581galMulSSSE3(t[:16], t[16:32], in, out)
582done := (len(in) >> 4) << 4
583in = in[done:]
584out = out[done:]
585}
586out = out[:len(in)]
587mt := mul8LUTs[log_m].Value[:]
588for i := range in {
589out[i] = byte(mt[in[i]])
590}
591}
592