cubefs

galois_amd64.go
591 строка · 13.1 Кб
Перенос по словам
1
//go:build !noasm && !appengine && !gccgo
2
// +build !noasm,!appengine,!gccgo
3

4
// Copyright 2015, Klaus Post, see LICENSE for details.
5

6
package reedsolomon
7

8
//go:noescape
9
func galMulSSSE3(low, high, in, out []byte)
10

11
//go:noescape
12
func galMulSSSE3Xor(low, high, in, out []byte)
13

14
//go:noescape
15
func galMulAVX2Xor(low, high, in, out []byte)
16

17
//go:noescape
18
func galMulAVX2(low, high, in, out []byte)
19

20
//go:noescape
21
func sSE2XorSlice(in, out []byte)
22

23
//go:noescape
24
func galMulAVX2Xor_64(low, high, in, out []byte)
25

26
//go:noescape
27
func galMulAVX2_64(low, high, in, out []byte)
28

29
//go:noescape
30
func sSE2XorSlice_64(in, out []byte)
31

32
//go:noescape
33
func avx2XorSlice_64(in, out []byte)
34

35
// This is what the assembler routines do in blocks of 16 bytes:
36
/*
37
func galMulSSSE3(low, high, in, out []byte) {
38
	for n, input := range in {
39
		l := input & 0xf
40
		h := input >> 4
41
		out[n] = low[l] ^ high[h]
42
	}
43
}
44

45
func galMulSSSE3Xor(low, high, in, out []byte) {
46
	for n, input := range in {
47
		l := input & 0xf
48
		h := input >> 4
49
		out[n] ^= low[l] ^ high[h]
50
	}
51
}
52
*/
53

54
// bigSwitchover is the size where 64 bytes are processed per loop.
55
const bigSwitchover = 128
56

57
func galMulSlice(c byte, in, out []byte, o *options) {
58
	if c == 1 {
59
		copy(out, in)
60
		return
61
	}
62
	if o.useAVX2 {
63
		if len(in) >= bigSwitchover {
64
			galMulAVX2_64(mulTableLow[c][:], mulTableHigh[c][:], in, out)
65
			done := (len(in) >> 6) << 6
66
			in = in[done:]
67
			out = out[done:]
68
		}
69
		if len(in) > 32 {
70
			galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out)
71
			done := (len(in) >> 5) << 5
72
			in = in[done:]
73
			out = out[done:]
74
		}
75
	} else if o.useSSSE3 {
76
		galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out)
77
		done := (len(in) >> 4) << 4
78
		in = in[done:]
79
		out = out[done:]
80
	}
81
	out = out[:len(in)]
82
	mt := mulTable[c][:256]
83
	for i := range in {
84
		out[i] = mt[in[i]]
85
	}
86
}
87

88
func galMulSliceXor(c byte, in, out []byte, o *options) {
89
	if c == 1 {
90
		sliceXor(in, out, o)
91
		return
92
	}
93

94
	if o.useAVX2 {
95
		if len(in) >= bigSwitchover {
96
			galMulAVX2Xor_64(mulTableLow[c][:], mulTableHigh[c][:], in, out)
97
			done := (len(in) >> 6) << 6
98
			in = in[done:]
99
			out = out[done:]
100
		}
101
		if len(in) >= 32 {
102
			galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
103
			done := (len(in) >> 5) << 5
104
			in = in[done:]
105
			out = out[done:]
106
		}
107
	} else if o.useSSSE3 {
108
		galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
109
		done := (len(in) >> 4) << 4
110
		in = in[done:]
111
		out = out[done:]
112
	}
113
	if len(in) == 0 {
114
		return
115
	}
116
	out = out[:len(in)]
117
	mt := mulTable[c][:256]
118
	for i := range in {
119
		out[i] ^= mt[in[i]]
120
	}
121
}
122

123
// simple slice xor
124
func sliceXor(in, out []byte, o *options) {
125
	if o.useSSE2 {
126
		if len(in) >= bigSwitchover {
127
			if o.useAVX2 {
128
				avx2XorSlice_64(in, out)
129
				done := (len(in) >> 6) << 6
130
				in = in[done:]
131
				out = out[done:]
132
			} else {
133
				sSE2XorSlice_64(in, out)
134
				done := (len(in) >> 6) << 6
135
				in = in[done:]
136
				out = out[done:]
137
			}
138
		}
139
		if len(in) >= 16 {
140
			sSE2XorSlice(in, out)
141
			done := (len(in) >> 4) << 4
142
			in = in[done:]
143
			out = out[done:]
144
		}
145
	} else {
146
		sliceXorGo(in, out, o)
147
		return
148
	}
149
	out = out[:len(in)]
150
	for i := range in {
151
		out[i] ^= in[i]
152
	}
153
}
154

155
// 4-way butterfly
156
func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
157
	if len(work[0]) == 0 {
158
		return
159
	}
160

161
	t01 := &multiply256LUT[log_m01]
162
	t23 := &multiply256LUT[log_m23]
163
	t02 := &multiply256LUT[log_m02]
164
	if o.useAVX512 {
165
		if log_m01 == modulus {
166
			if log_m23 == modulus {
167
				if log_m02 == modulus {
168
					ifftDIT4_avx512_7(work, dist*24, t01, t23, t02)
169
				} else {
170
					ifftDIT4_avx512_3(work, dist*24, t01, t23, t02)
171
				}
172
			} else {
173
				if log_m02 == modulus {
174
					ifftDIT4_avx512_5(work, dist*24, t01, t23, t02)
175
				} else {
176
					ifftDIT4_avx512_1(work, dist*24, t01, t23, t02)
177
				}
178
			}
179
		} else {
180
			if log_m23 == modulus {
181
				if log_m02 == modulus {
182
					ifftDIT4_avx512_6(work, dist*24, t01, t23, t02)
183
				} else {
184
					ifftDIT4_avx512_2(work, dist*24, t01, t23, t02)
185
				}
186
			} else {
187
				if log_m02 == modulus {
188
					ifftDIT4_avx512_4(work, dist*24, t01, t23, t02)
189
				} else {
190
					ifftDIT4_avx512_0(work, dist*24, t01, t23, t02)
191
				}
192
			}
193
		}
194
		return
195
	} else if o.useAVX2 {
196
		if log_m01 == modulus {
197
			if log_m23 == modulus {
198
				if log_m02 == modulus {
199
					ifftDIT4_avx2_7(work, dist*24, t01, t23, t02)
200
				} else {
201
					ifftDIT4_avx2_3(work, dist*24, t01, t23, t02)
202
				}
203
			} else {
204
				if log_m02 == modulus {
205
					ifftDIT4_avx2_5(work, dist*24, t01, t23, t02)
206
				} else {
207
					ifftDIT4_avx2_1(work, dist*24, t01, t23, t02)
208
				}
209
			}
210
		} else {
211
			if log_m23 == modulus {
212
				if log_m02 == modulus {
213
					ifftDIT4_avx2_6(work, dist*24, t01, t23, t02)
214
				} else {
215
					ifftDIT4_avx2_2(work, dist*24, t01, t23, t02)
216
				}
217
			} else {
218
				if log_m02 == modulus {
219
					ifftDIT4_avx2_4(work, dist*24, t01, t23, t02)
220
				} else {
221
					ifftDIT4_avx2_0(work, dist*24, t01, t23, t02)
222
				}
223
			}
224
		}
225
		return
226
	}
227
	ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
228
}
229

230
// 4-way butterfly
231
func ifftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) {
232
	if len(work[0]) == 0 {
233
		return
234
	}
235

236
	if false && o.useGFNI {
237
		// Note that these currently require that length is multiple of 64.
238
		t01 := gf2p811dMulMatrices[log_m01]
239
		t23 := gf2p811dMulMatrices[log_m23]
240
		t02 := gf2p811dMulMatrices[log_m02]
241
		if log_m01 == modulus8 {
242
			if log_m23 == modulus8 {
243
				if log_m02 == modulus8 {
244
					ifftDIT48_gfni_7(work, dist*24, t01, t23, t02)
245
				} else {
246
					ifftDIT48_gfni_3(work, dist*24, t01, t23, t02)
247
				}
248
			} else {
249
				if log_m02 == modulus8 {
250
					ifftDIT48_gfni_5(work, dist*24, t01, t23, t02)
251
				} else {
252
					ifftDIT48_gfni_1(work, dist*24, t01, t23, t02)
253
				}
254
			}
255
		} else {
256
			if log_m23 == modulus8 {
257
				if log_m02 == modulus8 {
258
					ifftDIT48_gfni_6(work, dist*24, t01, t23, t02)
259
				} else {
260
					ifftDIT48_gfni_2(work, dist*24, t01, t23, t02)
261
				}
262
			} else {
263
				if log_m02 == modulus8 {
264
					ifftDIT48_gfni_4(work, dist*24, t01, t23, t02)
265
				} else {
266
					ifftDIT48_gfni_0(work, dist*24, t01, t23, t02)
267
				}
268
			}
269
		}
270
		return
271
	}
272
	if o.useAVX2 {
273
		// Note that these currently require that length is multiple of 64.
274
		t01 := &multiply256LUT8[log_m01]
275
		t23 := &multiply256LUT8[log_m23]
276
		t02 := &multiply256LUT8[log_m02]
277
		if log_m01 == modulus8 {
278
			if log_m23 == modulus8 {
279
				if log_m02 == modulus8 {
280
					ifftDIT48_avx2_7(work, dist*24, t01, t23, t02)
281
				} else {
282
					ifftDIT48_avx2_3(work, dist*24, t01, t23, t02)
283
				}
284
			} else {
285
				if log_m02 == modulus8 {
286
					ifftDIT48_avx2_5(work, dist*24, t01, t23, t02)
287
				} else {
288
					ifftDIT48_avx2_1(work, dist*24, t01, t23, t02)
289
				}
290
			}
291
		} else {
292
			if log_m23 == modulus8 {
293
				if log_m02 == modulus8 {
294
					ifftDIT48_avx2_6(work, dist*24, t01, t23, t02)
295
				} else {
296
					ifftDIT48_avx2_2(work, dist*24, t01, t23, t02)
297
				}
298
			} else {
299
				if log_m02 == modulus8 {
300
					ifftDIT48_avx2_4(work, dist*24, t01, t23, t02)
301
				} else {
302
					ifftDIT48_avx2_0(work, dist*24, t01, t23, t02)
303
				}
304
			}
305
		}
306
		return
307
	}
308
	ifftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o)
309
}
310

311
func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
312
	if len(work[0]) == 0 {
313
		return
314
	}
315

316
	t01 := &multiply256LUT[log_m01]
317
	t23 := &multiply256LUT[log_m23]
318
	t02 := &multiply256LUT[log_m02]
319
	if o.useAVX512 {
320
		if log_m02 == modulus {
321
			if log_m01 == modulus {
322
				if log_m23 == modulus {
323
					fftDIT4_avx512_7(work, dist*24, t01, t23, t02)
324
				} else {
325
					fftDIT4_avx512_3(work, dist*24, t01, t23, t02)
326
				}
327
			} else {
328
				if log_m23 == modulus {
329
					fftDIT4_avx512_5(work, dist*24, t01, t23, t02)
330
				} else {
331
					fftDIT4_avx512_1(work, dist*24, t01, t23, t02)
332
				}
333
			}
334
		} else {
335
			if log_m01 == modulus {
336
				if log_m23 == modulus {
337
					fftDIT4_avx512_6(work, dist*24, t01, t23, t02)
338
				} else {
339
					fftDIT4_avx512_2(work, dist*24, t01, t23, t02)
340
				}
341
			} else {
342
				if log_m23 == modulus {
343
					fftDIT4_avx512_4(work, dist*24, t01, t23, t02)
344
				} else {
345
					fftDIT4_avx512_0(work, dist*24, t01, t23, t02)
346
				}
347
			}
348
		}
349
		return
350
	} else if o.useAVX2 {
351
		if log_m02 == modulus {
352
			if log_m01 == modulus {
353
				if log_m23 == modulus {
354
					fftDIT4_avx2_7(work, dist*24, t01, t23, t02)
355
				} else {
356
					fftDIT4_avx2_3(work, dist*24, t01, t23, t02)
357
				}
358
			} else {
359
				if log_m23 == modulus {
360
					fftDIT4_avx2_5(work, dist*24, t01, t23, t02)
361
				} else {
362
					fftDIT4_avx2_1(work, dist*24, t01, t23, t02)
363
				}
364
			}
365
		} else {
366
			if log_m01 == modulus {
367
				if log_m23 == modulus {
368
					fftDIT4_avx2_6(work, dist*24, t01, t23, t02)
369
				} else {
370
					fftDIT4_avx2_2(work, dist*24, t01, t23, t02)
371
				}
372
			} else {
373
				if log_m23 == modulus {
374
					fftDIT4_avx2_4(work, dist*24, t01, t23, t02)
375
				} else {
376
					fftDIT4_avx2_0(work, dist*24, t01, t23, t02)
377
				}
378
			}
379
		}
380
		return
381
	}
382
	fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
383
}
384

385
// 4-way butterfly
386
func fftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) {
387
	if len(work[0]) == 0 {
388
		return
389
	}
390

391
	if false && o.useGFNI {
392
		t01 := gf2p811dMulMatrices[log_m01]
393
		t23 := gf2p811dMulMatrices[log_m23]
394
		t02 := gf2p811dMulMatrices[log_m02]
395
		// Note that these currently require that length is multiple of 64.
396
		if log_m02 == modulus8 {
397
			if log_m01 == modulus8 {
398
				if log_m23 == modulus8 {
399
					fftDIT48_gfni_7(work, dist*24, t01, t23, t02)
400
				} else {
401
					fftDIT48_gfni_3(work, dist*24, t01, t23, t02)
402
				}
403
			} else {
404
				if log_m23 == modulus8 {
405
					fftDIT48_gfni_5(work, dist*24, t01, t23, t02)
406
				} else {
407
					fftDIT48_gfni_1(work, dist*24, t01, t23, t02)
408
				}
409
			}
410
		} else {
411
			if log_m01 == modulus8 {
412
				if log_m23 == modulus8 {
413
					fftDIT48_gfni_6(work, dist*24, t01, t23, t02)
414
				} else {
415
					fftDIT48_gfni_2(work, dist*24, t01, t23, t02)
416
				}
417
			} else {
418
				if log_m23 == modulus8 {
419
					fftDIT48_gfni_4(work, dist*24, t01, t23, t02)
420
				} else {
421
					fftDIT48_gfni_0(work, dist*24, t01, t23, t02)
422
				}
423
			}
424
		}
425
		return
426
	}
427
	if o.useAVX2 {
428
		t01 := &multiply256LUT8[log_m01]
429
		t23 := &multiply256LUT8[log_m23]
430
		t02 := &multiply256LUT8[log_m02]
431
		// Note that these currently require that length is multiple of 64.
432
		if log_m02 == modulus8 {
433
			if log_m01 == modulus8 {
434
				if log_m23 == modulus8 {
435
					fftDIT48_avx2_7(work, dist*24, t01, t23, t02)
436
				} else {
437
					fftDIT48_avx2_3(work, dist*24, t01, t23, t02)
438
				}
439
			} else {
440
				if log_m23 == modulus8 {
441
					fftDIT48_avx2_5(work, dist*24, t01, t23, t02)
442
				} else {
443
					fftDIT48_avx2_1(work, dist*24, t01, t23, t02)
444
				}
445
			}
446
		} else {
447
			if log_m01 == modulus8 {
448
				if log_m23 == modulus8 {
449
					fftDIT48_avx2_6(work, dist*24, t01, t23, t02)
450
				} else {
451
					fftDIT48_avx2_2(work, dist*24, t01, t23, t02)
452
				}
453
			} else {
454
				if log_m23 == modulus8 {
455
					fftDIT48_avx2_4(work, dist*24, t01, t23, t02)
456
				} else {
457
					fftDIT48_avx2_0(work, dist*24, t01, t23, t02)
458
				}
459
			}
460
		}
461
		return
462
	}
463
	fftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o)
464
}
465

466
// 2-way butterfly forward
467
func fftDIT2(x, y []byte, log_m ffe, o *options) {
468
	if len(x) == 0 {
469
		return
470
	}
471
	if o.useAVX2 {
472
		tmp := &multiply256LUT[log_m]
473
		fftDIT2_avx2(x, y, tmp)
474
	} else if o.useSSSE3 {
475
		tmp := &multiply256LUT[log_m]
476
		fftDIT2_ssse3(x, y, tmp)
477
	} else {
478
		// Reference version:
479
		refMulAdd(x, y, log_m)
480
		sliceXor(x, y, o)
481
	}
482
}
483

484
// 2-way butterfly forward
485
func fftDIT28(x, y []byte, log_m ffe8, o *options) {
486
	if len(x) == 0 {
487
		return
488
	}
489

490
	if o.useAVX2 {
491
		fftDIT28_avx2(x, y, &multiply256LUT8[log_m])
492
		if len(x)&63 == 0 {
493
			return
494
		}
495
		done := (len(y) >> 6) << 6
496
		y = y[done:]
497
		x = x[done:]
498
	}
499
	mulAdd8(x, y, log_m, o)
500
	sliceXor(x, y, o)
501
}
502

503
// 2-way butterfly inverse
504
func ifftDIT28(x, y []byte, log_m ffe8, o *options) {
505
	if len(x) == 0 {
506
		return
507
	}
508

509
	if o.useAVX2 {
510
		ifftDIT28_avx2(x, y, &multiply256LUT8[log_m])
511
		if len(x)&63 == 0 {
512
			return
513
		}
514
		done := (len(y) >> 6) << 6
515
		y = y[done:]
516
		x = x[done:]
517
	}
518
	sliceXor(x, y, o)
519
	mulAdd8(x, y, log_m, o)
520
}
521

522
func mulAdd8(x, y []byte, log_m ffe8, o *options) {
523
	if o.useAVX2 {
524
		t := &multiply256LUT8[log_m]
525
		galMulAVX2Xor_64(t[:16], t[16:32], y, x)
526
		done := (len(y) >> 6) << 6
527
		y = y[done:]
528
		x = x[done:]
529
	} else if o.useSSSE3 {
530
		t := &multiply256LUT8[log_m]
531
		galMulSSSE3Xor(t[:16], t[16:32], y, x)
532
		done := (len(y) >> 4) << 4
533
		y = y[done:]
534
		x = x[done:]
535
	}
536
	refMulAdd8(x, y, log_m)
537
}
538

539
// 2-way butterfly
540
func ifftDIT2(x, y []byte, log_m ffe, o *options) {
541
	if len(x) == 0 {
542
		return
543
	}
544
	if o.useAVX2 {
545
		tmp := &multiply256LUT[log_m]
546
		ifftDIT2_avx2(x, y, tmp)
547
	} else if o.useSSSE3 {
548
		tmp := &multiply256LUT[log_m]
549
		ifftDIT2_ssse3(x, y, tmp)
550
	} else {
551
		// Reference version:
552
		sliceXor(x, y, o)
553
		refMulAdd(x, y, log_m)
554
	}
555
}
556

557
func mulgf16(x, y []byte, log_m ffe, o *options) {
558
	if len(x) == 0 {
559
		return
560
	}
561
	if o.useAVX2 {
562
		tmp := &multiply256LUT[log_m]
563
		mulgf16_avx2(x, y, tmp)
564
	} else if o.useSSSE3 {
565
		tmp := &multiply256LUT[log_m]
566
		mulgf16_ssse3(x, y, tmp)
567
	} else {
568
		refMul(x, y, log_m)
569
	}
570
}
571

572
func mulgf8(out, in []byte, log_m ffe8, o *options) {
573
	if o.useAVX2 {
574
		t := &multiply256LUT8[log_m]
575
		galMulAVX2_64(t[:16], t[16:32], in, out)
576
		done := (len(in) >> 6) << 6
577
		in = in[done:]
578
		out = out[done:]
579
	} else if o.useSSSE3 {
580
		t := &multiply256LUT8[log_m]
581
		galMulSSSE3(t[:16], t[16:32], in, out)
582
		done := (len(in) >> 4) << 4
583
		in = in[done:]
584
		out = out[done:]
585
	}
586
	out = out[:len(in)]
587
	mt := mul8LUTs[log_m].Value[:]
588
	for i := range in {
589
		out[i] = byte(mt[in[i]])
590
	}
591
}
592
cubefs

Использование cookies