cubefs
1// +build !appengine
2// +build gc
3// +build !purego
4// +build !noasm
5
6#include "textflag.h"
7
8// Register allocation:
9// AX h
10// SI pointer to advance through b
11// DX n
12// BX loop end
13// R8 v1, k1
14// R9 v2
15// R10 v3
16// R11 v4
17// R12 tmp
18// R13 prime1v
19// R14 prime2v
20// DI prime4v
21
22// round reads from and advances the buffer pointer in SI.
23// It assumes that R13 has prime1v and R14 has prime2v.
24#define round(r) \
25MOVQ (SI), R12 \
26ADDQ $8, SI \
27IMULQ R14, R12 \
28ADDQ R12, r \
29ROLQ $31, r \
30IMULQ R13, r
31
32// mergeRound applies a merge round on the two registers acc and val.
33// It assumes that R13 has prime1v, R14 has prime2v, and DI has prime4v.
34#define mergeRound(acc, val) \
35IMULQ R14, val \
36ROLQ $31, val \
37IMULQ R13, val \
38XORQ val, acc \
39IMULQ R13, acc \
40ADDQ DI, acc
41
42// func Sum64(b []byte) uint64
43TEXT ·Sum64(SB), NOSPLIT, $0-32
44// Load fixed primes.
45MOVQ ·prime1v(SB), R13
46MOVQ ·prime2v(SB), R14
47MOVQ ·prime4v(SB), DI
48
49// Load slice.
50MOVQ b_base+0(FP), SI
51MOVQ b_len+8(FP), DX
52LEAQ (SI)(DX*1), BX
53
54// The first loop limit will be len(b)-32.
55SUBQ $32, BX
56
57// Check whether we have at least one block.
58CMPQ DX, $32
59JLT noBlocks
60
61// Set up initial state (v1, v2, v3, v4).
62MOVQ R13, R8
63ADDQ R14, R8
64MOVQ R14, R9
65XORQ R10, R10
66XORQ R11, R11
67SUBQ R13, R11
68
69// Loop until SI > BX.
70blockLoop:
71round(R8)
72round(R9)
73round(R10)
74round(R11)
75
76CMPQ SI, BX
77JLE blockLoop
78
79MOVQ R8, AX
80ROLQ $1, AX
81MOVQ R9, R12
82ROLQ $7, R12
83ADDQ R12, AX
84MOVQ R10, R12
85ROLQ $12, R12
86ADDQ R12, AX
87MOVQ R11, R12
88ROLQ $18, R12
89ADDQ R12, AX
90
91mergeRound(AX, R8)
92mergeRound(AX, R9)
93mergeRound(AX, R10)
94mergeRound(AX, R11)
95
96JMP afterBlocks
97
98noBlocks:
99MOVQ ·prime5v(SB), AX
100
101afterBlocks:
102ADDQ DX, AX
103
104// Right now BX has len(b)-32, and we want to loop until SI > len(b)-8.
105ADDQ $24, BX
106
107CMPQ SI, BX
108JG fourByte
109
110wordLoop:
111// Calculate k1.
112MOVQ (SI), R8
113ADDQ $8, SI
114IMULQ R14, R8
115ROLQ $31, R8
116IMULQ R13, R8
117
118XORQ R8, AX
119ROLQ $27, AX
120IMULQ R13, AX
121ADDQ DI, AX
122
123CMPQ SI, BX
124JLE wordLoop
125
126fourByte:
127ADDQ $4, BX
128CMPQ SI, BX
129JG singles
130
131MOVL (SI), R8
132ADDQ $4, SI
133IMULQ R13, R8
134XORQ R8, AX
135
136ROLQ $23, AX
137IMULQ R14, AX
138ADDQ ·prime3v(SB), AX
139
140singles:
141ADDQ $4, BX
142CMPQ SI, BX
143JGE finalize
144
145singlesLoop:
146MOVBQZX (SI), R12
147ADDQ $1, SI
148IMULQ ·prime5v(SB), R12
149XORQ R12, AX
150
151ROLQ $11, AX
152IMULQ R13, AX
153
154CMPQ SI, BX
155JL singlesLoop
156
157finalize:
158MOVQ AX, R12
159SHRQ $33, R12
160XORQ R12, AX
161IMULQ R14, AX
162MOVQ AX, R12
163SHRQ $29, R12
164XORQ R12, AX
165IMULQ ·prime3v(SB), AX
166MOVQ AX, R12
167SHRQ $32, R12
168XORQ R12, AX
169
170MOVQ AX, ret+24(FP)
171RET
172
173// writeBlocks uses the same registers as above except that it uses AX to store
174// the d pointer.
175
176// func writeBlocks(d *Digest, b []byte) int
177TEXT ·writeBlocks(SB), NOSPLIT, $0-40
178// Load fixed primes needed for round.
179MOVQ ·prime1v(SB), R13
180MOVQ ·prime2v(SB), R14
181
182// Load slice.
183MOVQ b_base+8(FP), SI
184MOVQ b_len+16(FP), DX
185LEAQ (SI)(DX*1), BX
186SUBQ $32, BX
187
188// Load vN from d.
189MOVQ d+0(FP), AX
190MOVQ 0(AX), R8 // v1
191MOVQ 8(AX), R9 // v2
192MOVQ 16(AX), R10 // v3
193MOVQ 24(AX), R11 // v4
194
195// We don't need to check the loop condition here; this function is
196// always called with at least one block of data to process.
197blockLoop:
198round(R8)
199round(R9)
200round(R10)
201round(R11)
202
203CMPQ SI, BX
204JLE blockLoop
205
206// Copy vN back to d.
207MOVQ R8, 0(AX)
208MOVQ R9, 8(AX)
209MOVQ R10, 16(AX)
210MOVQ R11, 24(AX)
211
212// The number of bytes written is SI minus the old base pointer.
213SUBQ b_base+8(FP), SI
214MOVQ SI, ret+32(FP)
215
216RET
217