cubefs
1// +build !appengine
2// +build gc
3// +build !purego
4
5#include "textflag.h"
6
7// Register allocation:
8// AX h
9// SI pointer to advance through b
10// DX n
11// BX loop end
12// R8 v1, k1
13// R9 v2
14// R10 v3
15// R11 v4
16// R12 tmp
17// R13 prime1v
18// R14 prime2v
19// DI prime4v
20
21// round reads from and advances the buffer pointer in SI.
22// It assumes that R13 has prime1v and R14 has prime2v.
23#define round(r) \
24MOVQ (SI), R12 \
25ADDQ $8, SI \
26IMULQ R14, R12 \
27ADDQ R12, r \
28ROLQ $31, r \
29IMULQ R13, r
30
31// mergeRound applies a merge round on the two registers acc and val.
32// It assumes that R13 has prime1v, R14 has prime2v, and DI has prime4v.
33#define mergeRound(acc, val) \
34IMULQ R14, val \
35ROLQ $31, val \
36IMULQ R13, val \
37XORQ val, acc \
38IMULQ R13, acc \
39ADDQ DI, acc
40
41// func Sum64(b []byte) uint64
42TEXT ·Sum64(SB), NOSPLIT, $0-32
43// Load fixed primes.
44MOVQ ·prime1v(SB), R13
45MOVQ ·prime2v(SB), R14
46MOVQ ·prime4v(SB), DI
47
48// Load slice.
49MOVQ b_base+0(FP), SI
50MOVQ b_len+8(FP), DX
51LEAQ (SI)(DX*1), BX
52
53// The first loop limit will be len(b)-32.
54SUBQ $32, BX
55
56// Check whether we have at least one block.
57CMPQ DX, $32
58JLT noBlocks
59
60// Set up initial state (v1, v2, v3, v4).
61MOVQ R13, R8
62ADDQ R14, R8
63MOVQ R14, R9
64XORQ R10, R10
65XORQ R11, R11
66SUBQ R13, R11
67
68// Loop until SI > BX.
69blockLoop:
70round(R8)
71round(R9)
72round(R10)
73round(R11)
74
75CMPQ SI, BX
76JLE blockLoop
77
78MOVQ R8, AX
79ROLQ $1, AX
80MOVQ R9, R12
81ROLQ $7, R12
82ADDQ R12, AX
83MOVQ R10, R12
84ROLQ $12, R12
85ADDQ R12, AX
86MOVQ R11, R12
87ROLQ $18, R12
88ADDQ R12, AX
89
90mergeRound(AX, R8)
91mergeRound(AX, R9)
92mergeRound(AX, R10)
93mergeRound(AX, R11)
94
95JMP afterBlocks
96
97noBlocks:
98MOVQ ·prime5v(SB), AX
99
100afterBlocks:
101ADDQ DX, AX
102
103// Right now BX has len(b)-32, and we want to loop until SI > len(b)-8.
104ADDQ $24, BX
105
106CMPQ SI, BX
107JG fourByte
108
109wordLoop:
110// Calculate k1.
111MOVQ (SI), R8
112ADDQ $8, SI
113IMULQ R14, R8
114ROLQ $31, R8
115IMULQ R13, R8
116
117XORQ R8, AX
118ROLQ $27, AX
119IMULQ R13, AX
120ADDQ DI, AX
121
122CMPQ SI, BX
123JLE wordLoop
124
125fourByte:
126ADDQ $4, BX
127CMPQ SI, BX
128JG singles
129
130MOVL (SI), R8
131ADDQ $4, SI
132IMULQ R13, R8
133XORQ R8, AX
134
135ROLQ $23, AX
136IMULQ R14, AX
137ADDQ ·prime3v(SB), AX
138
139singles:
140ADDQ $4, BX
141CMPQ SI, BX
142JGE finalize
143
144singlesLoop:
145MOVBQZX (SI), R12
146ADDQ $1, SI
147IMULQ ·prime5v(SB), R12
148XORQ R12, AX
149
150ROLQ $11, AX
151IMULQ R13, AX
152
153CMPQ SI, BX
154JL singlesLoop
155
156finalize:
157MOVQ AX, R12
158SHRQ $33, R12
159XORQ R12, AX
160IMULQ R14, AX
161MOVQ AX, R12
162SHRQ $29, R12
163XORQ R12, AX
164IMULQ ·prime3v(SB), AX
165MOVQ AX, R12
166SHRQ $32, R12
167XORQ R12, AX
168
169MOVQ AX, ret+24(FP)
170RET
171
172// writeBlocks uses the same registers as above except that it uses AX to store
173// the d pointer.
174
175// func writeBlocks(d *Digest, b []byte) int
176TEXT ·writeBlocks(SB), NOSPLIT, $0-40
177// Load fixed primes needed for round.
178MOVQ ·prime1v(SB), R13
179MOVQ ·prime2v(SB), R14
180
181// Load slice.
182MOVQ b_base+8(FP), SI
183MOVQ b_len+16(FP), DX
184LEAQ (SI)(DX*1), BX
185SUBQ $32, BX
186
187// Load vN from d.
188MOVQ d+0(FP), AX
189MOVQ 0(AX), R8 // v1
190MOVQ 8(AX), R9 // v2
191MOVQ 16(AX), R10 // v3
192MOVQ 24(AX), R11 // v4
193
194// We don't need to check the loop condition here; this function is
195// always called with at least one block of data to process.
196blockLoop:
197round(R8)
198round(R9)
199round(R10)
200round(R11)
201
202CMPQ SI, BX
203JLE blockLoop
204
205// Copy vN back to d.
206MOVQ R8, 0(AX)
207MOVQ R9, 8(AX)
208MOVQ R10, 16(AX)
209MOVQ R11, 24(AX)
210
211// The number of bytes written is SI minus the old base pointer.
212SUBQ b_base+8(FP), SI
213MOVQ SI, ret+32(FP)
214
215RET
216