podman
1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// This file provides the generic implementation of Sum and MAC. Other files
6// might provide optimized assembly implementations of some of this code.
7
8package poly13059
10import (11"encoding/binary"12"math/bits"13)
14
15// Poly1305 [RFC 7539] is a relatively simple algorithm: the authentication tag
16// for a 64 bytes message is approximately
17//
18// s + m[0:16] * r⁴ + m[16:32] * r³ + m[32:48] * r² + m[48:64] * r mod 2¹³⁰ - 5
19//
20// for some secret r and s. It can be computed sequentially like
21//
22// for len(msg) > 0:
23// h += read(msg, 16)
24// h *= r
25// h %= 2¹³⁰ - 5
26// return h + s
27//
28// All the complexity is about doing performant constant-time math on numbers
29// larger than any available numeric type.
30
31func sumGeneric(out *[TagSize]byte, msg []byte, key *[32]byte) {32h := newMACGeneric(key)33h.Write(msg)34h.Sum(out)35}
36
37func newMACGeneric(key *[32]byte) macGeneric {38m := macGeneric{}39initialize(key, &m.macState)40return m41}
42
43// macState holds numbers in saturated 64-bit little-endian limbs. That is,
44// the value of [x0, x1, x2] is x[0] + x[1] * 2⁶⁴ + x[2] * 2¹²⁸.
45type macState struct {46// h is the main accumulator. It is to be interpreted modulo 2¹³⁰ - 5, but47// can grow larger during and after rounds. It must, however, remain below48// 2 * (2¹³⁰ - 5).49h [3]uint6450// r and s are the private key components.51r [2]uint6452s [2]uint6453}
54
55type macGeneric struct {56macState
57
58buffer [TagSize]byte59offset int60}
61
62// Write splits the incoming message into TagSize chunks, and passes them to
63// update. It buffers incomplete chunks.
64func (h *macGeneric) Write(p []byte) (int, error) {65nn := len(p)66if h.offset > 0 {67n := copy(h.buffer[h.offset:], p)68if h.offset+n < TagSize {69h.offset += n70return nn, nil71}72p = p[n:]73h.offset = 074updateGeneric(&h.macState, h.buffer[:])75}76if n := len(p) - (len(p) % TagSize); n > 0 {77updateGeneric(&h.macState, p[:n])78p = p[n:]79}80if len(p) > 0 {81h.offset += copy(h.buffer[h.offset:], p)82}83return nn, nil84}
85
86// Sum flushes the last incomplete chunk from the buffer, if any, and generates
87// the MAC output. It does not modify its state, in order to allow for multiple
88// calls to Sum, even if no Write is allowed after Sum.
89func (h *macGeneric) Sum(out *[TagSize]byte) {90state := h.macState91if h.offset > 0 {92updateGeneric(&state, h.buffer[:h.offset])93}94finalize(out, &state.h, &state.s)95}
96
97// [rMask0, rMask1] is the specified Poly1305 clamping mask in little-endian. It
98// clears some bits of the secret coefficient to make it possible to implement
99// multiplication more efficiently.
100const (101rMask0 = 0x0FFFFFFC0FFFFFFF102rMask1 = 0x0FFFFFFC0FFFFFFC103)
104
105// initialize loads the 256-bit key into the two 128-bit secret values r and s.
106func initialize(key *[32]byte, m *macState) {107m.r[0] = binary.LittleEndian.Uint64(key[0:8]) & rMask0108m.r[1] = binary.LittleEndian.Uint64(key[8:16]) & rMask1109m.s[0] = binary.LittleEndian.Uint64(key[16:24])110m.s[1] = binary.LittleEndian.Uint64(key[24:32])111}
112
113// uint128 holds a 128-bit number as two 64-bit limbs, for use with the
114// bits.Mul64 and bits.Add64 intrinsics.
115type uint128 struct {116lo, hi uint64117}
118
119func mul64(a, b uint64) uint128 {120hi, lo := bits.Mul64(a, b)121return uint128{lo, hi}122}
123
124func add128(a, b uint128) uint128 {125lo, c := bits.Add64(a.lo, b.lo, 0)126hi, c := bits.Add64(a.hi, b.hi, c)127if c != 0 {128panic("poly1305: unexpected overflow")129}130return uint128{lo, hi}131}
132
133func shiftRightBy2(a uint128) uint128 {134a.lo = a.lo>>2 | (a.hi&3)<<62135a.hi = a.hi >> 2136return a137}
138
139// updateGeneric absorbs msg into the state.h accumulator. For each chunk m of
140// 128 bits of message, it computes
141//
142// h₊ = (h + m) * r mod 2¹³⁰ - 5
143//
144// If the msg length is not a multiple of TagSize, it assumes the last
145// incomplete chunk is the final one.
146func updateGeneric(state *macState, msg []byte) {147h0, h1, h2 := state.h[0], state.h[1], state.h[2]148r0, r1 := state.r[0], state.r[1]149
150for len(msg) > 0 {151var c uint64152
153// For the first step, h + m, we use a chain of bits.Add64 intrinsics.154// The resulting value of h might exceed 2¹³⁰ - 5, but will be partially155// reduced at the end of the multiplication below.156//157// The spec requires us to set a bit just above the message size, not to158// hide leading zeroes. For full chunks, that's 1 << 128, so we can just159// add 1 to the most significant (2¹²⁸) limb, h2.160if len(msg) >= TagSize {161h0, c = bits.Add64(h0, binary.LittleEndian.Uint64(msg[0:8]), 0)162h1, c = bits.Add64(h1, binary.LittleEndian.Uint64(msg[8:16]), c)163h2 += c + 1164
165msg = msg[TagSize:]166} else {167var buf [TagSize]byte168copy(buf[:], msg)169buf[len(msg)] = 1170
171h0, c = bits.Add64(h0, binary.LittleEndian.Uint64(buf[0:8]), 0)172h1, c = bits.Add64(h1, binary.LittleEndian.Uint64(buf[8:16]), c)173h2 += c174
175msg = nil176}177
178// Multiplication of big number limbs is similar to elementary school179// columnar multiplication. Instead of digits, there are 64-bit limbs.180//181// We are multiplying a 3 limbs number, h, by a 2 limbs number, r.182//183// h2 h1 h0 x184// r1 r0 =185// ----------------186// h2r0 h1r0 h0r0 <-- individual 128-bit products187// + h2r1 h1r1 h0r1188// ------------------------189// m3 m2 m1 m0 <-- result in 128-bit overlapping limbs190// ------------------------191// m3.hi m2.hi m1.hi m0.hi <-- carry propagation192// + m3.lo m2.lo m1.lo m0.lo193// -------------------------------194// t4 t3 t2 t1 t0 <-- final result in 64-bit limbs195//196// The main difference from pen-and-paper multiplication is that we do197// carry propagation in a separate step, as if we wrote two digit sums198// at first (the 128-bit limbs), and then carried the tens all at once.199
200h0r0 := mul64(h0, r0)201h1r0 := mul64(h1, r0)202h2r0 := mul64(h2, r0)203h0r1 := mul64(h0, r1)204h1r1 := mul64(h1, r1)205h2r1 := mul64(h2, r1)206
207// Since h2 is known to be at most 7 (5 + 1 + 1), and r0 and r1 have their208// top 4 bits cleared by rMask{0,1}, we know that their product is not going209// to overflow 64 bits, so we can ignore the high part of the products.210//211// This also means that the product doesn't have a fifth limb (t4).212if h2r0.hi != 0 {213panic("poly1305: unexpected overflow")214}215if h2r1.hi != 0 {216panic("poly1305: unexpected overflow")217}218
219m0 := h0r0220m1 := add128(h1r0, h0r1) // These two additions don't overflow thanks again221m2 := add128(h2r0, h1r1) // to the 4 masked bits at the top of r0 and r1.222m3 := h2r1223
224t0 := m0.lo225t1, c := bits.Add64(m1.lo, m0.hi, 0)226t2, c := bits.Add64(m2.lo, m1.hi, c)227t3, _ := bits.Add64(m3.lo, m2.hi, c)228
229// Now we have the result as 4 64-bit limbs, and we need to reduce it230// modulo 2¹³⁰ - 5. The special shape of this Crandall prime lets us do231// a cheap partial reduction according to the reduction identity232//233// c * 2¹³⁰ + n = c * 5 + n mod 2¹³⁰ - 5234//235// because 2¹³⁰ = 5 mod 2¹³⁰ - 5. Partial reduction since the result is236// likely to be larger than 2¹³⁰ - 5, but still small enough to fit the237// assumptions we make about h in the rest of the code.238//239// See also https://speakerdeck.com/gtank/engineering-prime-numbers?slide=23240
241// We split the final result at the 2¹³⁰ mark into h and cc, the carry.242// Note that the carry bits are effectively shifted left by 2, in other243// words, cc = c * 4 for the c in the reduction identity.244h0, h1, h2 = t0, t1, t2&maskLow2Bits245cc := uint128{t2 & maskNotLow2Bits, t3}246
247// To add c * 5 to h, we first add cc = c * 4, and then add (cc >> 2) = c.248
249h0, c = bits.Add64(h0, cc.lo, 0)250h1, c = bits.Add64(h1, cc.hi, c)251h2 += c252
253cc = shiftRightBy2(cc)254
255h0, c = bits.Add64(h0, cc.lo, 0)256h1, c = bits.Add64(h1, cc.hi, c)257h2 += c258
259// h2 is at most 3 + 1 + 1 = 5, making the whole of h at most260//261// 5 * 2¹²⁸ + (2¹²⁸ - 1) = 6 * 2¹²⁸ - 1262}263
264state.h[0], state.h[1], state.h[2] = h0, h1, h2265}
266
267const (268maskLow2Bits uint64 = 0x0000000000000003269maskNotLow2Bits uint64 = ^maskLow2Bits270)
271
272// select64 returns x if v == 1 and y if v == 0, in constant time.
273func select64(v, x, y uint64) uint64 { return ^(v-1)&x | (v-1)&y }274
275// [p0, p1, p2] is 2¹³⁰ - 5 in little endian order.
276const (277p0 = 0xFFFFFFFFFFFFFFFB278p1 = 0xFFFFFFFFFFFFFFFF279p2 = 0x0000000000000003280)
281
282// finalize completes the modular reduction of h and computes
283//
284// out = h + s mod 2¹²⁸
285func finalize(out *[TagSize]byte, h *[3]uint64, s *[2]uint64) {286h0, h1, h2 := h[0], h[1], h[2]287
288// After the partial reduction in updateGeneric, h might be more than289// 2¹³⁰ - 5, but will be less than 2 * (2¹³⁰ - 5). To complete the reduction290// in constant time, we compute t = h - (2¹³⁰ - 5), and select h as the291// result if the subtraction underflows, and t otherwise.292
293hMinusP0, b := bits.Sub64(h0, p0, 0)294hMinusP1, b := bits.Sub64(h1, p1, b)295_, b = bits.Sub64(h2, p2, b)296
297// h = h if h < p else h - p298h0 = select64(b, h0, hMinusP0)299h1 = select64(b, h1, hMinusP1)300
301// Finally, we compute the last Poly1305 step302//303// tag = h + s mod 2¹²⁸304//305// by just doing a wide addition with the 128 low bits of h and discarding306// the overflow.307h0, c := bits.Add64(h0, s[0], 0)308h1, _ = bits.Add64(h1, s[1], c)309
310binary.LittleEndian.PutUint64(out[0:8], h0)311binary.LittleEndian.PutUint64(out[8:16], h1)312}
313