28 #include <emmintrin.h> 29 #if defined(HAVE_SSSE3) 30 #include <tmmintrin.h> 32 #if defined(HAVE_SSE41) 33 #include <smmintrin.h> 36 #include <immintrin.h> 39 #include <x86intrin.h> 44 ALIGN(64) static const uint64_t blake2b_IV[8] = {
45 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
46 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
47 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
48 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
55 const uint8_t * v = (
const uint8_t *)(blake2b_IV);
56 const uint8_t * p = (
const uint8_t *)(P);
57 uint8_t *
h = (uint8_t *)(S->h);
115 memcpy(block, key, keylen);
124 __m128i row1l, row1h;
125 __m128i row2l, row2h;
126 __m128i row3l, row3h;
127 __m128i row4l, row4h;
130 #if defined(HAVE_SSSE3) && !defined(HAVE_XOP) 131 const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
132 const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
134 #if defined(HAVE_SSE41) 135 const __m128i m0 =
LOADU(block + 00);
136 const __m128i m1 =
LOADU(block + 16);
137 const __m128i m2 =
LOADU(block + 32);
138 const __m128i m3 =
LOADU(block + 48);
139 const __m128i m4 =
LOADU(block + 64);
140 const __m128i m5 =
LOADU(block + 80);
141 const __m128i m6 =
LOADU(block + 96);
142 const __m128i m7 =
LOADU(block + 112);
144 const uint64_t m0 = ( ( uint64_t * )block )[ 0];
145 const uint64_t m1 = ( ( uint64_t * )block )[ 1];
146 const uint64_t m2 = ( ( uint64_t * )block )[ 2];
147 const uint64_t m3 = ( ( uint64_t * )block )[ 3];
148 const uint64_t m4 = ( ( uint64_t * )block )[ 4];
149 const uint64_t m5 = ( ( uint64_t * )block )[ 5];
150 const uint64_t m6 = ( ( uint64_t * )block )[ 6];
151 const uint64_t m7 = ( ( uint64_t * )block )[ 7];
152 const uint64_t m8 = ( ( uint64_t * )block )[ 8];
153 const uint64_t m9 = ( ( uint64_t * )block )[ 9];
154 const uint64_t m10 = ( ( uint64_t * )block )[10];
155 const uint64_t m11 = ( ( uint64_t * )block )[11];
156 const uint64_t m12 = ( ( uint64_t * )block )[12];
157 const uint64_t m13 = ( ( uint64_t * )block )[13];
158 const uint64_t m14 = ( ( uint64_t * )block )[14];
159 const uint64_t m15 = ( ( uint64_t * )block )[15];
165 row3l =
LOADU(&blake2b_IV[0]);
166 row3h =
LOADU(&blake2b_IV[2]);
167 row4l = _mm_xor_si128(
LOADU(&blake2b_IV[4]), _mm_set_epi32(0, 0, 0,
S->counter));
168 row4h = _mm_xor_si128(
LOADU(&blake2b_IV[6]), _mm_set_epi32(0, 0, 0L -
S->lastblock, 0L -
S->lastblock));
181 row1l = _mm_xor_si128(row3l, row1l);
182 row1h = _mm_xor_si128(row3h, row1h);
185 row2l = _mm_xor_si128(row4l, row2l);
186 row2h = _mm_xor_si128(row4h, row2h);
196 size_t left = S->buflen;
201 memcpy(S->buf + left, in, fill);
205 blake2b_compress(S, S->buf);
210 memcpy(S->buf + left, in, inlen);
211 S->buflen += (uint8_t) inlen;
228 blake2b_compress(S, S->buf);
233 S->counter += S->buflen;
236 blake2b_compress(S, S->buf);
237 memcpy(out, &S->h[0], outlen);
242 int eq_blake2b(uint8_t *out,
const void *in,
const void *key,
const uint8_t outlen,
const uint64_t inlen, uint8_t keylen)
247 if (!in || !out)
return -1;
248 if (NULL == key) keylen = 0;
int eq_blake2b_update(blake2b_state *S, const uint8_t *in, uint64_t inlen)
int eq_blake2b_final(blake2b_state *S, uint8_t *out, uint8_t outlen)
int eq_blake2b_init_key(blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen)
#define ROUND(lh, ll, rh, rl, kh, kl)
int eq_blake2b(uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen)
int eq_blake2b_init_param(blake2b_state *S, const blake2b_param *P)
ALIGN(64) static const uint64_t blake2b_IV[8]
void * memcpy(void *a, const void *b, size_t c)
int eq_blake2b_init(blake2b_state *S, const uint8_t outlen)