Fabcoin Core  0.16.2
P2P Digital Currency
blake2.cpp
Go to the documentation of this file.
1 // blake2.cpp - written and placed in the public domain by Jeffrey Walton and Zooko
2 // Wilcox-O'Hearn. Copyright assigned to the Crypto++ project.
3 // Based on Aumasson, Neves, Wilcox-O'Hearn and Winnerlein's reference BLAKE2
4 // implementation at http://github.com/BLAKE2/BLAKE2.
5 
6 #include "pch.h"
7 #include "config.h"
8 #include "cryptlib.h"
9 #include "argnames.h"
10 #include "algparam.h"
11 #include "blake2.h"
12 #include "cpu.h"
13 
15 
16 // Uncomment for benchmarking C++ against SSE2 or NEON
17 // #undef CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
18 // #undef CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
19 
20 // Apple Clang 6.0/Clang 3.5 does not have SSSE3 intrinsics
21 // http://llvm.org/bugs/show_bug.cgi?id=20213
22 #if (defined(CRYPTOPP_APPLE_CLANG_VERSION) && (CRYPTOPP_APPLE_CLANG_VERSION <= 60000)) || (defined(CRYPTOPP_LLVM_CLANG_VERSION) && (CRYPTOPP_LLVM_CLANG_VERSION <= 30500))
23 # undef CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
24 #endif
25 
26 // Sun Studio 12.3 and earlier lack SSE2's _mm_set_epi64x. Win32 lacks _mm_set_epi64x (Win64 supplies it except for VS2008).
27 // Also see http://stackoverflow.com/a/38547909/608639
28 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE && ((__SUNPRO_CC >= 0x5100 && __SUNPRO_CC < 0x5130) || (defined(_MSC_VER) && _MSC_VER < 1600) || (defined(_M_IX86) && _MSC_VER >= 1600))
29 inline __m128i _mm_set_epi64x(const word64 a, const word64 b)
30 {
31  const word64 t[2] = {b,a}; __m128i r;
32  memcpy(&r, &t, sizeof(r));
33  return r;
34 }
35 #endif
36 
37 // C/C++ implementation
38 static void BLAKE2_CXX_Compress32(const byte* input, BLAKE2_State<word32, false>& state);
39 static void BLAKE2_CXX_Compress64(const byte* input, BLAKE2_State<word64, true>& state);
40 
41 // Also see http://github.com/weidai11/cryptopp/issues/247 for singling out SunCC 5.12
42 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
43 static void BLAKE2_SSE2_Compress32(const byte* input, BLAKE2_State<word32, false>& state);
44 # if (__SUNPRO_CC != 0x5120)
45 static void BLAKE2_SSE2_Compress64(const byte* input, BLAKE2_State<word64, true>& state);
46 # endif
47 #endif
48 
49 #if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
50 static void BLAKE2_SSE4_Compress32(const byte* input, BLAKE2_State<word32, false>& state);
51 static void BLAKE2_SSE4_Compress64(const byte* input, BLAKE2_State<word64, true>& state);
52 #endif
53 
54 #if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
55 static void BLAKE2_NEON_Compress32(const byte* input, BLAKE2_State<word32, false>& state);
56 static void BLAKE2_NEON_Compress64(const byte* input, BLAKE2_State<word64, true>& state);
57 #endif
58 
59 #ifndef CRYPTOPP_DOXYGEN_PROCESSING
60 
61 // IV and Sigma are a better fit as part of BLAKE2_Base, but that places
62 // the constants out of reach for the NEON, SSE2 and SSE4 implementations.
63 template<bool T_64bit>
65 
67 template<>
68 struct CRYPTOPP_NO_VTABLE BLAKE2_IV<false>
69 {
70  CRYPTOPP_CONSTANT(IVSIZE = 8)
71  // Always align for NEON and SSE
72  CRYPTOPP_ALIGN_DATA(16) static const word32 iv[8];
73 };
74 
76 const word32 BLAKE2_IV<false>::iv[8] = {
77  0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
78  0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
79 };
80 
81 #define BLAKE2S_IV(n) BLAKE2_IV<false>::iv[n]
82 
83 template<>
84 struct CRYPTOPP_NO_VTABLE BLAKE2_IV<true>
85 {
86  CRYPTOPP_CONSTANT(IVSIZE = 8)
87  // Always align for NEON and SSE
88  CRYPTOPP_ALIGN_DATA(16) static const word64 iv[8];
89 };
90 
92 const word64 BLAKE2_IV<true>::iv[8] = {
93  W64LIT(0x6a09e667f3bcc908), W64LIT(0xbb67ae8584caa73b),
94  W64LIT(0x3c6ef372fe94f82b), W64LIT(0xa54ff53a5f1d36f1),
95  W64LIT(0x510e527fade682d1), W64LIT(0x9b05688c2b3e6c1f),
96  W64LIT(0x1f83d9abfb41bd6b), W64LIT(0x5be0cd19137e2179)
97 };
98 
99 #define BLAKE2B_IV(n) BLAKE2_IV<true>::iv[n]
100 
101 // IV and Sigma are a better fit as part of BLAKE2_Base, but that places
102 // the constants out of reach for the NEON, SSE2 and SSE4 implementations.
103 template<bool T_64bit>
105 
106 template<>
107 struct CRYPTOPP_NO_VTABLE BLAKE2_Sigma<false>
108 {
109  // Always align for NEON and SSE
110  CRYPTOPP_ALIGN_DATA(16) static const byte sigma[10][16];
111 };
112 
114 const byte BLAKE2_Sigma<false>::sigma[10][16] = {
115  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
116  { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
117  { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
118  { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
119  { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
120  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
121  { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
122  { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
123  { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
124  { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
125 };
126 
128 template<>
129 struct CRYPTOPP_NO_VTABLE BLAKE2_Sigma<true>
130 {
131  // Always align for NEON and SSE
132  CRYPTOPP_ALIGN_DATA(16) static const byte sigma[12][16];
133 };
134 
136 const byte BLAKE2_Sigma<true>::sigma[12][16] = {
137  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
138  { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
139  { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
140  { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
141  { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
142  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
143  { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
144  { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
145  { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
146  { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
147  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
148  { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
149 };
150 
152 typedef void (*pfnCompress64)(const byte*, BLAKE2_State<word64, true>&);
153 
155 {
156 #if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
157  if (HasSSE4())
158  return &BLAKE2_SSE4_Compress64;
159  else
160 #endif
161 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
162 # if (__SUNPRO_CC != 0x5120)
163  if (HasSSE2())
164  return &BLAKE2_SSE2_Compress64;
165  else
166 # endif
167 #endif
168 #if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
169  if (HasNEON())
170  return &BLAKE2_NEON_Compress64;
171  else
172 #endif
173  return &BLAKE2_CXX_Compress64;
174 }
175 
177 {
178 #if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
179  if (HasSSE4())
180  return &BLAKE2_SSE4_Compress32;
181  else
182 #endif
183 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
184  if (HasSSE2())
185  return &BLAKE2_SSE2_Compress32;
186  else
187 #endif
188 #if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
189  if (HasNEON())
190  return &BLAKE2_NEON_Compress32;
191  else
192 #endif
193  return &BLAKE2_CXX_Compress32;
194 }
195 
196 #endif // CRYPTOPP_DOXYGEN_PROCESSING
197 
199  const byte* saltStr, size_t saltLen,
200  const byte* personalizationStr, size_t personalizationLen)
201 {
202  // Avoid Coverity finding SIZEOF_MISMATCH/suspicious_sizeof
203  digestLength = (byte)digestLen;
204  keyLength = (byte)keyLen;
205  fanout = depth = 1;
206  nodeDepth = innerLength = 0;
207 
208  memset(leafLength, 0x00, COUNTOF(leafLength));
209  memset(nodeOffset, 0x00, COUNTOF(nodeOffset));
210 
211  if (saltStr && saltLen)
212  {
213  memcpy_s(salt, COUNTOF(salt), saltStr, saltLen);
214  const size_t rem = COUNTOF(salt) - saltLen;
215  const size_t off = COUNTOF(salt) - rem;
216  if (rem)
217  memset(salt+off, 0x00, rem);
218  }
219  else
220  {
221  memset(salt, 0x00, COUNTOF(salt));
222  }
223 
224  if (personalizationStr && personalizationLen)
225  {
226  memcpy_s(personalization, COUNTOF(personalization), personalizationStr, personalizationLen);
227  const size_t rem = COUNTOF(personalization) - personalizationLen;
228  const size_t off = COUNTOF(personalization) - rem;
229  if (rem)
230  memset(personalization+off, 0x00, rem);
231  }
232  else
233  {
234  memset(personalization, 0x00, COUNTOF(personalization));
235  }
236 }
237 
239  const byte* saltStr, size_t saltLen,
240  const byte* personalizationStr, size_t personalizationLen)
241 {
242  // Avoid Coverity finding SIZEOF_MISMATCH/suspicious_sizeof
243  digestLength = (byte)digestLen;
244  keyLength = (byte)keyLen;
245  fanout = depth = 1;
246  nodeDepth = innerLength = 0;
247 
248  memset(rfu, 0x00, COUNTOF(rfu));
249  memset(leafLength, 0x00, COUNTOF(leafLength));
250  memset(nodeOffset, 0x00, COUNTOF(nodeOffset));
251 
252  if (saltStr && saltLen)
253  {
254  memcpy_s(salt, COUNTOF(salt), saltStr, saltLen);
255  const size_t rem = COUNTOF(salt) - saltLen;
256  const size_t off = COUNTOF(salt) - rem;
257  if (rem)
258  memset(salt+off, 0x00, rem);
259  }
260  else
261  {
262  memset(salt, 0x00, COUNTOF(salt));
263  }
264 
265  if (personalizationStr && personalizationLen)
266  {
267  memcpy_s(personalization, COUNTOF(personalization), personalizationStr, personalizationLen);
268  const size_t rem = COUNTOF(personalization) - personalizationLen;
269  const size_t off = COUNTOF(personalization) - rem;
270  if (rem)
271  memset(personalization+off, 0x00, rem);
272  }
273  else
274  {
275  memset(personalization, 0x00, COUNTOF(personalization));
276  }
277 }
278 
279 template <class W, bool T_64bit>
280 void BLAKE2_Base<W, T_64bit>::UncheckedSetKey(const byte *key, unsigned int length, const CryptoPP::NameValuePairs& params)
281 {
282  if (key && length)
283  {
284  AlignedSecByteBlock temp(BLOCKSIZE);
285  memcpy_s(temp, BLOCKSIZE, key, length);
286 
287  const size_t rem = BLOCKSIZE - length;
288  if (rem)
289  memset(temp+length, 0x00, rem);
290 
291  m_key.swap(temp);
292  }
293  else
294  {
295  m_key.resize(0);
296  }
297 
298 #if defined(__COVERITY__)
299  // Avoid Coverity finding SIZEOF_MISMATCH/suspicious_sizeof
300  ParameterBlock& block = *m_block.data();
301  memset(m_block.data(), 0x00, sizeof(ParameterBlock));
302 #else
303  // Set Head bytes; Tail bytes are set below
304  ParameterBlock& block = *m_block.data();
305  memset(m_block.data(), 0x00, T_64bit ? 32 : 16);
306 #endif
307 
308  block.keyLength = (byte)length;
309  block.digestLength = (byte)params.GetIntValueWithDefault(Name::DigestSize(), DIGESTSIZE);
310  block.fanout = block.depth = 1;
311 
313  if (params.GetValue(Name::Salt(), t) && t.begin() && t.size())
314  {
315  memcpy_s(block.salt, COUNTOF(block.salt), t.begin(), t.size());
316  const size_t rem = COUNTOF(block.salt) - t.size();
317  const size_t off = COUNTOF(block.salt) - rem;
318  if (rem)
319  memset(block.salt+off, 0x00, rem);
320  }
321  else
322  {
323  memset(block.salt, 0x00, COUNTOF(block.salt));
324  }
325 
326  if (params.GetValue(Name::Personalization(), t) && t.begin() && t.size())
327  {
328  memcpy_s(block.personalization, COUNTOF(block.personalization), t.begin(), t.size());
329  const size_t rem = COUNTOF(block.personalization) - t.size();
330  const size_t off = COUNTOF(block.personalization) - rem;
331  if (rem)
332  memset(block.personalization+off, 0x00, rem);
333  }
334  else
335  {
336  memset(block.personalization, 0x00, COUNTOF(block.personalization));
337  }
338 }
339 
340 template <class W, bool T_64bit>
341 BLAKE2_Base<W, T_64bit>::BLAKE2_Base() : m_state(1), m_block(1), m_digestSize(DIGESTSIZE), m_treeMode(false)
342 {
344  Restart();
345 }
346 
347 template <class W, bool T_64bit>
348 BLAKE2_Base<W, T_64bit>::BLAKE2_Base(bool treeMode, unsigned int digestSize) : m_state(1), m_block(1), m_digestSize(digestSize), m_treeMode(treeMode)
349 {
350  CRYPTOPP_ASSERT(digestSize <= DIGESTSIZE);
351 
353  Restart();
354 }
355 
356 template <class W, bool T_64bit>
357 BLAKE2_Base<W, T_64bit>::BLAKE2_Base(const byte *key, size_t keyLength, const byte* salt, size_t saltLength,
358  const byte* personalization, size_t personalizationLength, bool treeMode, unsigned int digestSize)
359  : m_state(1), m_block(1), m_digestSize(digestSize), m_treeMode(treeMode)
360 {
361  CRYPTOPP_ASSERT(keyLength <= MAX_KEYLENGTH);
362  CRYPTOPP_ASSERT(digestSize <= DIGESTSIZE);
363  CRYPTOPP_ASSERT(saltLength <= SALTSIZE);
364  CRYPTOPP_ASSERT(personalizationLength <= PERSONALIZATIONSIZE);
365 
366  UncheckedSetKey(key, static_cast<unsigned int>(keyLength), MakeParameters(Name::DigestSize(),(int)digestSize)(Name::TreeMode(),treeMode, false)
367  (Name::Salt(), ConstByteArrayParameter(salt, saltLength))(Name::Personalization(), ConstByteArrayParameter(personalization, personalizationLength)));
368  Restart();
369 }
370 
371 template <class W, bool T_64bit>
373 {
374  static const W zero[2] = {0,0};
375  Restart(*m_block.data(), zero);
376 }
377 
378 template <class W, bool T_64bit>
380 {
381  // We take a parameter block as a parameter to allow customized state.
382  // Avoid the copy of the parameter block when we are passing our own block.
383  if (&block != m_block.data())
384  {
385  memcpy_s(m_block.data(), sizeof(ParameterBlock), &block, sizeof(ParameterBlock));
386  m_block.data()->digestLength = (byte)m_digestSize;
387  m_block.data()->keyLength = (byte)m_key.size();
388  }
389 
390  State& state = *m_state.data();
391  state.t[0] = state.t[1] = 0, state.f[0] = state.f[1] = 0, state.length = 0;
392 
393  if (counter != NULL)
394  {
395  state.t[0] = counter[0];
396  state.t[1] = counter[1];
397  }
398 
399  PutBlock<W, LittleEndian, true> put(m_block.data(), &state.h[0]);
402 
403  // When BLAKE2 is keyed, the input stream is simply {key||message}. Key it
404  // during Restart to avoid FirstPut and friends. Key size == 0 means no key.
405  if (m_key.size())
406  Update(m_key, m_key.size());
407 }
408 
409 template <class W, bool T_64bit>
410 void BLAKE2_Base<W, T_64bit>::Update(const byte *input, size_t length)
411 {
412  State& state = *m_state.data();
413  if (state.length + length > BLOCKSIZE)
414  {
415  // Complete current block
416  const size_t fill = BLOCKSIZE - state.length;
417  memcpy_s(&state.buffer[state.length], fill, input, fill);
418 
420  Compress(state.buffer);
421  state.length = 0;
422 
423  length -= fill, input += fill;
424 
425  // Compress in-place to avoid copies
426  while (length > BLOCKSIZE)
427  {
429  Compress(input);
430  length -= BLOCKSIZE, input += BLOCKSIZE;
431  }
432  }
433 
434  // Copy tail bytes
435  if (input && length)
436  {
437  CRYPTOPP_ASSERT(length <= BLOCKSIZE - state.length);
438  memcpy_s(&state.buffer[state.length], length, input, length);
439  state.length += static_cast<unsigned int>(length);
440  }
441 }
442 
443 template <class W, bool T_64bit>
445 {
446  this->ThrowIfInvalidTruncatedSize(size);
447 
448  // Set last block unconditionally
449  State& state = *m_state.data();
450  state.f[0] = static_cast<W>(-1);
451 
452  // Set last node if tree mode
453  if (m_treeMode)
454  state.f[1] = static_cast<W>(-1);
455 
456  // Increment counter for tail bytes only
457  IncrementCounter(state.length);
458 
459  memset(state.buffer + state.length, 0x00, BLOCKSIZE - state.length);
460  Compress(state.buffer);
461 
462  // Copy to caller buffer
463  memcpy_s(hash, size, &state.h[0], size);
464 
465  Restart();
466 }
467 
468 template <class W, bool T_64bit>
470 {
471  State& state = *m_state.data();
472  state.t[0] += static_cast<W>(count);
473  state.t[1] += !!(state.t[0] < count);
474 }
475 
476 template <>
477 void BLAKE2_Base<word64, true>::Compress(const byte *input)
478 {
479  // Selects the most advanced implementation at runtime
480  static const pfnCompress64 s_pfn = InitializeCompress64Fn();
481  s_pfn(input, *m_state.data());
482 }
483 
484 template <>
486 {
487  // Selects the most advanced implementation at runtime
488  static const pfnCompress32 s_pfn = InitializeCompress32Fn();
489  s_pfn(input, *m_state.data());
490 }
491 
492 void BLAKE2_CXX_Compress64(const byte* input, BLAKE2_State<word64, true>& state)
493 {
494  #undef BLAKE2_G
495  #undef BLAKE2_ROUND
496 
497  #define BLAKE2_G(r,i,a,b,c,d) \
498  do { \
499  a = a + b + m[BLAKE2_Sigma<true>::sigma[r][2*i+0]]; \
500  d = rotrVariable<word64>(d ^ a, 32); \
501  c = c + d; \
502  b = rotrVariable<word64>(b ^ c, 24); \
503  a = a + b + m[BLAKE2_Sigma<true>::sigma[r][2*i+1]]; \
504  d = rotrVariable<word64>(d ^ a, 16); \
505  c = c + d; \
506  b = rotrVariable<word64>(b ^ c, 63); \
507  } while(0)
508 
509  #define BLAKE2_ROUND(r) \
510  do { \
511  BLAKE2_G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
512  BLAKE2_G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
513  BLAKE2_G(r,2,v[ 2],v[ 6],v[10],v[14]); \
514  BLAKE2_G(r,3,v[ 3],v[ 7],v[11],v[15]); \
515  BLAKE2_G(r,4,v[ 0],v[ 5],v[10],v[15]); \
516  BLAKE2_G(r,5,v[ 1],v[ 6],v[11],v[12]); \
517  BLAKE2_G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
518  BLAKE2_G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
519  } while(0)
520 
521  word64 m[16], v[16];
522 
524  get1(m[0])(m[1])(m[2])(m[3])(m[4])(m[5])(m[6])(m[7])(m[8])(m[9])(m[10])(m[11])(m[12])(m[13])(m[14])(m[15]);
525 
526  GetBlock<word64, LittleEndian, true> get2(&state.h[0]);
527  get2(v[0])(v[1])(v[2])(v[3])(v[4])(v[5])(v[6])(v[7]);
528 
529  v[ 8] = BLAKE2B_IV(0);
530  v[ 9] = BLAKE2B_IV(1);
531  v[10] = BLAKE2B_IV(2);
532  v[11] = BLAKE2B_IV(3);
533  v[12] = state.t[0] ^ BLAKE2B_IV(4);
534  v[13] = state.t[1] ^ BLAKE2B_IV(5);
535  v[14] = state.f[0] ^ BLAKE2B_IV(6);
536  v[15] = state.f[1] ^ BLAKE2B_IV(7);
537 
538  BLAKE2_ROUND( 0 );
539  BLAKE2_ROUND( 1 );
540  BLAKE2_ROUND( 2 );
541  BLAKE2_ROUND( 3 );
542  BLAKE2_ROUND( 4 );
543  BLAKE2_ROUND( 5 );
544  BLAKE2_ROUND( 6 );
545  BLAKE2_ROUND( 7 );
546  BLAKE2_ROUND( 8 );
547  BLAKE2_ROUND( 9 );
548  BLAKE2_ROUND( 10 );
549  BLAKE2_ROUND( 11 );
550 
551  for(unsigned int i = 0; i < 8; ++i)
552  state.h[i] = state.h[i] ^ ConditionalByteReverse(LittleEndian::ToEnum(), v[i] ^ v[i + 8]);
553 }
554 
555 void BLAKE2_CXX_Compress32(const byte* input, BLAKE2_State<word32, false>& state)
556 {
557  #undef BLAKE2_G
558  #undef BLAKE2_ROUND
559 
560  #define BLAKE2_G(r,i,a,b,c,d) \
561  do { \
562  a = a + b + m[BLAKE2_Sigma<false>::sigma[r][2*i+0]]; \
563  d = rotrVariable<word32>(d ^ a, 16); \
564  c = c + d; \
565  b = rotrVariable<word32>(b ^ c, 12); \
566  a = a + b + m[BLAKE2_Sigma<false>::sigma[r][2*i+1]]; \
567  d = rotrVariable<word32>(d ^ a, 8); \
568  c = c + d; \
569  b = rotrVariable<word32>(b ^ c, 7); \
570  } while(0)
571 
572  #define BLAKE2_ROUND(r) \
573  do { \
574  BLAKE2_G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
575  BLAKE2_G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
576  BLAKE2_G(r,2,v[ 2],v[ 6],v[10],v[14]); \
577  BLAKE2_G(r,3,v[ 3],v[ 7],v[11],v[15]); \
578  BLAKE2_G(r,4,v[ 0],v[ 5],v[10],v[15]); \
579  BLAKE2_G(r,5,v[ 1],v[ 6],v[11],v[12]); \
580  BLAKE2_G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
581  BLAKE2_G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
582  } while(0)
583 
584  word32 m[16], v[16];
585 
587  get1(m[0])(m[1])(m[2])(m[3])(m[4])(m[5])(m[6])(m[7])(m[8])(m[9])(m[10])(m[11])(m[12])(m[13])(m[14])(m[15]);
588 
589  GetBlock<word32, LittleEndian, true> get2(&state.h[0]);
590  get2(v[0])(v[1])(v[2])(v[3])(v[4])(v[5])(v[6])(v[7]);
591 
592  v[ 8] = BLAKE2S_IV(0);
593  v[ 9] = BLAKE2S_IV(1);
594  v[10] = BLAKE2S_IV(2);
595  v[11] = BLAKE2S_IV(3);
596  v[12] = state.t[0] ^ BLAKE2S_IV(4);
597  v[13] = state.t[1] ^ BLAKE2S_IV(5);
598  v[14] = state.f[0] ^ BLAKE2S_IV(6);
599  v[15] = state.f[1] ^ BLAKE2S_IV(7);
600 
601  BLAKE2_ROUND( 0 );
602  BLAKE2_ROUND( 1 );
603  BLAKE2_ROUND( 2 );
604  BLAKE2_ROUND( 3 );
605  BLAKE2_ROUND( 4 );
606  BLAKE2_ROUND( 5 );
607  BLAKE2_ROUND( 6 );
608  BLAKE2_ROUND( 7 );
609  BLAKE2_ROUND( 8 );
610  BLAKE2_ROUND( 9 );
611 
612  for(unsigned int i = 0; i < 8; ++i)
613  state.h[i] = state.h[i] ^ ConditionalByteReverse(LittleEndian::ToEnum(), v[i] ^ v[i + 8]);
614 }
615 
616 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
617 static void BLAKE2_SSE2_Compress32(const byte* input, BLAKE2_State<word32, false>& state)
618 {
619  word32 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15;
621  get(m0)(m1)(m2)(m3)(m4)(m5)(m6)(m7)(m8)(m9)(m10)(m11)(m12)(m13)(m14)(m15);
622 
623  __m128i row1,row2,row3,row4;
624  __m128i buf1,buf2,buf3,buf4;
625  __m128i ff0,ff1;
626 
627  row1 = ff0 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0]));
628  row2 = ff1 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4]));
629  row3 = _mm_setr_epi32(BLAKE2S_IV(0),BLAKE2S_IV(1),BLAKE2S_IV(2),BLAKE2S_IV(3));
630  row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV(4),BLAKE2S_IV(5),BLAKE2S_IV(6),BLAKE2S_IV(7)),_mm_loadu_si128((const __m128i*)(const void*)(&state.t[0])));
631  buf1 = _mm_set_epi32(m6,m4,m2,m0);
632  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
633  row4 = _mm_xor_si128(row4,row1);
634  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
635  row3 = _mm_add_epi32(row3,row4);
636  row2 = _mm_xor_si128(row2,row3);
637  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
638 
639  buf2 = _mm_set_epi32(m7,m5,m3,m1);
640  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
641  row4 = _mm_xor_si128(row4,row1);
642  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
643  row3 = _mm_add_epi32(row3,row4);
644  row2 = _mm_xor_si128(row2,row3);
645  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
646 
647  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
648  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
649  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
650 
651  buf3 = _mm_set_epi32(m14,m12,m10,m8);
652  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
653  row4 = _mm_xor_si128(row4,row1);
654  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
655  row3 = _mm_add_epi32(row3,row4);
656  row2 = _mm_xor_si128(row2,row3);
657  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
658 
659  buf4 = _mm_set_epi32(m15,m13,m11,m9);
660  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
661  row4 = _mm_xor_si128(row4,row1);
662  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
663  row3 = _mm_add_epi32(row3,row4);
664  row2 = _mm_xor_si128(row2,row3);
665  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
666 
667  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
668  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
669  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
670 
671  buf1 = _mm_set_epi32(m13,m9,m4,m14);
672  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
673  row4 = _mm_xor_si128(row4,row1);
674  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
675  row3 = _mm_add_epi32(row3,row4);
676  row2 = _mm_xor_si128(row2,row3);
677  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
678 
679  buf2 = _mm_set_epi32(m6,m15,m8,m10);
680  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
681  row4 = _mm_xor_si128(row4,row1);
682  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
683  row3 = _mm_add_epi32(row3,row4);
684  row2 = _mm_xor_si128(row2,row3);
685  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
686 
687  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
688  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
689  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
690 
691  buf3 = _mm_set_epi32(m5,m11,m0,m1);
692  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
693  row4 = _mm_xor_si128(row4,row1);
694  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
695  row3 = _mm_add_epi32(row3,row4);
696  row2 = _mm_xor_si128(row2,row3);
697  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
698 
699  buf4 = _mm_set_epi32(m3,m7,m2,m12);
700  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
701  row4 = _mm_xor_si128(row4,row1);
702  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
703  row3 = _mm_add_epi32(row3,row4);
704  row2 = _mm_xor_si128(row2,row3);
705  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
706 
707  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
708  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
709  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
710 
711  buf1 = _mm_set_epi32(m15,m5,m12,m11);
712  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
713  row4 = _mm_xor_si128(row4,row1);
714  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
715  row3 = _mm_add_epi32(row3,row4);
716  row2 = _mm_xor_si128(row2,row3);
717  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
718 
719  buf2 = _mm_set_epi32(m13,m2,m0,m8);
720  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
721  row4 = _mm_xor_si128(row4,row1);
722  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
723  row3 = _mm_add_epi32(row3,row4);
724  row2 = _mm_xor_si128(row2,row3);
725  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
726 
727  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
728  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
729  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
730 
731  buf3 = _mm_set_epi32(m9,m7,m3,m10);
732  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
733  row4 = _mm_xor_si128(row4,row1);
734  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
735  row3 = _mm_add_epi32(row3,row4);
736  row2 = _mm_xor_si128(row2,row3);
737  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
738 
739  buf4 = _mm_set_epi32(m4,m1,m6,m14);
740  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
741  row4 = _mm_xor_si128(row4,row1);
742  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
743  row3 = _mm_add_epi32(row3,row4);
744  row2 = _mm_xor_si128(row2,row3);
745  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
746 
747  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
748  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
749  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
750 
751  buf1 = _mm_set_epi32(m11,m13,m3,m7);
752  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
753  row4 = _mm_xor_si128(row4,row1);
754  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
755  row3 = _mm_add_epi32(row3,row4);
756  row2 = _mm_xor_si128(row2,row3);
757  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
758 
759  buf2 = _mm_set_epi32(m14,m12,m1,m9);
760  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
761  row4 = _mm_xor_si128(row4,row1);
762  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
763  row3 = _mm_add_epi32(row3,row4);
764  row2 = _mm_xor_si128(row2,row3);
765  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
766 
767  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
768  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
769  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
770 
771  buf3 = _mm_set_epi32(m15,m4,m5,m2);
772  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
773  row4 = _mm_xor_si128(row4,row1);
774  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
775  row3 = _mm_add_epi32(row3,row4);
776  row2 = _mm_xor_si128(row2,row3);
777  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
778 
779  buf4 = _mm_set_epi32(m8,m0,m10,m6);
780  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
781  row4 = _mm_xor_si128(row4,row1);
782  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
783  row3 = _mm_add_epi32(row3,row4);
784  row2 = _mm_xor_si128(row2,row3);
785  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
786 
787  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
788  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
789  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
790 
791  buf1 = _mm_set_epi32(m10,m2,m5,m9);
792  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
793  row4 = _mm_xor_si128(row4,row1);
794  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
795  row3 = _mm_add_epi32(row3,row4);
796  row2 = _mm_xor_si128(row2,row3);
797  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
798 
799  buf2 = _mm_set_epi32(m15,m4,m7,m0);
800  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
801  row4 = _mm_xor_si128(row4,row1);
802  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
803  row3 = _mm_add_epi32(row3,row4);
804  row2 = _mm_xor_si128(row2,row3);
805  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
806 
807  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
808  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
809  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
810 
811  buf3 = _mm_set_epi32(m3,m6,m11,m14);
812  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
813  row4 = _mm_xor_si128(row4,row1);
814  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
815  row3 = _mm_add_epi32(row3,row4);
816  row2 = _mm_xor_si128(row2,row3);
817  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
818 
819  buf4 = _mm_set_epi32(m13,m8,m12,m1);
820  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
821  row4 = _mm_xor_si128(row4,row1);
822  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
823  row3 = _mm_add_epi32(row3,row4);
824  row2 = _mm_xor_si128(row2,row3);
825  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
826 
827  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
828  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
829  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
830 
831  buf1 = _mm_set_epi32(m8,m0,m6,m2);
832  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
833  row4 = _mm_xor_si128(row4,row1);
834  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
835  row3 = _mm_add_epi32(row3,row4);
836  row2 = _mm_xor_si128(row2,row3);
837  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
838 
839  buf2 = _mm_set_epi32(m3,m11,m10,m12);
840  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
841  row4 = _mm_xor_si128(row4,row1);
842  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
843  row3 = _mm_add_epi32(row3,row4);
844  row2 = _mm_xor_si128(row2,row3);
845  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
846 
847  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
848  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
849  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
850 
851  buf3 = _mm_set_epi32(m1,m15,m7,m4);
852  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
853  row4 = _mm_xor_si128(row4,row1);
854  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
855  row3 = _mm_add_epi32(row3,row4);
856  row2 = _mm_xor_si128(row2,row3);
857  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
858 
859  buf4 = _mm_set_epi32(m9,m14,m5,m13);
860  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
861  row4 = _mm_xor_si128(row4,row1);
862  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
863  row3 = _mm_add_epi32(row3,row4);
864  row2 = _mm_xor_si128(row2,row3);
865  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
866 
867  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
868  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
869  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
870 
871  buf1 = _mm_set_epi32(m4,m14,m1,m12);
872  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
873  row4 = _mm_xor_si128(row4,row1);
874  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
875  row3 = _mm_add_epi32(row3,row4);
876  row2 = _mm_xor_si128(row2,row3);
877  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
878 
879  buf2 = _mm_set_epi32(m10,m13,m15,m5);
880  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
881  row4 = _mm_xor_si128(row4,row1);
882  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
883  row3 = _mm_add_epi32(row3,row4);
884  row2 = _mm_xor_si128(row2,row3);
885  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
886 
887  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
888  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
889  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
890 
891  buf3 = _mm_set_epi32(m8,m9,m6,m0);
892  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
893  row4 = _mm_xor_si128(row4,row1);
894  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
895  row3 = _mm_add_epi32(row3,row4);
896  row2 = _mm_xor_si128(row2,row3);
897  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
898 
899  buf4 = _mm_set_epi32(m11,m2,m3,m7);
900  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
901  row4 = _mm_xor_si128(row4,row1);
902  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
903  row3 = _mm_add_epi32(row3,row4);
904  row2 = _mm_xor_si128(row2,row3);
905  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
906 
907  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
908  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
909  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
910 
911  buf1 = _mm_set_epi32(m3,m12,m7,m13);
912  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
913  row4 = _mm_xor_si128(row4,row1);
914  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
915  row3 = _mm_add_epi32(row3,row4);
916  row2 = _mm_xor_si128(row2,row3);
917  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
918 
919  buf2 = _mm_set_epi32(m9,m1,m14,m11);
920  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
921  row4 = _mm_xor_si128(row4,row1);
922  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
923  row3 = _mm_add_epi32(row3,row4);
924  row2 = _mm_xor_si128(row2,row3);
925  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
926 
927  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
928  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
929  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
930 
931  buf3 = _mm_set_epi32(m2,m8,m15,m5);
932  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
933  row4 = _mm_xor_si128(row4,row1);
934  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
935  row3 = _mm_add_epi32(row3,row4);
936  row2 = _mm_xor_si128(row2,row3);
937  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
938 
939  buf4 = _mm_set_epi32(m10,m6,m4,m0);
940  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
941  row4 = _mm_xor_si128(row4,row1);
942  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
943  row3 = _mm_add_epi32(row3,row4);
944  row2 = _mm_xor_si128(row2,row3);
945  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
946 
947  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
948  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
949  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
950 
951  buf1 = _mm_set_epi32(m0,m11,m14,m6);
952  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
953  row4 = _mm_xor_si128(row4,row1);
954  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
955  row3 = _mm_add_epi32(row3,row4);
956  row2 = _mm_xor_si128(row2,row3);
957  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
958 
959  buf2 = _mm_set_epi32(m8,m3,m9,m15);
960  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
961  row4 = _mm_xor_si128(row4,row1);
962  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
963  row3 = _mm_add_epi32(row3,row4);
964  row2 = _mm_xor_si128(row2,row3);
965  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
966 
967  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
968  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
969  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
970 
971  buf3 = _mm_set_epi32(m10,m1,m13,m12);
972  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
973  row4 = _mm_xor_si128(row4,row1);
974  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
975  row3 = _mm_add_epi32(row3,row4);
976  row2 = _mm_xor_si128(row2,row3);
977  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
978 
979  buf4 = _mm_set_epi32(m5,m4,m7,m2);
980  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
981  row4 = _mm_xor_si128(row4,row1);
982  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
983  row3 = _mm_add_epi32(row3,row4);
984  row2 = _mm_xor_si128(row2,row3);
985  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
986 
987  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
988  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
989  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
990 
991  buf1 = _mm_set_epi32(m1,m7,m8,m10);
992  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
993  row4 = _mm_xor_si128(row4,row1);
994  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
995  row3 = _mm_add_epi32(row3,row4);
996  row2 = _mm_xor_si128(row2,row3);
997  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
998 
999  buf2 = _mm_set_epi32(m5,m6,m4,m2);
1000  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
1001  row4 = _mm_xor_si128(row4,row1);
1002  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
1003  row3 = _mm_add_epi32(row3,row4);
1004  row2 = _mm_xor_si128(row2,row3);
1005  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
1006 
1007  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
1008  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
1009  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
1010 
1011  buf3 = _mm_set_epi32(m13,m3,m9,m15);
1012  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
1013  row4 = _mm_xor_si128(row4,row1);
1014  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
1015  row3 = _mm_add_epi32(row3,row4);
1016  row2 = _mm_xor_si128(row2,row3);
1017  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
1018 
1019  buf4 = _mm_set_epi32(m0,m12,m14,m11);
1020  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
1021  row4 = _mm_xor_si128(row4,row1);
1022  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
1023  row3 = _mm_add_epi32(row3,row4);
1024  row2 = _mm_xor_si128(row2,row3);
1025  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
1026 
1027  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
1028  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
1029  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
1030 
1031  _mm_storeu_si128((__m128i *)(void*)(&state.h[0]),_mm_xor_si128(ff0,_mm_xor_si128(row1,row3)));
1032  _mm_storeu_si128((__m128i *)(void*)(&state.h[4]),_mm_xor_si128(ff1,_mm_xor_si128(row2,row4)));
1033 }
1034 
1035 # if (__SUNPRO_CC != 0x5120)
1036 static void BLAKE2_SSE2_Compress64(const byte* input, BLAKE2_State<word64, true>& state)
1037 {
1038  word64 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15;
1040  get(m0)(m1)(m2)(m3)(m4)(m5)(m6)(m7)(m8)(m9)(m10)(m11)(m12)(m13)(m14)(m15);
1041 
1042  __m128i row1l, row1h, row2l, row2h;
1043  __m128i row3l, row3h, row4l, row4h;
1044  __m128i b0, b1, t0, t1;
1045 
1046  row1l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0]));
1047  row1h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[2]));
1048  row2l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4]));
1049  row2h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[6]));
1050  row3l = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(0)));
1051  row3h = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(2)));
1052  row4l = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(4))), _mm_loadu_si128((const __m128i*)(const void*)(&state.t[0])));
1053  row4h = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(6))), _mm_loadu_si128((const __m128i*)(const void*)(&state.f[0])));
1054 
1055  b0 = _mm_set_epi64x(m2, m0);
1056  b1 = _mm_set_epi64x(m6, m4);
1057  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1058  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1059  row4l = _mm_xor_si128(row4l, row1l);
1060  row4h = _mm_xor_si128(row4h, row1h);
1061  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1062  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1063  row3l = _mm_add_epi64(row3l, row4l);
1064  row3h = _mm_add_epi64(row3h, row4h);
1065  row2l = _mm_xor_si128(row2l, row3l);
1066  row2h = _mm_xor_si128(row2h, row3h);
1067  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l, 40));
1068  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h, 40));
1069 
1070  b0 = _mm_set_epi64x(m3, m1);
1071  b1 = _mm_set_epi64x(m7, m5);
1072  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1073  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1074  row4l = _mm_xor_si128(row4l, row1l);
1075  row4h = _mm_xor_si128(row4h, row1h);
1076  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1077  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1078  row3l = _mm_add_epi64(row3l, row4l);
1079  row3h = _mm_add_epi64(row3h, row4h);
1080  row2l = _mm_xor_si128(row2l, row3l);
1081  row2h = _mm_xor_si128(row2h, row3h);
1082  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1083  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1084 
1085  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1086  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1087  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1088  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1089  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1090 
1091  b0 = _mm_set_epi64x(m10, m8);
1092  b1 = _mm_set_epi64x(m14, m12);
1093  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1094  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1095  row4l = _mm_xor_si128(row4l, row1l);
1096  row4h = _mm_xor_si128(row4h, row1h);
1097  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1098  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1099  row3l = _mm_add_epi64(row3l, row4l);
1100  row3h = _mm_add_epi64(row3h, row4h);
1101  row2l = _mm_xor_si128(row2l, row3l);
1102  row2h = _mm_xor_si128(row2h, row3h);
1103  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1104  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1105 
1106  b0 = _mm_set_epi64x(m11, m9);
1107  b1 = _mm_set_epi64x(m15, m13);
1108  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1109  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1110  row4l = _mm_xor_si128(row4l, row1l);
1111  row4h = _mm_xor_si128(row4h, row1h);
1112  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1113  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1114  row3l = _mm_add_epi64(row3l, row4l);
1115  row3h = _mm_add_epi64(row3h, row4h);
1116  row2l = _mm_xor_si128(row2l, row3l);
1117  row2h = _mm_xor_si128(row2h, row3h);
1118  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1119  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1120 
1121  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1122  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1123  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1124  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1125  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1126 
1127  b0 = _mm_set_epi64x(m4, m14);
1128  b1 = _mm_set_epi64x(m13, m9);
1129  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1130  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1131  row4l = _mm_xor_si128(row4l, row1l);
1132  row4h = _mm_xor_si128(row4h, row1h);
1133  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1134  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1135  row3l = _mm_add_epi64(row3l, row4l);
1136  row3h = _mm_add_epi64(row3h, row4h);
1137  row2l = _mm_xor_si128(row2l, row3l);
1138  row2h = _mm_xor_si128(row2h, row3h);
1139  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1140  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1141 
1142  b0 = _mm_set_epi64x(m8, m10);
1143  b1 = _mm_set_epi64x(m6, m15);
1144  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1145  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1146  row4l = _mm_xor_si128(row4l, row1l);
1147  row4h = _mm_xor_si128(row4h, row1h);
1148  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1149  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1150  row3l = _mm_add_epi64(row3l, row4l);
1151  row3h = _mm_add_epi64(row3h, row4h);
1152  row2l = _mm_xor_si128(row2l, row3l);
1153  row2h = _mm_xor_si128(row2h, row3h);
1154  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1155  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1156 
1157  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1158  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1159  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1160  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1161  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1162  b0 = _mm_set_epi64x(m0, m1);
1163  b1 = _mm_set_epi64x(m5, m11);
1164  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1165  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1166  row4l = _mm_xor_si128(row4l, row1l);
1167  row4h = _mm_xor_si128(row4h, row1h);
1168  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1169  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1170  row3l = _mm_add_epi64(row3l, row4l);
1171  row3h = _mm_add_epi64(row3h, row4h);
1172  row2l = _mm_xor_si128(row2l, row3l);
1173  row2h = _mm_xor_si128(row2h, row3h);
1174  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1175  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1176 
1177  b0 = _mm_set_epi64x(m2, m12);
1178  b1 = _mm_set_epi64x(m3, m7);
1179  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1180  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1181  row4l = _mm_xor_si128(row4l, row1l);
1182  row4h = _mm_xor_si128(row4h, row1h);
1183  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1184  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1185  row3l = _mm_add_epi64(row3l, row4l);
1186  row3h = _mm_add_epi64(row3h, row4h);
1187  row2l = _mm_xor_si128(row2l, row3l);
1188  row2h = _mm_xor_si128(row2h, row3h);
1189  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1190  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1191 
1192  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1193  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1194  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1195  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1196  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1197 
1198  b0 = _mm_set_epi64x(m12, m11);
1199  b1 = _mm_set_epi64x(m15, m5);
1200  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1201  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1202  row4l = _mm_xor_si128(row4l, row1l);
1203  row4h = _mm_xor_si128(row4h, row1h);
1204  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1205  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1206  row3l = _mm_add_epi64(row3l, row4l);
1207  row3h = _mm_add_epi64(row3h, row4h);
1208  row2l = _mm_xor_si128(row2l, row3l);
1209  row2h = _mm_xor_si128(row2h, row3h);
1210  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1211  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1212 
1213  b0 = _mm_set_epi64x(m0, m8);
1214  b1 = _mm_set_epi64x(m13, m2);
1215  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1216  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1217  row4l = _mm_xor_si128(row4l, row1l);
1218  row4h = _mm_xor_si128(row4h, row1h);
1219  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1220  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1221  row3l = _mm_add_epi64(row3l, row4l);
1222  row3h = _mm_add_epi64(row3h, row4h);
1223  row2l = _mm_xor_si128(row2l, row3l);
1224  row2h = _mm_xor_si128(row2h, row3h);
1225  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1226  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1227 
1228  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1229  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1230  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1231  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1232  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1233  b0 = _mm_set_epi64x(m3, m10);
1234  b1 = _mm_set_epi64x(m9, m7);
1235  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1236  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1237  row4l = _mm_xor_si128(row4l, row1l);
1238  row4h = _mm_xor_si128(row4h, row1h);
1239  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1240  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1241  row3l = _mm_add_epi64(row3l, row4l);
1242  row3h = _mm_add_epi64(row3h, row4h);
1243  row2l = _mm_xor_si128(row2l, row3l);
1244  row2h = _mm_xor_si128(row2h, row3h);
1245  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1246  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1247 
1248  b0 = _mm_set_epi64x(m6, m14);
1249  b1 = _mm_set_epi64x(m4, m1);
1250  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1251  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1252  row4l = _mm_xor_si128(row4l, row1l);
1253  row4h = _mm_xor_si128(row4h, row1h);
1254  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1255  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1256  row3l = _mm_add_epi64(row3l, row4l);
1257  row3h = _mm_add_epi64(row3h, row4h);
1258  row2l = _mm_xor_si128(row2l, row3l);
1259  row2h = _mm_xor_si128(row2h, row3h);
1260  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1261  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1262 
1263  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1264  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1265  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1266  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1267  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1268 
1269  b0 = _mm_set_epi64x(m3, m7);
1270  b1 = _mm_set_epi64x(m11, m13);
1271  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1272  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1273  row4l = _mm_xor_si128(row4l, row1l);
1274  row4h = _mm_xor_si128(row4h, row1h);
1275  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1276  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1277  row3l = _mm_add_epi64(row3l, row4l);
1278  row3h = _mm_add_epi64(row3h, row4h);
1279  row2l = _mm_xor_si128(row2l, row3l);
1280  row2h = _mm_xor_si128(row2h, row3h);
1281  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1282  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1283 
1284  b0 = _mm_set_epi64x(m1, m9);
1285  b1 = _mm_set_epi64x(m14, m12);
1286  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1287  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1288  row4l = _mm_xor_si128(row4l, row1l);
1289  row4h = _mm_xor_si128(row4h, row1h);
1290  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1291  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1292  row3l = _mm_add_epi64(row3l, row4l);
1293  row3h = _mm_add_epi64(row3h, row4h);
1294  row2l = _mm_xor_si128(row2l, row3l);
1295  row2h = _mm_xor_si128(row2h, row3h);
1296  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1297  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1298 
1299  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1300  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1301  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1302  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1303  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1304  b0 = _mm_set_epi64x(m5, m2);
1305  b1 = _mm_set_epi64x(m15, m4);
1306  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1307  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1308  row4l = _mm_xor_si128(row4l, row1l);
1309  row4h = _mm_xor_si128(row4h, row1h);
1310  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1311  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1312  row3l = _mm_add_epi64(row3l, row4l);
1313  row3h = _mm_add_epi64(row3h, row4h);
1314  row2l = _mm_xor_si128(row2l, row3l);
1315  row2h = _mm_xor_si128(row2h, row3h);
1316  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1317  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1318 
1319  b0 = _mm_set_epi64x(m10, m6);
1320  b1 = _mm_set_epi64x(m8, m0);
1321  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1322  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1323  row4l = _mm_xor_si128(row4l, row1l);
1324  row4h = _mm_xor_si128(row4h, row1h);
1325  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1326  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1327  row3l = _mm_add_epi64(row3l, row4l);
1328  row3h = _mm_add_epi64(row3h, row4h);
1329  row2l = _mm_xor_si128(row2l, row3l);
1330  row2h = _mm_xor_si128(row2h, row3h);
1331  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1332  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1333 
1334  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1335  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1336  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1337  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1338  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1339 
1340  b0 = _mm_set_epi64x(m5, m9);
1341  b1 = _mm_set_epi64x(m10, m2);
1342  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1343  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1344  row4l = _mm_xor_si128(row4l, row1l);
1345  row4h = _mm_xor_si128(row4h, row1h);
1346  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1347  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1348  row3l = _mm_add_epi64(row3l, row4l);
1349  row3h = _mm_add_epi64(row3h, row4h);
1350  row2l = _mm_xor_si128(row2l, row3l);
1351  row2h = _mm_xor_si128(row2h, row3h);
1352  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1353  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1354 
1355  b0 = _mm_set_epi64x(m7, m0);
1356  b1 = _mm_set_epi64x(m15, m4);
1357  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1358  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1359  row4l = _mm_xor_si128(row4l, row1l);
1360  row4h = _mm_xor_si128(row4h, row1h);
1361  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1362  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1363  row3l = _mm_add_epi64(row3l, row4l);
1364  row3h = _mm_add_epi64(row3h, row4h);
1365  row2l = _mm_xor_si128(row2l, row3l);
1366  row2h = _mm_xor_si128(row2h, row3h);
1367  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1368  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1369 
1370  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1371  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1372  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1373  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1374  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1375  b0 = _mm_set_epi64x(m11, m14);
1376  b1 = _mm_set_epi64x(m3, m6);
1377  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1378  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1379  row4l = _mm_xor_si128(row4l, row1l);
1380  row4h = _mm_xor_si128(row4h, row1h);
1381  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1382  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1383  row3l = _mm_add_epi64(row3l, row4l);
1384  row3h = _mm_add_epi64(row3h, row4h);
1385  row2l = _mm_xor_si128(row2l, row3l);
1386  row2h = _mm_xor_si128(row2h, row3h);
1387  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1388  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1389 
1390 
1391  b0 = _mm_set_epi64x(m12, m1);
1392  b1 = _mm_set_epi64x(m13, m8);
1393  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1394  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1395  row4l = _mm_xor_si128(row4l, row1l);
1396  row4h = _mm_xor_si128(row4h, row1h);
1397  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1398  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1399  row3l = _mm_add_epi64(row3l, row4l);
1400  row3h = _mm_add_epi64(row3h, row4h);
1401  row2l = _mm_xor_si128(row2l, row3l);
1402  row2h = _mm_xor_si128(row2h, row3h);
1403  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1404  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1405 
1406  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1407  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1408  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1409  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1410  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1411 
1412  b0 = _mm_set_epi64x(m6, m2);
1413  b1 = _mm_set_epi64x(m8, m0);
1414  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1415  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1416  row4l = _mm_xor_si128(row4l, row1l);
1417  row4h = _mm_xor_si128(row4h, row1h);
1418  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1419  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1420  row3l = _mm_add_epi64(row3l, row4l);
1421  row3h = _mm_add_epi64(row3h, row4h);
1422  row2l = _mm_xor_si128(row2l, row3l);
1423  row2h = _mm_xor_si128(row2h, row3h);
1424  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1425  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1426 
1427  b0 = _mm_set_epi64x(m10, m12);
1428  b1 = _mm_set_epi64x(m3, m11);
1429  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1430  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1431  row4l = _mm_xor_si128(row4l, row1l);
1432  row4h = _mm_xor_si128(row4h, row1h);
1433  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1434  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1435  row3l = _mm_add_epi64(row3l, row4l);
1436  row3h = _mm_add_epi64(row3h, row4h);
1437  row2l = _mm_xor_si128(row2l, row3l);
1438  row2h = _mm_xor_si128(row2h, row3h);
1439  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1440  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1441 
1442  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1443  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1444  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1445  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1446  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1447  b0 = _mm_set_epi64x(m7, m4);
1448  b1 = _mm_set_epi64x(m1, m15);
1449  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1450  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1451  row4l = _mm_xor_si128(row4l, row1l);
1452  row4h = _mm_xor_si128(row4h, row1h);
1453  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1454  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1455  row3l = _mm_add_epi64(row3l, row4l);
1456  row3h = _mm_add_epi64(row3h, row4h);
1457  row2l = _mm_xor_si128(row2l, row3l);
1458  row2h = _mm_xor_si128(row2h, row3h);
1459  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1460  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1461 
1462  b0 = _mm_set_epi64x(m5, m13);
1463  b1 = _mm_set_epi64x(m9, m14);
1464  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1465  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1466  row4l = _mm_xor_si128(row4l, row1l);
1467  row4h = _mm_xor_si128(row4h, row1h);
1468  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1469  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1470  row3l = _mm_add_epi64(row3l, row4l);
1471  row3h = _mm_add_epi64(row3h, row4h);
1472  row2l = _mm_xor_si128(row2l, row3l);
1473  row2h = _mm_xor_si128(row2h, row3h);
1474  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1475  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1476 
1477  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1478  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1479  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1480  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1481  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1482 
1483  b0 = _mm_set_epi64x(m1, m12);
1484  b1 = _mm_set_epi64x(m4, m14);
1485  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1486  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1487  row4l = _mm_xor_si128(row4l, row1l);
1488  row4h = _mm_xor_si128(row4h, row1h);
1489  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1490  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1491  row3l = _mm_add_epi64(row3l, row4l);
1492  row3h = _mm_add_epi64(row3h, row4h);
1493  row2l = _mm_xor_si128(row2l, row3l);
1494  row2h = _mm_xor_si128(row2h, row3h);
1495  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1496  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1497 
1498  b0 = _mm_set_epi64x(m15, m5);
1499  b1 = _mm_set_epi64x(m10, m13);
1500  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1501  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1502  row4l = _mm_xor_si128(row4l, row1l);
1503  row4h = _mm_xor_si128(row4h, row1h);
1504  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1505  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1506  row3l = _mm_add_epi64(row3l, row4l);
1507  row3h = _mm_add_epi64(row3h, row4h);
1508  row2l = _mm_xor_si128(row2l, row3l);
1509  row2h = _mm_xor_si128(row2h, row3h);
1510  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1511  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1512 
1513  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1514  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1515  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1516  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1517  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1518  b0 = _mm_set_epi64x(m6, m0);
1519  b1 = _mm_set_epi64x(m8, m9);
1520  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1521  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1522  row4l = _mm_xor_si128(row4l, row1l);
1523  row4h = _mm_xor_si128(row4h, row1h);
1524  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1525  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1526  row3l = _mm_add_epi64(row3l, row4l);
1527  row3h = _mm_add_epi64(row3h, row4h);
1528  row2l = _mm_xor_si128(row2l, row3l);
1529  row2h = _mm_xor_si128(row2h, row3h);
1530  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1531  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1532 
1533  b0 = _mm_set_epi64x(m3, m7);
1534  b1 = _mm_set_epi64x(m11, m2);
1535  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1536  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1537  row4l = _mm_xor_si128(row4l, row1l);
1538  row4h = _mm_xor_si128(row4h, row1h);
1539  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1540  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1541  row3l = _mm_add_epi64(row3l, row4l);
1542  row3h = _mm_add_epi64(row3h, row4h);
1543  row2l = _mm_xor_si128(row2l, row3l);
1544  row2h = _mm_xor_si128(row2h, row3h);
1545  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1546  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1547 
1548  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1549  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1550  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1551  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1552  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1553 
1554  b0 = _mm_set_epi64x(m7, m13);
1555  b1 = _mm_set_epi64x(m3, m12);
1556  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1557  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1558  row4l = _mm_xor_si128(row4l, row1l);
1559  row4h = _mm_xor_si128(row4h, row1h);
1560  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1561  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1562  row3l = _mm_add_epi64(row3l, row4l);
1563  row3h = _mm_add_epi64(row3h, row4h);
1564  row2l = _mm_xor_si128(row2l, row3l);
1565  row2h = _mm_xor_si128(row2h, row3h);
1566  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1567  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1568 
1569  b0 = _mm_set_epi64x(m14, m11);
1570  b1 = _mm_set_epi64x(m9, m1);
1571  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1572  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1573  row4l = _mm_xor_si128(row4l, row1l);
1574  row4h = _mm_xor_si128(row4h, row1h);
1575  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1576  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1577  row3l = _mm_add_epi64(row3l, row4l);
1578  row3h = _mm_add_epi64(row3h, row4h);
1579  row2l = _mm_xor_si128(row2l, row3l);
1580  row2h = _mm_xor_si128(row2h, row3h);
1581  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1582  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1583 
1584  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1585  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1586  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1587  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1588  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1589  b0 = _mm_set_epi64x(m15, m5);
1590  b1 = _mm_set_epi64x(m2, m8);
1591  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1592  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1593  row4l = _mm_xor_si128(row4l, row1l);
1594  row4h = _mm_xor_si128(row4h, row1h);
1595  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1596  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1597  row3l = _mm_add_epi64(row3l, row4l);
1598  row3h = _mm_add_epi64(row3h, row4h);
1599  row2l = _mm_xor_si128(row2l, row3l);
1600  row2h = _mm_xor_si128(row2h, row3h);
1601  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1602  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1603 
1604  b0 = _mm_set_epi64x(m4, m0);
1605  b1 = _mm_set_epi64x(m10, m6);
1606  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1607  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1608  row4l = _mm_xor_si128(row4l, row1l);
1609  row4h = _mm_xor_si128(row4h, row1h);
1610  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1611  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1612  row3l = _mm_add_epi64(row3l, row4l);
1613  row3h = _mm_add_epi64(row3h, row4h);
1614  row2l = _mm_xor_si128(row2l, row3l);
1615  row2h = _mm_xor_si128(row2h, row3h);
1616  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1617  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1618 
1619  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1620  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1621  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1622  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1623  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1624 
1625  b0 = _mm_set_epi64x(m14, m6);
1626  b1 = _mm_set_epi64x(m0, m11);
1627  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1628  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1629  row4l = _mm_xor_si128(row4l, row1l);
1630  row4h = _mm_xor_si128(row4h, row1h);
1631  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1632  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1633  row3l = _mm_add_epi64(row3l, row4l);
1634  row3h = _mm_add_epi64(row3h, row4h);
1635  row2l = _mm_xor_si128(row2l, row3l);
1636  row2h = _mm_xor_si128(row2h, row3h);
1637  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1638  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1639 
1640  b0 = _mm_set_epi64x(m9, m15);
1641  b1 = _mm_set_epi64x(m8, m3);
1642  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1643  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1644  row4l = _mm_xor_si128(row4l, row1l);
1645  row4h = _mm_xor_si128(row4h, row1h);
1646  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1647  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1648  row3l = _mm_add_epi64(row3l, row4l);
1649  row3h = _mm_add_epi64(row3h, row4h);
1650  row2l = _mm_xor_si128(row2l, row3l);
1651  row2h = _mm_xor_si128(row2h, row3h);
1652  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1653  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1654 
1655  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1656  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1657  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1658  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1659  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1660  b0 = _mm_set_epi64x(m13, m12);
1661  b1 = _mm_set_epi64x(m10, m1);
1662  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1663  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1664  row4l = _mm_xor_si128(row4l, row1l);
1665  row4h = _mm_xor_si128(row4h, row1h);
1666  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1667  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1668  row3l = _mm_add_epi64(row3l, row4l);
1669  row3h = _mm_add_epi64(row3h, row4h);
1670  row2l = _mm_xor_si128(row2l, row3l);
1671  row2h = _mm_xor_si128(row2h, row3h);
1672  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1673  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1674 
1675  b0 = _mm_set_epi64x(m7, m2);
1676  b1 = _mm_set_epi64x(m5, m4);
1677  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1678  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1679  row4l = _mm_xor_si128(row4l, row1l);
1680  row4h = _mm_xor_si128(row4h, row1h);
1681  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1682  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1683  row3l = _mm_add_epi64(row3l, row4l);
1684  row3h = _mm_add_epi64(row3h, row4h);
1685  row2l = _mm_xor_si128(row2l, row3l);
1686  row2h = _mm_xor_si128(row2h, row3h);
1687  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1688  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1689 
1690  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1691  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1692  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1693  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1694  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1695 
1696  b0 = _mm_set_epi64x(m8, m10);
1697  b1 = _mm_set_epi64x(m1, m7);
1698  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1699  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1700  row4l = _mm_xor_si128(row4l, row1l);
1701  row4h = _mm_xor_si128(row4h, row1h);
1702  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1703  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1704  row3l = _mm_add_epi64(row3l, row4l);
1705  row3h = _mm_add_epi64(row3h, row4h);
1706  row2l = _mm_xor_si128(row2l, row3l);
1707  row2h = _mm_xor_si128(row2h, row3h);
1708  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1709  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1710 
1711  b0 = _mm_set_epi64x(m4, m2);
1712  b1 = _mm_set_epi64x(m5, m6);
1713  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1714  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1715  row4l = _mm_xor_si128(row4l, row1l);
1716  row4h = _mm_xor_si128(row4h, row1h);
1717  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1718  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1719  row3l = _mm_add_epi64(row3l, row4l);
1720  row3h = _mm_add_epi64(row3h, row4h);
1721  row2l = _mm_xor_si128(row2l, row3l);
1722  row2h = _mm_xor_si128(row2h, row3h);
1723  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1724  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1725 
1726  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1727  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1728  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1729  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1730  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1731  b0 = _mm_set_epi64x(m9, m15);
1732  b1 = _mm_set_epi64x(m13, m3);
1733  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1734  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1735  row4l = _mm_xor_si128(row4l, row1l);
1736  row4h = _mm_xor_si128(row4h, row1h);
1737  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1738  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1739  row3l = _mm_add_epi64(row3l, row4l);
1740  row3h = _mm_add_epi64(row3h, row4h);
1741  row2l = _mm_xor_si128(row2l, row3l);
1742  row2h = _mm_xor_si128(row2h, row3h);
1743  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1744  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1745 
1746  b0 = _mm_set_epi64x(m14, m11);
1747  b1 = _mm_set_epi64x(m0, m12);
1748  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1749  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1750  row4l = _mm_xor_si128(row4l, row1l);
1751  row4h = _mm_xor_si128(row4h, row1h);
1752  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1753  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1754  row3l = _mm_add_epi64(row3l, row4l);
1755  row3h = _mm_add_epi64(row3h, row4h);
1756  row2l = _mm_xor_si128(row2l, row3l);
1757  row2h = _mm_xor_si128(row2h, row3h);
1758  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1759  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1760 
1761  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1762  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1763  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1764  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1765  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1766 
1767  b0 = _mm_set_epi64x(m2, m0);
1768  b1 = _mm_set_epi64x(m6, m4);
1769  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1770  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1771  row4l = _mm_xor_si128(row4l, row1l);
1772  row4h = _mm_xor_si128(row4h, row1h);
1773  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1774  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1775  row3l = _mm_add_epi64(row3l, row4l);
1776  row3h = _mm_add_epi64(row3h, row4h);
1777  row2l = _mm_xor_si128(row2l, row3l);
1778  row2h = _mm_xor_si128(row2h, row3h);
1779  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1780  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1781 
1782  b0 = _mm_set_epi64x(m3, m1);
1783  b1 = _mm_set_epi64x(m7, m5);
1784  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1785  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1786  row4l = _mm_xor_si128(row4l, row1l);
1787  row4h = _mm_xor_si128(row4h, row1h);
1788  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1789  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1790  row3l = _mm_add_epi64(row3l, row4l);
1791  row3h = _mm_add_epi64(row3h, row4h);
1792  row2l = _mm_xor_si128(row2l, row3l);
1793  row2h = _mm_xor_si128(row2h, row3h);
1794  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1795  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1796 
1797  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1798  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1799  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1800  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1801  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1802 
1803  b0 = _mm_set_epi64x(m10, m8);
1804  b1 = _mm_set_epi64x(m14, m12);
1805  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1806  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1807  row4l = _mm_xor_si128(row4l, row1l);
1808  row4h = _mm_xor_si128(row4h, row1h);
1809  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1810  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1811  row3l = _mm_add_epi64(row3l, row4l);
1812  row3h = _mm_add_epi64(row3h, row4h);
1813  row2l = _mm_xor_si128(row2l, row3l);
1814  row2h = _mm_xor_si128(row2h, row3h);
1815  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1816  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1817 
1818  b0 = _mm_set_epi64x(m11, m9);
1819  b1 = _mm_set_epi64x(m15, m13);
1820  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1821  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1822  row4l = _mm_xor_si128(row4l, row1l);
1823  row4h = _mm_xor_si128(row4h, row1h);
1824  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1825  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1826  row3l = _mm_add_epi64(row3l, row4l);
1827  row3h = _mm_add_epi64(row3h, row4h);
1828  row2l = _mm_xor_si128(row2l, row3l);
1829  row2h = _mm_xor_si128(row2h, row3h);
1830  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1831  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1832 
1833  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1834  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1835  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1836  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1837  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1838 
1839  b0 = _mm_set_epi64x(m4, m14);
1840  b1 = _mm_set_epi64x(m13, m9);
1841  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1842  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1843  row4l = _mm_xor_si128(row4l, row1l);
1844  row4h = _mm_xor_si128(row4h, row1h);
1845  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1846  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1847  row3l = _mm_add_epi64(row3l, row4l);
1848  row3h = _mm_add_epi64(row3h, row4h);
1849  row2l = _mm_xor_si128(row2l, row3l);
1850  row2h = _mm_xor_si128(row2h, row3h);
1851  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1852  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1853 
1854  b0 = _mm_set_epi64x(m8, m10);
1855  b1 = _mm_set_epi64x(m6, m15);
1856  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1857  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1858  row4l = _mm_xor_si128(row4l, row1l);
1859  row4h = _mm_xor_si128(row4h, row1h);
1860  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1861  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1862  row3l = _mm_add_epi64(row3l, row4l);
1863  row3h = _mm_add_epi64(row3h, row4h);
1864  row2l = _mm_xor_si128(row2l, row3l);
1865  row2h = _mm_xor_si128(row2h, row3h);
1866  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1867  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1868 
1869  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1870  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1871  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1872  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1873  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1874  b0 = _mm_set_epi64x(m0, m1);
1875  b1 = _mm_set_epi64x(m5, m11);
1876  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1877  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1878  row4l = _mm_xor_si128(row4l, row1l);
1879  row4h = _mm_xor_si128(row4h, row1h);
1880  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1881  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1882  row3l = _mm_add_epi64(row3l, row4l);
1883  row3h = _mm_add_epi64(row3h, row4h);
1884  row2l = _mm_xor_si128(row2l, row3l);
1885  row2h = _mm_xor_si128(row2h, row3h);
1886  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1887  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1888 
1889  b0 = _mm_set_epi64x(m2, m12);
1890  b1 = _mm_set_epi64x(m3, m7);
1891  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1892  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1893  row4l = _mm_xor_si128(row4l, row1l);
1894  row4h = _mm_xor_si128(row4h, row1h);
1895  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1896  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1897  row3l = _mm_add_epi64(row3l, row4l);
1898  row3h = _mm_add_epi64(row3h, row4h);
1899  row2l = _mm_xor_si128(row2l, row3l);
1900  row2h = _mm_xor_si128(row2h, row3h);
1901  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1902  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1903 
1904  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1905  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1906  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1907  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1908  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1909 
1910  row1l = _mm_xor_si128(row3l, row1l);
1911  row1h = _mm_xor_si128(row3h, row1h);
1912  _mm_storeu_si128((__m128i *)(void*)(&state.h[0]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])), row1l));
1913  _mm_storeu_si128((__m128i *)(void*)(&state.h[2]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[2])), row1h));
1914 
1915  row2l = _mm_xor_si128(row4l, row2l);
1916  row2h = _mm_xor_si128(row4h, row2h);
1917  _mm_storeu_si128((__m128i *)(void*)(&state.h[4]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])), row2l));
1918  _mm_storeu_si128((__m128i *)(void*)(&state.h[6]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[6])), row2h));
1919 }
1920 # endif // (__SUNPRO_CC != 0x5120)
1921 #endif // CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
1922 
1923 #if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
1924 static void BLAKE2_SSE4_Compress32(const byte* input, BLAKE2_State<word32, false>& state)
1925 {
1926  __m128i row1, row2, row3, row4;
1927  __m128i buf1, buf2, buf3, buf4;
1928 
1929  __m128i t0, t1, t2;
1930  __m128i ff0, ff1;
1931 
1932  const __m128i r8 = _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1);
1933  const __m128i r16 = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
1934 
1935  const __m128i m0 = _mm_loadu_si128((const __m128i*)(const void*)(input + 00));
1936  const __m128i m1 = _mm_loadu_si128((const __m128i*)(const void*)(input + 16));
1937  const __m128i m2 = _mm_loadu_si128((const __m128i*)(const void*)(input + 32));
1938  const __m128i m3 = _mm_loadu_si128((const __m128i*)(const void*)(input + 48));
1939 
1940  row1 = ff0 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0]));
1941  row2 = ff1 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4]));
1942  row3 = _mm_setr_epi32(BLAKE2S_IV(0), BLAKE2S_IV(1), BLAKE2S_IV(2), BLAKE2S_IV(3));
1943  row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV(4), BLAKE2S_IV(5), BLAKE2S_IV(6), BLAKE2S_IV(7)), _mm_loadu_si128((const __m128i*)(const void*)(&state.t[0])));
1944  buf1 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(2,0,2,0))));
1945 
1946  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
1947  row4 = _mm_xor_si128(row4, row1);
1948  row4 = _mm_shuffle_epi8(row4,r16);
1949  row3 = _mm_add_epi32(row3, row4);
1950  row2 = _mm_xor_si128(row2, row3);
1951  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
1952 
1953  buf2 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(3,1,3,1))));
1954 
1955  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
1956  row4 = _mm_xor_si128(row4, row1);
1957  row4 = _mm_shuffle_epi8(row4,r8);
1958  row3 = _mm_add_epi32(row3, row4);
1959  row2 = _mm_xor_si128(row2, row3);
1960  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
1961 
1962  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
1963  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
1964  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
1965 
1966  buf3 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m2)), _mm_castsi128_ps((m3)), _MM_SHUFFLE(2,0,2,0))));
1967 
1968  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
1969  row4 = _mm_xor_si128(row4, row1);
1970  row4 = _mm_shuffle_epi8(row4,r16);
1971  row3 = _mm_add_epi32(row3, row4);
1972  row2 = _mm_xor_si128(row2, row3);
1973  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
1974 
1975  buf4 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m2)), _mm_castsi128_ps((m3)), _MM_SHUFFLE(3,1,3,1))));
1976 
1977  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
1978  row4 = _mm_xor_si128(row4, row1);
1979  row4 = _mm_shuffle_epi8(row4,r8);
1980  row3 = _mm_add_epi32(row3, row4);
1981  row2 = _mm_xor_si128(row2, row3);
1982  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
1983 
1984  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
1985  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
1986  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
1987 
1988  t0 = _mm_blend_epi16(m1, m2, 0x0C);
1989  t1 = _mm_slli_si128(m3, 4);
1990  t2 = _mm_blend_epi16(t0, t1, 0xF0);
1991  buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3));
1992 
1993  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
1994  row4 = _mm_xor_si128(row4, row1);
1995  row4 = _mm_shuffle_epi8(row4,r16);
1996  row3 = _mm_add_epi32(row3, row4);
1997  row2 = _mm_xor_si128(row2, row3);
1998  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
1999 
2000  t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0));
2001  t1 = _mm_blend_epi16(m1,m3,0xC0);
2002  t2 = _mm_blend_epi16(t0, t1, 0xF0);
2003  buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
2004 
2005  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2006  row4 = _mm_xor_si128(row4, row1);
2007  row4 = _mm_shuffle_epi8(row4,r8);
2008  row3 = _mm_add_epi32(row3, row4);
2009  row2 = _mm_xor_si128(row2, row3);
2010  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2011 
2012  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2013  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2014  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2015 
2016  t0 = _mm_slli_si128(m1, 4);
2017  t1 = _mm_blend_epi16(m2, t0, 0x30);
2018  t2 = _mm_blend_epi16(m0, t1, 0xF0);
2019  buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
2020 
2021  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2022  row4 = _mm_xor_si128(row4, row1);
2023  row4 = _mm_shuffle_epi8(row4,r16);
2024  row3 = _mm_add_epi32(row3, row4);
2025  row2 = _mm_xor_si128(row2, row3);
2026  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2027 
2028  t0 = _mm_unpackhi_epi32(m0,m1);
2029  t1 = _mm_slli_si128(m3, 4);
2030  t2 = _mm_blend_epi16(t0, t1, 0x0C);
2031  buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
2032 
2033  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2034  row4 = _mm_xor_si128(row4, row1);
2035  row4 = _mm_shuffle_epi8(row4,r8);
2036  row3 = _mm_add_epi32(row3, row4);
2037  row2 = _mm_xor_si128(row2, row3);
2038  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2039 
2040  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2041  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2042  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2043 
2044  t0 = _mm_unpackhi_epi32(m2,m3);
2045  t1 = _mm_blend_epi16(m3,m1,0x0C);
2046  t2 = _mm_blend_epi16(t0, t1, 0x0F);
2047  buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
2048 
2049  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2050  row4 = _mm_xor_si128(row4, row1);
2051  row4 = _mm_shuffle_epi8(row4,r16);
2052  row3 = _mm_add_epi32(row3, row4);
2053  row2 = _mm_xor_si128(row2, row3);
2054  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2055 
2056  t0 = _mm_unpacklo_epi32(m2,m0);
2057  t1 = _mm_blend_epi16(t0, m0, 0xF0);
2058  t2 = _mm_slli_si128(m3, 8);
2059  buf2 = _mm_blend_epi16(t1, t2, 0xC0);
2060 
2061  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2062  row4 = _mm_xor_si128(row4, row1);
2063  row4 = _mm_shuffle_epi8(row4,r8);
2064  row3 = _mm_add_epi32(row3, row4);
2065  row2 = _mm_xor_si128(row2, row3);
2066  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2067 
2068  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2069  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2070  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2071 
2072  t0 = _mm_blend_epi16(m0, m2, 0x3C);
2073  t1 = _mm_srli_si128(m1, 12);
2074  t2 = _mm_blend_epi16(t0,t1,0x03);
2075  buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2));
2076 
2077  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2078  row4 = _mm_xor_si128(row4, row1);
2079  row4 = _mm_shuffle_epi8(row4,r16);
2080  row3 = _mm_add_epi32(row3, row4);
2081  row2 = _mm_xor_si128(row2, row3);
2082  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2083 
2084  t0 = _mm_slli_si128(m3, 4);
2085  t1 = _mm_blend_epi16(m0, m1, 0x33);
2086  t2 = _mm_blend_epi16(t1, t0, 0xC0);
2087  buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3));
2088 
2089  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2090  row4 = _mm_xor_si128(row4, row1);
2091  row4 = _mm_shuffle_epi8(row4,r8);
2092  row3 = _mm_add_epi32(row3, row4);
2093  row2 = _mm_xor_si128(row2, row3);
2094  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2095 
2096  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2097  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2098  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2099 
2100  t0 = _mm_unpackhi_epi32(m0,m1);
2101  t1 = _mm_unpackhi_epi32(t0, m2);
2102  t2 = _mm_blend_epi16(t1, m3, 0x0C);
2103  buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
2104 
2105  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2106  row4 = _mm_xor_si128(row4, row1);
2107  row4 = _mm_shuffle_epi8(row4,r16);
2108  row3 = _mm_add_epi32(row3, row4);
2109  row2 = _mm_xor_si128(row2, row3);
2110  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2111 
2112  t0 = _mm_slli_si128(m2, 8);
2113  t1 = _mm_blend_epi16(m3,m0,0x0C);
2114  t2 = _mm_blend_epi16(t1, t0, 0xC0);
2115  buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
2116 
2117  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2118  row4 = _mm_xor_si128(row4, row1);
2119  row4 = _mm_shuffle_epi8(row4,r8);
2120  row3 = _mm_add_epi32(row3, row4);
2121  row2 = _mm_xor_si128(row2, row3);
2122  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2123 
2124  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2125  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2126  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2127 
2128  t0 = _mm_blend_epi16(m0,m1,0x0F);
2129  t1 = _mm_blend_epi16(t0, m3, 0xC0);
2130  buf3 = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
2131 
2132  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2133  row4 = _mm_xor_si128(row4, row1);
2134  row4 = _mm_shuffle_epi8(row4,r16);
2135  row3 = _mm_add_epi32(row3, row4);
2136  row2 = _mm_xor_si128(row2, row3);
2137  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2138 
2139  t0 = _mm_unpacklo_epi32(m0,m2);
2140  t1 = _mm_unpackhi_epi32(m1,m2);
2141  buf4 = _mm_unpacklo_epi64(t1,t0);
2142 
2143  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2144  row4 = _mm_xor_si128(row4, row1);
2145  row4 = _mm_shuffle_epi8(row4,r8);
2146  row3 = _mm_add_epi32(row3, row4);
2147  row2 = _mm_xor_si128(row2, row3);
2148  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2149 
2150  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2151  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2152  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2153 
2154  t0 = _mm_unpacklo_epi64(m1,m2);
2155  t1 = _mm_unpackhi_epi64(m0,m2);
2156  t2 = _mm_blend_epi16(t0,t1,0x33);
2157  buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
2158 
2159  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2160  row4 = _mm_xor_si128(row4, row1);
2161  row4 = _mm_shuffle_epi8(row4,r16);
2162  row3 = _mm_add_epi32(row3, row4);
2163  row2 = _mm_xor_si128(row2, row3);
2164  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2165 
2166  t0 = _mm_unpackhi_epi64(m1,m3);
2167  t1 = _mm_unpacklo_epi64(m0,m1);
2168  buf2 = _mm_blend_epi16(t0,t1,0x33);
2169 
2170  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2171  row4 = _mm_xor_si128(row4, row1);
2172  row4 = _mm_shuffle_epi8(row4,r8);
2173  row3 = _mm_add_epi32(row3, row4);
2174  row2 = _mm_xor_si128(row2, row3);
2175  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2176 
2177  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2178  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2179  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2180 
2181  t0 = _mm_unpackhi_epi64(m3,m1);
2182  t1 = _mm_unpackhi_epi64(m2,m0);
2183  buf3 = _mm_blend_epi16(t1,t0,0x33);
2184 
2185  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2186  row4 = _mm_xor_si128(row4, row1);
2187  row4 = _mm_shuffle_epi8(row4,r16);
2188  row3 = _mm_add_epi32(row3, row4);
2189  row2 = _mm_xor_si128(row2, row3);
2190  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2191 
2192  t0 = _mm_blend_epi16(m0,m2,0x03);
2193  t1 = _mm_slli_si128(t0, 8);
2194  t2 = _mm_blend_epi16(t1,m3,0x0F);
2195  buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3));
2196 
2197  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2198  row4 = _mm_xor_si128(row4, row1);
2199  row4 = _mm_shuffle_epi8(row4,r8);
2200  row3 = _mm_add_epi32(row3, row4);
2201  row2 = _mm_xor_si128(row2, row3);
2202  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2203 
2204  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2205  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2206  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2207 
2208  t0 = _mm_unpackhi_epi32(m0,m1);
2209  t1 = _mm_unpacklo_epi32(m0,m2);
2210  buf1 = _mm_unpacklo_epi64(t0,t1);
2211 
2212  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2213  row4 = _mm_xor_si128(row4, row1);
2214  row4 = _mm_shuffle_epi8(row4,r16);
2215  row3 = _mm_add_epi32(row3, row4);
2216  row2 = _mm_xor_si128(row2, row3);
2217  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2218 
2219  t0 = _mm_srli_si128(m2, 4);
2220  t1 = _mm_blend_epi16(m0,m3,0x03);
2221  buf2 = _mm_blend_epi16(t1,t0,0x3C);
2222 
2223  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2224  row4 = _mm_xor_si128(row4, row1);
2225  row4 = _mm_shuffle_epi8(row4,r8);
2226  row3 = _mm_add_epi32(row3, row4);
2227  row2 = _mm_xor_si128(row2, row3);
2228  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2229 
2230  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2231  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2232  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2233 
2234  t0 = _mm_blend_epi16(m1,m0,0x0C);
2235  t1 = _mm_srli_si128(m3, 4);
2236  t2 = _mm_blend_epi16(t0,t1,0x30);
2237  buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0));
2238 
2239  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2240  row4 = _mm_xor_si128(row4, row1);
2241  row4 = _mm_shuffle_epi8(row4,r16);
2242  row3 = _mm_add_epi32(row3, row4);
2243  row2 = _mm_xor_si128(row2, row3);
2244  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2245 
2246  t0 = _mm_unpacklo_epi64(m1,m2);
2247  t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1));
2248  buf4 = _mm_blend_epi16(t0,t1,0x33);
2249 
2250  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2251  row4 = _mm_xor_si128(row4, row1);
2252  row4 = _mm_shuffle_epi8(row4,r8);
2253  row3 = _mm_add_epi32(row3, row4);
2254  row2 = _mm_xor_si128(row2, row3);
2255  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2256 
2257  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2258  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2259  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2260 
2261  t0 = _mm_slli_si128(m1, 12);
2262  t1 = _mm_blend_epi16(m0,m3,0x33);
2263  buf1 = _mm_blend_epi16(t1,t0,0xC0);
2264 
2265  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2266  row4 = _mm_xor_si128(row4, row1);
2267  row4 = _mm_shuffle_epi8(row4,r16);
2268  row3 = _mm_add_epi32(row3, row4);
2269  row2 = _mm_xor_si128(row2, row3);
2270  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2271 
2272  t0 = _mm_blend_epi16(m3,m2,0x30);
2273  t1 = _mm_srli_si128(m1, 4);
2274  t2 = _mm_blend_epi16(t0,t1,0x03);
2275  buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0));
2276 
2277  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2278  row4 = _mm_xor_si128(row4, row1);
2279  row4 = _mm_shuffle_epi8(row4,r8);
2280  row3 = _mm_add_epi32(row3, row4);
2281  row2 = _mm_xor_si128(row2, row3);
2282  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2283 
2284  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2285  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2286  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2287 
2288  t0 = _mm_unpacklo_epi64(m0,m2);
2289  t1 = _mm_srli_si128(m1, 4);
2290  buf3 = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0));
2291 
2292  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2293  row4 = _mm_xor_si128(row4, row1);
2294  row4 = _mm_shuffle_epi8(row4,r16);
2295  row3 = _mm_add_epi32(row3, row4);
2296  row2 = _mm_xor_si128(row2, row3);
2297  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2298 
2299  t0 = _mm_unpackhi_epi32(m1,m2);
2300  t1 = _mm_unpackhi_epi64(m0,t0);
2301  buf4 = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
2302 
2303  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2304  row4 = _mm_xor_si128(row4, row1);
2305  row4 = _mm_shuffle_epi8(row4,r8);
2306  row3 = _mm_add_epi32(row3, row4);
2307  row2 = _mm_xor_si128(row2, row3);
2308  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2309 
2310  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2311  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2312  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2313 
2314  t0 = _mm_unpackhi_epi32(m0,m1);
2315  t1 = _mm_blend_epi16(t0,m3,0x0F);
2316  buf1 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1));
2317 
2318  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2319  row4 = _mm_xor_si128(row4, row1);
2320  row4 = _mm_shuffle_epi8(row4,r16);
2321  row3 = _mm_add_epi32(row3, row4);
2322  row2 = _mm_xor_si128(row2, row3);
2323  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2324 
2325  t0 = _mm_blend_epi16(m2,m3,0x30);
2326  t1 = _mm_srli_si128(m0,4);
2327  t2 = _mm_blend_epi16(t0,t1,0x03);
2328  buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3));
2329 
2330  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2331  row4 = _mm_xor_si128(row4, row1);
2332  row4 = _mm_shuffle_epi8(row4,r8);
2333  row3 = _mm_add_epi32(row3, row4);
2334  row2 = _mm_xor_si128(row2, row3);
2335  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2336 
2337  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2338  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2339  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2340 
2341  t0 = _mm_unpackhi_epi64(m0,m3);
2342  t1 = _mm_unpacklo_epi64(m1,m2);
2343  t2 = _mm_blend_epi16(t0,t1,0x3C);
2344  buf3 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1));
2345 
2346  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2347  row4 = _mm_xor_si128(row4, row1);
2348  row4 = _mm_shuffle_epi8(row4,r16);
2349  row3 = _mm_add_epi32(row3, row4);
2350  row2 = _mm_xor_si128(row2, row3);
2351  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2352 
2353  t0 = _mm_unpacklo_epi32(m0,m1);
2354  t1 = _mm_unpackhi_epi32(m1,m2);
2355  buf4 = _mm_unpacklo_epi64(t0,t1);
2356 
2357  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2358  row4 = _mm_xor_si128(row4, row1);
2359  row4 = _mm_shuffle_epi8(row4,r8);
2360  row3 = _mm_add_epi32(row3, row4);
2361  row2 = _mm_xor_si128(row2, row3);
2362  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2363 
2364  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2365  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2366  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2367 
2368  t0 = _mm_unpackhi_epi32(m1,m3);
2369  t1 = _mm_unpacklo_epi64(t0,m0);
2370  t2 = _mm_blend_epi16(t1,m2,0xC0);
2371  buf1 = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2));
2372 
2373  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2374  row4 = _mm_xor_si128(row4, row1);
2375  row4 = _mm_shuffle_epi8(row4,r16);
2376  row3 = _mm_add_epi32(row3, row4);
2377  row2 = _mm_xor_si128(row2, row3);
2378  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2379 
2380  t0 = _mm_unpackhi_epi32(m0,m3);
2381  t1 = _mm_blend_epi16(m2,t0,0xF0);
2382  buf2 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3));
2383 
2384  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2385  row4 = _mm_xor_si128(row4, row1);
2386  row4 = _mm_shuffle_epi8(row4,r8);
2387  row3 = _mm_add_epi32(row3, row4);
2388  row2 = _mm_xor_si128(row2, row3);
2389  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2390 
2391  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2392  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2393  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2394 
2395  t0 = _mm_blend_epi16(m2,m0,0x0C);
2396  t1 = _mm_slli_si128(t0,4);
2397  buf3 = _mm_blend_epi16(t1,m3,0x0F);
2398 
2399  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2400  row4 = _mm_xor_si128(row4, row1);
2401  row4 = _mm_shuffle_epi8(row4,r16);
2402  row3 = _mm_add_epi32(row3, row4);
2403  row2 = _mm_xor_si128(row2, row3);
2404  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2405 
2406  t0 = _mm_blend_epi16(m1,m0,0x30);
2407  buf4 = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2));
2408 
2409  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2410  row4 = _mm_xor_si128(row4, row1);
2411  row4 = _mm_shuffle_epi8(row4,r8);
2412  row3 = _mm_add_epi32(row3, row4);
2413  row2 = _mm_xor_si128(row2, row3);
2414  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2415 
2416  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2417  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2418  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2419 
2420  t0 = _mm_blend_epi16(m0,m2,0x03);
2421  t1 = _mm_blend_epi16(m1,m2,0x30);
2422  t2 = _mm_blend_epi16(t1,t0,0x0F);
2423  buf1 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2));
2424 
2425  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2426  row4 = _mm_xor_si128(row4, row1);
2427  row4 = _mm_shuffle_epi8(row4,r16);
2428  row3 = _mm_add_epi32(row3, row4);
2429  row2 = _mm_xor_si128(row2, row3);
2430  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2431 
2432  t0 = _mm_slli_si128(m0,4);
2433  t1 = _mm_blend_epi16(m1,t0,0xC0);
2434  buf2 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3));
2435 
2436  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2437  row4 = _mm_xor_si128(row4, row1);
2438  row4 = _mm_shuffle_epi8(row4,r8);
2439  row3 = _mm_add_epi32(row3, row4);
2440  row2 = _mm_xor_si128(row2, row3);
2441  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2442 
2443  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2444  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2445  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2446 
2447  t0 = _mm_unpackhi_epi32(m0,m3);
2448  t1 = _mm_unpacklo_epi32(m2,m3);
2449  t2 = _mm_unpackhi_epi64(t0,t1);
2450  buf3 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1));
2451 
2452  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2453  row4 = _mm_xor_si128(row4, row1);
2454  row4 = _mm_shuffle_epi8(row4,r16);
2455  row3 = _mm_add_epi32(row3, row4);
2456  row2 = _mm_xor_si128(row2, row3);
2457  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2458 
2459  t0 = _mm_blend_epi16(m3,m2,0xC0);
2460  t1 = _mm_unpacklo_epi32(m0,m3);
2461  t2 = _mm_blend_epi16(t0,t1,0x0F);
2462  buf4 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3));
2463 
2464  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2465  row4 = _mm_xor_si128(row4, row1);
2466  row4 = _mm_shuffle_epi8(row4,r8);
2467  row3 = _mm_add_epi32(row3, row4);
2468  row2 = _mm_xor_si128(row2, row3);
2469  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2470 
2471  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2472  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2473  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2474 
2475  _mm_storeu_si128((__m128i *)(void*)(&state.h[0]), _mm_xor_si128(ff0, _mm_xor_si128(row1, row3)));
2476  _mm_storeu_si128((__m128i *)(void*)(&state.h[4]), _mm_xor_si128(ff1, _mm_xor_si128(row2, row4)));
2477 }
2478 
2479 static void BLAKE2_SSE4_Compress64(const byte* input, BLAKE2_State<word64, true>& state)
2480 {
2481  __m128i row1l, row1h;
2482  __m128i row2l, row2h;
2483  __m128i row3l, row3h;
2484  __m128i row4l, row4h;
2485  __m128i b0, b1, t0, t1;
2486 
2487  const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
2488  const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
2489 
2490  const __m128i m0 = _mm_loadu_si128((const __m128i*)(const void*)(input + 00));
2491  const __m128i m1 = _mm_loadu_si128((const __m128i*)(const void*)(input + 16));
2492  const __m128i m2 = _mm_loadu_si128((const __m128i*)(const void*)(input + 32));
2493  const __m128i m3 = _mm_loadu_si128((const __m128i*)(const void*)(input + 48));
2494  const __m128i m4 = _mm_loadu_si128((const __m128i*)(const void*)(input + 64));
2495  const __m128i m5 = _mm_loadu_si128((const __m128i*)(const void*)(input + 80));
2496  const __m128i m6 = _mm_loadu_si128((const __m128i*)(const void*)(input + 96));
2497  const __m128i m7 = _mm_loadu_si128((const __m128i*)(const void*)(input + 112));
2498 
2499  row1l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0]));
2500  row1h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[2]));
2501  row2l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4]));
2502  row2h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[6]));
2503  row3l = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(0)));
2504  row3h = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(2)));
2505  row4l = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(4))), _mm_loadu_si128((const __m128i*)(const void*)(&state.t[0])));
2506  row4h = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(6))), _mm_loadu_si128((const __m128i*)(const void*)(&state.f[0])));
2507 
2508  b0 = _mm_unpacklo_epi64(m0, m1);
2509  b1 = _mm_unpacklo_epi64(m2, m3);
2510  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2511  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2512  row4l = _mm_xor_si128(row4l, row1l);
2513  row4h = _mm_xor_si128(row4h, row1h);
2514  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2515  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2516  row3l = _mm_add_epi64(row3l, row4l);
2517  row3h = _mm_add_epi64(row3h, row4h);
2518  row2l = _mm_xor_si128(row2l, row3l);
2519  row2h = _mm_xor_si128(row2h, row3h);
2520  row2l = _mm_shuffle_epi8(row2l, r24);
2521  row2h = _mm_shuffle_epi8(row2h, r24);
2522 
2523  b0 = _mm_unpackhi_epi64(m0, m1);
2524  b1 = _mm_unpackhi_epi64(m2, m3);
2525 
2526  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2527  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2528  row4l = _mm_xor_si128(row4l, row1l);
2529  row4h = _mm_xor_si128(row4h, row1h);
2530  row4l = _mm_shuffle_epi8(row4l, r16);
2531  row4h = _mm_shuffle_epi8(row4h, r16);
2532  row3l = _mm_add_epi64(row3l, row4l);
2533  row3h = _mm_add_epi64(row3h, row4h);
2534  row2l = _mm_xor_si128(row2l, row3l);
2535  row2h = _mm_xor_si128(row2h, row3h);
2536  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2537  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2538 
2539  t0 = _mm_alignr_epi8(row2h, row2l, 8);
2540  t1 = _mm_alignr_epi8(row2l, row2h, 8);
2541  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2542  t0 = _mm_alignr_epi8(row4h, row4l, 8);
2543  t1 = _mm_alignr_epi8(row4l, row4h, 8);
2544  row4l = t1, row4h = t0;
2545 
2546  b0 = _mm_unpacklo_epi64(m4, m5);
2547  b1 = _mm_unpacklo_epi64(m6, m7);
2548 
2549  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2550  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2551  row4l = _mm_xor_si128(row4l, row1l);
2552  row4h = _mm_xor_si128(row4h, row1h);
2553  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2554  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2555  row3l = _mm_add_epi64(row3l, row4l);
2556  row3h = _mm_add_epi64(row3h, row4h);
2557  row2l = _mm_xor_si128(row2l, row3l);
2558  row2h = _mm_xor_si128(row2h, row3h);
2559  row2l = _mm_shuffle_epi8(row2l, r24);
2560  row2h = _mm_shuffle_epi8(row2h, r24);
2561 
2562  b0 = _mm_unpackhi_epi64(m4, m5);
2563  b1 = _mm_unpackhi_epi64(m6, m7);
2564 
2565  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2566  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2567  row4l = _mm_xor_si128(row4l, row1l);
2568  row4h = _mm_xor_si128(row4h, row1h);
2569  row4l = _mm_shuffle_epi8(row4l, r16);
2570  row4h = _mm_shuffle_epi8(row4h, r16);
2571  row3l = _mm_add_epi64(row3l, row4l);
2572  row3h = _mm_add_epi64(row3h, row4h);
2573  row2l = _mm_xor_si128(row2l, row3l);
2574  row2h = _mm_xor_si128(row2h, row3h);
2575  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2576  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2577 
2578  t0 = _mm_alignr_epi8(row2l, row2h, 8);
2579  t1 = _mm_alignr_epi8(row2h, row2l, 8);
2580  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2581  t0 = _mm_alignr_epi8(row4l, row4h, 8);
2582  t1 = _mm_alignr_epi8(row4h, row4l, 8);
2583  row4l = t1, row4h = t0;
2584 
2585  b0 = _mm_unpacklo_epi64(m7, m2);
2586  b1 = _mm_unpackhi_epi64(m4, m6);
2587 
2588  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2589  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2590  row4l = _mm_xor_si128(row4l, row1l);
2591  row4h = _mm_xor_si128(row4h, row1h);
2592  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2593  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2594  row3l = _mm_add_epi64(row3l, row4l);
2595  row3h = _mm_add_epi64(row3h, row4h);
2596  row2l = _mm_xor_si128(row2l, row3l);
2597  row2h = _mm_xor_si128(row2h, row3h);
2598  row2l = _mm_shuffle_epi8(row2l, r24);
2599  row2h = _mm_shuffle_epi8(row2h, r24);
2600 
2601  b0 = _mm_unpacklo_epi64(m5, m4);
2602  b1 = _mm_alignr_epi8(m3, m7, 8);
2603 
2604  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2605  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2606  row4l = _mm_xor_si128(row4l, row1l);
2607  row4h = _mm_xor_si128(row4h, row1h);
2608  row4l = _mm_shuffle_epi8(row4l, r16);
2609  row4h = _mm_shuffle_epi8(row4h, r16);
2610  row3l = _mm_add_epi64(row3l, row4l);
2611  row3h = _mm_add_epi64(row3h, row4h);
2612  row2l = _mm_xor_si128(row2l, row3l);
2613  row2h = _mm_xor_si128(row2h, row3h);
2614  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2615  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2616 
2617  t0 = _mm_alignr_epi8(row2h, row2l, 8);
2618  t1 = _mm_alignr_epi8(row2l, row2h, 8);
2619  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2620  t0 = _mm_alignr_epi8(row4h, row4l, 8);
2621  t1 = _mm_alignr_epi8(row4l, row4h, 8);
2622  row4l = t1, row4h = t0;
2623 
2624  b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));
2625  b1 = _mm_unpackhi_epi64(m5, m2);
2626 
2627  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2628  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2629  row4l = _mm_xor_si128(row4l, row1l);
2630  row4h = _mm_xor_si128(row4h, row1h);
2631  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2632  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2633  row3l = _mm_add_epi64(row3l, row4l);
2634  row3h = _mm_add_epi64(row3h, row4h);
2635  row2l = _mm_xor_si128(row2l, row3l);
2636  row2h = _mm_xor_si128(row2h, row3h);
2637  row2l = _mm_shuffle_epi8(row2l, r24);
2638  row2h = _mm_shuffle_epi8(row2h, r24);
2639 
2640  b0 = _mm_unpacklo_epi64(m6, m1);
2641  b1 = _mm_unpackhi_epi64(m3, m1);
2642 
2643  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2644  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2645  row4l = _mm_xor_si128(row4l, row1l);
2646  row4h = _mm_xor_si128(row4h, row1h);
2647  row4l = _mm_shuffle_epi8(row4l, r16);
2648  row4h = _mm_shuffle_epi8(row4h, r16);
2649  row3l = _mm_add_epi64(row3l, row4l);
2650  row3h = _mm_add_epi64(row3h, row4h);
2651  row2l = _mm_xor_si128(row2l, row3l);
2652  row2h = _mm_xor_si128(row2h, row3h);
2653  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2654  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2655 
2656  t0 = _mm_alignr_epi8(row2l, row2h, 8);
2657  t1 = _mm_alignr_epi8(row2h, row2l, 8);
2658  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2659  t0 = _mm_alignr_epi8(row4l, row4h, 8);
2660  t1 = _mm_alignr_epi8(row4h, row4l, 8);
2661  row4l = t1, row4h = t0;
2662 
2663  b0 = _mm_alignr_epi8(m6, m5, 8);
2664  b1 = _mm_unpackhi_epi64(m2, m7);
2665 
2666  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2667  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2668  row4l = _mm_xor_si128(row4l, row1l);
2669  row4h = _mm_xor_si128(row4h, row1h);
2670  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2671  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2672  row3l = _mm_add_epi64(row3l, row4l);
2673  row3h = _mm_add_epi64(row3h, row4h);
2674  row2l = _mm_xor_si128(row2l, row3l);
2675  row2h = _mm_xor_si128(row2h, row3h);
2676  row2l = _mm_shuffle_epi8(row2l, r24);
2677  row2h = _mm_shuffle_epi8(row2h, r24);
2678 
2679  b0 = _mm_unpacklo_epi64(m4, m0);
2680  b1 = _mm_blend_epi16(m1, m6, 0xF0);
2681 
2682  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2683  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2684  row4l = _mm_xor_si128(row4l, row1l);
2685  row4h = _mm_xor_si128(row4h, row1h);
2686  row4l = _mm_shuffle_epi8(row4l, r16);
2687  row4h = _mm_shuffle_epi8(row4h, r16);
2688  row3l = _mm_add_epi64(row3l, row4l);
2689  row3h = _mm_add_epi64(row3h, row4h);
2690  row2l = _mm_xor_si128(row2l, row3l);
2691  row2h = _mm_xor_si128(row2h, row3h);
2692  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2693  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2694 
2695  t0 = _mm_alignr_epi8(row2h, row2l, 8);
2696  t1 = _mm_alignr_epi8(row2l, row2h, 8);
2697  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2698  t0 = _mm_alignr_epi8(row4h, row4l, 8);
2699  t1 = _mm_alignr_epi8(row4l, row4h, 8);
2700  row4l = t1, row4h = t0;
2701 
2702  b0 = _mm_blend_epi16(m5, m1, 0xF0);
2703  b1 = _mm_unpackhi_epi64(m3, m4);
2704 
2705  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2706  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2707  row4l = _mm_xor_si128(row4l, row1l);
2708  row4h = _mm_xor_si128(row4h, row1h);
2709  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2710  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2711  row3l = _mm_add_epi64(row3l, row4l);
2712  row3h = _mm_add_epi64(row3h, row4h);
2713  row2l = _mm_xor_si128(row2l, row3l);
2714  row2h = _mm_xor_si128(row2h, row3h);
2715  row2l = _mm_shuffle_epi8(row2l, r24);
2716  row2h = _mm_shuffle_epi8(row2h, r24);
2717 
2718  b0 = _mm_unpacklo_epi64(m7, m3);
2719  b1 = _mm_alignr_epi8(m2, m0, 8);
2720 
2721  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2722  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2723  row4l = _mm_xor_si128(row4l, row1l);
2724  row4h = _mm_xor_si128(row4h, row1h);
2725  row4l = _mm_shuffle_epi8(row4l, r16);
2726  row4h = _mm_shuffle_epi8(row4h, r16);
2727  row3l = _mm_add_epi64(row3l, row4l);
2728  row3h = _mm_add_epi64(row3h, row4h);
2729  row2l = _mm_xor_si128(row2l, row3l);
2730  row2h = _mm_xor_si128(row2h, row3h);
2731  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2732  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2733 
2734  t0 = _mm_alignr_epi8(row2l, row2h, 8);
2735  t1 = _mm_alignr_epi8(row2h, row2l, 8);
2736  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2737  t0 = _mm_alignr_epi8(row4l, row4h, 8);
2738  t1 = _mm_alignr_epi8(row4h, row4l, 8);
2739  row4l = t1, row4h = t0;
2740 
2741  b0 = _mm_unpackhi_epi64(m3, m1);
2742  b1 = _mm_unpackhi_epi64(m6, m5);
2743 
2744  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2745  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2746  row4l = _mm_xor_si128(row4l, row1l);
2747  row4h = _mm_xor_si128(row4h, row1h);
2748  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2749  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2750  row3l = _mm_add_epi64(row3l, row4l);
2751  row3h = _mm_add_epi64(row3h, row4h);
2752  row2l = _mm_xor_si128(row2l, row3l);
2753  row2h = _mm_xor_si128(row2h, row3h);
2754  row2l = _mm_shuffle_epi8(row2l, r24);
2755  row2h = _mm_shuffle_epi8(row2h, r24);
2756 
2757  b0 = _mm_unpackhi_epi64(m4, m0);
2758  b1 = _mm_unpacklo_epi64(m6, m7);
2759 
2760  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2761  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2762  row4l = _mm_xor_si128(row4l, row1l);
2763  row4h = _mm_xor_si128(row4h, row1h);
2764  row4l = _mm_shuffle_epi8(row4l, r16);
2765  row4h = _mm_shuffle_epi8(row4h, r16);
2766  row3l = _mm_add_epi64(row3l, row4l);
2767  row3h = _mm_add_epi64(row3h, row4h);
2768  row2l = _mm_xor_si128(row2l, row3l);
2769  row2h = _mm_xor_si128(row2h, row3h);
2770  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2771  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2772 
2773  t0 = _mm_alignr_epi8(row2h, row2l, 8);
2774  t1 = _mm_alignr_epi8(row2l, row2h, 8);
2775  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2776  t0 = _mm_alignr_epi8(row4h, row4l, 8);
2777  t1 = _mm_alignr_epi8(row4l, row4h, 8);
2778  row4l = t1, row4h = t0;
2779 
2780  b0 = _mm_blend_epi16(m1, m2, 0xF0);
2781  b1 = _mm_blend_epi16(m2, m7, 0xF0);
2782 
2783  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2784  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2785  row4l = _mm_xor_si128(row4l, row1l);
2786  row4h = _mm_xor_si128(row4h, row1h);
2787  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2788  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2789  row3l = _mm_add_epi64(row3l, row4l);
2790  row3h = _mm_add_epi64(row3h, row4h);
2791  row2l = _mm_xor_si128(row2l, row3l);
2792  row2h = _mm_xor_si128(row2h, row3h);
2793  row2l = _mm_shuffle_epi8(row2l, r24);
2794  row2h = _mm_shuffle_epi8(row2h, r24);
2795 
2796  b0 = _mm_unpacklo_epi64(m3, m5);
2797  b1 = _mm_unpacklo_epi64(m0, m4);
2798 
2799  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2800  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2801  row4l = _mm_xor_si128(row4l, row1l);
2802  row4h = _mm_xor_si128(row4h, row1h);
2803  row4l = _mm_shuffle_epi8(row4l, r16);
2804  row4h = _mm_shuffle_epi8(row4h, r16);
2805  row3l = _mm_add_epi64(row3l, row4l);
2806  row3h = _mm_add_epi64(row3h, row4h);
2807  row2l = _mm_xor_si128(row2l, row3l);
2808  row2h = _mm_xor_si128(row2h, row3h);
2809  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2810  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2811 
2812  t0 = _mm_alignr_epi8(row2l, row2h, 8);
2813  t1 = _mm_alignr_epi8(row2h, row2l, 8);
2814  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2815  t0 = _mm_alignr_epi8(row4l, row4h, 8);
2816  t1 = _mm_alignr_epi8(row4h, row4l, 8);
2817  row4l = t1, row4h = t0;
2818 
2819  b0 = _mm_unpackhi_epi64(m4, m2);
2820  b1 = _mm_unpacklo_epi64(m1, m5);
2821 
2822  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2823  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2824  row4l = _mm_xor_si128(row4l, row1l);
2825  row4h = _mm_xor_si128(row4h, row1h);
2826  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2827  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2828  row3l = _mm_add_epi64(row3l, row4l);
2829  row3h = _mm_add_epi64(row3h, row4h);
2830  row2l = _mm_xor_si128(row2l, row3l);
2831  row2h = _mm_xor_si128(row2h, row3h);
2832  row2l = _mm_shuffle_epi8(row2l, r24);
2833  row2h = _mm_shuffle_epi8(row2h, r24);
2834 
2835  b0 = _mm_blend_epi16(m0, m3, 0xF0);
2836  b1 = _mm_blend_epi16(m2, m7, 0xF0);
2837 
2838  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2839  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2840  row4l = _mm_xor_si128(row4l, row1l);
2841  row4h = _mm_xor_si128(row4h, row1h);
2842  row4l = _mm_shuffle_epi8(row4l, r16);
2843  row4h = _mm_shuffle_epi8(row4h, r16);
2844  row3l = _mm_add_epi64(row3l, row4l);
2845  row3h = _mm_add_epi64(row3h, row4h);
2846  row2l = _mm_xor_si128(row2l, row3l);
2847  row2h = _mm_xor_si128(row2h, row3h);
2848  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2849  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2850 
2851  t0 = _mm_alignr_epi8(row2h, row2l, 8);
2852  t1 = _mm_alignr_epi8(row2l, row2h, 8);
2853  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2854  t0 = _mm_alignr_epi8(row4h, row4l, 8);
2855  t1 = _mm_alignr_epi8(row4l, row4h, 8);
2856  row4l = t1, row4h = t0;
2857 
2858  b0 = _mm_blend_epi16(m7, m5, 0xF0);
2859  b1 = _mm_blend_epi16(m3, m1, 0xF0);
2860 
2861  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2862  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2863  row4l = _mm_xor_si128(row4l, row1l);
2864  row4h = _mm_xor_si128(row4h, row1h);
2865  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2866  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2867  row3l = _mm_add_epi64(row3l, row4l);
2868  row3h = _mm_add_epi64(row3h, row4h);
2869  row2l = _mm_xor_si128(row2l, row3l);
2870  row2h = _mm_xor_si128(row2h, row3h);
2871  row2l = _mm_shuffle_epi8(row2l, r24);
2872  row2h = _mm_shuffle_epi8(row2h, r24);
2873 
2874  b0 = _mm_alignr_epi8(m6, m0, 8);
2875  b1 = _mm_blend_epi16(m4, m6, 0xF0);
2876 
2877  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2878  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2879  row4l = _mm_xor_si128(row4l, row1l);
2880  row4h = _mm_xor_si128(row4h, row1h);
2881  row4l = _mm_shuffle_epi8(row4l, r16);
2882  row4h = _mm_shuffle_epi8(row4h, r16);
2883  row3l = _mm_add_epi64(row3l, row4l);
2884  row3h = _mm_add_epi64(row3h, row4h);
2885  row2l = _mm_xor_si128(row2l, row3l);
2886  row2h = _mm_xor_si128(row2h, row3h);
2887  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2888  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2889 
2890  t0 = _mm_alignr_epi8(row2l, row2h, 8);
2891  t1 = _mm_alignr_epi8(row2h, row2l, 8);
2892  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2893  t0 = _mm_alignr_epi8(row4l, row4h, 8);
2894  t1 = _mm_alignr_epi8(row4h, row4l, 8);
2895  row4l = t1, row4h = t0;
2896 
2897  b0 = _mm_unpacklo_epi64(m1, m3);
2898  b1 = _mm_unpacklo_epi64(m0, m4);
2899 
2900  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2901  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2902  row4l = _mm_xor_si128(row4l, row1l);
2903  row4h = _mm_xor_si128(row4h, row1h);
2904  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2905  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2906  row3l = _mm_add_epi64(row3l, row4l);
2907  row3h = _mm_add_epi64(row3h, row4h);
2908  row2l = _mm_xor_si128(row2l, row3l);
2909  row2h = _mm_xor_si128(row2h, row3h);
2910  row2l = _mm_shuffle_epi8(row2l, r24);
2911  row2h = _mm_shuffle_epi8(row2h, r24);
2912 
2913  b0 = _mm_unpacklo_epi64(m6, m5);
2914  b1 = _mm_unpackhi_epi64(m5, m1);
2915 
2916  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2917  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2918  row4l = _mm_xor_si128(row4l, row1l);
2919  row4h = _mm_xor_si128(row4h, row1h);
2920  row4l = _mm_shuffle_epi8(row4l, r16);
2921  row4h = _mm_shuffle_epi8(row4h, r16);
2922  row3l = _mm_add_epi64(row3l, row4l);
2923  row3h = _mm_add_epi64(row3h, row4h);
2924  row2l = _mm_xor_si128(row2l, row3l);
2925  row2h = _mm_xor_si128(row2h, row3h);
2926  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2927  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2928 
2929  t0 = _mm_alignr_epi8(row2h, row2l, 8);
2930  t1 = _mm_alignr_epi8(row2l, row2h, 8);
2931  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2932  t0 = _mm_alignr_epi8(row4h, row4l, 8);
2933  t1 = _mm_alignr_epi8(row4l, row4h, 8);
2934  row4l = t1, row4h = t0;
2935 
2936  b0 = _mm_blend_epi16(m2, m3, 0xF0);
2937  b1 = _mm_unpackhi_epi64(m7, m0);
2938 
2939  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2940  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2941  row4l = _mm_xor_si128(row4l, row1l);
2942  row4h = _mm_xor_si128(row4h, row1h);
2943  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2944  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2945  row3l = _mm_add_epi64(row3l, row4l);
2946  row3h = _mm_add_epi64(row3h, row4h);
2947  row2l = _mm_xor_si128(row2l, row3l);
2948  row2h = _mm_xor_si128(row2h, row3h);
2949  row2l = _mm_shuffle_epi8(row2l, r24);
2950  row2h = _mm_shuffle_epi8(row2h, r24);
2951 
2952  b0 = _mm_unpackhi_epi64(m6, m2);
2953  b1 = _mm_blend_epi16(m7, m4, 0xF0);
2954 
2955  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2956  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2957  row4l = _mm_xor_si128(row4l, row1l);
2958  row4h = _mm_xor_si128(row4h, row1h);
2959  row4l = _mm_shuffle_epi8(row4l, r16);
2960  row4h = _mm_shuffle_epi8(row4h, r16);
2961  row3l = _mm_add_epi64(row3l, row4l);
2962  row3h = _mm_add_epi64(row3h, row4h);
2963  row2l = _mm_xor_si128(row2l, row3l);
2964  row2h = _mm_xor_si128(row2h, row3h);
2965  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2966  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2967 
2968  t0 = _mm_alignr_epi8(row2l, row2h, 8);
2969  t1 = _mm_alignr_epi8(row2h, row2l, 8);
2970  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2971  t0 = _mm_alignr_epi8(row4l, row4h, 8);
2972  t1 = _mm_alignr_epi8(row4h, row4l, 8);
2973  row4l = t1, row4h = t0;
2974 
2975  b0 = _mm_blend_epi16(m6, m0, 0xF0);
2976  b1 = _mm_unpacklo_epi64(m7, m2);
2977 
2978  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2979  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2980  row4l = _mm_xor_si128(row4l, row1l);
2981  row4h = _mm_xor_si128(row4h, row1h);
2982  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2983  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2984  row3l = _mm_add_epi64(row3l, row4l);
2985  row3h = _mm_add_epi64(row3h, row4h);
2986  row2l = _mm_xor_si128(row2l, row3l);
2987  row2h = _mm_xor_si128(row2h, row3h);
2988  row2l = _mm_shuffle_epi8(row2l, r24);
2989  row2h = _mm_shuffle_epi8(row2h, r24);
2990 
2991  b0 = _mm_unpackhi_epi64(m2, m7);
2992  b1 = _mm_alignr_epi8(m5, m6, 8);
2993 
2994  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2995  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2996  row4l = _mm_xor_si128(row4l, row1l);
2997  row4h = _mm_xor_si128(row4h, row1h);
2998  row4l = _mm_shuffle_epi8(row4l, r16);
2999  row4h = _mm_shuffle_epi8(row4h, r16);
3000  row3l = _mm_add_epi64(row3l, row4l);
3001  row3h = _mm_add_epi64(row3h, row4h);
3002  row2l = _mm_xor_si128(row2l, row3l);
3003  row2h = _mm_xor_si128(row2h, row3h);
3004  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3005  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3006 
3007  t0 = _mm_alignr_epi8(row2h, row2l, 8);
3008  t1 = _mm_alignr_epi8(row2l, row2h, 8);
3009  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3010  t0 = _mm_alignr_epi8(row4h, row4l, 8);
3011  t1 = _mm_alignr_epi8(row4l, row4h, 8);
3012  row4l = t1, row4h = t0;
3013 
3014  b0 = _mm_unpacklo_epi64(m0, m3);
3015  b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2));
3016 
3017  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3018  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3019  row4l = _mm_xor_si128(row4l, row1l);
3020  row4h = _mm_xor_si128(row4h, row1h);
3021  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3022  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3023  row3l = _mm_add_epi64(row3l, row4l);
3024  row3h = _mm_add_epi64(row3h, row4h);
3025  row2l = _mm_xor_si128(row2l, row3l);
3026  row2h = _mm_xor_si128(row2h, row3h);
3027  row2l = _mm_shuffle_epi8(row2l, r24);
3028  row2h = _mm_shuffle_epi8(row2h, r24);
3029 
3030  b0 = _mm_unpackhi_epi64(m3, m1);
3031  b1 = _mm_blend_epi16(m1, m5, 0xF0);
3032 
3033  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3034  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3035  row4l = _mm_xor_si128(row4l, row1l);
3036  row4h = _mm_xor_si128(row4h, row1h);
3037  row4l = _mm_shuffle_epi8(row4l, r16);
3038  row4h = _mm_shuffle_epi8(row4h, r16);
3039  row3l = _mm_add_epi64(row3l, row4l);
3040  row3h = _mm_add_epi64(row3h, row4h);
3041  row2l = _mm_xor_si128(row2l, row3l);
3042  row2h = _mm_xor_si128(row2h, row3h);
3043  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3044  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3045 
3046  t0 = _mm_alignr_epi8(row2l, row2h, 8);
3047  t1 = _mm_alignr_epi8(row2h, row2l, 8);
3048  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3049  t0 = _mm_alignr_epi8(row4l, row4h, 8);
3050  t1 = _mm_alignr_epi8(row4h, row4l, 8);
3051  row4l = t1, row4h = t0;
3052 
3053  b0 = _mm_unpackhi_epi64(m6, m3);
3054  b1 = _mm_blend_epi16(m6, m1, 0xF0);
3055 
3056  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3057  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3058  row4l = _mm_xor_si128(row4l, row1l);
3059  row4h = _mm_xor_si128(row4h, row1h);
3060  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3061  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3062  row3l = _mm_add_epi64(row3l, row4l);
3063  row3h = _mm_add_epi64(row3h, row4h);
3064  row2l = _mm_xor_si128(row2l, row3l);
3065  row2h = _mm_xor_si128(row2h, row3h);
3066  row2l = _mm_shuffle_epi8(row2l, r24);
3067  row2h = _mm_shuffle_epi8(row2h, r24);
3068 
3069  b0 = _mm_alignr_epi8(m7, m5, 8);
3070  b1 = _mm_unpackhi_epi64(m0, m4);
3071 
3072  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3073  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3074  row4l = _mm_xor_si128(row4l, row1l);
3075  row4h = _mm_xor_si128(row4h, row1h);
3076  row4l = _mm_shuffle_epi8(row4l, r16);
3077  row4h = _mm_shuffle_epi8(row4h, r16);
3078  row3l = _mm_add_epi64(row3l, row4l);
3079  row3h = _mm_add_epi64(row3h, row4h);
3080  row2l = _mm_xor_si128(row2l, row3l);
3081  row2h = _mm_xor_si128(row2h, row3h);
3082  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3083  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3084 
3085  t0 = _mm_alignr_epi8(row2h, row2l, 8);
3086  t1 = _mm_alignr_epi8(row2l, row2h, 8);
3087  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3088  t0 = _mm_alignr_epi8(row4h, row4l, 8);
3089  t1 = _mm_alignr_epi8(row4l, row4h, 8);
3090  row4l = t1, row4h = t0;
3091 
3092  b0 = _mm_unpackhi_epi64(m2, m7);
3093  b1 = _mm_unpacklo_epi64(m4, m1);
3094 
3095  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3096  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3097  row4l = _mm_xor_si128(row4l, row1l);
3098  row4h = _mm_xor_si128(row4h, row1h);
3099  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3100  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3101  row3l = _mm_add_epi64(row3l, row4l);
3102  row3h = _mm_add_epi64(row3h, row4h);
3103  row2l = _mm_xor_si128(row2l, row3l);
3104  row2h = _mm_xor_si128(row2h, row3h);
3105  row2l = _mm_shuffle_epi8(row2l, r24);
3106  row2h = _mm_shuffle_epi8(row2h, r24);
3107 
3108  b0 = _mm_unpacklo_epi64(m0, m2);
3109  b1 = _mm_unpacklo_epi64(m3, m5);
3110 
3111  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3112  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3113  row4l = _mm_xor_si128(row4l, row1l);
3114  row4h = _mm_xor_si128(row4h, row1h);
3115  row4l = _mm_shuffle_epi8(row4l, r16);
3116  row4h = _mm_shuffle_epi8(row4h, r16);
3117  row3l = _mm_add_epi64(row3l, row4l);
3118  row3h = _mm_add_epi64(row3h, row4h);
3119  row2l = _mm_xor_si128(row2l, row3l);
3120  row2h = _mm_xor_si128(row2h, row3h);
3121  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3122  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3123 
3124  t0 = _mm_alignr_epi8(row2l, row2h, 8);
3125  t1 = _mm_alignr_epi8(row2h, row2l, 8);
3126  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3127  t0 = _mm_alignr_epi8(row4l, row4h, 8);
3128  t1 = _mm_alignr_epi8(row4h, row4l, 8);
3129  row4l = t1, row4h = t0;
3130 
3131  b0 = _mm_unpacklo_epi64(m3, m7);
3132  b1 = _mm_alignr_epi8(m0, m5, 8);
3133 
3134  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3135  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3136  row4l = _mm_xor_si128(row4l, row1l);
3137  row4h = _mm_xor_si128(row4h, row1h);
3138  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3139  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3140  row3l = _mm_add_epi64(row3l, row4l);
3141  row3h = _mm_add_epi64(row3h, row4h);
3142  row2l = _mm_xor_si128(row2l, row3l);
3143  row2h = _mm_xor_si128(row2h, row3h);
3144  row2l = _mm_shuffle_epi8(row2l, r24);
3145  row2h = _mm_shuffle_epi8(row2h, r24);
3146 
3147  b0 = _mm_unpackhi_epi64(m7, m4);
3148  b1 = _mm_alignr_epi8(m4, m1, 8);
3149 
3150  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3151  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3152  row4l = _mm_xor_si128(row4l, row1l);
3153  row4h = _mm_xor_si128(row4h, row1h);
3154  row4l = _mm_shuffle_epi8(row4l, r16);
3155  row4h = _mm_shuffle_epi8(row4h, r16);
3156  row3l = _mm_add_epi64(row3l, row4l);
3157  row3h = _mm_add_epi64(row3h, row4h);
3158  row2l = _mm_xor_si128(row2l, row3l);
3159  row2h = _mm_xor_si128(row2h, row3h);
3160  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3161  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3162 
3163  t0 = _mm_alignr_epi8(row2h, row2l, 8);
3164  t1 = _mm_alignr_epi8(row2l, row2h, 8);
3165  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3166  t0 = _mm_alignr_epi8(row4h, row4l, 8);
3167  t1 = _mm_alignr_epi8(row4l, row4h, 8);
3168  row4l = t1, row4h = t0;
3169 
3170  b0 = m6;
3171  b1 = _mm_alignr_epi8(m5, m0, 8);
3172 
3173  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3174  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3175  row4l = _mm_xor_si128(row4l, row1l);
3176  row4h = _mm_xor_si128(row4h, row1h);
3177  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3178  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3179  row3l = _mm_add_epi64(row3l, row4l);
3180  row3h = _mm_add_epi64(row3h, row4h);
3181  row2l = _mm_xor_si128(row2l, row3l);
3182  row2h = _mm_xor_si128(row2h, row3h);
3183  row2l = _mm_shuffle_epi8(row2l, r24);
3184  row2h = _mm_shuffle_epi8(row2h, r24);
3185 
3186  b0 = _mm_blend_epi16(m1, m3, 0xF0);
3187  b1 = m2;
3188 
3189  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3190  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3191  row4l = _mm_xor_si128(row4l, row1l);
3192  row4h = _mm_xor_si128(row4h, row1h);
3193  row4l = _mm_shuffle_epi8(row4l, r16);
3194  row4h = _mm_shuffle_epi8(row4h, r16);
3195  row3l = _mm_add_epi64(row3l, row4l);
3196  row3h = _mm_add_epi64(row3h, row4h);
3197  row2l = _mm_xor_si128(row2l, row3l);
3198  row2h = _mm_xor_si128(row2h, row3h);
3199  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3200  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3201 
3202  t0 = _mm_alignr_epi8(row2l, row2h, 8);
3203  t1 = _mm_alignr_epi8(row2h, row2l, 8);
3204  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3205  t0 = _mm_alignr_epi8(row4l, row4h, 8);
3206  t1 = _mm_alignr_epi8(row4h, row4l, 8);
3207  row4l = t1, row4h = t0;
3208 
3209  b0 = _mm_unpacklo_epi64(m5, m4);
3210  b1 = _mm_unpackhi_epi64(m3, m0);
3211 
3212  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3213  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3214  row4l = _mm_xor_si128(row4l, row1l);
3215  row4h = _mm_xor_si128(row4h, row1h);
3216  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3217  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3218  row3l = _mm_add_epi64(row3l, row4l);
3219  row3h = _mm_add_epi64(row3h, row4h);
3220  row2l = _mm_xor_si128(row2l, row3l);
3221  row2h = _mm_xor_si128(row2h, row3h);
3222  row2l = _mm_shuffle_epi8(row2l, r24);
3223  row2h = _mm_shuffle_epi8(row2h, r24);
3224 
3225  b0 = _mm_unpacklo_epi64(m1, m2);
3226  b1 = _mm_blend_epi16(m3, m2, 0xF0);
3227 
3228  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3229  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3230  row4l = _mm_xor_si128(row4l, row1l);
3231  row4h = _mm_xor_si128(row4h, row1h);
3232  row4l = _mm_shuffle_epi8(row4l, r16);
3233  row4h = _mm_shuffle_epi8(row4h, r16);
3234  row3l = _mm_add_epi64(row3l, row4l);
3235  row3h = _mm_add_epi64(row3h, row4h);
3236  row2l = _mm_xor_si128(row2l, row3l);
3237  row2h = _mm_xor_si128(row2h, row3h);
3238  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3239  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3240 
3241  t0 = _mm_alignr_epi8(row2h, row2l, 8);
3242  t1 = _mm_alignr_epi8(row2l, row2h, 8);
3243  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3244  t0 = _mm_alignr_epi8(row4h, row4l, 8);
3245  t1 = _mm_alignr_epi8(row4l, row4h, 8);
3246  row4l = t1, row4h = t0;
3247 
3248  b0 = _mm_unpackhi_epi64(m7, m4);
3249  b1 = _mm_unpackhi_epi64(m1, m6);
3250 
3251  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3252  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3253  row4l = _mm_xor_si128(row4l, row1l);
3254  row4h = _mm_xor_si128(row4h, row1h);
3255  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3256  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3257  row3l = _mm_add_epi64(row3l, row4l);
3258  row3h = _mm_add_epi64(row3h, row4h);
3259  row2l = _mm_xor_si128(row2l, row3l);
3260  row2h = _mm_xor_si128(row2h, row3h);
3261  row2l = _mm_shuffle_epi8(row2l, r24);
3262  row2h = _mm_shuffle_epi8(row2h, r24);
3263 
3264  b0 = _mm_alignr_epi8(m7, m5, 8);
3265  b1 = _mm_unpacklo_epi64(m6, m0);
3266 
3267  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3268  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3269  row4l = _mm_xor_si128(row4l, row1l);
3270  row4h = _mm_xor_si128(row4h, row1h);
3271  row4l = _mm_shuffle_epi8(row4l, r16);
3272  row4h = _mm_shuffle_epi8(row4h, r16);
3273  row3l = _mm_add_epi64(row3l, row4l);
3274  row3h = _mm_add_epi64(row3h, row4h);
3275  row2l = _mm_xor_si128(row2l, row3l);
3276  row2h = _mm_xor_si128(row2h, row3h);
3277  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3278  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3279 
3280  t0 = _mm_alignr_epi8(row2l, row2h, 8);
3281  t1 = _mm_alignr_epi8(row2h, row2l, 8);
3282  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3283  t0 = _mm_alignr_epi8(row4l, row4h, 8);
3284  t1 = _mm_alignr_epi8(row4h, row4l, 8);
3285  row4l = t1, row4h = t0;
3286 
3287  b0 = _mm_unpacklo_epi64(m0, m1);
3288  b1 = _mm_unpacklo_epi64(m2, m3);
3289 
3290  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3291  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3292  row4l = _mm_xor_si128(row4l, row1l);
3293  row4h = _mm_xor_si128(row4h, row1h);
3294  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3295  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3296  row3l = _mm_add_epi64(row3l, row4l);
3297  row3h = _mm_add_epi64(row3h, row4h);
3298  row2l = _mm_xor_si128(row2l, row3l);
3299  row2h = _mm_xor_si128(row2h, row3h);
3300  row2l = _mm_shuffle_epi8(row2l, r24);
3301  row2h = _mm_shuffle_epi8(row2h, r24);
3302 
3303  b0 = _mm_unpackhi_epi64(m0, m1);
3304  b1 = _mm_unpackhi_epi64(m2, m3);
3305 
3306  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3307  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3308  row4l = _mm_xor_si128(row4l, row1l);
3309  row4h = _mm_xor_si128(row4h, row1h);
3310  row4l = _mm_shuffle_epi8(row4l, r16);
3311  row4h = _mm_shuffle_epi8(row4h, r16);
3312  row3l = _mm_add_epi64(row3l, row4l);
3313  row3h = _mm_add_epi64(row3h, row4h);
3314  row2l = _mm_xor_si128(row2l, row3l);
3315  row2h = _mm_xor_si128(row2h, row3h);
3316  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3317  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3318 
3319  t0 = _mm_alignr_epi8(row2h, row2l, 8);
3320  t1 = _mm_alignr_epi8(row2l, row2h, 8);
3321  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3322  t0 = _mm_alignr_epi8(row4h, row4l, 8);
3323  t1 = _mm_alignr_epi8(row4l, row4h, 8);
3324  row4l = t1, row4h = t0;
3325 
3326  b0 = _mm_unpacklo_epi64(m4, m5);
3327  b1 = _mm_unpacklo_epi64(m6, m7);
3328 
3329  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3330  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3331  row4l = _mm_xor_si128(row4l, row1l);
3332  row4h = _mm_xor_si128(row4h, row1h);
3333  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3334  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3335  row3l = _mm_add_epi64(row3l, row4l);
3336  row3h = _mm_add_epi64(row3h, row4h);
3337  row2l = _mm_xor_si128(row2l, row3l);
3338  row2h = _mm_xor_si128(row2h, row3h);
3339  row2l = _mm_shuffle_epi8(row2l, r24);
3340  row2h = _mm_shuffle_epi8(row2h, r24);
3341 
3342  b0 = _mm_unpackhi_epi64(m4, m5);
3343  b1 = _mm_unpackhi_epi64(m6, m7);
3344 
3345  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3346  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3347  row4l = _mm_xor_si128(row4l, row1l);
3348  row4h = _mm_xor_si128(row4h, row1h);
3349  row4l = _mm_shuffle_epi8(row4l, r16);
3350  row4h = _mm_shuffle_epi8(row4h, r16);
3351  row3l = _mm_add_epi64(row3l, row4l);
3352  row3h = _mm_add_epi64(row3h, row4h);
3353  row2l = _mm_xor_si128(row2l, row3l);
3354  row2h = _mm_xor_si128(row2h, row3h);
3355  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3356  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3357 
3358  t0 = _mm_alignr_epi8(row2l, row2h, 8);
3359  t1 = _mm_alignr_epi8(row2h, row2l, 8);
3360  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3361  t0 = _mm_alignr_epi8(row4l, row4h, 8);
3362  t1 = _mm_alignr_epi8(row4h, row4l, 8);
3363  row4l = t1, row4h = t0;
3364 
3365  b0 = _mm_unpacklo_epi64(m7, m2);
3366  b1 = _mm_unpackhi_epi64(m4, m6);
3367 
3368  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3369  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3370  row4l = _mm_xor_si128(row4l, row1l);
3371  row4h = _mm_xor_si128(row4h, row1h);
3372  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3373  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3374  row3l = _mm_add_epi64(row3l, row4l);
3375  row3h = _mm_add_epi64(row3h, row4h);
3376  row2l = _mm_xor_si128(row2l, row3l);
3377  row2h = _mm_xor_si128(row2h, row3h);
3378  row2l = _mm_shuffle_epi8(row2l, r24);
3379  row2h = _mm_shuffle_epi8(row2h, r24);
3380 
3381  b0 = _mm_unpacklo_epi64(m5, m4);
3382  b1 = _mm_alignr_epi8(m3, m7, 8);
3383 
3384  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3385  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3386  row4l = _mm_xor_si128(row4l, row1l);
3387  row4h = _mm_xor_si128(row4h, row1h);
3388  row4l = _mm_shuffle_epi8(row4l, r16);
3389  row4h = _mm_shuffle_epi8(row4h, r16);
3390  row3l = _mm_add_epi64(row3l, row4l);
3391  row3h = _mm_add_epi64(row3h, row4h);
3392  row2l = _mm_xor_si128(row2l, row3l);
3393  row2h = _mm_xor_si128(row2h, row3h);
3394  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3395  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3396 
3397  t0 = _mm_alignr_epi8(row2h, row2l, 8);
3398  t1 = _mm_alignr_epi8(row2l, row2h, 8);
3399  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3400  t0 = _mm_alignr_epi8(row4h, row4l, 8);
3401  t1 = _mm_alignr_epi8(row4l, row4h, 8);
3402  row4l = t1, row4h = t0;
3403 
3404  b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));
3405  b1 = _mm_unpackhi_epi64(m5, m2);
3406 
3407  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3408  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3409  row4l = _mm_xor_si128(row4l, row1l);
3410  row4h = _mm_xor_si128(row4h, row1h);
3411  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3412  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3413  row3l = _mm_add_epi64(row3l, row4l);
3414  row3h = _mm_add_epi64(row3h, row4h);
3415  row2l = _mm_xor_si128(row2l, row3l);
3416  row2h = _mm_xor_si128(row2h, row3h);
3417  row2l = _mm_shuffle_epi8(row2l, r24);
3418  row2h = _mm_shuffle_epi8(row2h, r24);
3419 
3420  b0 = _mm_unpacklo_epi64(m6, m1);
3421  b1 = _mm_unpackhi_epi64(m3, m1);
3422 
3423  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3424  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3425  row4l = _mm_xor_si128(row4l, row1l);
3426  row4h = _mm_xor_si128(row4h, row1h);
3427  row4l = _mm_shuffle_epi8(row4l, r16);
3428  row4h = _mm_shuffle_epi8(row4h, r16);
3429  row3l = _mm_add_epi64(row3l, row4l);
3430  row3h = _mm_add_epi64(row3h, row4h);
3431  row2l = _mm_xor_si128(row2l, row3l);
3432  row2h = _mm_xor_si128(row2h, row3h);
3433  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3434  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3435 
3436  t0 = _mm_alignr_epi8(row2l, row2h, 8);
3437  t1 = _mm_alignr_epi8(row2h, row2l, 8);
3438  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3439  t0 = _mm_alignr_epi8(row4l, row4h, 8);
3440  t1 = _mm_alignr_epi8(row4h, row4l, 8);
3441  row4l = t1, row4h = t0;
3442 
3443  row1l = _mm_xor_si128(row3l, row1l);
3444  row1h = _mm_xor_si128(row3h, row1h);
3445  _mm_storeu_si128((__m128i *)(void*)(&state.h[0]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])), row1l));
3446  _mm_storeu_si128((__m128i *)(void*)(&state.h[2]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[2])), row1h));
3447 
3448  row2l = _mm_xor_si128(row4l, row2l);
3449  row2h = _mm_xor_si128(row4h, row2h);
3450  _mm_storeu_si128((__m128i *)(void*)(&state.h[4]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])), row2l));
3451  _mm_storeu_si128((__m128i *)(void*)(&state.h[6]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[6])), row2h));
3452 }
3453 #endif // CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
3454 
3455 #if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
3456 
3457 // Reverse words for ARM (use arguments to _mm_set_epi32 without reversing them).
3458 #define vld1q_u32_rev(x, a,b,c,d) d[1]=c[0],d[2]=b[0],d[3]=a[0]; x = vld1q_u32(d);
3459 
3460 // Keep things straight due to swapping. For a 128-bit vector, H64 denotes
3461 // the high 64-bit vector, and L64 denotes the low 64-bit vector. The
3462 // vectors are the same as returned by vget_high_u64 and vget_low_u64.
3463 static const int LANE_H64 = 1;
3464 static const int LANE_L64 = 0;
3465 
3466 static void BLAKE2_NEON_Compress32(const byte* input, BLAKE2_State<word32, false>& state)
3467 {
3468  //CRYPTOPP_ASSERT(IsAlignedOn(input,GetAlignmentOf<uint8_t*>()));
3469  CRYPTOPP_ASSERT(IsAlignedOn(&state.h[0],GetAlignmentOf<uint32x4_t>()));
3470  CRYPTOPP_ASSERT(IsAlignedOn(&state.h[4],GetAlignmentOf<uint32x4_t>()));
3471  CRYPTOPP_ASSERT(IsAlignedOn(&state.t[0],GetAlignmentOf<uint32x4_t>()));
3472 
3473  CRYPTOPP_ALIGN_DATA(16) uint32_t m0[4], m1[4], m2[4], m3[4], m4[4], m5[4], m6[4], m7[4];
3474  CRYPTOPP_ALIGN_DATA(16) uint32_t m8[4], m9[4], m10[4], m11[4], m12[4], m13[4], m14[4], m15[4];
3475 
3477  get(m0[0])(m1[0])(m2[0])(m3[0])(m4[0])(m5[0])(m6[0])(m7[0])(m8[0])(m9[0])(m10[0])(m11[0])(m12[0])(m13[0])(m14[0])(m15[0]);
3478 
3479  uint32x4_t row1,row2,row3,row4;
3480  uint32x4_t buf1,buf2,buf3,buf4;
3481  uint32x4_t ff0,ff1;
3482 
3483  row1 = ff0 = vld1q_u32((const uint32_t*)&state.h[0]);
3484  row2 = ff1 = vld1q_u32((const uint32_t*)&state.h[4]);
3485  row3 = vld1q_u32((const uint32_t*)&BLAKE2S_IV(0));
3486  row4 = veorq_u32(vld1q_u32((const uint32_t*)&BLAKE2S_IV(4)), vld1q_u32((const uint32_t*)&state.t[0]));
3487 
3488  // buf1 = vld1q_u32(m6,m4,m2,m0);
3489  vld1q_u32_rev(buf1, m6,m4,m2,m0);
3490 
3491  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3492  row4 = veorq_u32(row4,row1);
3493  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3494  row3 = vaddq_u32(row3,row4);
3495  row2 = veorq_u32(row2,row3);
3496  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3497 
3498  // buf2 = vld1q_u32(m7,m5,m3,m1);
3499  vld1q_u32_rev(buf2, m7,m5,m3,m1);
3500 
3501  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3502  row4 = veorq_u32(row4,row1);
3503  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3504  row3 = vaddq_u32(row3,row4);
3505  row2 = veorq_u32(row2,row3);
3506  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3507 
3508  row4 = vextq_u32(row4,row4,3);
3509  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3510  row2 = vextq_u32(row2,row2,1);
3511 
3512  // buf3 = vld1q_u32(m14,m12,m10,m8);
3513  vld1q_u32_rev(buf3, m14,m12,m10,m8);
3514 
3515  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3516  row4 = veorq_u32(row4,row1);
3517  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3518  row3 = vaddq_u32(row3,row4);
3519  row2 = veorq_u32(row2,row3);
3520  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3521 
3522  // buf4 = vld1q_u32(m15,m13,m11,m9);
3523  vld1q_u32_rev(buf4, m15,m13,m11,m9);
3524 
3525  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3526  row4 = veorq_u32(row4,row1);
3527  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3528  row3 = vaddq_u32(row3,row4);
3529  row2 = veorq_u32(row2,row3);
3530  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3531 
3532  row4 = vextq_u32(row4,row4,1);
3533  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3534  row2 = vextq_u32(row2,row2,3);
3535 
3536  // buf1 = vld1q_u32(m13,m9,m4,m14);
3537  vld1q_u32_rev(buf1, m13,m9,m4,m14);
3538 
3539  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3540  row4 = veorq_u32(row4,row1);
3541  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3542  row3 = vaddq_u32(row3,row4);
3543  row2 = veorq_u32(row2,row3);
3544  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3545 
3546  // buf2 = vld1q_u32(m6,m15,m8,m10);
3547  vld1q_u32_rev(buf2, m6,m15,m8,m10);
3548 
3549  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3550  row4 = veorq_u32(row4,row1);
3551  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3552  row3 = vaddq_u32(row3,row4);
3553  row2 = veorq_u32(row2,row3);
3554  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3555 
3556  row4 = vextq_u32(row4,row4,3);
3557  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3558  row2 = vextq_u32(row2,row2,1);
3559 
3560  // buf3 = vld1q_u32(m5,m11,m0,m1);
3561  vld1q_u32_rev(buf3, m5,m11,m0,m1);
3562 
3563  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3564  row4 = veorq_u32(row4,row1);
3565  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3566  row3 = vaddq_u32(row3,row4);
3567  row2 = veorq_u32(row2,row3);
3568  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3569 
3570  // buf4 = vld1q_u32(m3,m7,m2,m12);
3571  vld1q_u32_rev(buf4, m3,m7,m2,m12);
3572 
3573  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3574  row4 = veorq_u32(row4,row1);
3575  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3576  row3 = vaddq_u32(row3,row4);
3577  row2 = veorq_u32(row2,row3);
3578  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3579 
3580  row4 = vextq_u32(row4,row4,1);
3581  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3582  row2 = vextq_u32(row2,row2,3);
3583 
3584  // buf1 = vld1q_u32(m15,m5,m12,m11);
3585  vld1q_u32_rev(buf1, m15,m5,m12,m11);
3586 
3587  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3588  row4 = veorq_u32(row4,row1);
3589  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3590  row3 = vaddq_u32(row3,row4);
3591  row2 = veorq_u32(row2,row3);
3592  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3593 
3594  // buf2 = vld1q_u32(m13,m2,m0,m8);
3595  vld1q_u32_rev(buf2, m13,m2,m0,m8);
3596 
3597  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3598  row4 = veorq_u32(row4,row1);
3599  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3600  row3 = vaddq_u32(row3,row4);
3601  row2 = veorq_u32(row2,row3);
3602  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3603 
3604  row4 = vextq_u32(row4,row4,3);
3605  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3606  row2 = vextq_u32(row2,row2,1);
3607 
3608  // buf3 = vld1q_u32(m9,m7,m3,m10);
3609  vld1q_u32_rev(buf3, m9,m7,m3,m10);
3610 
3611  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3612  row4 = veorq_u32(row4,row1);
3613  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3614  row3 = vaddq_u32(row3,row4);
3615  row2 = veorq_u32(row2,row3);
3616  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3617 
3618  // buf4 = vld1q_u32(m4,m1,m6,m14);
3619  vld1q_u32_rev(buf4, m4,m1,m6,m14);
3620 
3621  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3622  row4 = veorq_u32(row4,row1);
3623  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3624  row3 = vaddq_u32(row3,row4);
3625  row2 = veorq_u32(row2,row3);
3626  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3627 
3628  row4 = vextq_u32(row4,row4,1);
3629  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3630  row2 = vextq_u32(row2,row2,3);
3631 
3632  // buf1 = vld1q_u32(m11,m13,m3,m7);
3633  vld1q_u32_rev(buf1, m11,m13,m3,m7);
3634 
3635  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3636  row4 = veorq_u32(row4,row1);
3637  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3638  row3 = vaddq_u32(row3,row4);
3639  row2 = veorq_u32(row2,row3);
3640  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3641 
3642  // buf2 = vld1q_u32(m14,m12,m1,m9);
3643  vld1q_u32_rev(buf2, m14,m12,m1,m9);
3644 
3645  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3646  row4 = veorq_u32(row4,row1);
3647  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3648  row3 = vaddq_u32(row3,row4);
3649  row2 = veorq_u32(row2,row3);
3650  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3651 
3652  row4 = vextq_u32(row4,row4,3);
3653  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3654  row2 = vextq_u32(row2,row2,1);
3655 
3656  // buf3 = vld1q_u32(m15,m4,m5,m2);
3657  vld1q_u32_rev(buf3, m15,m4,m5,m2);
3658 
3659  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3660  row4 = veorq_u32(row4,row1);
3661  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3662  row3 = vaddq_u32(row3,row4);
3663  row2 = veorq_u32(row2,row3);
3664  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3665 
3666  // buf4 = vld1q_u32(m8,m0,m10,m6);
3667  vld1q_u32_rev(buf4, m8,m0,m10,m6);
3668 
3669  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3670  row4 = veorq_u32(row4,row1);
3671  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3672  row3 = vaddq_u32(row3,row4);
3673  row2 = veorq_u32(row2,row3);
3674  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3675 
3676  row4 = vextq_u32(row4,row4,1);
3677  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3678  row2 = vextq_u32(row2,row2,3);
3679 
3680  // buf1 = vld1q_u32(m10,m2,m5,m9);
3681  vld1q_u32_rev(buf1, m10,m2,m5,m9);
3682 
3683  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3684  row4 = veorq_u32(row4,row1);
3685  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3686  row3 = vaddq_u32(row3,row4);
3687  row2 = veorq_u32(row2,row3);
3688  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3689 
3690  // buf2 = vld1q_u32(m15,m4,m7,m0);
3691  vld1q_u32_rev(buf2, m15,m4,m7,m0);
3692 
3693  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3694  row4 = veorq_u32(row4,row1);
3695  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3696  row3 = vaddq_u32(row3,row4);
3697  row2 = veorq_u32(row2,row3);
3698  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3699 
3700  row4 = vextq_u32(row4,row4,3);
3701  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3702  row2 = vextq_u32(row2,row2,1);
3703 
3704  // buf3 = vld1q_u32(m3,m6,m11,m14);
3705  vld1q_u32_rev(buf3, m3,m6,m11,m14);
3706 
3707  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3708  row4 = veorq_u32(row4,row1);
3709  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3710  row3 = vaddq_u32(row3,row4);
3711  row2 = veorq_u32(row2,row3);
3712  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3713 
3714  // buf4 = vld1q_u32(m13,m8,m12,m1);
3715  vld1q_u32_rev(buf4, m13,m8,m12,m1);
3716 
3717  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3718  row4 = veorq_u32(row4,row1);
3719  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3720  row3 = vaddq_u32(row3,row4);
3721  row2 = veorq_u32(row2,row3);
3722  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3723 
3724  row4 = vextq_u32(row4,row4,1);
3725  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3726  row2 = vextq_u32(row2,row2,3);
3727 
3728  // buf1 = vld1q_u32(m8,m0,m6,m2);
3729  vld1q_u32_rev(buf1, m8,m0,m6,m2);
3730 
3731  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3732  row4 = veorq_u32(row4,row1);
3733  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3734  row3 = vaddq_u32(row3,row4);
3735  row2 = veorq_u32(row2,row3);
3736  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3737 
3738  // buf2 = vld1q_u32(m3,m11,m10,m12);
3739  vld1q_u32_rev(buf2, m3,m11,m10,m12);
3740 
3741  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3742  row4 = veorq_u32(row4,row1);
3743  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3744  row3 = vaddq_u32(row3,row4);
3745  row2 = veorq_u32(row2,row3);
3746  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3747 
3748  row4 = vextq_u32(row4,row4,3);
3749  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3750  row2 = vextq_u32(row2,row2,1);
3751 
3752  // buf3 = vld1q_u32(m1,m15,m7,m4);
3753  vld1q_u32_rev(buf3, m1,m15,m7,m4);
3754 
3755  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3756  row4 = veorq_u32(row4,row1);
3757  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3758  row3 = vaddq_u32(row3,row4);
3759  row2 = veorq_u32(row2,row3);
3760  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3761 
3762  // buf4 = vld1q_u32(m9,m14,m5,m13);
3763  vld1q_u32_rev(buf4, m9,m14,m5,m13);
3764 
3765  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3766  row4 = veorq_u32(row4,row1);
3767  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3768  row3 = vaddq_u32(row3,row4);
3769  row2 = veorq_u32(row2,row3);
3770  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3771 
3772  row4 = vextq_u32(row4,row4,1);
3773  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3774  row2 = vextq_u32(row2,row2,3);
3775 
3776  // buf1 = vld1q_u32(m4,m14,m1,m12);
3777  vld1q_u32_rev(buf1, m4,m14,m1,m12);
3778 
3779  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3780  row4 = veorq_u32(row4,row1);
3781  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3782  row3 = vaddq_u32(row3,row4);
3783  row2 = veorq_u32(row2,row3);
3784  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3785 
3786  // buf2 = vld1q_u32(m10,m13,m15,m5);
3787  vld1q_u32_rev(buf2, m10,m13,m15,m5);
3788 
3789  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3790  row4 = veorq_u32(row4,row1);
3791  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3792  row3 = vaddq_u32(row3,row4);
3793  row2 = veorq_u32(row2,row3);
3794  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3795 
3796  row4 = vextq_u32(row4,row4,3);
3797  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3798  row2 = vextq_u32(row2,row2,1);
3799 
3800  // buf3 = vld1q_u32(m8,m9,m6,m0);
3801  vld1q_u32_rev(buf3, m8,m9,m6,m0);
3802 
3803  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3804  row4 = veorq_u32(row4,row1);
3805  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3806  row3 = vaddq_u32(row3,row4);
3807  row2 = veorq_u32(row2,row3);
3808  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3809 
3810  // buf4 = vld1q_u32(m11,m2,m3,m7);
3811  vld1q_u32_rev(buf4, m11,m2,m3,m7);
3812 
3813  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3814  row4 = veorq_u32(row4,row1);
3815  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3816  row3 = vaddq_u32(row3,row4);
3817  row2 = veorq_u32(row2,row3);
3818  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3819 
3820  row4 = vextq_u32(row4,row4,1);
3821  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3822  row2 = vextq_u32(row2,row2,3);
3823 
3824  // buf1 = vld1q_u32(m3,m12,m7,m13);
3825  vld1q_u32_rev(buf1, m3,m12,m7,m13);
3826 
3827  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3828  row4 = veorq_u32(row4,row1);
3829  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3830  row3 = vaddq_u32(row3,row4);
3831  row2 = veorq_u32(row2,row3);
3832  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3833 
3834  // buf2 = vld1q_u32(m9,m1,m14,m11);
3835  vld1q_u32_rev(buf2, m9,m1,m14,m11);
3836 
3837  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3838  row4 = veorq_u32(row4,row1);
3839  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3840  row3 = vaddq_u32(row3,row4);
3841  row2 = veorq_u32(row2,row3);
3842  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3843 
3844  row4 = vextq_u32(row4,row4,3);
3845  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3846  row2 = vextq_u32(row2,row2,1);
3847 
3848  // buf3 = vld1q_u32(m2,m8,m15,m5);
3849  vld1q_u32_rev(buf3, m2,m8,m15,m5);
3850 
3851  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3852  row4 = veorq_u32(row4,row1);
3853  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3854  row3 = vaddq_u32(row3,row4);
3855  row2 = veorq_u32(row2,row3);
3856  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3857 
3858  // buf4 = vld1q_u32(m10,m6,m4,m0);
3859  vld1q_u32_rev(buf4, m10,m6,m4,m0);
3860 
3861  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3862  row4 = veorq_u32(row4,row1);
3863  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3864  row3 = vaddq_u32(row3,row4);
3865  row2 = veorq_u32(row2,row3);
3866  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3867 
3868  row4 = vextq_u32(row4,row4,1);
3869  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3870  row2 = vextq_u32(row2,row2,3);
3871 
3872  // buf1 = vld1q_u32(m0,m11,m14,m6);
3873  vld1q_u32_rev(buf1, m0,m11,m14,m6);
3874 
3875  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3876  row4 = veorq_u32(row4,row1);
3877  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3878  row3 = vaddq_u32(row3,row4);
3879  row2 = veorq_u32(row2,row3);
3880  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3881 
3882  // buf2 = vld1q_u32(m8,m3,m9,m15);
3883  vld1q_u32_rev(buf2, m8,m3,m9,m15);
3884 
3885  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3886  row4 = veorq_u32(row4,row1);
3887  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3888  row3 = vaddq_u32(row3,row4);
3889  row2 = veorq_u32(row2,row3);
3890  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3891 
3892  row4 = vextq_u32(row4,row4,3);
3893  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3894  row2 = vextq_u32(row2,row2,1);
3895 
3896  // buf3 = vld1q_u32(m10,m1,m13,m12);
3897  vld1q_u32_rev(buf3, m10,m1,m13,m12);
3898 
3899  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3900  row4 = veorq_u32(row4,row1);
3901  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3902  row3 = vaddq_u32(row3,row4);
3903  row2 = veorq_u32(row2,row3);
3904  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3905 
3906  // buf4 = vld1q_u32(m5,m4,m7,m2);
3907  vld1q_u32_rev(buf4, m5,m4,m7,m2);
3908 
3909  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3910  row4 = veorq_u32(row4,row1);
3911  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3912  row3 = vaddq_u32(row3,row4);
3913  row2 = veorq_u32(row2,row3);
3914  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3915 
3916  row4 = vextq_u32(row4,row4,1);
3917  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3918  row2 = vextq_u32(row2,row2,3);
3919 
3920  // buf1 = vld1q_u32(m1,m7,m8,m10);
3921  vld1q_u32_rev(buf1, m1,m7,m8,m10);
3922 
3923  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3924  row4 = veorq_u32(row4,row1);
3925  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3926  row3 = vaddq_u32(row3,row4);
3927  row2 = veorq_u32(row2,row3);
3928  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3929 
3930  // buf2 = vld1q_u32(m5,m6,m4,m2);
3931  vld1q_u32_rev(buf2, m5,m6,m4,m2);
3932 
3933  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3934  row4 = veorq_u32(row4,row1);
3935  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3936  row3 = vaddq_u32(row3,row4);
3937  row2 = veorq_u32(row2,row3);
3938  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3939 
3940  row4 = vextq_u32(row4,row4,3);
3941  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3942  row2 = vextq_u32(row2,row2,1);
3943 
3944  // buf3 = vld1q_u32(m13,m3,m9,m15);
3945  vld1q_u32_rev(buf3, m13,m3,m9,m15);
3946 
3947  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3948  row4 = veorq_u32(row4,row1);
3949  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3950  row3 = vaddq_u32(row3,row4);
3951  row2 = veorq_u32(row2,row3);
3952  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3953 
3954  // buf4 = vld1q_u32(m0,m12,m14,m11);
3955  vld1q_u32_rev(buf4, m0,m12,m14,m11);
3956 
3957  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3958  row4 = veorq_u32(row4,row1);
3959  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3960  row3 = vaddq_u32(row3,row4);
3961  row2 = veorq_u32(row2,row3);
3962  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3963 
3964  row4 = vextq_u32(row4,row4,1);
3965  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3966  row2 = vextq_u32(row2,row2,3);
3967 
3968  vst1q_u32((uint32_t*)&state.h[0],veorq_u32(ff0,veorq_u32(row1,row3)));
3969  vst1q_u32((uint32_t*)&state.h[4],veorq_u32(ff1,veorq_u32(row2,row4)));
3970 }
3971 
3972 static void BLAKE2_NEON_Compress64(const byte* input, BLAKE2_State<word64, true>& state)
3973 {
3974  //CRYPTOPP_ASSERT(IsAlignedOn(input,GetAlignmentOf<uint8_t*>()));
3975  CRYPTOPP_ASSERT(IsAlignedOn(&state.h[0],GetAlignmentOf<uint64x2_t>()));
3976  CRYPTOPP_ASSERT(IsAlignedOn(&state.h[4],GetAlignmentOf<uint64x2_t>()));
3977  CRYPTOPP_ASSERT(IsAlignedOn(&state.t[0],GetAlignmentOf<uint64x2_t>()));
3978 
3979  uint64x2_t m0m1,m2m3,m4m5,m6m7,m8m9,m10m11,m12m13,m14m15;
3980 
3981  m0m1 = vreinterpretq_u64_u8(vld1q_u8(input+ 0));
3982  m2m3 = vreinterpretq_u64_u8(vld1q_u8(input+ 16));
3983  m4m5 = vreinterpretq_u64_u8(vld1q_u8(input+ 32));
3984  m6m7 = vreinterpretq_u64_u8(vld1q_u8(input+ 48));
3985  m8m9 = vreinterpretq_u64_u8(vld1q_u8(input+ 64));
3986  m10m11 = vreinterpretq_u64_u8(vld1q_u8(input+ 80));
3987  m12m13 = vreinterpretq_u64_u8(vld1q_u8(input+ 96));
3988  m14m15 = vreinterpretq_u64_u8(vld1q_u8(input+112));
3989 
3990  uint64x2_t row1l, row1h, row2l, row2h;
3991  uint64x2_t row3l, row3h, row4l, row4h;
3992  uint64x2_t b0 = {0,0}, b1 = {0,0}, t0, t1;
3993 
3994  row1l = vld1q_u64((const uint64_t *)&state.h[0]);
3995  row1h = vld1q_u64((const uint64_t *)&state.h[2]);
3996  row2l = vld1q_u64((const uint64_t *)&state.h[4]);
3997  row2h = vld1q_u64((const uint64_t *)&state.h[6]);
3998  row3l = vld1q_u64((const uint64_t *)&BLAKE2B_IV(0));
3999  row3h = vld1q_u64((const uint64_t *)&BLAKE2B_IV(2));
4000  row4l = veorq_u64(vld1q_u64((const uint64_t *)&BLAKE2B_IV(4)), vld1q_u64((const uint64_t*)&state.t[0]));
4001  row4h = veorq_u64(vld1q_u64((const uint64_t *)&BLAKE2B_IV(6)), vld1q_u64((const uint64_t*)&state.f[0]));
4002 
4003  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_L64);
4004  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_H64);
4005  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_L64);
4006  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_H64);
4007  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4008  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4009  row4l = veorq_u64(row4l, row1l);
4010  row4h = veorq_u64(row4h, row1h);
4011  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4012  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4013  row3l = vaddq_u64(row3l, row4l);
4014  row3h = vaddq_u64(row3h, row4h);
4015  row2l = veorq_u64(row2l, row3l);
4016  row2h = veorq_u64(row2h, row3h);
4017  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4018  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4019 
4020  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_L64);
4021  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b0,LANE_H64);
4022  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_L64);
4023  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_H64);
4024  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4025  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4026  row4l = veorq_u64(row4l, row1l);
4027  row4h = veorq_u64(row4h, row1h);
4028  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4029  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4030  row3l = vaddq_u64(row3l, row4l);
4031  row3h = vaddq_u64(row3h, row4h);
4032  row2l = veorq_u64(row2l, row3l);
4033  row2h = veorq_u64(row2h, row3h);
4034  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4035  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4036 
4037  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4038  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4039  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4040  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4041  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4042  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4043  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4044  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4045  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4046 
4047  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_L64);
4048  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_H64);
4049  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b1,LANE_L64);
4050  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b1,LANE_H64);
4051  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4052  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4053  row4l = veorq_u64(row4l, row1l);
4054  row4h = veorq_u64(row4h, row1h);
4055  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4056  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4057  row3l = vaddq_u64(row3l, row4l);
4058  row3h = vaddq_u64(row3h, row4h);
4059  row2l = veorq_u64(row2l, row3l);
4060  row2h = veorq_u64(row2h, row3h);
4061  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4062  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4063 
4064  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_L64);
4065  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_H64);
4066  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_L64);
4067  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_H64);
4068  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4069  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4070  row4l = veorq_u64(row4l, row1l);
4071  row4h = veorq_u64(row4h, row1h);
4072  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4073  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4074  row3l = vaddq_u64(row3l, row4l);
4075  row3h = vaddq_u64(row3h, row4h);
4076  row2l = veorq_u64(row2l, row3l);
4077  row2h = veorq_u64(row2h, row3h);
4078  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4079  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4080 
4081  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4082  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4083  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4084  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4085  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4086  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4087  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4088  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4089  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4090 
4091  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_L64);
4092  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b0,LANE_H64);
4093  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_L64);
4094  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_H64);
4095  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4096  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4097  row4l = veorq_u64(row4l, row1l);
4098  row4h = veorq_u64(row4h, row1h);
4099  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4100  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4101  row3l = vaddq_u64(row3l, row4l);
4102  row3h = vaddq_u64(row3h, row4h);
4103  row2l = veorq_u64(row2l, row3l);
4104  row2h = veorq_u64(row2h, row3h);
4105  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4106  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4107 
4108  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_L64);
4109  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_H64);
4110  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_L64);
4111  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_H64);
4112  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4113  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4114  row4l = veorq_u64(row4l, row1l);
4115  row4h = veorq_u64(row4h, row1h);
4116  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4117  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4118  row3l = vaddq_u64(row3l, row4l);
4119  row3h = vaddq_u64(row3h, row4h);
4120  row2l = veorq_u64(row2l, row3l);
4121  row2h = veorq_u64(row2h, row3h);
4122  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4123  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4124 
4125  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4126  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4127  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4128  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4129  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4130  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4131  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4132  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4133  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4134 
4135  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_L64);
4136  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_H64);
4137  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_L64);
4138  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_H64);
4139  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4140  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4141  row4l = veorq_u64(row4l, row1l);
4142  row4h = veorq_u64(row4h, row1h);
4143  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4144  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4145  row3l = vaddq_u64(row3l, row4l);
4146  row3h = vaddq_u64(row3h, row4h);
4147  row2l = veorq_u64(row2l, row3l);
4148  row2h = veorq_u64(row2h, row3h);
4149  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4150  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4151 
4152  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_L64);
4153  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_H64);
4154  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_L64);
4155  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_H64);
4156  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4157  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4158  row4l = veorq_u64(row4l, row1l);
4159  row4h = veorq_u64(row4h, row1h);
4160  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4161  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4162  row3l = vaddq_u64(row3l, row4l);
4163  row3h = vaddq_u64(row3h, row4h);
4164  row2l = veorq_u64(row2l, row3l);
4165  row2h = veorq_u64(row2h, row3h);
4166  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4167  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4168 
4169  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4170  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4171  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4172  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4173  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4174  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4175  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4176  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4177  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4178 
4179  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_L64);
4180  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_H64);
4181  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_L64);
4182  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_H64);
4183  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4184  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4185  row4l = veorq_u64(row4l, row1l);
4186  row4h = veorq_u64(row4h, row1h);
4187  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4188  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4189  row3l = vaddq_u64(row3l, row4l);
4190  row3h = vaddq_u64(row3h, row4h);
4191  row2l = veorq_u64(row2l, row3l);
4192  row2h = veorq_u64(row2h, row3h);
4193  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4194  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4195 
4196  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_L64);
4197  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_H64);
4198  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b1,LANE_L64);
4199  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_H64);
4200  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4201  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4202  row4l = veorq_u64(row4l, row1l);
4203  row4h = veorq_u64(row4h, row1h);
4204  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4205  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4206  row3l = vaddq_u64(row3l, row4l);
4207  row3h = vaddq_u64(row3h, row4h);
4208  row2l = veorq_u64(row2l, row3l);
4209  row2h = veorq_u64(row2h, row3h);
4210  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4211  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4212 
4213  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4214  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4215  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4216  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4217  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4218  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4219  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4220  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4221  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4222 
4223  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_L64);
4224  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b0,LANE_H64);
4225  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_L64);
4226  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_H64);
4227  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4228  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4229  row4l = veorq_u64(row4l, row1l);
4230  row4h = veorq_u64(row4h, row1h);
4231  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4232  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4233  row3l = vaddq_u64(row3l, row4l);
4234  row3h = vaddq_u64(row3h, row4h);
4235  row2l = veorq_u64(row2l, row3l);
4236  row2h = veorq_u64(row2h, row3h);
4237  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4238  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4239 
4240  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_L64);
4241  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b0,LANE_H64);
4242  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b1,LANE_L64);
4243  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_H64);
4244  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4245  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4246  row4l = veorq_u64(row4l, row1l);
4247  row4h = veorq_u64(row4h, row1h);
4248  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4249  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4250  row3l = vaddq_u64(row3l, row4l);
4251  row3h = vaddq_u64(row3h, row4h);
4252  row2l = veorq_u64(row2l, row3l);
4253  row2h = veorq_u64(row2h, row3h);
4254  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4255  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4256 
4257  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4258  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4259  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4260  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4261  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4262  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4263  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4264  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4265  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4266 
4267  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_L64);
4268  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b0,LANE_H64);
4269  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_L64);
4270  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_H64);
4271  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4272  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4273  row4l = veorq_u64(row4l, row1l);
4274  row4h = veorq_u64(row4h, row1h);
4275  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4276  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4277  row3l = vaddq_u64(row3l, row4l);
4278  row3h = vaddq_u64(row3h, row4h);
4279  row2l = veorq_u64(row2l, row3l);
4280  row2h = veorq_u64(row2h, row3h);
4281  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4282  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4283 
4284  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_L64);
4285  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_H64);
4286  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b1,LANE_L64);
4287  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b1,LANE_H64);
4288  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4289  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4290  row4l = veorq_u64(row4l, row1l);
4291  row4h = veorq_u64(row4h, row1h);
4292  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4293  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4294  row3l = vaddq_u64(row3l, row4l);
4295  row3h = vaddq_u64(row3h, row4h);
4296  row2l = veorq_u64(row2l, row3l);
4297  row2h = veorq_u64(row2h, row3h);
4298  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4299  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4300 
4301  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4302  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4303  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4304  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4305  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4306  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4307  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4308  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4309  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4310 
4311  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_L64);
4312  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b0,LANE_H64);
4313  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_L64);
4314  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_H64);
4315  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4316  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4317  row4l = veorq_u64(row4l, row1l);
4318  row4h = veorq_u64(row4h, row1h);
4319  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4320  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4321  row3l = vaddq_u64(row3l, row4l);
4322  row3h = vaddq_u64(row3h, row4h);
4323  row2l = veorq_u64(row2l, row3l);
4324  row2h = veorq_u64(row2h, row3h);
4325  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4326  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4327 
4328  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b0,LANE_L64);
4329  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_H64);
4330  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b1,LANE_L64);
4331  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_H64);
4332  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4333  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4334  row4l = veorq_u64(row4l, row1l);
4335  row4h = veorq_u64(row4h, row1h);
4336  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4337  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4338  row3l = vaddq_u64(row3l, row4l);
4339  row3h = vaddq_u64(row3h, row4h);
4340  row2l = veorq_u64(row2l, row3l);
4341  row2h = veorq_u64(row2h, row3h);
4342  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4343  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4344 
4345  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4346  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4347  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4348  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4349  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4350  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4351  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4352  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4353  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4354 
4355  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_L64);
4356  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b0,LANE_H64);
4357  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b1,LANE_L64);
4358  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b1,LANE_H64);
4359  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4360  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4361  row4l = veorq_u64(row4l, row1l);
4362  row4h = veorq_u64(row4h, row1h);
4363  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4364  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4365  row3l = vaddq_u64(row3l, row4l);
4366  row3h = vaddq_u64(row3h, row4h);
4367  row2l = veorq_u64(row2l, row3l);
4368  row2h = veorq_u64(row2h, row3h);
4369  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4370  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4371 
4372  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_L64);
4373  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_H64);
4374  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_L64);
4375  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_H64);
4376  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4377  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4378  row4l = veorq_u64(row4l, row1l);
4379  row4h = veorq_u64(row4h, row1h);
4380  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4381  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4382  row3l = vaddq_u64(row3l, row4l);
4383  row3h = vaddq_u64(row3h, row4h);
4384  row2l = veorq_u64(row2l, row3l);
4385  row2h = veorq_u64(row2h, row3h);
4386  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4387  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4388 
4389  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4390  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4391  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4392  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4393  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4394  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4395  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4396  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4397  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4398 
4399  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_L64);
4400  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_H64);
4401  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_L64);
4402  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_H64);
4403  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4404  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4405  row4l = veorq_u64(row4l, row1l);
4406  row4h = veorq_u64(row4h, row1h);
4407  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4408  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4409  row3l = vaddq_u64(row3l, row4l);
4410  row3h = vaddq_u64(row3h, row4h);
4411  row2l = veorq_u64(row2l, row3l);
4412  row2h = veorq_u64(row2h, row3h);
4413  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4414  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4415 
4416  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_L64);
4417  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_H64);
4418  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_L64);
4419  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_H64);
4420  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4421  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4422  row4l = veorq_u64(row4l, row1l);
4423  row4h = veorq_u64(row4h, row1h);
4424  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4425  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4426  row3l = vaddq_u64(row3l, row4l);
4427  row3h = vaddq_u64(row3h, row4h);
4428  row2l = veorq_u64(row2l, row3l);
4429  row2h = veorq_u64(row2h, row3h);
4430  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4431  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4432 
4433  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4434  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4435  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4436  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4437  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4438  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4439  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4440  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4441  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4442 
4443  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_L64);
4444  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b0,LANE_H64);
4445  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b1,LANE_L64);
4446  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_H64);
4447  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4448  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4449  row4l = veorq_u64(row4l, row1l);
4450  row4h = veorq_u64(row4h, row1h);
4451  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4452  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4453  row3l = vaddq_u64(row3l, row4l);
4454  row3h = vaddq_u64(row3h, row4h);
4455  row2l = veorq_u64(row2l, row3l);
4456  row2h = veorq_u64(row2h, row3h);
4457  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4458  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4459 
4460  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_L64);
4461  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_H64);
4462  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_L64);
4463  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_H64);
4464  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4465  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4466  row4l = veorq_u64(row4l, row1l);
4467  row4h = veorq_u64(row4h, row1h);
4468  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4469  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4470  row3l = vaddq_u64(row3l, row4l);
4471  row3h = vaddq_u64(row3h, row4h);
4472  row2l = veorq_u64(row2l, row3l);
4473  row2h = veorq_u64(row2h, row3h);
4474  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4475  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4476 
4477  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4478  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4479  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4480  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4481  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4482  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4483  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4484  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4485  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4486 
4487  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b0,LANE_L64);
4488  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_H64);
4489  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_L64);
4490  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b1,LANE_H64);
4491  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4492  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4493  row4l = veorq_u64(row4l, row1l);
4494  row4h = veorq_u64(row4h, row1h);
4495  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4496  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4497  row3l = vaddq_u64(row3l, row4l);
4498  row3h = vaddq_u64(row3h, row4h);
4499  row2l = veorq_u64(row2l, row3l);
4500  row2h = veorq_u64(row2h, row3h);
4501  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4502  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4503 
4504  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b0,LANE_L64);
4505  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b0,LANE_H64);
4506  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b1,LANE_L64);
4507  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_H64);
4508  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4509  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4510  row4l = veorq_u64(row4l, row1l);
4511  row4h = veorq_u64(row4h, row1h);
4512  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4513  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4514  row3l = vaddq_u64(row3l, row4l);
4515  row3h = vaddq_u64(row3h, row4h);
4516  row2l = veorq_u64(row2l, row3l);
4517  row2h = veorq_u64(row2h, row3h);
4518  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4519  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4520 
4521  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4522  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4523  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4524  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4525  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4526  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4527  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4528  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4529  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4530 
4531  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_L64);
4532  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_H64);
4533  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b1,LANE_L64);
4534  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_H64);
4535  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4536  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4537  row4l = veorq_u64(row4l, row1l);
4538  row4h = veorq_u64(row4h, row1h);
4539  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4540  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4541  row3l = vaddq_u64(row3l, row4l);
4542  row3h = vaddq_u64(row3h, row4h);
4543  row2l = veorq_u64(row2l, row3l);
4544  row2h = veorq_u64(row2h, row3h);
4545  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4546  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4547 
4548  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b0,LANE_L64);
4549  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b0,LANE_H64);
4550  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_L64);
4551  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b1,LANE_H64);
4552  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4553  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4554  row4l = veorq_u64(row4l, row1l);
4555  row4h = veorq_u64(row4h, row1h);
4556  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4557  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4558  row3l = vaddq_u64(row3l, row4l);
4559  row3h = vaddq_u64(row3h, row4h);
4560  row2l = veorq_u64(row2l, row3l);
4561  row2h = veorq_u64(row2h, row3h);
4562  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4563  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4564 
4565  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4566  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4567  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4568  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4569  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4570  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4571  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4572  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4573  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4574 
4575  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_L64);
4576  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b0,LANE_H64);
4577  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_L64);
4578  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_H64);
4579  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4580  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4581  row4l = veorq_u64(row4l, row1l);
4582  row4h = veorq_u64(row4h, row1h);
4583  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4584  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4585  row3l = vaddq_u64(row3l, row4l);
4586  row3h = vaddq_u64(row3h, row4h);
4587  row2l = veorq_u64(row2l, row3l);
4588  row2h = veorq_u64(row2h, row3h);
4589  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4590  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4591 
4592  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_L64);
4593  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b0,LANE_H64);
4594  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b1,LANE_L64);
4595  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_H64);
4596  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4597  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4598  row4l = veorq_u64(row4l, row1l);
4599  row4h = veorq_u64(row4h, row1h);
4600  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4601  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4602  row3l = vaddq_u64(row3l, row4l);
4603  row3h = vaddq_u64(row3h, row4h);
4604  row2l = veorq_u64(row2l, row3l);
4605  row2h = veorq_u64(row2h, row3h);
4606  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4607  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4608 
4609  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4610  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4611  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4612  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4613  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4614  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4615  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4616  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4617  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4618 
4619  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b0,LANE_L64);
4620  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_H64);
4621  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b1,LANE_L64);
4622  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_H64);
4623  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4624  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4625  row4l = veorq_u64(row4l, row1l);
4626  row4h = veorq_u64(row4h, row1h);
4627  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4628  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4629  row3l = vaddq_u64(row3l, row4l);
4630  row3h = vaddq_u64(row3h, row4h);
4631  row2l = veorq_u64(row2l, row3l);
4632  row2h = veorq_u64(row2h, row3h);
4633  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4634  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4635 
4636  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_L64);
4637  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_H64);
4638  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b1,LANE_L64);
4639  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_H64);
4640  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4641  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4642  row4l = veorq_u64(row4l, row1l);
4643  row4h = veorq_u64(row4h, row1h);
4644  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4645  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4646  row3l = vaddq_u64(row3l, row4l);
4647  row3h = vaddq_u64(row3h, row4h);
4648  row2l = veorq_u64(row2l, row3l);
4649  row2h = veorq_u64(row2h, row3h);
4650  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4651  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4652 
4653  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4654  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4655  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4656  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4657  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4658  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4659  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4660  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4661  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4662 
4663  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b0,LANE_L64);
4664  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b0,LANE_H64);
4665  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_L64);
4666  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b1,LANE_H64);
4667  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4668  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4669  row4l = veorq_u64(row4l, row1l);
4670  row4h = veorq_u64(row4h, row1h);
4671  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4672  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4673  row3l = vaddq_u64(row3l, row4l);
4674  row3h = vaddq_u64(row3h, row4h);
4675  row2l = veorq_u64(row2l, row3l);
4676  row2h = veorq_u64(row2h, row3h);
4677  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4678  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4679 
4680  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_L64);
4681  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b0,LANE_H64);
4682  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_L64);
4683  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b1,LANE_H64);
4684  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4685  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4686  row4l = veorq_u64(row4l, row1l);
4687  row4h = veorq_u64(row4h, row1h);
4688  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4689  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4690  row3l = vaddq_u64(row3l, row4l);
4691  row3h = vaddq_u64(row3h, row4h);
4692  row2l = veorq_u64(row2l, row3l);
4693  row2h = veorq_u64(row2h, row3h);
4694  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4695  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4696 
4697  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4698  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4699  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4700  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4701  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4702  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4703  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4704  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4705  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4706 
4707  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b0,LANE_L64);
4708  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_H64);
4709  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_L64);
4710  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b1,LANE_H64);
4711  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4712  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4713  row4l = veorq_u64(row4l, row1l);
4714  row4h = veorq_u64(row4h, row1h);
4715  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4716  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4717  row3l = vaddq_u64(row3l, row4l);
4718  row3h = vaddq_u64(row3h, row4h);
4719  row2l = veorq_u64(row2l, row3l);
4720  row2h = veorq_u64(row2h, row3h);
4721  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4722  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4723 
4724  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b0,LANE_L64);
4725  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_H64);
4726  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_L64);
4727  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_H64);
4728  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4729  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4730  row4l = veorq_u64(row4l, row1l);
4731  row4h = veorq_u64(row4h, row1h);
4732  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4733  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4734  row3l = vaddq_u64(row3l, row4l);
4735  row3h = vaddq_u64(row3h, row4h);
4736  row2l = veorq_u64(row2l, row3l);
4737  row2h = veorq_u64(row2h, row3h);
4738  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4739  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4740 
4741  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4742  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4743  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4744  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4745  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4746  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4747  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4748  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4749  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4750 
4751  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_L64);
4752  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b0,LANE_H64);
4753  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b1,LANE_L64);
4754  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b1,LANE_H64);
4755  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4756  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4757  row4l = veorq_u64(row4l, row1l);
4758  row4h = veorq_u64(row4h, row1h);
4759  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4760  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4761  row3l = vaddq_u64(row3l, row4l);
4762  row3h = vaddq_u64(row3h, row4h);
4763  row2l = veorq_u64(row2l, row3l);
4764  row2h = veorq_u64(row2h, row3h);
4765  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4766  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4767 
4768  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_L64);
4769  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_H64);
4770  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_L64);
4771  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_H64);
4772  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4773  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4774  row4l = veorq_u64(row4l, row1l);
4775  row4h = veorq_u64(row4h, row1h);
4776  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4777  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4778  row3l = vaddq_u64(row3l, row4l);
4779  row3h = vaddq_u64(row3h, row4h);
4780  row2l = veorq_u64(row2l, row3l);
4781  row2h = veorq_u64(row2h, row3h);
4782  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4783  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4784 
4785  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4786  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4787  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4788  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4789  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4790  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4791  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4792  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4793  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4794 
4795  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_L64);
4796  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_H64);
4797  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_L64);
4798  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b1,LANE_H64);
4799  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4800  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4801  row4l = veorq_u64(row4l, row1l);
4802  row4h = veorq_u64(row4h, row1h);
4803  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4804  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4805  row3l = vaddq_u64(row3l, row4l);
4806  row3h = vaddq_u64(row3h, row4h);
4807  row2l = veorq_u64(row2l, row3l);
4808  row2h = veorq_u64(row2h, row3h);
4809  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4810  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4811 
4812  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_L64);
4813  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b0,LANE_H64);
4814  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_L64);
4815  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_H64);
4816  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4817  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4818  row4l = veorq_u64(row4l, row1l);
4819  row4h = veorq_u64(row4h, row1h);
4820  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4821  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4822  row3l = vaddq_u64(row3l, row4l);
4823  row3h = vaddq_u64(row3h, row4h);
4824  row2l = veorq_u64(row2l, row3l);
4825  row2h = veorq_u64(row2h, row3h);
4826  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4827  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4828 
4829  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4830  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4831  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4832  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4833  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4834  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4835  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4836  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4837  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4838 
4839  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b0,LANE_L64);
4840  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_H64);
4841  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_L64);
4842  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_H64);
4843  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4844  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4845  row4l = veorq_u64(row4l, row1l);
4846  row4h = veorq_u64(row4h, row1h);
4847  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4848  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4849  row3l = vaddq_u64(row3l, row4l);
4850  row3h = vaddq_u64(row3h, row4h);
4851  row2l = veorq_u64(row2l, row3l);
4852  row2h = veorq_u64(row2h, row3h);
4853  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4854  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4855 
4856  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_L64);
4857  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_H64);
4858  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b1,LANE_L64);
4859  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b1,LANE_H64);
4860  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4861  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4862  row4l = veorq_u64(row4l, row1l);
4863  row4h = veorq_u64(row4h, row1h);
4864  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4865  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4866  row3l = vaddq_u64(row3l, row4l);
4867  row3h = vaddq_u64(row3h, row4h);
4868  row2l = veorq_u64(row2l, row3l);
4869  row2h = veorq_u64(row2h, row3h);
4870  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4871  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4872 
4873  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4874  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4875  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4876  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4877  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4878  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4879  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4880  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4881  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4882 
4883  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_L64);
4884  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_H64);
4885  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_L64);
4886  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_H64);
4887  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4888  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4889  row4l = veorq_u64(row4l, row1l);
4890  row4h = veorq_u64(row4h, row1h);
4891  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4892  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4893  row3l = vaddq_u64(row3l, row4l);
4894  row3h = vaddq_u64(row3h, row4h);
4895  row2l = veorq_u64(row2l, row3l);
4896  row2h = veorq_u64(row2h, row3h);
4897  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4898  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4899 
4900  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_L64);
4901  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b0,LANE_H64);
4902  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_L64);
4903  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_H64);
4904  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4905  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4906  row4l = veorq_u64(row4l, row1l);
4907  row4h = veorq_u64(row4h, row1h);
4908  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4909  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4910  row3l = vaddq_u64(row3l, row4l);
4911  row3h = vaddq_u64(row3h, row4h);
4912  row2l = veorq_u64(row2l, row3l);
4913  row2h = veorq_u64(row2h, row3h);
4914  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4915  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4916 
4917  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4918  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4919  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4920  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4921  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4922  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4923  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4924  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4925  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4926 
4927  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_L64);
4928  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_H64);
4929  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b1,LANE_L64);
4930  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b1,LANE_H64);
4931  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4932  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4933  row4l = veorq_u64(row4l, row1l);
4934  row4h = veorq_u64(row4h, row1h);
4935  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4936  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4937  row3l = vaddq_u64(row3l, row4l);
4938  row3h = vaddq_u64(row3h, row4h);
4939  row2l = veorq_u64(row2l, row3l);
4940  row2h = veorq_u64(row2h, row3h);
4941  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4942  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4943 
4944  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_L64);
4945  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_H64);
4946  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_L64);
4947  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_H64);
4948  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4949  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4950  row4l = veorq_u64(row4l, row1l);
4951  row4h = veorq_u64(row4h, row1h);
4952  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4953  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4954  row3l = vaddq_u64(row3l, row4l);
4955  row3h = vaddq_u64(row3h, row4h);
4956  row2l = veorq_u64(row2l, row3l);
4957  row2h = veorq_u64(row2h, row3h);
4958  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4959  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4960 
4961  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4962  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4963  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4964  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4965  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4966  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4967  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4968  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4969  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4970 
4971  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_L64);
4972  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b0,LANE_H64);
4973  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_L64);
4974  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_H64);
4975  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4976  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4977  row4l = veorq_u64(row4l, row1l);
4978  row4h = veorq_u64(row4h, row1h);
4979  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4980  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4981  row3l = vaddq_u64(row3l, row4l);
4982  row3h = vaddq_u64(row3h, row4h);
4983  row2l = veorq_u64(row2l, row3l);
4984  row2h = veorq_u64(row2h, row3h);
4985  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4986  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4987 
4988  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_L64);
4989  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_H64);
4990  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_L64);
4991  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_H64);
4992  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4993  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4994  row4l = veorq_u64(row4l, row1l);
4995  row4h = veorq_u64(row4h, row1h);
4996  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4997  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4998  row3l = vaddq_u64(row3l, row4l);
4999  row3h = vaddq_u64(row3h, row4h);
5000  row2l = veorq_u64(row2l, row3l);
5001  row2h = veorq_u64(row2h, row3h);
5002  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
5003  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
5004 
5005  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
5006  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
5007  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
5008  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
5009  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
5010  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
5011  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
5012  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
5013  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
5014 
5015  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_L64);
5016  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_H64);
5017  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_L64);
5018  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_H64);
5019  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
5020  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
5021  row4l = veorq_u64(row4l, row1l);
5022  row4h = veorq_u64(row4h, row1h);
5023  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
5024  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
5025  row3l = vaddq_u64(row3l, row4l);
5026  row3h = vaddq_u64(row3h, row4h);
5027  row2l = veorq_u64(row2l, row3l);
5028  row2h = veorq_u64(row2h, row3h);
5029  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
5030  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
5031 
5032  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_L64);
5033  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_H64);
5034  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_L64);
5035  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_H64);
5036  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
5037  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
5038  row4l = veorq_u64(row4l, row1l);
5039  row4h = veorq_u64(row4h, row1h);
5040  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
5041  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
5042  row3l = vaddq_u64(row3l, row4l);
5043  row3h = vaddq_u64(row3h, row4h);
5044  row2l = veorq_u64(row2l, row3l);
5045  row2h = veorq_u64(row2h, row3h);
5046  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
5047  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
5048 
5049  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
5050  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
5051  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
5052  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
5053  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
5054  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
5055  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
5056  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
5057  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
5058 
5059  row1l = veorq_u64(row3l, row1l);
5060  row1h = veorq_u64(row3h, row1h);
5061  vst1q_u64((uint64_t*)&state.h[0], veorq_u64(vld1q_u64((const uint64_t*)&state.h[0]), row1l));
5062  vst1q_u64((uint64_t*)&state.h[2], veorq_u64(vld1q_u64((const uint64_t*)&state.h[2]), row1h));
5063 
5064  row2l = veorq_u64(row4l, row2l);
5065  row2h = veorq_u64(row4h, row2h);
5066  vst1q_u64((uint64_t*)&state.h[4], veorq_u64(vld1q_u64((const uint64_t*)&state.h[4]), row2l));
5067  vst1q_u64((uint64_t*)&state.h[6], veorq_u64(vld1q_u64((const uint64_t*)&state.h[6]), row2h));
5068 }
5069 #endif // CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
5070 
5071 template class BLAKE2_Base<word32, false>;
5072 template class BLAKE2_Base<word64, true>;
5073 
Used to pass byte array input as part of a NameValuePairs object.
Definition: algparam.h:29
pfnCompress64 InitializeCompress64Fn()
Definition: blake2.cpp:154
#define COUNTOF(x)
Definition: misc.h:175
#define BLAKE2B_IV(n)
Definition: blake2.cpp:99
void Restart()
Restart the hash.
Definition: blake2.cpp:372
Standard names for retrieving values by name when working with NameValuePairs.
Classes for working with NameValuePairs.
uint8_t byte
Definition: Common.h:57
void(* pfnCompress32)(const byte *, BLAKE2_State< word32, false > &)
Definition: blake2.cpp:151
#define NAMESPACE_BEGIN(x)
Definition: config.h:200
void TruncatedFinal(byte *hash, size_t size)
Computes the hash of the current message.
Definition: blake2.cpp:444
size_t count
Definition: ExecStats.cpp:37
Abstract base classes that provide a uniform interface to this library.
void memcpy_s(void *dest, size_t sizeInBytes, const void *src, size_t count)
Bounds checking replacement for memcpy()
Definition: misc.h:366
size_type size() const
Provides the count of elements in the SecBlock.
Definition: secblock.h:524
AlignedParameterBlock m_block
Definition: blake2.h:234
void IncrementCounter(size_t count=BLOCKSIZE)
Definition: blake2.cpp:469
size_t size() const
Length of the memory block.
Definition: algparam.h:93
Library configuration file.
AlignedSecByteBlock m_key
Definition: blake2.h:235
void Compress(const byte *input)
const byte * begin() const
Pointer to the first byte in the memory block.
Definition: algparam.h:89
size_t length
Definition: blake2.h:151
void ThrowIfInvalidTruncatedSize(size_t size) const
Validates a truncated digest size.
Definition: cryptlib.cpp:416
bool IsAlignedOn(const void *ptr, unsigned int alignment)
Determines whether ptr is aligned to a minimum value.
Definition: misc.h:954
#define a(i)
#define CRYPTOPP_ALIGN_DATA(x)
Definition: config.h:338
AlgorithmParameters MakeParameters(const char *name, const T &value, bool throwIfNotUsed=true)
Create an object that implements NameValuePairs.
Definition: algparam.h:498
pfnCompress32 InitializeCompress32Fn()
Definition: blake2.cpp:176
AlignedState m_state
Definition: blake2.h:233
T ConditionalByteReverse(ByteOrder order, T value)
Reverses bytes in a value depending upon endianness.
Definition: misc.h:1807
#define W64LIT(x)
Definition: config.h:241
A::pointer data()
Provides a pointer to the first element in the memory block.
Definition: secblock.h:516
#define t1
unsigned long long word64
Definition: config.h:240
W f[2]
Definition: blake2.h:149
#define CRYPTOPP_CONSTANT(x)
Definition: config.h:540
#define BLAKE2_ROUND(r)
#define b(i, j)
#define CRYPTOPP_ASSERT(exp)
Definition: trap.h:92
const NameValuePairs & g_nullNameValuePairs
An empty set of name-value pairs.
Definition: cryptlib.cpp:76
word32 m_digestSize
Definition: blake2.h:236
BLAKE2_ParameterBlock< T_64bit > ParameterBlock
Definition: blake2.h:175
Functions for CPU features and intrinsics.
#define t2
#define CRYPTOPP_NO_VTABLE
Definition: config.h:369
BLAKE2 state information.
Definition: blake2.h:136
void(* pfnCompress64)(const byte *, BLAKE2_State< word64, true > &)
Definition: blake2.cpp:152
BLAKE2 parameter block.
Definition: blake2.h:54
#define UL(i)
uint8_t const size_t const size
Definition: sha3.h:20
void * memcpy(void *a, const void *b, size_t c)
uint8_t byte
Definition: Common.h:10
bool m_treeMode
Definition: blake2.h:237
Access a block of memory.
Definition: misc.h:2153
#define NAMESPACE_END
Definition: config.h:201
#define BLAKE2S_IV(n)
Definition: blake2.cpp:81
Access a block of memory.
Definition: misc.h:2195
W t[2]
Definition: blake2.h:149
W h[8]
Definition: blake2.h:149
unsigned int word32
Definition: config.h:231
byte buffer[BLOCKSIZE]
Definition: blake2.h:150
static ENUM_TYPE ToEnum()
Definition: cryptlib.h:118
void UncheckedSetKey(const byte *key, unsigned int length, const CryptoPP::NameValuePairs &params)
Definition: blake2.cpp:280
void Update(const byte *input, size_t length)
Updates a hash with additional input.
Definition: blake2.cpp:410