22 #if (defined(CRYPTOPP_APPLE_CLANG_VERSION) && (CRYPTOPP_APPLE_CLANG_VERSION <= 60000)) || (defined(CRYPTOPP_LLVM_CLANG_VERSION) && (CRYPTOPP_LLVM_CLANG_VERSION <= 30500)) 23 # undef CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE 28 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE && ((__SUNPRO_CC >= 0x5100 && __SUNPRO_CC < 0x5130) || (defined(_MSC_VER) && _MSC_VER < 1600) || (defined(_M_IX86) && _MSC_VER >= 1600)) 31 const word64 t[2] = {
b,a}; __m128i r;
42 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 44 # if (__SUNPRO_CC != 0x5120) 49 #if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE 54 #if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE 59 #ifndef CRYPTOPP_DOXYGEN_PROCESSING 63 template<
bool T_64bit>
77 0x6A09E667
UL, 0xBB67AE85
UL, 0x3C6EF372
UL, 0xA54FF53A
UL,
78 0x510E527F
UL, 0x9B05688C
UL, 0x1F83D9AB
UL, 0x5BE0CD19UL
81 #define BLAKE2S_IV(n) BLAKE2_IV<false>::iv[n] 99 #define BLAKE2B_IV(n) BLAKE2_IV<true>::iv[n] 103 template<
bool T_64bit>
115 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
116 { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
117 { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
118 { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
119 { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
120 { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
121 { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
122 { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
123 { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
124 { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
137 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
138 { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
139 { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
140 { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
141 { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
142 { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
143 { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
144 { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
145 { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
146 { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
147 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
148 { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
156 #if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE 158 return &BLAKE2_SSE4_Compress64;
161 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 162 # if (__SUNPRO_CC != 0x5120) 164 return &BLAKE2_SSE2_Compress64;
168 #if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE 170 return &BLAKE2_NEON_Compress64;
173 return &BLAKE2_CXX_Compress64;
178 #if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE 180 return &BLAKE2_SSE4_Compress32;
183 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 185 return &BLAKE2_SSE2_Compress32;
188 #if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE 190 return &BLAKE2_NEON_Compress32;
193 return &BLAKE2_CXX_Compress32;
196 #endif // CRYPTOPP_DOXYGEN_PROCESSING 199 const byte* saltStr,
size_t saltLen,
200 const byte* personalizationStr,
size_t personalizationLen)
203 digestLength = (
byte)digestLen;
204 keyLength = (
byte)keyLen;
206 nodeDepth = innerLength = 0;
208 memset(leafLength, 0x00,
COUNTOF(leafLength));
209 memset(nodeOffset, 0x00,
COUNTOF(nodeOffset));
211 if (saltStr && saltLen)
214 const size_t rem =
COUNTOF(salt) - saltLen;
215 const size_t off =
COUNTOF(salt) - rem;
217 memset(salt+off, 0x00, rem);
221 memset(salt, 0x00,
COUNTOF(salt));
224 if (personalizationStr && personalizationLen)
226 memcpy_s(personalization,
COUNTOF(personalization), personalizationStr, personalizationLen);
227 const size_t rem =
COUNTOF(personalization) - personalizationLen;
228 const size_t off =
COUNTOF(personalization) - rem;
230 memset(personalization+off, 0x00, rem);
234 memset(personalization, 0x00,
COUNTOF(personalization));
239 const byte* saltStr,
size_t saltLen,
240 const byte* personalizationStr,
size_t personalizationLen)
243 digestLength = (
byte)digestLen;
244 keyLength = (
byte)keyLen;
246 nodeDepth = innerLength = 0;
248 memset(rfu, 0x00,
COUNTOF(rfu));
249 memset(leafLength, 0x00,
COUNTOF(leafLength));
250 memset(nodeOffset, 0x00,
COUNTOF(nodeOffset));
252 if (saltStr && saltLen)
255 const size_t rem =
COUNTOF(salt) - saltLen;
256 const size_t off =
COUNTOF(salt) - rem;
258 memset(salt+off, 0x00, rem);
262 memset(salt, 0x00,
COUNTOF(salt));
265 if (personalizationStr && personalizationLen)
267 memcpy_s(personalization,
COUNTOF(personalization), personalizationStr, personalizationLen);
268 const size_t rem =
COUNTOF(personalization) - personalizationLen;
269 const size_t off =
COUNTOF(personalization) - rem;
271 memset(personalization+off, 0x00, rem);
275 memset(personalization, 0x00,
COUNTOF(personalization));
279 template <
class W,
bool T_64bit>
285 memcpy_s(temp, BLOCKSIZE, key, length);
287 const size_t rem = BLOCKSIZE - length;
289 memset(temp+length, 0x00, rem);
298 #if defined(__COVERITY__) 305 memset(m_block.data(), 0x00, T_64bit ? 32 : 16);
308 block.keyLength = (
byte)length;
309 block.digestLength = (
byte)params.GetIntValueWithDefault(Name::DigestSize(), DIGESTSIZE);
310 block.fanout = block.depth = 1;
313 if (params.GetValue(Name::Salt(), t) && t.
begin() && t.
size())
317 const size_t off =
COUNTOF(block.salt) - rem;
319 memset(block.salt+off, 0x00, rem);
323 memset(block.salt, 0x00,
COUNTOF(block.salt));
326 if (params.GetValue(Name::Personalization(), t) && t.
begin() && t.
size())
329 const size_t rem =
COUNTOF(block.personalization) - t.
size();
330 const size_t off =
COUNTOF(block.personalization) - rem;
332 memset(block.personalization+off, 0x00, rem);
336 memset(block.personalization, 0x00,
COUNTOF(block.personalization));
340 template <
class W,
bool T_64bit>
347 template <
class W,
bool T_64bit>
356 template <
class W,
bool T_64bit>
358 const byte* personalization,
size_t personalizationLength,
bool treeMode,
unsigned int digestSize)
366 UncheckedSetKey(key, static_cast<unsigned int>(keyLength),
MakeParameters(Name::DigestSize(),(
int)digestSize)(Name::TreeMode(),treeMode,
false)
371 template <
class W,
bool T_64bit>
374 static const W zero[2] = {0,0};
378 template <
class W,
bool T_64bit>
391 state.
t[0] = state.
t[1] = 0, state.
f[0] = state.
f[1] = 0, state.
length = 0;
395 state.
t[0] = counter[0];
396 state.
t[1] = counter[1];
409 template <
class W,
bool T_64bit>
413 if (state.
length + length > BLOCKSIZE)
416 const size_t fill = BLOCKSIZE - state.
length;
423 length -= fill, input += fill;
426 while (length > BLOCKSIZE)
430 length -= BLOCKSIZE, input += BLOCKSIZE;
439 state.
length +=
static_cast<unsigned int>(length);
443 template <
class W,
bool T_64bit>
450 state.
f[0] =
static_cast<W
>(-1);
454 state.
f[1] =
static_cast<W
>(-1);
468 template <
class W,
bool T_64bit>
472 state.
t[0] +=
static_cast<W
>(
count);
473 state.
t[1] += !!(state.
t[0] <
count);
497 #define BLAKE2_G(r,i,a,b,c,d) \ 499 a = a + b + m[BLAKE2_Sigma<true>::sigma[r][2*i+0]]; \ 500 d = rotrVariable<word64>(d ^ a, 32); \ 502 b = rotrVariable<word64>(b ^ c, 24); \ 503 a = a + b + m[BLAKE2_Sigma<true>::sigma[r][2*i+1]]; \ 504 d = rotrVariable<word64>(d ^ a, 16); \ 506 b = rotrVariable<word64>(b ^ c, 63); \ 509 #define BLAKE2_ROUND(r) \ 511 BLAKE2_G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ 512 BLAKE2_G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ 513 BLAKE2_G(r,2,v[ 2],v[ 6],v[10],v[14]); \ 514 BLAKE2_G(r,3,v[ 3],v[ 7],v[11],v[15]); \ 515 BLAKE2_G(r,4,v[ 0],v[ 5],v[10],v[15]); \ 516 BLAKE2_G(r,5,v[ 1],v[ 6],v[11],v[12]); \ 517 BLAKE2_G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ 518 BLAKE2_G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ 524 get1(m[0])(m[1])(m[2])(m[3])(m[4])(m[5])(m[6])(m[7])(m[8])(m[9])(m[10])(m[11])(m[12])(m[13])(m[14])(m[15]);
527 get2(v[0])(v[1])(v[2])(v[3])(v[4])(v[5])(v[6])(v[7]);
551 for(
unsigned int i = 0; i < 8; ++i)
560 #define BLAKE2_G(r,i,a,b,c,d) \ 562 a = a + b + m[BLAKE2_Sigma<false>::sigma[r][2*i+0]]; \ 563 d = rotrVariable<word32>(d ^ a, 16); \ 565 b = rotrVariable<word32>(b ^ c, 12); \ 566 a = a + b + m[BLAKE2_Sigma<false>::sigma[r][2*i+1]]; \ 567 d = rotrVariable<word32>(d ^ a, 8); \ 569 b = rotrVariable<word32>(b ^ c, 7); \ 572 #define BLAKE2_ROUND(r) \ 574 BLAKE2_G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ 575 BLAKE2_G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ 576 BLAKE2_G(r,2,v[ 2],v[ 6],v[10],v[14]); \ 577 BLAKE2_G(r,3,v[ 3],v[ 7],v[11],v[15]); \ 578 BLAKE2_G(r,4,v[ 0],v[ 5],v[10],v[15]); \ 579 BLAKE2_G(r,5,v[ 1],v[ 6],v[11],v[12]); \ 580 BLAKE2_G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ 581 BLAKE2_G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ 587 get1(m[0])(m[1])(m[2])(m[3])(m[4])(m[5])(m[6])(m[7])(m[8])(m[9])(m[10])(m[11])(m[12])(m[13])(m[14])(m[15]);
590 get2(v[0])(v[1])(v[2])(v[3])(v[4])(v[5])(v[6])(v[7]);
612 for(
unsigned int i = 0; i < 8; ++i)
616 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 619 word32 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15;
621 get(m0)(m1)(m2)(m3)(m4)(m5)(m6)(m7)(m8)(m9)(m10)(m11)(m12)(m13)(m14)(m15);
623 __m128i row1,row2,row3,row4;
624 __m128i buf1,buf2,buf3,buf4;
627 row1 = ff0 = _mm_loadu_si128((
const __m128i*)(
const void*)(&state.
h[0]));
628 row2 = ff1 = _mm_loadu_si128((
const __m128i*)(
const void*)(&state.
h[4]));
631 buf1 = _mm_set_epi32(m6,m4,m2,m0);
632 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
633 row4 = _mm_xor_si128(row4,row1);
634 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
635 row3 = _mm_add_epi32(row3,row4);
636 row2 = _mm_xor_si128(row2,row3);
637 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
639 buf2 = _mm_set_epi32(m7,m5,m3,m1);
640 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
641 row4 = _mm_xor_si128(row4,row1);
642 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
643 row3 = _mm_add_epi32(row3,row4);
644 row2 = _mm_xor_si128(row2,row3);
645 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
647 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
648 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
649 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
651 buf3 = _mm_set_epi32(m14,m12,m10,m8);
652 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
653 row4 = _mm_xor_si128(row4,row1);
654 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
655 row3 = _mm_add_epi32(row3,row4);
656 row2 = _mm_xor_si128(row2,row3);
657 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
659 buf4 = _mm_set_epi32(m15,m13,m11,m9);
660 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
661 row4 = _mm_xor_si128(row4,row1);
662 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
663 row3 = _mm_add_epi32(row3,row4);
664 row2 = _mm_xor_si128(row2,row3);
665 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
667 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
668 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
669 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
671 buf1 = _mm_set_epi32(m13,m9,m4,m14);
672 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
673 row4 = _mm_xor_si128(row4,row1);
674 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
675 row3 = _mm_add_epi32(row3,row4);
676 row2 = _mm_xor_si128(row2,row3);
677 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
679 buf2 = _mm_set_epi32(m6,m15,m8,m10);
680 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
681 row4 = _mm_xor_si128(row4,row1);
682 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
683 row3 = _mm_add_epi32(row3,row4);
684 row2 = _mm_xor_si128(row2,row3);
685 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
687 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
688 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
689 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
691 buf3 = _mm_set_epi32(m5,m11,m0,m1);
692 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
693 row4 = _mm_xor_si128(row4,row1);
694 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
695 row3 = _mm_add_epi32(row3,row4);
696 row2 = _mm_xor_si128(row2,row3);
697 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
699 buf4 = _mm_set_epi32(m3,m7,m2,m12);
700 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
701 row4 = _mm_xor_si128(row4,row1);
702 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
703 row3 = _mm_add_epi32(row3,row4);
704 row2 = _mm_xor_si128(row2,row3);
705 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
707 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
708 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
709 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
711 buf1 = _mm_set_epi32(m15,m5,m12,m11);
712 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
713 row4 = _mm_xor_si128(row4,row1);
714 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
715 row3 = _mm_add_epi32(row3,row4);
716 row2 = _mm_xor_si128(row2,row3);
717 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
719 buf2 = _mm_set_epi32(m13,m2,m0,m8);
720 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
721 row4 = _mm_xor_si128(row4,row1);
722 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
723 row3 = _mm_add_epi32(row3,row4);
724 row2 = _mm_xor_si128(row2,row3);
725 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
727 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
728 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
729 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
731 buf3 = _mm_set_epi32(m9,m7,m3,m10);
732 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
733 row4 = _mm_xor_si128(row4,row1);
734 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
735 row3 = _mm_add_epi32(row3,row4);
736 row2 = _mm_xor_si128(row2,row3);
737 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
739 buf4 = _mm_set_epi32(m4,m1,m6,m14);
740 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
741 row4 = _mm_xor_si128(row4,row1);
742 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
743 row3 = _mm_add_epi32(row3,row4);
744 row2 = _mm_xor_si128(row2,row3);
745 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
747 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
748 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
749 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
751 buf1 = _mm_set_epi32(m11,m13,m3,m7);
752 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
753 row4 = _mm_xor_si128(row4,row1);
754 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
755 row3 = _mm_add_epi32(row3,row4);
756 row2 = _mm_xor_si128(row2,row3);
757 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
759 buf2 = _mm_set_epi32(m14,m12,m1,m9);
760 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
761 row4 = _mm_xor_si128(row4,row1);
762 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
763 row3 = _mm_add_epi32(row3,row4);
764 row2 = _mm_xor_si128(row2,row3);
765 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
767 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
768 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
769 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
771 buf3 = _mm_set_epi32(m15,m4,m5,m2);
772 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
773 row4 = _mm_xor_si128(row4,row1);
774 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
775 row3 = _mm_add_epi32(row3,row4);
776 row2 = _mm_xor_si128(row2,row3);
777 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
779 buf4 = _mm_set_epi32(m8,m0,m10,m6);
780 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
781 row4 = _mm_xor_si128(row4,row1);
782 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
783 row3 = _mm_add_epi32(row3,row4);
784 row2 = _mm_xor_si128(row2,row3);
785 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
787 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
788 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
789 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
791 buf1 = _mm_set_epi32(m10,m2,m5,m9);
792 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
793 row4 = _mm_xor_si128(row4,row1);
794 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
795 row3 = _mm_add_epi32(row3,row4);
796 row2 = _mm_xor_si128(row2,row3);
797 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
799 buf2 = _mm_set_epi32(m15,m4,m7,m0);
800 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
801 row4 = _mm_xor_si128(row4,row1);
802 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
803 row3 = _mm_add_epi32(row3,row4);
804 row2 = _mm_xor_si128(row2,row3);
805 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
807 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
808 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
809 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
811 buf3 = _mm_set_epi32(m3,m6,m11,m14);
812 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
813 row4 = _mm_xor_si128(row4,row1);
814 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
815 row3 = _mm_add_epi32(row3,row4);
816 row2 = _mm_xor_si128(row2,row3);
817 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
819 buf4 = _mm_set_epi32(m13,m8,m12,m1);
820 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
821 row4 = _mm_xor_si128(row4,row1);
822 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
823 row3 = _mm_add_epi32(row3,row4);
824 row2 = _mm_xor_si128(row2,row3);
825 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
827 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
828 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
829 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
831 buf1 = _mm_set_epi32(m8,m0,m6,m2);
832 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
833 row4 = _mm_xor_si128(row4,row1);
834 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
835 row3 = _mm_add_epi32(row3,row4);
836 row2 = _mm_xor_si128(row2,row3);
837 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
839 buf2 = _mm_set_epi32(m3,m11,m10,m12);
840 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
841 row4 = _mm_xor_si128(row4,row1);
842 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
843 row3 = _mm_add_epi32(row3,row4);
844 row2 = _mm_xor_si128(row2,row3);
845 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
847 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
848 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
849 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
851 buf3 = _mm_set_epi32(m1,m15,m7,m4);
852 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
853 row4 = _mm_xor_si128(row4,row1);
854 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
855 row3 = _mm_add_epi32(row3,row4);
856 row2 = _mm_xor_si128(row2,row3);
857 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
859 buf4 = _mm_set_epi32(m9,m14,m5,m13);
860 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
861 row4 = _mm_xor_si128(row4,row1);
862 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
863 row3 = _mm_add_epi32(row3,row4);
864 row2 = _mm_xor_si128(row2,row3);
865 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
867 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
868 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
869 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
871 buf1 = _mm_set_epi32(m4,m14,m1,m12);
872 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
873 row4 = _mm_xor_si128(row4,row1);
874 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
875 row3 = _mm_add_epi32(row3,row4);
876 row2 = _mm_xor_si128(row2,row3);
877 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
879 buf2 = _mm_set_epi32(m10,m13,m15,m5);
880 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
881 row4 = _mm_xor_si128(row4,row1);
882 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
883 row3 = _mm_add_epi32(row3,row4);
884 row2 = _mm_xor_si128(row2,row3);
885 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
887 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
888 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
889 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
891 buf3 = _mm_set_epi32(m8,m9,m6,m0);
892 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
893 row4 = _mm_xor_si128(row4,row1);
894 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
895 row3 = _mm_add_epi32(row3,row4);
896 row2 = _mm_xor_si128(row2,row3);
897 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
899 buf4 = _mm_set_epi32(m11,m2,m3,m7);
900 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
901 row4 = _mm_xor_si128(row4,row1);
902 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
903 row3 = _mm_add_epi32(row3,row4);
904 row2 = _mm_xor_si128(row2,row3);
905 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
907 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
908 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
909 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
911 buf1 = _mm_set_epi32(m3,m12,m7,m13);
912 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
913 row4 = _mm_xor_si128(row4,row1);
914 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
915 row3 = _mm_add_epi32(row3,row4);
916 row2 = _mm_xor_si128(row2,row3);
917 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
919 buf2 = _mm_set_epi32(m9,m1,m14,m11);
920 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
921 row4 = _mm_xor_si128(row4,row1);
922 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
923 row3 = _mm_add_epi32(row3,row4);
924 row2 = _mm_xor_si128(row2,row3);
925 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
927 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
928 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
929 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
931 buf3 = _mm_set_epi32(m2,m8,m15,m5);
932 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
933 row4 = _mm_xor_si128(row4,row1);
934 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
935 row3 = _mm_add_epi32(row3,row4);
936 row2 = _mm_xor_si128(row2,row3);
937 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
939 buf4 = _mm_set_epi32(m10,m6,m4,m0);
940 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
941 row4 = _mm_xor_si128(row4,row1);
942 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
943 row3 = _mm_add_epi32(row3,row4);
944 row2 = _mm_xor_si128(row2,row3);
945 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
947 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
948 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
949 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
951 buf1 = _mm_set_epi32(m0,m11,m14,m6);
952 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
953 row4 = _mm_xor_si128(row4,row1);
954 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
955 row3 = _mm_add_epi32(row3,row4);
956 row2 = _mm_xor_si128(row2,row3);
957 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
959 buf2 = _mm_set_epi32(m8,m3,m9,m15);
960 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
961 row4 = _mm_xor_si128(row4,row1);
962 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
963 row3 = _mm_add_epi32(row3,row4);
964 row2 = _mm_xor_si128(row2,row3);
965 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
967 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
968 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
969 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
971 buf3 = _mm_set_epi32(m10,m1,m13,m12);
972 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
973 row4 = _mm_xor_si128(row4,row1);
974 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
975 row3 = _mm_add_epi32(row3,row4);
976 row2 = _mm_xor_si128(row2,row3);
977 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
979 buf4 = _mm_set_epi32(m5,m4,m7,m2);
980 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
981 row4 = _mm_xor_si128(row4,row1);
982 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
983 row3 = _mm_add_epi32(row3,row4);
984 row2 = _mm_xor_si128(row2,row3);
985 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
987 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
988 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
989 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
991 buf1 = _mm_set_epi32(m1,m7,m8,m10);
992 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
993 row4 = _mm_xor_si128(row4,row1);
994 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
995 row3 = _mm_add_epi32(row3,row4);
996 row2 = _mm_xor_si128(row2,row3);
997 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
999 buf2 = _mm_set_epi32(m5,m6,m4,m2);
1000 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
1001 row4 = _mm_xor_si128(row4,row1);
1002 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
1003 row3 = _mm_add_epi32(row3,row4);
1004 row2 = _mm_xor_si128(row2,row3);
1005 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
1007 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
1008 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
1009 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
1011 buf3 = _mm_set_epi32(m13,m3,m9,m15);
1012 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
1013 row4 = _mm_xor_si128(row4,row1);
1014 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
1015 row3 = _mm_add_epi32(row3,row4);
1016 row2 = _mm_xor_si128(row2,row3);
1017 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
1019 buf4 = _mm_set_epi32(m0,m12,m14,m11);
1020 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
1021 row4 = _mm_xor_si128(row4,row1);
1022 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
1023 row3 = _mm_add_epi32(row3,row4);
1024 row2 = _mm_xor_si128(row2,row3);
1025 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
1027 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
1028 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
1029 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
1031 _mm_storeu_si128((__m128i *)(
void*)(&state.
h[0]),_mm_xor_si128(ff0,_mm_xor_si128(row1,row3)));
1032 _mm_storeu_si128((__m128i *)(
void*)(&state.
h[4]),_mm_xor_si128(ff1,_mm_xor_si128(row2,row4)));
1035 # if (__SUNPRO_CC != 0x5120) 1038 word64 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15;
1040 get(m0)(m1)(m2)(m3)(m4)(m5)(m6)(m7)(m8)(m9)(m10)(m11)(m12)(m13)(m14)(m15);
1042 __m128i row1l, row1h, row2l, row2h;
1043 __m128i row3l, row3h, row4l, row4h;
1044 __m128i b0, b1, t0,
t1;
1046 row1l = _mm_loadu_si128((
const __m128i*)(
const void*)(&state.
h[0]));
1047 row1h = _mm_loadu_si128((
const __m128i*)(
const void*)(&state.
h[2]));
1048 row2l = _mm_loadu_si128((
const __m128i*)(
const void*)(&state.
h[4]));
1049 row2h = _mm_loadu_si128((
const __m128i*)(
const void*)(&state.
h[6]));
1050 row3l = _mm_loadu_si128((
const __m128i*)(
const void*)(&
BLAKE2B_IV(0)));
1051 row3h = _mm_loadu_si128((
const __m128i*)(
const void*)(&
BLAKE2B_IV(2)));
1052 row4l = _mm_xor_si128(_mm_loadu_si128((
const __m128i*)(
const void*)(&
BLAKE2B_IV(4))), _mm_loadu_si128((
const __m128i*)(
const void*)(&state.
t[0])));
1053 row4h = _mm_xor_si128(_mm_loadu_si128((
const __m128i*)(
const void*)(&
BLAKE2B_IV(6))), _mm_loadu_si128((
const __m128i*)(
const void*)(&state.
f[0])));
1055 b0 = _mm_set_epi64x(m2, m0);
1056 b1 = _mm_set_epi64x(m6, m4);
1057 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1058 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1059 row4l = _mm_xor_si128(row4l, row1l);
1060 row4h = _mm_xor_si128(row4h, row1h);
1061 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1062 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1063 row3l = _mm_add_epi64(row3l, row4l);
1064 row3h = _mm_add_epi64(row3h, row4h);
1065 row2l = _mm_xor_si128(row2l, row3l);
1066 row2h = _mm_xor_si128(row2h, row3h);
1067 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l, 40));
1068 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h, 40));
1070 b0 = _mm_set_epi64x(m3, m1);
1071 b1 = _mm_set_epi64x(m7, m5);
1072 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1073 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1074 row4l = _mm_xor_si128(row4l, row1l);
1075 row4h = _mm_xor_si128(row4h, row1h);
1076 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1077 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1078 row3l = _mm_add_epi64(row3l, row4l);
1079 row3h = _mm_add_epi64(row3h, row4h);
1080 row2l = _mm_xor_si128(row2l, row3l);
1081 row2h = _mm_xor_si128(row2h, row3h);
1082 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1083 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1085 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1086 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1087 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1088 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1089 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1091 b0 = _mm_set_epi64x(m10, m8);
1092 b1 = _mm_set_epi64x(m14, m12);
1093 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1094 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1095 row4l = _mm_xor_si128(row4l, row1l);
1096 row4h = _mm_xor_si128(row4h, row1h);
1097 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1098 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1099 row3l = _mm_add_epi64(row3l, row4l);
1100 row3h = _mm_add_epi64(row3h, row4h);
1101 row2l = _mm_xor_si128(row2l, row3l);
1102 row2h = _mm_xor_si128(row2h, row3h);
1103 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1104 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1106 b0 = _mm_set_epi64x(m11, m9);
1107 b1 = _mm_set_epi64x(m15, m13);
1108 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1109 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1110 row4l = _mm_xor_si128(row4l, row1l);
1111 row4h = _mm_xor_si128(row4h, row1h);
1112 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1113 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1114 row3l = _mm_add_epi64(row3l, row4l);
1115 row3h = _mm_add_epi64(row3h, row4h);
1116 row2l = _mm_xor_si128(row2l, row3l);
1117 row2h = _mm_xor_si128(row2h, row3h);
1118 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1119 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1121 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1122 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1123 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1124 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1125 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1127 b0 = _mm_set_epi64x(m4, m14);
1128 b1 = _mm_set_epi64x(m13, m9);
1129 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1130 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1131 row4l = _mm_xor_si128(row4l, row1l);
1132 row4h = _mm_xor_si128(row4h, row1h);
1133 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1134 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1135 row3l = _mm_add_epi64(row3l, row4l);
1136 row3h = _mm_add_epi64(row3h, row4h);
1137 row2l = _mm_xor_si128(row2l, row3l);
1138 row2h = _mm_xor_si128(row2h, row3h);
1139 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1140 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1142 b0 = _mm_set_epi64x(m8, m10);
1143 b1 = _mm_set_epi64x(m6, m15);
1144 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1145 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1146 row4l = _mm_xor_si128(row4l, row1l);
1147 row4h = _mm_xor_si128(row4h, row1h);
1148 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1149 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1150 row3l = _mm_add_epi64(row3l, row4l);
1151 row3h = _mm_add_epi64(row3h, row4h);
1152 row2l = _mm_xor_si128(row2l, row3l);
1153 row2h = _mm_xor_si128(row2h, row3h);
1154 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1155 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1157 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1158 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1159 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1160 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1161 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1162 b0 = _mm_set_epi64x(m0, m1);
1163 b1 = _mm_set_epi64x(m5, m11);
1164 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1165 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1166 row4l = _mm_xor_si128(row4l, row1l);
1167 row4h = _mm_xor_si128(row4h, row1h);
1168 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1169 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1170 row3l = _mm_add_epi64(row3l, row4l);
1171 row3h = _mm_add_epi64(row3h, row4h);
1172 row2l = _mm_xor_si128(row2l, row3l);
1173 row2h = _mm_xor_si128(row2h, row3h);
1174 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1175 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1177 b0 = _mm_set_epi64x(m2, m12);
1178 b1 = _mm_set_epi64x(m3, m7);
1179 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1180 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1181 row4l = _mm_xor_si128(row4l, row1l);
1182 row4h = _mm_xor_si128(row4h, row1h);
1183 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1184 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1185 row3l = _mm_add_epi64(row3l, row4l);
1186 row3h = _mm_add_epi64(row3h, row4h);
1187 row2l = _mm_xor_si128(row2l, row3l);
1188 row2h = _mm_xor_si128(row2h, row3h);
1189 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1190 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1192 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1193 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1194 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1195 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1196 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1198 b0 = _mm_set_epi64x(m12, m11);
1199 b1 = _mm_set_epi64x(m15, m5);
1200 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1201 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1202 row4l = _mm_xor_si128(row4l, row1l);
1203 row4h = _mm_xor_si128(row4h, row1h);
1204 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1205 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1206 row3l = _mm_add_epi64(row3l, row4l);
1207 row3h = _mm_add_epi64(row3h, row4h);
1208 row2l = _mm_xor_si128(row2l, row3l);
1209 row2h = _mm_xor_si128(row2h, row3h);
1210 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1211 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1213 b0 = _mm_set_epi64x(m0, m8);
1214 b1 = _mm_set_epi64x(m13, m2);
1215 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1216 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1217 row4l = _mm_xor_si128(row4l, row1l);
1218 row4h = _mm_xor_si128(row4h, row1h);
1219 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1220 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1221 row3l = _mm_add_epi64(row3l, row4l);
1222 row3h = _mm_add_epi64(row3h, row4h);
1223 row2l = _mm_xor_si128(row2l, row3l);
1224 row2h = _mm_xor_si128(row2h, row3h);
1225 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1226 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1228 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1229 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1230 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1231 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1232 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1233 b0 = _mm_set_epi64x(m3, m10);
1234 b1 = _mm_set_epi64x(m9, m7);
1235 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1236 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1237 row4l = _mm_xor_si128(row4l, row1l);
1238 row4h = _mm_xor_si128(row4h, row1h);
1239 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1240 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1241 row3l = _mm_add_epi64(row3l, row4l);
1242 row3h = _mm_add_epi64(row3h, row4h);
1243 row2l = _mm_xor_si128(row2l, row3l);
1244 row2h = _mm_xor_si128(row2h, row3h);
1245 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1246 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1248 b0 = _mm_set_epi64x(m6, m14);
1249 b1 = _mm_set_epi64x(m4, m1);
1250 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1251 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1252 row4l = _mm_xor_si128(row4l, row1l);
1253 row4h = _mm_xor_si128(row4h, row1h);
1254 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1255 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1256 row3l = _mm_add_epi64(row3l, row4l);
1257 row3h = _mm_add_epi64(row3h, row4h);
1258 row2l = _mm_xor_si128(row2l, row3l);
1259 row2h = _mm_xor_si128(row2h, row3h);
1260 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1261 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1263 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1264 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1265 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1266 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1267 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1269 b0 = _mm_set_epi64x(m3, m7);
1270 b1 = _mm_set_epi64x(m11, m13);
1271 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1272 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1273 row4l = _mm_xor_si128(row4l, row1l);
1274 row4h = _mm_xor_si128(row4h, row1h);
1275 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1276 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1277 row3l = _mm_add_epi64(row3l, row4l);
1278 row3h = _mm_add_epi64(row3h, row4h);
1279 row2l = _mm_xor_si128(row2l, row3l);
1280 row2h = _mm_xor_si128(row2h, row3h);
1281 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1282 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1284 b0 = _mm_set_epi64x(m1, m9);
1285 b1 = _mm_set_epi64x(m14, m12);
1286 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1287 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1288 row4l = _mm_xor_si128(row4l, row1l);
1289 row4h = _mm_xor_si128(row4h, row1h);
1290 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1291 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1292 row3l = _mm_add_epi64(row3l, row4l);
1293 row3h = _mm_add_epi64(row3h, row4h);
1294 row2l = _mm_xor_si128(row2l, row3l);
1295 row2h = _mm_xor_si128(row2h, row3h);
1296 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1297 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1299 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1300 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1301 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1302 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1303 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1304 b0 = _mm_set_epi64x(m5, m2);
1305 b1 = _mm_set_epi64x(m15, m4);
1306 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1307 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1308 row4l = _mm_xor_si128(row4l, row1l);
1309 row4h = _mm_xor_si128(row4h, row1h);
1310 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1311 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1312 row3l = _mm_add_epi64(row3l, row4l);
1313 row3h = _mm_add_epi64(row3h, row4h);
1314 row2l = _mm_xor_si128(row2l, row3l);
1315 row2h = _mm_xor_si128(row2h, row3h);
1316 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1317 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1319 b0 = _mm_set_epi64x(m10, m6);
1320 b1 = _mm_set_epi64x(m8, m0);
1321 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1322 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1323 row4l = _mm_xor_si128(row4l, row1l);
1324 row4h = _mm_xor_si128(row4h, row1h);
1325 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1326 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1327 row3l = _mm_add_epi64(row3l, row4l);
1328 row3h = _mm_add_epi64(row3h, row4h);
1329 row2l = _mm_xor_si128(row2l, row3l);
1330 row2h = _mm_xor_si128(row2h, row3h);
1331 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1332 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1334 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1335 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1336 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1337 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1338 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1340 b0 = _mm_set_epi64x(m5, m9);
1341 b1 = _mm_set_epi64x(m10, m2);
1342 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1343 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1344 row4l = _mm_xor_si128(row4l, row1l);
1345 row4h = _mm_xor_si128(row4h, row1h);
1346 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1347 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1348 row3l = _mm_add_epi64(row3l, row4l);
1349 row3h = _mm_add_epi64(row3h, row4h);
1350 row2l = _mm_xor_si128(row2l, row3l);
1351 row2h = _mm_xor_si128(row2h, row3h);
1352 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1353 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1355 b0 = _mm_set_epi64x(m7, m0);
1356 b1 = _mm_set_epi64x(m15, m4);
1357 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1358 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1359 row4l = _mm_xor_si128(row4l, row1l);
1360 row4h = _mm_xor_si128(row4h, row1h);
1361 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1362 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1363 row3l = _mm_add_epi64(row3l, row4l);
1364 row3h = _mm_add_epi64(row3h, row4h);
1365 row2l = _mm_xor_si128(row2l, row3l);
1366 row2h = _mm_xor_si128(row2h, row3h);
1367 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1368 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1370 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1371 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1372 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1373 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1374 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1375 b0 = _mm_set_epi64x(m11, m14);
1376 b1 = _mm_set_epi64x(m3, m6);
1377 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1378 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1379 row4l = _mm_xor_si128(row4l, row1l);
1380 row4h = _mm_xor_si128(row4h, row1h);
1381 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1382 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1383 row3l = _mm_add_epi64(row3l, row4l);
1384 row3h = _mm_add_epi64(row3h, row4h);
1385 row2l = _mm_xor_si128(row2l, row3l);
1386 row2h = _mm_xor_si128(row2h, row3h);
1387 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1388 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1391 b0 = _mm_set_epi64x(m12, m1);
1392 b1 = _mm_set_epi64x(m13, m8);
1393 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1394 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1395 row4l = _mm_xor_si128(row4l, row1l);
1396 row4h = _mm_xor_si128(row4h, row1h);
1397 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1398 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1399 row3l = _mm_add_epi64(row3l, row4l);
1400 row3h = _mm_add_epi64(row3h, row4h);
1401 row2l = _mm_xor_si128(row2l, row3l);
1402 row2h = _mm_xor_si128(row2h, row3h);
1403 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1404 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1406 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1407 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1408 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1409 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1410 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1412 b0 = _mm_set_epi64x(m6, m2);
1413 b1 = _mm_set_epi64x(m8, m0);
1414 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1415 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1416 row4l = _mm_xor_si128(row4l, row1l);
1417 row4h = _mm_xor_si128(row4h, row1h);
1418 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1419 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1420 row3l = _mm_add_epi64(row3l, row4l);
1421 row3h = _mm_add_epi64(row3h, row4h);
1422 row2l = _mm_xor_si128(row2l, row3l);
1423 row2h = _mm_xor_si128(row2h, row3h);
1424 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1425 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1427 b0 = _mm_set_epi64x(m10, m12);
1428 b1 = _mm_set_epi64x(m3, m11);
1429 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1430 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1431 row4l = _mm_xor_si128(row4l, row1l);
1432 row4h = _mm_xor_si128(row4h, row1h);
1433 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1434 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1435 row3l = _mm_add_epi64(row3l, row4l);
1436 row3h = _mm_add_epi64(row3h, row4h);
1437 row2l = _mm_xor_si128(row2l, row3l);
1438 row2h = _mm_xor_si128(row2h, row3h);
1439 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1440 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1442 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1443 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1444 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1445 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1446 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1447 b0 = _mm_set_epi64x(m7, m4);
1448 b1 = _mm_set_epi64x(m1, m15);
1449 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1450 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1451 row4l = _mm_xor_si128(row4l, row1l);
1452 row4h = _mm_xor_si128(row4h, row1h);
1453 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1454 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1455 row3l = _mm_add_epi64(row3l, row4l);
1456 row3h = _mm_add_epi64(row3h, row4h);
1457 row2l = _mm_xor_si128(row2l, row3l);
1458 row2h = _mm_xor_si128(row2h, row3h);
1459 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1460 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1462 b0 = _mm_set_epi64x(m5, m13);
1463 b1 = _mm_set_epi64x(m9, m14);
1464 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1465 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1466 row4l = _mm_xor_si128(row4l, row1l);
1467 row4h = _mm_xor_si128(row4h, row1h);
1468 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1469 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1470 row3l = _mm_add_epi64(row3l, row4l);
1471 row3h = _mm_add_epi64(row3h, row4h);
1472 row2l = _mm_xor_si128(row2l, row3l);
1473 row2h = _mm_xor_si128(row2h, row3h);
1474 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1475 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1477 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1478 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1479 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1480 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1481 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1483 b0 = _mm_set_epi64x(m1, m12);
1484 b1 = _mm_set_epi64x(m4, m14);
1485 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1486 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1487 row4l = _mm_xor_si128(row4l, row1l);
1488 row4h = _mm_xor_si128(row4h, row1h);
1489 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1490 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1491 row3l = _mm_add_epi64(row3l, row4l);
1492 row3h = _mm_add_epi64(row3h, row4h);
1493 row2l = _mm_xor_si128(row2l, row3l);
1494 row2h = _mm_xor_si128(row2h, row3h);
1495 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1496 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1498 b0 = _mm_set_epi64x(m15, m5);
1499 b1 = _mm_set_epi64x(m10, m13);
1500 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1501 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1502 row4l = _mm_xor_si128(row4l, row1l);
1503 row4h = _mm_xor_si128(row4h, row1h);
1504 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1505 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1506 row3l = _mm_add_epi64(row3l, row4l);
1507 row3h = _mm_add_epi64(row3h, row4h);
1508 row2l = _mm_xor_si128(row2l, row3l);
1509 row2h = _mm_xor_si128(row2h, row3h);
1510 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1511 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1513 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1514 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1515 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1516 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1517 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1518 b0 = _mm_set_epi64x(m6, m0);
1519 b1 = _mm_set_epi64x(m8, m9);
1520 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1521 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1522 row4l = _mm_xor_si128(row4l, row1l);
1523 row4h = _mm_xor_si128(row4h, row1h);
1524 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1525 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1526 row3l = _mm_add_epi64(row3l, row4l);
1527 row3h = _mm_add_epi64(row3h, row4h);
1528 row2l = _mm_xor_si128(row2l, row3l);
1529 row2h = _mm_xor_si128(row2h, row3h);
1530 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1531 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1533 b0 = _mm_set_epi64x(m3, m7);
1534 b1 = _mm_set_epi64x(m11, m2);
1535 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1536 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1537 row4l = _mm_xor_si128(row4l, row1l);
1538 row4h = _mm_xor_si128(row4h, row1h);
1539 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1540 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1541 row3l = _mm_add_epi64(row3l, row4l);
1542 row3h = _mm_add_epi64(row3h, row4h);
1543 row2l = _mm_xor_si128(row2l, row3l);
1544 row2h = _mm_xor_si128(row2h, row3h);
1545 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1546 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1548 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1549 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1550 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1551 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1552 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1554 b0 = _mm_set_epi64x(m7, m13);
1555 b1 = _mm_set_epi64x(m3, m12);
1556 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1557 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1558 row4l = _mm_xor_si128(row4l, row1l);
1559 row4h = _mm_xor_si128(row4h, row1h);
1560 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1561 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1562 row3l = _mm_add_epi64(row3l, row4l);
1563 row3h = _mm_add_epi64(row3h, row4h);
1564 row2l = _mm_xor_si128(row2l, row3l);
1565 row2h = _mm_xor_si128(row2h, row3h);
1566 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1567 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1569 b0 = _mm_set_epi64x(m14, m11);
1570 b1 = _mm_set_epi64x(m9, m1);
1571 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1572 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1573 row4l = _mm_xor_si128(row4l, row1l);
1574 row4h = _mm_xor_si128(row4h, row1h);
1575 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1576 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1577 row3l = _mm_add_epi64(row3l, row4l);
1578 row3h = _mm_add_epi64(row3h, row4h);
1579 row2l = _mm_xor_si128(row2l, row3l);
1580 row2h = _mm_xor_si128(row2h, row3h);
1581 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1582 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1584 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1585 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1586 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1587 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1588 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1589 b0 = _mm_set_epi64x(m15, m5);
1590 b1 = _mm_set_epi64x(m2, m8);
1591 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1592 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1593 row4l = _mm_xor_si128(row4l, row1l);
1594 row4h = _mm_xor_si128(row4h, row1h);
1595 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1596 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1597 row3l = _mm_add_epi64(row3l, row4l);
1598 row3h = _mm_add_epi64(row3h, row4h);
1599 row2l = _mm_xor_si128(row2l, row3l);
1600 row2h = _mm_xor_si128(row2h, row3h);
1601 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1602 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1604 b0 = _mm_set_epi64x(m4, m0);
1605 b1 = _mm_set_epi64x(m10, m6);
1606 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1607 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1608 row4l = _mm_xor_si128(row4l, row1l);
1609 row4h = _mm_xor_si128(row4h, row1h);
1610 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1611 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1612 row3l = _mm_add_epi64(row3l, row4l);
1613 row3h = _mm_add_epi64(row3h, row4h);
1614 row2l = _mm_xor_si128(row2l, row3l);
1615 row2h = _mm_xor_si128(row2h, row3h);
1616 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1617 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1619 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1620 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1621 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1622 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1623 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1625 b0 = _mm_set_epi64x(m14, m6);
1626 b1 = _mm_set_epi64x(m0, m11);
1627 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1628 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1629 row4l = _mm_xor_si128(row4l, row1l);
1630 row4h = _mm_xor_si128(row4h, row1h);
1631 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1632 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1633 row3l = _mm_add_epi64(row3l, row4l);
1634 row3h = _mm_add_epi64(row3h, row4h);
1635 row2l = _mm_xor_si128(row2l, row3l);
1636 row2h = _mm_xor_si128(row2h, row3h);
1637 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1638 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1640 b0 = _mm_set_epi64x(m9, m15);
1641 b1 = _mm_set_epi64x(m8, m3);
1642 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1643 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1644 row4l = _mm_xor_si128(row4l, row1l);
1645 row4h = _mm_xor_si128(row4h, row1h);
1646 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1647 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1648 row3l = _mm_add_epi64(row3l, row4l);
1649 row3h = _mm_add_epi64(row3h, row4h);
1650 row2l = _mm_xor_si128(row2l, row3l);
1651 row2h = _mm_xor_si128(row2h, row3h);
1652 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1653 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1655 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1656 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1657 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1658 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1659 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1660 b0 = _mm_set_epi64x(m13, m12);
1661 b1 = _mm_set_epi64x(m10, m1);
1662 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1663 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1664 row4l = _mm_xor_si128(row4l, row1l);
1665 row4h = _mm_xor_si128(row4h, row1h);
1666 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1667 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1668 row3l = _mm_add_epi64(row3l, row4l);
1669 row3h = _mm_add_epi64(row3h, row4h);
1670 row2l = _mm_xor_si128(row2l, row3l);
1671 row2h = _mm_xor_si128(row2h, row3h);
1672 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1673 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1675 b0 = _mm_set_epi64x(m7, m2);
1676 b1 = _mm_set_epi64x(m5, m4);
1677 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1678 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1679 row4l = _mm_xor_si128(row4l, row1l);
1680 row4h = _mm_xor_si128(row4h, row1h);
1681 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1682 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1683 row3l = _mm_add_epi64(row3l, row4l);
1684 row3h = _mm_add_epi64(row3h, row4h);
1685 row2l = _mm_xor_si128(row2l, row3l);
1686 row2h = _mm_xor_si128(row2h, row3h);
1687 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1688 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1690 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1691 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1692 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1693 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1694 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1696 b0 = _mm_set_epi64x(m8, m10);
1697 b1 = _mm_set_epi64x(m1, m7);
1698 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1699 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1700 row4l = _mm_xor_si128(row4l, row1l);
1701 row4h = _mm_xor_si128(row4h, row1h);
1702 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1703 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1704 row3l = _mm_add_epi64(row3l, row4l);
1705 row3h = _mm_add_epi64(row3h, row4h);
1706 row2l = _mm_xor_si128(row2l, row3l);
1707 row2h = _mm_xor_si128(row2h, row3h);
1708 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1709 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1711 b0 = _mm_set_epi64x(m4, m2);
1712 b1 = _mm_set_epi64x(m5, m6);
1713 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1714 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1715 row4l = _mm_xor_si128(row4l, row1l);
1716 row4h = _mm_xor_si128(row4h, row1h);
1717 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1718 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1719 row3l = _mm_add_epi64(row3l, row4l);
1720 row3h = _mm_add_epi64(row3h, row4h);
1721 row2l = _mm_xor_si128(row2l, row3l);
1722 row2h = _mm_xor_si128(row2h, row3h);
1723 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1724 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1726 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1727 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1728 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1729 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1730 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1731 b0 = _mm_set_epi64x(m9, m15);
1732 b1 = _mm_set_epi64x(m13, m3);
1733 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1734 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1735 row4l = _mm_xor_si128(row4l, row1l);
1736 row4h = _mm_xor_si128(row4h, row1h);
1737 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1738 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1739 row3l = _mm_add_epi64(row3l, row4l);
1740 row3h = _mm_add_epi64(row3h, row4h);
1741 row2l = _mm_xor_si128(row2l, row3l);
1742 row2h = _mm_xor_si128(row2h, row3h);
1743 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1744 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1746 b0 = _mm_set_epi64x(m14, m11);
1747 b1 = _mm_set_epi64x(m0, m12);
1748 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1749 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1750 row4l = _mm_xor_si128(row4l, row1l);
1751 row4h = _mm_xor_si128(row4h, row1h);
1752 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1753 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1754 row3l = _mm_add_epi64(row3l, row4l);
1755 row3h = _mm_add_epi64(row3h, row4h);
1756 row2l = _mm_xor_si128(row2l, row3l);
1757 row2h = _mm_xor_si128(row2h, row3h);
1758 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1759 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1761 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1762 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1763 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1764 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1765 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1767 b0 = _mm_set_epi64x(m2, m0);
1768 b1 = _mm_set_epi64x(m6, m4);
1769 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1770 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1771 row4l = _mm_xor_si128(row4l, row1l);
1772 row4h = _mm_xor_si128(row4h, row1h);
1773 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1774 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1775 row3l = _mm_add_epi64(row3l, row4l);
1776 row3h = _mm_add_epi64(row3h, row4h);
1777 row2l = _mm_xor_si128(row2l, row3l);
1778 row2h = _mm_xor_si128(row2h, row3h);
1779 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1780 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1782 b0 = _mm_set_epi64x(m3, m1);
1783 b1 = _mm_set_epi64x(m7, m5);
1784 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1785 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1786 row4l = _mm_xor_si128(row4l, row1l);
1787 row4h = _mm_xor_si128(row4h, row1h);
1788 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1789 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1790 row3l = _mm_add_epi64(row3l, row4l);
1791 row3h = _mm_add_epi64(row3h, row4h);
1792 row2l = _mm_xor_si128(row2l, row3l);
1793 row2h = _mm_xor_si128(row2h, row3h);
1794 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1795 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1797 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1798 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1799 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1800 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1801 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1803 b0 = _mm_set_epi64x(m10, m8);
1804 b1 = _mm_set_epi64x(m14, m12);
1805 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1806 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1807 row4l = _mm_xor_si128(row4l, row1l);
1808 row4h = _mm_xor_si128(row4h, row1h);
1809 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1810 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1811 row3l = _mm_add_epi64(row3l, row4l);
1812 row3h = _mm_add_epi64(row3h, row4h);
1813 row2l = _mm_xor_si128(row2l, row3l);
1814 row2h = _mm_xor_si128(row2h, row3h);
1815 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1816 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1818 b0 = _mm_set_epi64x(m11, m9);
1819 b1 = _mm_set_epi64x(m15, m13);
1820 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1821 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1822 row4l = _mm_xor_si128(row4l, row1l);
1823 row4h = _mm_xor_si128(row4h, row1h);
1824 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1825 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1826 row3l = _mm_add_epi64(row3l, row4l);
1827 row3h = _mm_add_epi64(row3h, row4h);
1828 row2l = _mm_xor_si128(row2l, row3l);
1829 row2h = _mm_xor_si128(row2h, row3h);
1830 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1831 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1833 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1834 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1835 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1836 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1837 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1839 b0 = _mm_set_epi64x(m4, m14);
1840 b1 = _mm_set_epi64x(m13, m9);
1841 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1842 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1843 row4l = _mm_xor_si128(row4l, row1l);
1844 row4h = _mm_xor_si128(row4h, row1h);
1845 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1846 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1847 row3l = _mm_add_epi64(row3l, row4l);
1848 row3h = _mm_add_epi64(row3h, row4h);
1849 row2l = _mm_xor_si128(row2l, row3l);
1850 row2h = _mm_xor_si128(row2h, row3h);
1851 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1852 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1854 b0 = _mm_set_epi64x(m8, m10);
1855 b1 = _mm_set_epi64x(m6, m15);
1856 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1857 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1858 row4l = _mm_xor_si128(row4l, row1l);
1859 row4h = _mm_xor_si128(row4h, row1h);
1860 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1861 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1862 row3l = _mm_add_epi64(row3l, row4l);
1863 row3h = _mm_add_epi64(row3h, row4h);
1864 row2l = _mm_xor_si128(row2l, row3l);
1865 row2h = _mm_xor_si128(row2h, row3h);
1866 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1867 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1869 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1870 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1871 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1872 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1873 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1874 b0 = _mm_set_epi64x(m0, m1);
1875 b1 = _mm_set_epi64x(m5, m11);
1876 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1877 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1878 row4l = _mm_xor_si128(row4l, row1l);
1879 row4h = _mm_xor_si128(row4h, row1h);
1880 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1881 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1882 row3l = _mm_add_epi64(row3l, row4l);
1883 row3h = _mm_add_epi64(row3h, row4h);
1884 row2l = _mm_xor_si128(row2l, row3l);
1885 row2h = _mm_xor_si128(row2h, row3h);
1886 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1887 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1889 b0 = _mm_set_epi64x(m2, m12);
1890 b1 = _mm_set_epi64x(m3, m7);
1891 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1892 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1893 row4l = _mm_xor_si128(row4l, row1l);
1894 row4h = _mm_xor_si128(row4h, row1h);
1895 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1896 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1897 row3l = _mm_add_epi64(row3l, row4l);
1898 row3h = _mm_add_epi64(row3h, row4h);
1899 row2l = _mm_xor_si128(row2l, row3l);
1900 row2h = _mm_xor_si128(row2h, row3h);
1901 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1902 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1904 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1905 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1906 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1907 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1908 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1910 row1l = _mm_xor_si128(row3l, row1l);
1911 row1h = _mm_xor_si128(row3h, row1h);
1912 _mm_storeu_si128((__m128i *)(
void*)(&state.
h[0]), _mm_xor_si128(_mm_loadu_si128((
const __m128i*)(
const void*)(&state.
h[0])), row1l));
1913 _mm_storeu_si128((__m128i *)(
void*)(&state.
h[2]), _mm_xor_si128(_mm_loadu_si128((
const __m128i*)(
const void*)(&state.
h[2])), row1h));
1915 row2l = _mm_xor_si128(row4l, row2l);
1916 row2h = _mm_xor_si128(row4h, row2h);
1917 _mm_storeu_si128((__m128i *)(
void*)(&state.
h[4]), _mm_xor_si128(_mm_loadu_si128((
const __m128i*)(
const void*)(&state.
h[4])), row2l));
1918 _mm_storeu_si128((__m128i *)(
void*)(&state.
h[6]), _mm_xor_si128(_mm_loadu_si128((
const __m128i*)(
const void*)(&state.
h[6])), row2h));
1920 # endif // (__SUNPRO_CC != 0x5120) 1921 #endif // CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 1923 #if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE 1926 __m128i row1, row2, row3, row4;
1927 __m128i buf1, buf2, buf3, buf4;
1932 const __m128i r8 = _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1);
1933 const __m128i r16 = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
1935 const __m128i m0 = _mm_loadu_si128((
const __m128i*)(
const void*)(input + 00));
1936 const __m128i m1 = _mm_loadu_si128((
const __m128i*)(
const void*)(input + 16));
1937 const __m128i m2 = _mm_loadu_si128((
const __m128i*)(
const void*)(input + 32));
1938 const __m128i m3 = _mm_loadu_si128((
const __m128i*)(
const void*)(input + 48));
1940 row1 = ff0 = _mm_loadu_si128((
const __m128i*)(
const void*)(&state.
h[0]));
1941 row2 = ff1 = _mm_loadu_si128((
const __m128i*)(
const void*)(&state.
h[4]));
1944 buf1 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(2,0,2,0))));
1946 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
1947 row4 = _mm_xor_si128(row4, row1);
1948 row4 = _mm_shuffle_epi8(row4,r16);
1949 row3 = _mm_add_epi32(row3, row4);
1950 row2 = _mm_xor_si128(row2, row3);
1951 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
1953 buf2 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(3,1,3,1))));
1955 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
1956 row4 = _mm_xor_si128(row4, row1);
1957 row4 = _mm_shuffle_epi8(row4,r8);
1958 row3 = _mm_add_epi32(row3, row4);
1959 row2 = _mm_xor_si128(row2, row3);
1960 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
1962 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
1963 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
1964 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
1966 buf3 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m2)), _mm_castsi128_ps((m3)), _MM_SHUFFLE(2,0,2,0))));
1968 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
1969 row4 = _mm_xor_si128(row4, row1);
1970 row4 = _mm_shuffle_epi8(row4,r16);
1971 row3 = _mm_add_epi32(row3, row4);
1972 row2 = _mm_xor_si128(row2, row3);
1973 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
1975 buf4 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m2)), _mm_castsi128_ps((m3)), _MM_SHUFFLE(3,1,3,1))));
1977 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
1978 row4 = _mm_xor_si128(row4, row1);
1979 row4 = _mm_shuffle_epi8(row4,r8);
1980 row3 = _mm_add_epi32(row3, row4);
1981 row2 = _mm_xor_si128(row2, row3);
1982 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
1984 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
1985 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
1986 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
1988 t0 = _mm_blend_epi16(m1, m2, 0x0C);
1989 t1 = _mm_slli_si128(m3, 4);
1990 t2 = _mm_blend_epi16(t0, t1, 0xF0);
1991 buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3));
1993 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
1994 row4 = _mm_xor_si128(row4, row1);
1995 row4 = _mm_shuffle_epi8(row4,r16);
1996 row3 = _mm_add_epi32(row3, row4);
1997 row2 = _mm_xor_si128(row2, row3);
1998 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2000 t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0));
2001 t1 = _mm_blend_epi16(m1,m3,0xC0);
2002 t2 = _mm_blend_epi16(t0, t1, 0xF0);
2003 buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
2005 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2006 row4 = _mm_xor_si128(row4, row1);
2007 row4 = _mm_shuffle_epi8(row4,r8);
2008 row3 = _mm_add_epi32(row3, row4);
2009 row2 = _mm_xor_si128(row2, row3);
2010 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2012 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2013 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2014 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2016 t0 = _mm_slli_si128(m1, 4);
2017 t1 = _mm_blend_epi16(m2, t0, 0x30);
2018 t2 = _mm_blend_epi16(m0, t1, 0xF0);
2019 buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
2021 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2022 row4 = _mm_xor_si128(row4, row1);
2023 row4 = _mm_shuffle_epi8(row4,r16);
2024 row3 = _mm_add_epi32(row3, row4);
2025 row2 = _mm_xor_si128(row2, row3);
2026 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2028 t0 = _mm_unpackhi_epi32(m0,m1);
2029 t1 = _mm_slli_si128(m3, 4);
2030 t2 = _mm_blend_epi16(t0, t1, 0x0C);
2031 buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
2033 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2034 row4 = _mm_xor_si128(row4, row1);
2035 row4 = _mm_shuffle_epi8(row4,r8);
2036 row3 = _mm_add_epi32(row3, row4);
2037 row2 = _mm_xor_si128(row2, row3);
2038 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2040 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2041 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2042 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2044 t0 = _mm_unpackhi_epi32(m2,m3);
2045 t1 = _mm_blend_epi16(m3,m1,0x0C);
2046 t2 = _mm_blend_epi16(t0, t1, 0x0F);
2047 buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
2049 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2050 row4 = _mm_xor_si128(row4, row1);
2051 row4 = _mm_shuffle_epi8(row4,r16);
2052 row3 = _mm_add_epi32(row3, row4);
2053 row2 = _mm_xor_si128(row2, row3);
2054 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2056 t0 = _mm_unpacklo_epi32(m2,m0);
2057 t1 = _mm_blend_epi16(t0, m0, 0xF0);
2058 t2 = _mm_slli_si128(m3, 8);
2059 buf2 = _mm_blend_epi16(t1, t2, 0xC0);
2061 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2062 row4 = _mm_xor_si128(row4, row1);
2063 row4 = _mm_shuffle_epi8(row4,r8);
2064 row3 = _mm_add_epi32(row3, row4);
2065 row2 = _mm_xor_si128(row2, row3);
2066 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2068 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2069 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2070 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2072 t0 = _mm_blend_epi16(m0, m2, 0x3C);
2073 t1 = _mm_srli_si128(m1, 12);
2074 t2 = _mm_blend_epi16(t0,t1,0x03);
2075 buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2));
2077 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2078 row4 = _mm_xor_si128(row4, row1);
2079 row4 = _mm_shuffle_epi8(row4,r16);
2080 row3 = _mm_add_epi32(row3, row4);
2081 row2 = _mm_xor_si128(row2, row3);
2082 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2084 t0 = _mm_slli_si128(m3, 4);
2085 t1 = _mm_blend_epi16(m0, m1, 0x33);
2086 t2 = _mm_blend_epi16(t1, t0, 0xC0);
2087 buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3));
2089 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2090 row4 = _mm_xor_si128(row4, row1);
2091 row4 = _mm_shuffle_epi8(row4,r8);
2092 row3 = _mm_add_epi32(row3, row4);
2093 row2 = _mm_xor_si128(row2, row3);
2094 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2096 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2097 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2098 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2100 t0 = _mm_unpackhi_epi32(m0,m1);
2101 t1 = _mm_unpackhi_epi32(t0, m2);
2102 t2 = _mm_blend_epi16(t1, m3, 0x0C);
2103 buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
2105 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2106 row4 = _mm_xor_si128(row4, row1);
2107 row4 = _mm_shuffle_epi8(row4,r16);
2108 row3 = _mm_add_epi32(row3, row4);
2109 row2 = _mm_xor_si128(row2, row3);
2110 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2112 t0 = _mm_slli_si128(m2, 8);
2113 t1 = _mm_blend_epi16(m3,m0,0x0C);
2114 t2 = _mm_blend_epi16(t1, t0, 0xC0);
2115 buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
2117 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2118 row4 = _mm_xor_si128(row4, row1);
2119 row4 = _mm_shuffle_epi8(row4,r8);
2120 row3 = _mm_add_epi32(row3, row4);
2121 row2 = _mm_xor_si128(row2, row3);
2122 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2124 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2125 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2126 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2128 t0 = _mm_blend_epi16(m0,m1,0x0F);
2129 t1 = _mm_blend_epi16(t0, m3, 0xC0);
2130 buf3 = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
2132 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2133 row4 = _mm_xor_si128(row4, row1);
2134 row4 = _mm_shuffle_epi8(row4,r16);
2135 row3 = _mm_add_epi32(row3, row4);
2136 row2 = _mm_xor_si128(row2, row3);
2137 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2139 t0 = _mm_unpacklo_epi32(m0,m2);
2140 t1 = _mm_unpackhi_epi32(m1,m2);
2141 buf4 = _mm_unpacklo_epi64(t1,t0);
2143 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2144 row4 = _mm_xor_si128(row4, row1);
2145 row4 = _mm_shuffle_epi8(row4,r8);
2146 row3 = _mm_add_epi32(row3, row4);
2147 row2 = _mm_xor_si128(row2, row3);
2148 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2150 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2151 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2152 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2154 t0 = _mm_unpacklo_epi64(m1,m2);
2155 t1 = _mm_unpackhi_epi64(m0,m2);
2156 t2 = _mm_blend_epi16(t0,t1,0x33);
2157 buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
2159 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2160 row4 = _mm_xor_si128(row4, row1);
2161 row4 = _mm_shuffle_epi8(row4,r16);
2162 row3 = _mm_add_epi32(row3, row4);
2163 row2 = _mm_xor_si128(row2, row3);
2164 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2166 t0 = _mm_unpackhi_epi64(m1,m3);
2167 t1 = _mm_unpacklo_epi64(m0,m1);
2168 buf2 = _mm_blend_epi16(t0,t1,0x33);
2170 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2171 row4 = _mm_xor_si128(row4, row1);
2172 row4 = _mm_shuffle_epi8(row4,r8);
2173 row3 = _mm_add_epi32(row3, row4);
2174 row2 = _mm_xor_si128(row2, row3);
2175 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2177 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2178 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2179 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2181 t0 = _mm_unpackhi_epi64(m3,m1);
2182 t1 = _mm_unpackhi_epi64(m2,m0);
2183 buf3 = _mm_blend_epi16(t1,t0,0x33);
2185 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2186 row4 = _mm_xor_si128(row4, row1);
2187 row4 = _mm_shuffle_epi8(row4,r16);
2188 row3 = _mm_add_epi32(row3, row4);
2189 row2 = _mm_xor_si128(row2, row3);
2190 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2192 t0 = _mm_blend_epi16(m0,m2,0x03);
2193 t1 = _mm_slli_si128(t0, 8);
2194 t2 = _mm_blend_epi16(t1,m3,0x0F);
2195 buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3));
2197 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2198 row4 = _mm_xor_si128(row4, row1);
2199 row4 = _mm_shuffle_epi8(row4,r8);
2200 row3 = _mm_add_epi32(row3, row4);
2201 row2 = _mm_xor_si128(row2, row3);
2202 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2204 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2205 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2206 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2208 t0 = _mm_unpackhi_epi32(m0,m1);
2209 t1 = _mm_unpacklo_epi32(m0,m2);
2210 buf1 = _mm_unpacklo_epi64(t0,t1);
2212 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2213 row4 = _mm_xor_si128(row4, row1);
2214 row4 = _mm_shuffle_epi8(row4,r16);
2215 row3 = _mm_add_epi32(row3, row4);
2216 row2 = _mm_xor_si128(row2, row3);
2217 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2219 t0 = _mm_srli_si128(m2, 4);
2220 t1 = _mm_blend_epi16(m0,m3,0x03);
2221 buf2 = _mm_blend_epi16(t1,t0,0x3C);
2223 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2224 row4 = _mm_xor_si128(row4, row1);
2225 row4 = _mm_shuffle_epi8(row4,r8);
2226 row3 = _mm_add_epi32(row3, row4);
2227 row2 = _mm_xor_si128(row2, row3);
2228 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2230 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2231 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2232 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2234 t0 = _mm_blend_epi16(m1,m0,0x0C);
2235 t1 = _mm_srli_si128(m3, 4);
2236 t2 = _mm_blend_epi16(t0,t1,0x30);
2237 buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0));
2239 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2240 row4 = _mm_xor_si128(row4, row1);
2241 row4 = _mm_shuffle_epi8(row4,r16);
2242 row3 = _mm_add_epi32(row3, row4);
2243 row2 = _mm_xor_si128(row2, row3);
2244 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2246 t0 = _mm_unpacklo_epi64(m1,m2);
2247 t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1));
2248 buf4 = _mm_blend_epi16(t0,t1,0x33);
2250 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2251 row4 = _mm_xor_si128(row4, row1);
2252 row4 = _mm_shuffle_epi8(row4,r8);
2253 row3 = _mm_add_epi32(row3, row4);
2254 row2 = _mm_xor_si128(row2, row3);
2255 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2257 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2258 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2259 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2261 t0 = _mm_slli_si128(m1, 12);
2262 t1 = _mm_blend_epi16(m0,m3,0x33);
2263 buf1 = _mm_blend_epi16(t1,t0,0xC0);
2265 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2266 row4 = _mm_xor_si128(row4, row1);
2267 row4 = _mm_shuffle_epi8(row4,r16);
2268 row3 = _mm_add_epi32(row3, row4);
2269 row2 = _mm_xor_si128(row2, row3);
2270 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2272 t0 = _mm_blend_epi16(m3,m2,0x30);
2273 t1 = _mm_srli_si128(m1, 4);
2274 t2 = _mm_blend_epi16(t0,t1,0x03);
2275 buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0));
2277 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2278 row4 = _mm_xor_si128(row4, row1);
2279 row4 = _mm_shuffle_epi8(row4,r8);
2280 row3 = _mm_add_epi32(row3, row4);
2281 row2 = _mm_xor_si128(row2, row3);
2282 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2284 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2285 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2286 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2288 t0 = _mm_unpacklo_epi64(m0,m2);
2289 t1 = _mm_srli_si128(m1, 4);
2290 buf3 = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0));
2292 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2293 row4 = _mm_xor_si128(row4, row1);
2294 row4 = _mm_shuffle_epi8(row4,r16);
2295 row3 = _mm_add_epi32(row3, row4);
2296 row2 = _mm_xor_si128(row2, row3);
2297 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2299 t0 = _mm_unpackhi_epi32(m1,m2);
2300 t1 = _mm_unpackhi_epi64(m0,t0);
2301 buf4 = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
2303 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2304 row4 = _mm_xor_si128(row4, row1);
2305 row4 = _mm_shuffle_epi8(row4,r8);
2306 row3 = _mm_add_epi32(row3, row4);
2307 row2 = _mm_xor_si128(row2, row3);
2308 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2310 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2311 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2312 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2314 t0 = _mm_unpackhi_epi32(m0,m1);
2315 t1 = _mm_blend_epi16(t0,m3,0x0F);
2316 buf1 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1));
2318 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2319 row4 = _mm_xor_si128(row4, row1);
2320 row4 = _mm_shuffle_epi8(row4,r16);
2321 row3 = _mm_add_epi32(row3, row4);
2322 row2 = _mm_xor_si128(row2, row3);
2323 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2325 t0 = _mm_blend_epi16(m2,m3,0x30);
2326 t1 = _mm_srli_si128(m0,4);
2327 t2 = _mm_blend_epi16(t0,t1,0x03);
2328 buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3));
2330 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2331 row4 = _mm_xor_si128(row4, row1);
2332 row4 = _mm_shuffle_epi8(row4,r8);
2333 row3 = _mm_add_epi32(row3, row4);
2334 row2 = _mm_xor_si128(row2, row3);
2335 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2337 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2338 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2339 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2341 t0 = _mm_unpackhi_epi64(m0,m3);
2342 t1 = _mm_unpacklo_epi64(m1,m2);
2343 t2 = _mm_blend_epi16(t0,t1,0x3C);
2344 buf3 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1));
2346 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2347 row4 = _mm_xor_si128(row4, row1);
2348 row4 = _mm_shuffle_epi8(row4,r16);
2349 row3 = _mm_add_epi32(row3, row4);
2350 row2 = _mm_xor_si128(row2, row3);
2351 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2353 t0 = _mm_unpacklo_epi32(m0,m1);
2354 t1 = _mm_unpackhi_epi32(m1,m2);
2355 buf4 = _mm_unpacklo_epi64(t0,t1);
2357 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2358 row4 = _mm_xor_si128(row4, row1);
2359 row4 = _mm_shuffle_epi8(row4,r8);
2360 row3 = _mm_add_epi32(row3, row4);
2361 row2 = _mm_xor_si128(row2, row3);
2362 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2364 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2365 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2366 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2368 t0 = _mm_unpackhi_epi32(m1,m3);
2369 t1 = _mm_unpacklo_epi64(t0,m0);
2370 t2 = _mm_blend_epi16(t1,m2,0xC0);
2371 buf1 = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2));
2373 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2374 row4 = _mm_xor_si128(row4, row1);
2375 row4 = _mm_shuffle_epi8(row4,r16);
2376 row3 = _mm_add_epi32(row3, row4);
2377 row2 = _mm_xor_si128(row2, row3);
2378 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2380 t0 = _mm_unpackhi_epi32(m0,m3);
2381 t1 = _mm_blend_epi16(m2,t0,0xF0);
2382 buf2 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3));
2384 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2385 row4 = _mm_xor_si128(row4, row1);
2386 row4 = _mm_shuffle_epi8(row4,r8);
2387 row3 = _mm_add_epi32(row3, row4);
2388 row2 = _mm_xor_si128(row2, row3);
2389 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2391 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2392 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2393 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2395 t0 = _mm_blend_epi16(m2,m0,0x0C);
2396 t1 = _mm_slli_si128(t0,4);
2397 buf3 = _mm_blend_epi16(t1,m3,0x0F);
2399 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2400 row4 = _mm_xor_si128(row4, row1);
2401 row4 = _mm_shuffle_epi8(row4,r16);
2402 row3 = _mm_add_epi32(row3, row4);
2403 row2 = _mm_xor_si128(row2, row3);
2404 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2406 t0 = _mm_blend_epi16(m1,m0,0x30);
2407 buf4 = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2));
2409 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2410 row4 = _mm_xor_si128(row4, row1);
2411 row4 = _mm_shuffle_epi8(row4,r8);
2412 row3 = _mm_add_epi32(row3, row4);
2413 row2 = _mm_xor_si128(row2, row3);
2414 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2416 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2417 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2418 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2420 t0 = _mm_blend_epi16(m0,m2,0x03);
2421 t1 = _mm_blend_epi16(m1,m2,0x30);
2422 t2 = _mm_blend_epi16(t1,t0,0x0F);
2423 buf1 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2));
2425 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2426 row4 = _mm_xor_si128(row4, row1);
2427 row4 = _mm_shuffle_epi8(row4,r16);
2428 row3 = _mm_add_epi32(row3, row4);
2429 row2 = _mm_xor_si128(row2, row3);
2430 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2432 t0 = _mm_slli_si128(m0,4);
2433 t1 = _mm_blend_epi16(m1,t0,0xC0);
2434 buf2 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3));
2436 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2437 row4 = _mm_xor_si128(row4, row1);
2438 row4 = _mm_shuffle_epi8(row4,r8);
2439 row3 = _mm_add_epi32(row3, row4);
2440 row2 = _mm_xor_si128(row2, row3);
2441 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2443 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2444 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2445 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2447 t0 = _mm_unpackhi_epi32(m0,m3);
2448 t1 = _mm_unpacklo_epi32(m2,m3);
2449 t2 = _mm_unpackhi_epi64(t0,t1);
2450 buf3 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1));
2452 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2453 row4 = _mm_xor_si128(row4, row1);
2454 row4 = _mm_shuffle_epi8(row4,r16);
2455 row3 = _mm_add_epi32(row3, row4);
2456 row2 = _mm_xor_si128(row2, row3);
2457 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2459 t0 = _mm_blend_epi16(m3,m2,0xC0);
2460 t1 = _mm_unpacklo_epi32(m0,m3);
2461 t2 = _mm_blend_epi16(t0,t1,0x0F);
2462 buf4 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3));
2464 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2465 row4 = _mm_xor_si128(row4, row1);
2466 row4 = _mm_shuffle_epi8(row4,r8);
2467 row3 = _mm_add_epi32(row3, row4);
2468 row2 = _mm_xor_si128(row2, row3);
2469 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2471 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2472 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2473 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2475 _mm_storeu_si128((__m128i *)(
void*)(&state.
h[0]), _mm_xor_si128(ff0, _mm_xor_si128(row1, row3)));
2476 _mm_storeu_si128((__m128i *)(
void*)(&state.
h[4]), _mm_xor_si128(ff1, _mm_xor_si128(row2, row4)));
2481 __m128i row1l, row1h;
2482 __m128i row2l, row2h;
2483 __m128i row3l, row3h;
2484 __m128i row4l, row4h;
2485 __m128i b0, b1, t0,
t1;
2487 const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
2488 const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
2490 const __m128i m0 = _mm_loadu_si128((
const __m128i*)(
const void*)(input + 00));
2491 const __m128i m1 = _mm_loadu_si128((
const __m128i*)(
const void*)(input + 16));
2492 const __m128i m2 = _mm_loadu_si128((
const __m128i*)(
const void*)(input + 32));
2493 const __m128i m3 = _mm_loadu_si128((
const __m128i*)(
const void*)(input + 48));
2494 const __m128i m4 = _mm_loadu_si128((
const __m128i*)(
const void*)(input + 64));
2495 const __m128i m5 = _mm_loadu_si128((
const __m128i*)(
const void*)(input + 80));
2496 const __m128i m6 = _mm_loadu_si128((
const __m128i*)(
const void*)(input + 96));
2497 const __m128i m7 = _mm_loadu_si128((
const __m128i*)(
const void*)(input + 112));
2499 row1l = _mm_loadu_si128((
const __m128i*)(
const void*)(&state.
h[0]));
2500 row1h = _mm_loadu_si128((
const __m128i*)(
const void*)(&state.
h[2]));
2501 row2l = _mm_loadu_si128((
const __m128i*)(
const void*)(&state.
h[4]));
2502 row2h = _mm_loadu_si128((
const __m128i*)(
const void*)(&state.
h[6]));
2503 row3l = _mm_loadu_si128((
const __m128i*)(
const void*)(&
BLAKE2B_IV(0)));
2504 row3h = _mm_loadu_si128((
const __m128i*)(
const void*)(&
BLAKE2B_IV(2)));
2505 row4l = _mm_xor_si128(_mm_loadu_si128((
const __m128i*)(
const void*)(&
BLAKE2B_IV(4))), _mm_loadu_si128((
const __m128i*)(
const void*)(&state.
t[0])));
2506 row4h = _mm_xor_si128(_mm_loadu_si128((
const __m128i*)(
const void*)(&
BLAKE2B_IV(6))), _mm_loadu_si128((
const __m128i*)(
const void*)(&state.
f[0])));
2508 b0 = _mm_unpacklo_epi64(m0, m1);
2509 b1 = _mm_unpacklo_epi64(m2, m3);
2510 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2511 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2512 row4l = _mm_xor_si128(row4l, row1l);
2513 row4h = _mm_xor_si128(row4h, row1h);
2514 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2515 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2516 row3l = _mm_add_epi64(row3l, row4l);
2517 row3h = _mm_add_epi64(row3h, row4h);
2518 row2l = _mm_xor_si128(row2l, row3l);
2519 row2h = _mm_xor_si128(row2h, row3h);
2520 row2l = _mm_shuffle_epi8(row2l, r24);
2521 row2h = _mm_shuffle_epi8(row2h, r24);
2523 b0 = _mm_unpackhi_epi64(m0, m1);
2524 b1 = _mm_unpackhi_epi64(m2, m3);
2526 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2527 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2528 row4l = _mm_xor_si128(row4l, row1l);
2529 row4h = _mm_xor_si128(row4h, row1h);
2530 row4l = _mm_shuffle_epi8(row4l, r16);
2531 row4h = _mm_shuffle_epi8(row4h, r16);
2532 row3l = _mm_add_epi64(row3l, row4l);
2533 row3h = _mm_add_epi64(row3h, row4h);
2534 row2l = _mm_xor_si128(row2l, row3l);
2535 row2h = _mm_xor_si128(row2h, row3h);
2536 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2537 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2539 t0 = _mm_alignr_epi8(row2h, row2l, 8);
2540 t1 = _mm_alignr_epi8(row2l, row2h, 8);
2541 row2l = t0, row2h =
t1, t0 = row3l, row3l = row3h, row3h = t0;
2542 t0 = _mm_alignr_epi8(row4h, row4l, 8);
2543 t1 = _mm_alignr_epi8(row4l, row4h, 8);
2544 row4l =
t1, row4h = t0;
2546 b0 = _mm_unpacklo_epi64(m4, m5);
2547 b1 = _mm_unpacklo_epi64(m6, m7);
2549 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2550 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2551 row4l = _mm_xor_si128(row4l, row1l);
2552 row4h = _mm_xor_si128(row4h, row1h);
2553 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2554 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2555 row3l = _mm_add_epi64(row3l, row4l);
2556 row3h = _mm_add_epi64(row3h, row4h);
2557 row2l = _mm_xor_si128(row2l, row3l);
2558 row2h = _mm_xor_si128(row2h, row3h);
2559 row2l = _mm_shuffle_epi8(row2l, r24);
2560 row2h = _mm_shuffle_epi8(row2h, r24);
2562 b0 = _mm_unpackhi_epi64(m4, m5);
2563 b1 = _mm_unpackhi_epi64(m6, m7);
2565 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2566 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2567 row4l = _mm_xor_si128(row4l, row1l);
2568 row4h = _mm_xor_si128(row4h, row1h);
2569 row4l = _mm_shuffle_epi8(row4l, r16);
2570 row4h = _mm_shuffle_epi8(row4h, r16);
2571 row3l = _mm_add_epi64(row3l, row4l);
2572 row3h = _mm_add_epi64(row3h, row4h);
2573 row2l = _mm_xor_si128(row2l, row3l);
2574 row2h = _mm_xor_si128(row2h, row3h);
2575 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2576 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2578 t0 = _mm_alignr_epi8(row2l, row2h, 8);
2579 t1 = _mm_alignr_epi8(row2h, row2l, 8);
2580 row2l = t0, row2h =
t1, t0 = row3l, row3l = row3h, row3h = t0;
2581 t0 = _mm_alignr_epi8(row4l, row4h, 8);
2582 t1 = _mm_alignr_epi8(row4h, row4l, 8);
2583 row4l =
t1, row4h = t0;
2585 b0 = _mm_unpacklo_epi64(m7, m2);
2586 b1 = _mm_unpackhi_epi64(m4, m6);
2588 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2589 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2590 row4l = _mm_xor_si128(row4l, row1l);
2591 row4h = _mm_xor_si128(row4h, row1h);
2592 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2593 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2594 row3l = _mm_add_epi64(row3l, row4l);
2595 row3h = _mm_add_epi64(row3h, row4h);
2596 row2l = _mm_xor_si128(row2l, row3l);
2597 row2h = _mm_xor_si128(row2h, row3h);
2598 row2l = _mm_shuffle_epi8(row2l, r24);
2599 row2h = _mm_shuffle_epi8(row2h, r24);
2601 b0 = _mm_unpacklo_epi64(m5, m4);
2602 b1 = _mm_alignr_epi8(m3, m7, 8);
2604 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2605 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2606 row4l = _mm_xor_si128(row4l, row1l);
2607 row4h = _mm_xor_si128(row4h, row1h);
2608 row4l = _mm_shuffle_epi8(row4l, r16);
2609 row4h = _mm_shuffle_epi8(row4h, r16);
2610 row3l = _mm_add_epi64(row3l, row4l);
2611 row3h = _mm_add_epi64(row3h, row4h);
2612 row2l = _mm_xor_si128(row2l, row3l);
2613 row2h = _mm_xor_si128(row2h, row3h);
2614 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2615 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2617 t0 = _mm_alignr_epi8(row2h, row2l, 8);
2618 t1 = _mm_alignr_epi8(row2l, row2h, 8);
2619 row2l = t0, row2h =
t1, t0 = row3l, row3l = row3h, row3h = t0;
2620 t0 = _mm_alignr_epi8(row4h, row4l, 8);
2621 t1 = _mm_alignr_epi8(row4l, row4h, 8);
2622 row4l =
t1, row4h = t0;
2624 b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));
2625 b1 = _mm_unpackhi_epi64(m5, m2);
2627 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2628 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2629 row4l = _mm_xor_si128(row4l, row1l);
2630 row4h = _mm_xor_si128(row4h, row1h);
2631 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2632 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2633 row3l = _mm_add_epi64(row3l, row4l);
2634 row3h = _mm_add_epi64(row3h, row4h);
2635 row2l = _mm_xor_si128(row2l, row3l);
2636 row2h = _mm_xor_si128(row2h, row3h);
2637 row2l = _mm_shuffle_epi8(row2l, r24);
2638 row2h = _mm_shuffle_epi8(row2h, r24);
2640 b0 = _mm_unpacklo_epi64(m6, m1);
2641 b1 = _mm_unpackhi_epi64(m3, m1);
2643 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2644 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2645 row4l = _mm_xor_si128(row4l, row1l);
2646 row4h = _mm_xor_si128(row4h, row1h);
2647 row4l = _mm_shuffle_epi8(row4l, r16);
2648 row4h = _mm_shuffle_epi8(row4h, r16);
2649 row3l = _mm_add_epi64(row3l, row4l);
2650 row3h = _mm_add_epi64(row3h, row4h);
2651 row2l = _mm_xor_si128(row2l, row3l);
2652 row2h = _mm_xor_si128(row2h, row3h);
2653 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2654 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2656 t0 = _mm_alignr_epi8(row2l, row2h, 8);
2657 t1 = _mm_alignr_epi8(row2h, row2l, 8);
2658 row2l = t0, row2h =
t1, t0 = row3l, row3l = row3h, row3h = t0;
2659 t0 = _mm_alignr_epi8(row4l, row4h, 8);
2660 t1 = _mm_alignr_epi8(row4h, row4l, 8);
2661 row4l =
t1, row4h = t0;
2663 b0 = _mm_alignr_epi8(m6, m5, 8);
2664 b1 = _mm_unpackhi_epi64(m2, m7);
2666 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2667 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2668 row4l = _mm_xor_si128(row4l, row1l);
2669 row4h = _mm_xor_si128(row4h, row1h);
2670 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2671 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2672 row3l = _mm_add_epi64(row3l, row4l);
2673 row3h = _mm_add_epi64(row3h, row4h);
2674 row2l = _mm_xor_si128(row2l, row3l);
2675 row2h = _mm_xor_si128(row2h, row3h);
2676 row2l = _mm_shuffle_epi8(row2l, r24);
2677 row2h = _mm_shuffle_epi8(row2h, r24);
2679 b0 = _mm_unpacklo_epi64(m4, m0);
2680 b1 = _mm_blend_epi16(m1, m6, 0xF0);
2682 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2683 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2684 row4l = _mm_xor_si128(row4l, row1l);
2685 row4h = _mm_xor_si128(row4h, row1h);
2686 row4l = _mm_shuffle_epi8(row4l, r16);
2687 row4h = _mm_shuffle_epi8(row4h, r16);
2688 row3l = _mm_add_epi64(row3l, row4l);
2689 row3h = _mm_add_epi64(row3h, row4h);
2690 row2l = _mm_xor_si128(row2l, row3l);
2691 row2h = _mm_xor_si128(row2h, row3h);
2692 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2693 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2695 t0 = _mm_alignr_epi8(row2h, row2l, 8);
2696 t1 = _mm_alignr_epi8(row2l, row2h, 8);
2697 row2l = t0, row2h =
t1, t0 = row3l, row3l = row3h, row3h = t0;
2698 t0 = _mm_alignr_epi8(row4h, row4l, 8);
2699 t1 = _mm_alignr_epi8(row4l, row4h, 8);
2700 row4l =
t1, row4h = t0;
2702 b0 = _mm_blend_epi16(m5, m1, 0xF0);
2703 b1 = _mm_unpackhi_epi64(m3, m4);
2705 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2706 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2707 row4l = _mm_xor_si128(row4l, row1l);
2708 row4h = _mm_xor_si128(row4h, row1h);
2709 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2710 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2711 row3l = _mm_add_epi64(row3l, row4l);
2712 row3h = _mm_add_epi64(row3h, row4h);
2713 row2l = _mm_xor_si128(row2l, row3l);
2714 row2h = _mm_xor_si128(row2h, row3h);
2715 row2l = _mm_shuffle_epi8(row2l, r24);
2716 row2h = _mm_shuffle_epi8(row2h, r24);
2718 b0 = _mm_unpacklo_epi64(m7, m3);
2719 b1 = _mm_alignr_epi8(m2, m0, 8);
2721 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2722 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2723 row4l = _mm_xor_si128(row4l, row1l);
2724 row4h = _mm_xor_si128(row4h, row1h);
2725 row4l = _mm_shuffle_epi8(row4l, r16);
2726 row4h = _mm_shuffle_epi8(row4h, r16);
2727 row3l = _mm_add_epi64(row3l, row4l);
2728 row3h = _mm_add_epi64(row3h, row4h);
2729 row2l = _mm_xor_si128(row2l, row3l);
2730 row2h = _mm_xor_si128(row2h, row3h);
2731 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2732 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2734 t0 = _mm_alignr_epi8(row2l, row2h, 8);
2735 t1 = _mm_alignr_epi8(row2h, row2l, 8);
2736 row2l = t0, row2h =
t1, t0 = row3l, row3l = row3h, row3h = t0;
2737 t0 = _mm_alignr_epi8(row4l, row4h, 8);
2738 t1 = _mm_alignr_epi8(row4h, row4l, 8);
2739 row4l =
t1, row4h = t0;
2741 b0 = _mm_unpackhi_epi64(m3, m1);
2742 b1 = _mm_unpackhi_epi64(m6, m5);
2744 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2745 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2746 row4l = _mm_xor_si128(row4l, row1l);
2747 row4h = _mm_xor_si128(row4h, row1h);
2748 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2749 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2750 row3l = _mm_add_epi64(row3l, row4l);
2751 row3h = _mm_add_epi64(row3h, row4h);
2752 row2l = _mm_xor_si128(row2l, row3l);
2753 row2h = _mm_xor_si128(row2h, row3h);
2754 row2l = _mm_shuffle_epi8(row2l, r24);
2755 row2h = _mm_shuffle_epi8(row2h, r24);
2757 b0 = _mm_unpackhi_epi64(m4, m0);
2758 b1 = _mm_unpacklo_epi64(m6, m7);
2760 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2761 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2762 row4l = _mm_xor_si128(row4l, row1l);
2763 row4h = _mm_xor_si128(row4h, row1h);
2764 row4l = _mm_shuffle_epi8(row4l, r16);
2765 row4h = _mm_shuffle_epi8(row4h, r16);
2766 row3l = _mm_add_epi64(row3l, row4l);
2767 row3h = _mm_add_epi64(row3h, row4h);
2768 row2l = _mm_xor_si128(row2l, row3l);
2769 row2h = _mm_xor_si128(row2h, row3h);
2770 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2771 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2773 t0 = _mm_alignr_epi8(row2h, row2l, 8);
2774 t1 = _mm_alignr_epi8(row2l, row2h, 8);
2775 row2l = t0, row2h =
t1, t0 = row3l, row3l = row3h, row3h = t0;
2776 t0 = _mm_alignr_epi8(row4h, row4l, 8);
2777 t1 = _mm_alignr_epi8(row4l, row4h, 8);
2778 row4l =
t1, row4h = t0;
2780 b0 = _mm_blend_epi16(m1, m2, 0xF0);
2781 b1 = _mm_blend_epi16(m2, m7, 0xF0);
2783 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2784 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2785 row4l = _mm_xor_si128(row4l, row1l);
2786 row4h = _mm_xor_si128(row4h, row1h);
2787 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2788 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2789 row3l = _mm_add_epi64(row3l, row4l);
2790 row3h = _mm_add_epi64(row3h, row4h);
2791 row2l = _mm_xor_si128(row2l, row3l);
2792 row2h = _mm_xor_si128(row2h, row3h);
2793 row2l = _mm_shuffle_epi8(row2l, r24);
2794 row2h = _mm_shuffle_epi8(row2h, r24);
2796 b0 = _mm_unpacklo_epi64(m3, m5);
2797 b1 = _mm_unpacklo_epi64(m0, m4);
2799 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2800 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2801 row4l = _mm_xor_si128(row4l, row1l);
2802 row4h = _mm_xor_si128(row4h, row1h);
2803 row4l = _mm_shuffle_epi8(row4l, r16);
2804 row4h = _mm_shuffle_epi8(row4h, r16);
2805 row3l = _mm_add_epi64(row3l, row4l);
2806 row3h = _mm_add_epi64(row3h, row4h);
2807 row2l = _mm_xor_si128(row2l, row3l);
2808 row2h = _mm_xor_si128(row2h, row3h);
2809 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2810 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2812 t0 = _mm_alignr_epi8(row2l, row2h, 8);
2813 t1 = _mm_alignr_epi8(row2h, row2l, 8);
2814 row2l = t0, row2h =
t1, t0 = row3l, row3l = row3h, row3h = t0;
2815 t0 = _mm_alignr_epi8(row4l, row4h, 8);
2816 t1 = _mm_alignr_epi8(row4h, row4l, 8);
2817 row4l =
t1, row4h = t0;
2819 b0 = _mm_unpackhi_epi64(m4, m2);
2820 b1 = _mm_unpacklo_epi64(m1, m5);
2822 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2823 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2824 row4l = _mm_xor_si128(row4l, row1l);
2825 row4h = _mm_xor_si128(row4h, row1h);
2826 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2827 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2828 row3l = _mm_add_epi64(row3l, row4l);
2829 row3h = _mm_add_epi64(row3h, row4h);
2830 row2l = _mm_xor_si128(row2l, row3l);
2831 row2h = _mm_xor_si128(row2h, row3h);
2832 row2l = _mm_shuffle_epi8(row2l, r24);
2833 row2h = _mm_shuffle_epi8(row2h, r24);
2835 b0 = _mm_blend_epi16(m0, m3, 0xF0);
2836 b1 = _mm_blend_epi16(m2, m7, 0xF0);
2838 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2839 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2840 row4l = _mm_xor_si128(row4l, row1l);
2841 row4h = _mm_xor_si128(row4h, row1h);
2842 row4l = _mm_shuffle_epi8(row4l, r16);
2843 row4h = _mm_shuffle_epi8(row4h, r16);
2844 row3l = _mm_add_epi64(row3l, row4l);
2845 row3h = _mm_add_epi64(row3h, row4h);
2846 row2l = _mm_xor_si128(row2l, row3l);
2847 row2h = _mm_xor_si128(row2h, row3h);
2848 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2849 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2851 t0 = _mm_alignr_epi8(row2h, row2l, 8);
2852 t1 = _mm_alignr_epi8(row2l, row2h, 8);
2853 row2l = t0, row2h =
t1, t0 = row3l, row3l = row3h, row3h = t0;
2854 t0 = _mm_alignr_epi8(row4h, row4l, 8);
2855 t1 = _mm_alignr_epi8(row4l, row4h, 8);
2856 row4l =
t1, row4h = t0;
2858 b0 = _mm_blend_epi16(m7, m5, 0xF0);
2859 b1 = _mm_blend_epi16(m3, m1, 0xF0);
2861 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2862 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2863 row4l = _mm_xor_si128(row4l, row1l);
2864 row4h = _mm_xor_si128(row4h, row1h);
2865 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2866 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2867 row3l = _mm_add_epi64(row3l, row4l);
2868 row3h = _mm_add_epi64(row3h, row4h);
2869 row2l = _mm_xor_si128(row2l, row3l);
2870 row2h = _mm_xor_si128(row2h, row3h);
2871 row2l = _mm_shuffle_epi8(row2l, r24);
2872 row2h = _mm_shuffle_epi8(row2h, r24);
2874 b0 = _mm_alignr_epi8(m6, m0, 8);
2875 b1 = _mm_blend_epi16(m4, m6, 0xF0);
2877 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2878 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2879 row4l = _mm_xor_si128(row4l, row1l);
2880 row4h = _mm_xor_si128(row4h, row1h);
2881 row4l = _mm_shuffle_epi8(row4l, r16);
2882 row4h = _mm_shuffle_epi8(row4h, r16);
2883 row3l = _mm_add_epi64(row3l, row4l);
2884 row3h = _mm_add_epi64(row3h, row4h);
2885 row2l = _mm_xor_si128(row2l, row3l);
2886 row2h = _mm_xor_si128(row2h, row3h);
2887 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2888 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2890 t0 = _mm_alignr_epi8(row2l, row2h, 8);
2891 t1 = _mm_alignr_epi8(row2h, row2l, 8);
2892 row2l = t0, row2h =
t1, t0 = row3l, row3l = row3h, row3h = t0;
2893 t0 = _mm_alignr_epi8(row4l, row4h, 8);
2894 t1 = _mm_alignr_epi8(row4h, row4l, 8);
2895 row4l =
t1, row4h = t0;
2897 b0 = _mm_unpacklo_epi64(m1, m3);
2898 b1 = _mm_unpacklo_epi64(m0, m4);
2900 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2901 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2902 row4l = _mm_xor_si128(row4l, row1l);
2903 row4h = _mm_xor_si128(row4h, row1h);
2904 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2905 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2906 row3l = _mm_add_epi64(row3l, row4l);
2907 row3h = _mm_add_epi64(row3h, row4h);
2908 row2l = _mm_xor_si128(row2l, row3l);
2909 row2h = _mm_xor_si128(row2h, row3h);
2910 row2l = _mm_shuffle_epi8(row2l, r24);
2911 row2h = _mm_shuffle_epi8(row2h, r24);
2913 b0 = _mm_unpacklo_epi64(m6, m5);
2914 b1 = _mm_unpackhi_epi64(m5, m1);
2916 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2917 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2918 row4l = _mm_xor_si128(row4l, row1l);
2919 row4h = _mm_xor_si128(row4h, row1h);
2920 row4l = _mm_shuffle_epi8(row4l, r16);
2921 row4h = _mm_shuffle_epi8(row4h, r16);
2922 row3l = _mm_add_epi64(row3l, row4l);
2923 row3h = _mm_add_epi64(row3h, row4h);
2924 row2l = _mm_xor_si128(row2l, row3l);
2925 row2h = _mm_xor_si128(row2h, row3h);
2926 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2927 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2929 t0 = _mm_alignr_epi8(row2h, row2l, 8);
2930 t1 = _mm_alignr_epi8(row2l, row2h, 8);
2931 row2l = t0, row2h =
t1, t0 = row3l, row3l = row3h, row3h = t0;
2932 t0 = _mm_alignr_epi8(row4h, row4l, 8);
2933 t1 = _mm_alignr_epi8(row4l, row4h, 8);
2934 row4l =
t1, row4h = t0;
2936 b0 = _mm_blend_epi16(m2, m3, 0xF0);
2937 b1 = _mm_unpackhi_epi64(m7, m0);
2939 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2940 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2941 row4l = _mm_xor_si128(row4l, row1l);
2942 row4h = _mm_xor_si128(row4h, row1h);
2943 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2944 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2945 row3l = _mm_add_epi64(row3l, row4l);
2946 row3h = _mm_add_epi64(row3h, row4h);
2947 row2l = _mm_xor_si128(row2l, row3l);
2948 row2h = _mm_xor_si128(row2h, row3h);
2949 row2l = _mm_shuffle_epi8(row2l, r24);
2950 row2h = _mm_shuffle_epi8(row2h, r24);
2952 b0 = _mm_unpackhi_epi64(m6, m2);
2953 b1 = _mm_blend_epi16(m7, m4, 0xF0);
2955 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2956 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2957 row4l = _mm_xor_si128(row4l, row1l);
2958 row4h = _mm_xor_si128(row4h, row1h);
2959 row4l = _mm_shuffle_epi8(row4l, r16);
2960 row4h = _mm_shuffle_epi8(row4h, r16);
2961 row3l = _mm_add_epi64(row3l, row4l);
2962 row3h = _mm_add_epi64(row3h, row4h);
2963 row2l = _mm_xor_si128(row2l, row3l);
2964 row2h = _mm_xor_si128(row2h, row3h);
2965 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2966 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2968 t0 = _mm_alignr_epi8(row2l, row2h, 8);
2969 t1 = _mm_alignr_epi8(row2h, row2l, 8);
2970 row2l = t0, row2h =
t1, t0 = row3l, row3l = row3h, row3h = t0;
2971 t0 = _mm_alignr_epi8(row4l, row4h, 8);
2972 t1 = _mm_alignr_epi8(row4h, row4l, 8);
2973 row4l =
t1, row4h = t0;
2975 b0 = _mm_blend_epi16(m6, m0, 0xF0);
2976 b1 = _mm_unpacklo_epi64(m7, m2);
2978 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2979 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2980 row4l = _mm_xor_si128(row4l, row1l);
2981 row4h = _mm_xor_si128(row4h, row1h);
2982 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2983 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2984 row3l = _mm_add_epi64(row3l, row4l);
2985 row3h = _mm_add_epi64(row3h, row4h);
2986 row2l = _mm_xor_si128(row2l, row3l);
2987 row2h = _mm_xor_si128(row2h, row3h);
2988 row2l = _mm_shuffle_epi8(row2l, r24);
2989 row2h = _mm_shuffle_epi8(row2h, r24);
2991 b0 = _mm_unpackhi_epi64(m2, m7);
2992 b1 = _mm_alignr_epi8(m5, m6, 8);
2994 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2995 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2996 row4l = _mm_xor_si128(row4l, row1l);
2997 row4h = _mm_xor_si128(row4h, row1h);
2998 row4l = _mm_shuffle_epi8(row4l, r16);
2999 row4h = _mm_shuffle_epi8(row4h, r16);
3000 row3l = _mm_add_epi64(row3l, row4l);
3001 row3h = _mm_add_epi64(row3h, row4h);
3002 row2l = _mm_xor_si128(row2l, row3l);
3003 row2h = _mm_xor_si128(row2h, row3h);
3004 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3005 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3007 t0 = _mm_alignr_epi8(row2h, row2l, 8);
3008 t1 = _mm_alignr_epi8(row2l, row2h, 8);
3009 row2l = t0, row2h =
t1, t0 = row3l, row3l = row3h, row3h = t0;
3010 t0 = _mm_alignr_epi8(row4h, row4l, 8);
3011 t1 = _mm_alignr_epi8(row4l, row4h, 8);
3012 row4l =
t1, row4h = t0;
3014 b0 = _mm_unpacklo_epi64(m0, m3);
3015 b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2));
3017 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3018 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3019 row4l = _mm_xor_si128(row4l, row1l);
3020 row4h = _mm_xor_si128(row4h, row1h);
3021 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3022 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3023 row3l = _mm_add_epi64(row3l, row4l);
3024 row3h = _mm_add_epi64(row3h, row4h);
3025 row2l = _mm_xor_si128(row2l, row3l);
3026 row2h = _mm_xor_si128(row2h, row3h);
3027 row2l = _mm_shuffle_epi8(row2l, r24);
3028 row2h = _mm_shuffle_epi8(row2h, r24);
3030 b0 = _mm_unpackhi_epi64(m3, m1);
3031 b1 = _mm_blend_epi16(m1, m5, 0xF0);
3033 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3034 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3035 row4l = _mm_xor_si128(row4l, row1l);
3036 row4h = _mm_xor_si128(row4h, row1h);
3037 row4l = _mm_shuffle_epi8(row4l, r16);
3038 row4h = _mm_shuffle_epi8(row4h, r16);
3039 row3l = _mm_add_epi64(row3l, row4l);
3040 row3h = _mm_add_epi64(row3h, row4h);
3041 row2l = _mm_xor_si128(row2l, row3l);
3042 row2h = _mm_xor_si128(row2h, row3h);
3043 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3044 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3046 t0 = _mm_alignr_epi8(row2l, row2h, 8);
3047 t1 = _mm_alignr_epi8(row2h, row2l, 8);
3048 row2l = t0, row2h =
t1, t0 = row3l, row3l = row3h, row3h = t0;
3049 t0 = _mm_alignr_epi8(row4l, row4h, 8);
3050 t1 = _mm_alignr_epi8(row4h, row4l, 8);
3051 row4l =
t1, row4h = t0;
3053 b0 = _mm_unpackhi_epi64(m6, m3);
3054 b1 = _mm_blend_epi16(m6, m1, 0xF0);
3056 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3057 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3058 row4l = _mm_xor_si128(row4l, row1l);
3059 row4h = _mm_xor_si128(row4h, row1h);
3060 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3061 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3062 row3l = _mm_add_epi64(row3l, row4l);
3063 row3h = _mm_add_epi64(row3h, row4h);
3064 row2l = _mm_xor_si128(row2l, row3l);
3065 row2h = _mm_xor_si128(row2h, row3h);
3066 row2l = _mm_shuffle_epi8(row2l, r24);
3067 row2h = _mm_shuffle_epi8(row2h, r24);
3069 b0 = _mm_alignr_epi8(m7, m5, 8);
3070 b1 = _mm_unpackhi_epi64(m0, m4);
3072 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3073 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3074 row4l = _mm_xor_si128(row4l, row1l);
3075 row4h = _mm_xor_si128(row4h, row1h);
3076 row4l = _mm_shuffle_epi8(row4l, r16);
3077 row4h = _mm_shuffle_epi8(row4h, r16);
3078 row3l = _mm_add_epi64(row3l, row4l);
3079 row3h = _mm_add_epi64(row3h, row4h);
3080 row2l = _mm_xor_si128(row2l, row3l);
3081 row2h = _mm_xor_si128(row2h, row3h);
3082 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3083 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3085 t0 = _mm_alignr_epi8(row2h, row2l, 8);
3086 t1 = _mm_alignr_epi8(row2l, row2h, 8);
3087 row2l = t0, row2h =
t1, t0 = row3l, row3l = row3h, row3h = t0;
3088 t0 = _mm_alignr_epi8(row4h, row4l, 8);
3089 t1 = _mm_alignr_epi8(row4l, row4h, 8);
3090 row4l =
t1, row4h = t0;
3092 b0 = _mm_unpackhi_epi64(m2, m7);
3093 b1 = _mm_unpacklo_epi64(m4, m1);
3095 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3096 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3097 row4l = _mm_xor_si128(row4l, row1l);
3098 row4h = _mm_xor_si128(row4h, row1h);
3099 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3100 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3101 row3l = _mm_add_epi64(row3l, row4l);
3102 row3h = _mm_add_epi64(row3h, row4h);
3103 row2l = _mm_xor_si128(row2l, row3l);
3104 row2h = _mm_xor_si128(row2h, row3h);
3105 row2l = _mm_shuffle_epi8(row2l, r24);
3106 row2h = _mm_shuffle_epi8(row2h, r24);
3108 b0 = _mm_unpacklo_epi64(m0, m2);
3109 b1 = _mm_unpacklo_epi64(m3, m5);
3111 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3112 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3113 row4l = _mm_xor_si128(row4l, row1l);
3114 row4h = _mm_xor_si128(row4h, row1h);
3115 row4l = _mm_shuffle_epi8(row4l, r16);
3116 row4h = _mm_shuffle_epi8(row4h, r16);
3117 row3l = _mm_add_epi64(row3l, row4l);
3118 row3h = _mm_add_epi64(row3h, row4h);
3119 row2l = _mm_xor_si128(row2l, row3l);
3120 row2h = _mm_xor_si128(row2h, row3h);
3121 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3122 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3124 t0 = _mm_alignr_epi8(row2l, row2h, 8);
3125 t1 = _mm_alignr_epi8(row2h, row2l, 8);
3126 row2l = t0, row2h =
t1, t0 = row3l, row3l = row3h, row3h = t0;
3127 t0 = _mm_alignr_epi8(row4l, row4h, 8);
3128 t1 = _mm_alignr_epi8(row4h, row4l, 8);
3129 row4l =
t1, row4h = t0;
3131 b0 = _mm_unpacklo_epi64(m3, m7);
3132 b1 = _mm_alignr_epi8(m0, m5, 8);
3134 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3135 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3136 row4l = _mm_xor_si128(row4l, row1l);
3137 row4h = _mm_xor_si128(row4h, row1h);
3138 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3139 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3140 row3l = _mm_add_epi64(row3l, row4l);
3141 row3h = _mm_add_epi64(row3h, row4h);
3142 row2l = _mm_xor_si128(row2l, row3l);
3143 row2h = _mm_xor_si128(row2h, row3h);
3144 row2l = _mm_shuffle_epi8(row2l, r24);
3145 row2h = _mm_shuffle_epi8(row2h, r24);
3147 b0 = _mm_unpackhi_epi64(m7, m4);
3148 b1 = _mm_alignr_epi8(m4, m1, 8);
3150 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3151 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3152 row4l = _mm_xor_si128(row4l, row1l);
3153 row4h = _mm_xor_si128(row4h, row1h);
3154 row4l = _mm_shuffle_epi8(row4l, r16);
3155 row4h = _mm_shuffle_epi8(row4h, r16);
3156 row3l = _mm_add_epi64(row3l, row4l);
3157 row3h = _mm_add_epi64(row3h, row4h);
3158 row2l = _mm_xor_si128(row2l, row3l);
3159 row2h = _mm_xor_si128(row2h, row3h);
3160 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3161 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3163 t0 = _mm_alignr_epi8(row2h, row2l, 8);
3164 t1 = _mm_alignr_epi8(row2l, row2h, 8);
3165 row2l = t0, row2h =
t1, t0 = row3l, row3l = row3h, row3h = t0;
3166 t0 = _mm_alignr_epi8(row4h, row4l, 8);
3167 t1 = _mm_alignr_epi8(row4l, row4h, 8);
3168 row4l =
t1, row4h = t0;
3171 b1 = _mm_alignr_epi8(m5, m0, 8);
3173 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3174 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3175 row4l = _mm_xor_si128(row4l, row1l);
3176 row4h = _mm_xor_si128(row4h, row1h);
3177 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3178 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3179 row3l = _mm_add_epi64(row3l, row4l);
3180 row3h = _mm_add_epi64(row3h, row4h);
3181 row2l = _mm_xor_si128(row2l, row3l);
3182 row2h = _mm_xor_si128(row2h, row3h);
3183 row2l = _mm_shuffle_epi8(row2l, r24);
3184 row2h = _mm_shuffle_epi8(row2h, r24);
3186 b0 = _mm_blend_epi16(m1, m3, 0xF0);
3189 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3190 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3191 row4l = _mm_xor_si128(row4l, row1l);
3192 row4h = _mm_xor_si128(row4h, row1h);
3193 row4l = _mm_shuffle_epi8(row4l, r16);
3194 row4h = _mm_shuffle_epi8(row4h, r16);
3195 row3l = _mm_add_epi64(row3l, row4l);
3196 row3h = _mm_add_epi64(row3h, row4h);
3197 row2l = _mm_xor_si128(row2l, row3l);
3198 row2h = _mm_xor_si128(row2h, row3h);
3199 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3200 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3202 t0 = _mm_alignr_epi8(row2l, row2h, 8);
3203 t1 = _mm_alignr_epi8(row2h, row2l, 8);
3204 row2l = t0, row2h =
t1, t0 = row3l, row3l = row3h, row3h = t0;
3205 t0 = _mm_alignr_epi8(row4l, row4h, 8);
3206 t1 = _mm_alignr_epi8(row4h, row4l, 8);
3207 row4l =
t1, row4h = t0;
3209 b0 = _mm_unpacklo_epi64(m5, m4);
3210 b1 = _mm_unpackhi_epi64(m3, m0);
3212 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3213 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3214 row4l = _mm_xor_si128(row4l, row1l);
3215 row4h = _mm_xor_si128(row4h, row1h);
3216 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3217 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3218 row3l = _mm_add_epi64(row3l, row4l);
3219 row3h = _mm_add_epi64(row3h, row4h);
3220 row2l = _mm_xor_si128(row2l, row3l);
3221 row2h = _mm_xor_si128(row2h, row3h);
3222 row2l = _mm_shuffle_epi8(row2l, r24);
3223 row2h = _mm_shuffle_epi8(row2h, r24);
3225 b0 = _mm_unpacklo_epi64(m1, m2);
3226 b1 = _mm_blend_epi16(m3, m2, 0xF0);
3228 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3229 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3230 row4l = _mm_xor_si128(row4l, row1l);
3231 row4h = _mm_xor_si128(row4h, row1h);
3232 row4l = _mm_shuffle_epi8(row4l, r16);
3233 row4h = _mm_shuffle_epi8(row4h, r16);
3234 row3l = _mm_add_epi64(row3l, row4l);
3235 row3h = _mm_add_epi64(row3h, row4h);
3236 row2l = _mm_xor_si128(row2l, row3l);
3237 row2h = _mm_xor_si128(row2h, row3h);
3238 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3239 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3241 t0 = _mm_alignr_epi8(row2h, row2l, 8);
3242 t1 = _mm_alignr_epi8(row2l, row2h, 8);
3243 row2l = t0, row2h =
t1, t0 = row3l, row3l = row3h, row3h = t0;
3244 t0 = _mm_alignr_epi8(row4h, row4l, 8);
3245 t1 = _mm_alignr_epi8(row4l, row4h, 8);
3246 row4l =
t1, row4h = t0;
3248 b0 = _mm_unpackhi_epi64(m7, m4);
3249 b1 = _mm_unpackhi_epi64(m1, m6);
3251 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3252 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3253 row4l = _mm_xor_si128(row4l, row1l);
3254 row4h = _mm_xor_si128(row4h, row1h);
3255 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3256 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3257 row3l = _mm_add_epi64(row3l, row4l);
3258 row3h = _mm_add_epi64(row3h, row4h);
3259 row2l = _mm_xor_si128(row2l, row3l);
3260 row2h = _mm_xor_si128(row2h, row3h);
3261 row2l = _mm_shuffle_epi8(row2l, r24);
3262 row2h = _mm_shuffle_epi8(row2h, r24);
3264 b0 = _mm_alignr_epi8(m7, m5, 8);
3265 b1 = _mm_unpacklo_epi64(m6, m0);
3267 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3268 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3269 row4l = _mm_xor_si128(row4l, row1l);
3270 row4h = _mm_xor_si128(row4h, row1h);
3271 row4l = _mm_shuffle_epi8(row4l, r16);
3272 row4h = _mm_shuffle_epi8(row4h, r16);
3273 row3l = _mm_add_epi64(row3l, row4l);
3274 row3h = _mm_add_epi64(row3h, row4h);
3275 row2l = _mm_xor_si128(row2l, row3l);
3276 row2h = _mm_xor_si128(row2h, row3h);
3277 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3278 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3280 t0 = _mm_alignr_epi8(row2l, row2h, 8);
3281 t1 = _mm_alignr_epi8(row2h, row2l, 8);
3282 row2l = t0, row2h =
t1, t0 = row3l, row3l = row3h, row3h = t0;
3283 t0 = _mm_alignr_epi8(row4l, row4h, 8);
3284 t1 = _mm_alignr_epi8(row4h, row4l, 8);
3285 row4l =
t1, row4h = t0;
3287 b0 = _mm_unpacklo_epi64(m0, m1);
3288 b1 = _mm_unpacklo_epi64(m2, m3);
3290 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3291 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3292 row4l = _mm_xor_si128(row4l, row1l);
3293 row4h = _mm_xor_si128(row4h, row1h);
3294 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3295 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3296 row3l = _mm_add_epi64(row3l, row4l);
3297 row3h = _mm_add_epi64(row3h, row4h);
3298 row2l = _mm_xor_si128(row2l, row3l);
3299 row2h = _mm_xor_si128(row2h, row3h);
3300 row2l = _mm_shuffle_epi8(row2l, r24);
3301 row2h = _mm_shuffle_epi8(row2h, r24);
3303 b0 = _mm_unpackhi_epi64(m0, m1);
3304 b1 = _mm_unpackhi_epi64(m2, m3);
3306 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3307 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3308 row4l = _mm_xor_si128(row4l, row1l);
3309 row4h = _mm_xor_si128(row4h, row1h);
3310 row4l = _mm_shuffle_epi8(row4l, r16);
3311 row4h = _mm_shuffle_epi8(row4h, r16);
3312 row3l = _mm_add_epi64(row3l, row4l);
3313 row3h = _mm_add_epi64(row3h, row4h);
3314 row2l = _mm_xor_si128(row2l, row3l);
3315 row2h = _mm_xor_si128(row2h, row3h);
3316 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3317 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3319 t0 = _mm_alignr_epi8(row2h, row2l, 8);
3320 t1 = _mm_alignr_epi8(row2l, row2h, 8);
3321 row2l = t0, row2h =
t1, t0 = row3l, row3l = row3h, row3h = t0;
3322 t0 = _mm_alignr_epi8(row4h, row4l, 8);
3323 t1 = _mm_alignr_epi8(row4l, row4h, 8);
3324 row4l =
t1, row4h = t0;
3326 b0 = _mm_unpacklo_epi64(m4, m5);
3327 b1 = _mm_unpacklo_epi64(m6, m7);
3329 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3330 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3331 row4l = _mm_xor_si128(row4l, row1l);
3332 row4h = _mm_xor_si128(row4h, row1h);
3333 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3334 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3335 row3l = _mm_add_epi64(row3l, row4l);
3336 row3h = _mm_add_epi64(row3h, row4h);
3337 row2l = _mm_xor_si128(row2l, row3l);
3338 row2h = _mm_xor_si128(row2h, row3h);
3339 row2l = _mm_shuffle_epi8(row2l, r24);
3340 row2h = _mm_shuffle_epi8(row2h, r24);
3342 b0 = _mm_unpackhi_epi64(m4, m5);
3343 b1 = _mm_unpackhi_epi64(m6, m7);
3345 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3346 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3347 row4l = _mm_xor_si128(row4l, row1l);
3348 row4h = _mm_xor_si128(row4h, row1h);
3349 row4l = _mm_shuffle_epi8(row4l, r16);
3350 row4h = _mm_shuffle_epi8(row4h, r16);
3351 row3l = _mm_add_epi64(row3l, row4l);
3352 row3h = _mm_add_epi64(row3h, row4h);
3353 row2l = _mm_xor_si128(row2l, row3l);
3354 row2h = _mm_xor_si128(row2h, row3h);
3355 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3356 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3358 t0 = _mm_alignr_epi8(row2l, row2h, 8);
3359 t1 = _mm_alignr_epi8(row2h, row2l, 8);
3360 row2l = t0, row2h =
t1, t0 = row3l, row3l = row3h, row3h = t0;
3361 t0 = _mm_alignr_epi8(row4l, row4h, 8);
3362 t1 = _mm_alignr_epi8(row4h, row4l, 8);
3363 row4l =
t1, row4h = t0;
3365 b0 = _mm_unpacklo_epi64(m7, m2);
3366 b1 = _mm_unpackhi_epi64(m4, m6);
3368 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3369 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3370 row4l = _mm_xor_si128(row4l, row1l);
3371 row4h = _mm_xor_si128(row4h, row1h);
3372 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3373 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3374 row3l = _mm_add_epi64(row3l, row4l);
3375 row3h = _mm_add_epi64(row3h, row4h);
3376 row2l = _mm_xor_si128(row2l, row3l);
3377 row2h = _mm_xor_si128(row2h, row3h);
3378 row2l = _mm_shuffle_epi8(row2l, r24);
3379 row2h = _mm_shuffle_epi8(row2h, r24);
3381 b0 = _mm_unpacklo_epi64(m5, m4);
3382 b1 = _mm_alignr_epi8(m3, m7, 8);
3384 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3385 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3386 row4l = _mm_xor_si128(row4l, row1l);
3387 row4h = _mm_xor_si128(row4h, row1h);
3388 row4l = _mm_shuffle_epi8(row4l, r16);
3389 row4h = _mm_shuffle_epi8(row4h, r16);
3390 row3l = _mm_add_epi64(row3l, row4l);
3391 row3h = _mm_add_epi64(row3h, row4h);
3392 row2l = _mm_xor_si128(row2l, row3l);
3393 row2h = _mm_xor_si128(row2h, row3h);
3394 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3395 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3397 t0 = _mm_alignr_epi8(row2h, row2l, 8);
3398 t1 = _mm_alignr_epi8(row2l, row2h, 8);
3399 row2l = t0, row2h =
t1, t0 = row3l, row3l = row3h, row3h = t0;
3400 t0 = _mm_alignr_epi8(row4h, row4l, 8);
3401 t1 = _mm_alignr_epi8(row4l, row4h, 8);
3402 row4l =
t1, row4h = t0;
3404 b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));
3405 b1 = _mm_unpackhi_epi64(m5, m2);
3407 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3408 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3409 row4l = _mm_xor_si128(row4l, row1l);
3410 row4h = _mm_xor_si128(row4h, row1h);
3411 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3412 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3413 row3l = _mm_add_epi64(row3l, row4l);
3414 row3h = _mm_add_epi64(row3h, row4h);
3415 row2l = _mm_xor_si128(row2l, row3l);
3416 row2h = _mm_xor_si128(row2h, row3h);
3417 row2l = _mm_shuffle_epi8(row2l, r24);
3418 row2h = _mm_shuffle_epi8(row2h, r24);
3420 b0 = _mm_unpacklo_epi64(m6, m1);
3421 b1 = _mm_unpackhi_epi64(m3, m1);
3423 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3424 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3425 row4l = _mm_xor_si128(row4l, row1l);
3426 row4h = _mm_xor_si128(row4h, row1h);
3427 row4l = _mm_shuffle_epi8(row4l, r16);
3428 row4h = _mm_shuffle_epi8(row4h, r16);
3429 row3l = _mm_add_epi64(row3l, row4l);
3430 row3h = _mm_add_epi64(row3h, row4h);
3431 row2l = _mm_xor_si128(row2l, row3l);
3432 row2h = _mm_xor_si128(row2h, row3h);
3433 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3434 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3436 t0 = _mm_alignr_epi8(row2l, row2h, 8);
3437 t1 = _mm_alignr_epi8(row2h, row2l, 8);
3438 row2l = t0, row2h =
t1, t0 = row3l, row3l = row3h, row3h = t0;
3439 t0 = _mm_alignr_epi8(row4l, row4h, 8);
3440 t1 = _mm_alignr_epi8(row4h, row4l, 8);
3441 row4l =
t1, row4h = t0;
3443 row1l = _mm_xor_si128(row3l, row1l);
3444 row1h = _mm_xor_si128(row3h, row1h);
3445 _mm_storeu_si128((__m128i *)(
void*)(&state.
h[0]), _mm_xor_si128(_mm_loadu_si128((
const __m128i*)(
const void*)(&state.
h[0])), row1l));
3446 _mm_storeu_si128((__m128i *)(
void*)(&state.
h[2]), _mm_xor_si128(_mm_loadu_si128((
const __m128i*)(
const void*)(&state.
h[2])), row1h));
3448 row2l = _mm_xor_si128(row4l, row2l);
3449 row2h = _mm_xor_si128(row4h, row2h);
3450 _mm_storeu_si128((__m128i *)(
void*)(&state.
h[4]), _mm_xor_si128(_mm_loadu_si128((
const __m128i*)(
const void*)(&state.
h[4])), row2l));
3451 _mm_storeu_si128((__m128i *)(
void*)(&state.
h[6]), _mm_xor_si128(_mm_loadu_si128((
const __m128i*)(
const void*)(&state.
h[6])), row2h));
3453 #endif // CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE 3455 #if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE 3458 #define vld1q_u32_rev(x, a,b,c,d) d[1]=c[0],d[2]=b[0],d[3]=a[0]; x = vld1q_u32(d); 3463 static const int LANE_H64 = 1;
3464 static const int LANE_L64 = 0;
3474 CRYPTOPP_ALIGN_DATA(16) uint32_t m8[4], m9[4], m10[4], m11[4], m12[4], m13[4], m14[4], m15[4];
3477 get(m0[0])(m1[0])(m2[0])(m3[0])(m4[0])(m5[0])(m6[0])(m7[0])(m8[0])(m9[0])(m10[0])(m11[0])(m12[0])(m13[0])(m14[0])(m15[0]);
3479 uint32x4_t row1,row2,row3,row4;
3480 uint32x4_t buf1,buf2,buf3,buf4;
3483 row1 = ff0 = vld1q_u32((
const uint32_t*)&state.
h[0]);
3484 row2 = ff1 = vld1q_u32((
const uint32_t*)&state.
h[4]);
3485 row3 = vld1q_u32((
const uint32_t*)&
BLAKE2S_IV(0));
3486 row4 = veorq_u32(vld1q_u32((
const uint32_t*)&
BLAKE2S_IV(4)), vld1q_u32((
const uint32_t*)&state.
t[0]));
3489 vld1q_u32_rev(buf1, m6,m4,m2,m0);
3491 row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3492 row4 = veorq_u32(row4,row1);
3493 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3494 row3 = vaddq_u32(row3,row4);
3495 row2 = veorq_u32(row2,row3);
3496 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3499 vld1q_u32_rev(buf2, m7,m5,m3,m1);
3501 row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3502 row4 = veorq_u32(row4,row1);
3503 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3504 row3 = vaddq_u32(row3,row4);
3505 row2 = veorq_u32(row2,row3);
3506 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3508 row4 = vextq_u32(row4,row4,3);
3509 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3510 row2 = vextq_u32(row2,row2,1);
3513 vld1q_u32_rev(buf3, m14,m12,m10,m8);
3515 row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3516 row4 = veorq_u32(row4,row1);
3517 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3518 row3 = vaddq_u32(row3,row4);
3519 row2 = veorq_u32(row2,row3);
3520 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3523 vld1q_u32_rev(buf4, m15,m13,m11,m9);
3525 row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3526 row4 = veorq_u32(row4,row1);
3527 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3528 row3 = vaddq_u32(row3,row4);
3529 row2 = veorq_u32(row2,row3);
3530 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3532 row4 = vextq_u32(row4,row4,1);
3533 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3534 row2 = vextq_u32(row2,row2,3);
3537 vld1q_u32_rev(buf1, m13,m9,m4,m14);
3539 row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3540 row4 = veorq_u32(row4,row1);
3541 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3542 row3 = vaddq_u32(row3,row4);
3543 row2 = veorq_u32(row2,row3);
3544 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3547 vld1q_u32_rev(buf2, m6,m15,m8,m10);
3549 row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3550 row4 = veorq_u32(row4,row1);
3551 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3552 row3 = vaddq_u32(row3,row4);
3553 row2 = veorq_u32(row2,row3);
3554 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3556 row4 = vextq_u32(row4,row4,3);
3557 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3558 row2 = vextq_u32(row2,row2,1);
3561 vld1q_u32_rev(buf3, m5,m11,m0,m1);
3563 row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3564 row4 = veorq_u32(row4,row1);
3565 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3566 row3 = vaddq_u32(row3,row4);
3567 row2 = veorq_u32(row2,row3);
3568 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3571 vld1q_u32_rev(buf4, m3,m7,m2,m12);
3573 row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3574 row4 = veorq_u32(row4,row1);
3575 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3576 row3 = vaddq_u32(row3,row4);
3577 row2 = veorq_u32(row2,row3);
3578 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3580 row4 = vextq_u32(row4,row4,1);
3581 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3582 row2 = vextq_u32(row2,row2,3);
3585 vld1q_u32_rev(buf1, m15,m5,m12,m11);
3587 row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3588 row4 = veorq_u32(row4,row1);
3589 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3590 row3 = vaddq_u32(row3,row4);
3591 row2 = veorq_u32(row2,row3);
3592 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3595 vld1q_u32_rev(buf2, m13,m2,m0,m8);
3597 row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3598 row4 = veorq_u32(row4,row1);
3599 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3600 row3 = vaddq_u32(row3,row4);
3601 row2 = veorq_u32(row2,row3);
3602 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3604 row4 = vextq_u32(row4,row4,3);
3605 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3606 row2 = vextq_u32(row2,row2,1);
3609 vld1q_u32_rev(buf3, m9,m7,m3,m10);
3611 row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3612 row4 = veorq_u32(row4,row1);
3613 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3614 row3 = vaddq_u32(row3,row4);
3615 row2 = veorq_u32(row2,row3);
3616 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3619 vld1q_u32_rev(buf4, m4,m1,m6,m14);
3621 row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3622 row4 = veorq_u32(row4,row1);
3623 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3624 row3 = vaddq_u32(row3,row4);
3625 row2 = veorq_u32(row2,row3);
3626 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3628 row4 = vextq_u32(row4,row4,1);
3629 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3630 row2 = vextq_u32(row2,row2,3);
3633 vld1q_u32_rev(buf1, m11,m13,m3,m7);
3635 row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3636 row4 = veorq_u32(row4,row1);
3637 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3638 row3 = vaddq_u32(row3,row4);
3639 row2 = veorq_u32(row2,row3);
3640 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3643 vld1q_u32_rev(buf2, m14,m12,m1,m9);
3645 row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3646 row4 = veorq_u32(row4,row1);
3647 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3648 row3 = vaddq_u32(row3,row4);
3649 row2 = veorq_u32(row2,row3);
3650 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3652 row4 = vextq_u32(row4,row4,3);
3653 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3654 row2 = vextq_u32(row2,row2,1);
3657 vld1q_u32_rev(buf3, m15,m4,m5,m2);
3659 row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3660 row4 = veorq_u32(row4,row1);
3661 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3662 row3 = vaddq_u32(row3,row4);
3663 row2 = veorq_u32(row2,row3);
3664 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3667 vld1q_u32_rev(buf4, m8,m0,m10,m6);
3669 row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3670 row4 = veorq_u32(row4,row1);
3671 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3672 row3 = vaddq_u32(row3,row4);
3673 row2 = veorq_u32(row2,row3);
3674 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3676 row4 = vextq_u32(row4,row4,1);
3677 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3678 row2 = vextq_u32(row2,row2,3);
3681 vld1q_u32_rev(buf1, m10,m2,m5,m9);
3683 row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3684 row4 = veorq_u32(row4,row1);
3685 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3686 row3 = vaddq_u32(row3,row4);
3687 row2 = veorq_u32(row2,row3);
3688 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3691 vld1q_u32_rev(buf2, m15,m4,m7,m0);
3693 row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3694 row4 = veorq_u32(row4,row1);
3695 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3696 row3 = vaddq_u32(row3,row4);
3697 row2 = veorq_u32(row2,row3);
3698 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3700 row4 = vextq_u32(row4,row4,3);
3701 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3702 row2 = vextq_u32(row2,row2,1);
3705 vld1q_u32_rev(buf3, m3,m6,m11,m14);
3707 row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3708 row4 = veorq_u32(row4,row1);
3709 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3710 row3 = vaddq_u32(row3,row4);
3711 row2 = veorq_u32(row2,row3);
3712 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3715 vld1q_u32_rev(buf4, m13,m8,m12,m1);
3717 row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3718 row4 = veorq_u32(row4,row1);
3719 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3720 row3 = vaddq_u32(row3,row4);
3721 row2 = veorq_u32(row2,row3);
3722 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3724 row4 = vextq_u32(row4,row4,1);
3725 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3726 row2 = vextq_u32(row2,row2,3);
3729 vld1q_u32_rev(buf1, m8,m0,m6,m2);
3731 row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3732 row4 = veorq_u32(row4,row1);
3733 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3734 row3 = vaddq_u32(row3,row4);
3735 row2 = veorq_u32(row2,row3);
3736 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3739 vld1q_u32_rev(buf2, m3,m11,m10,m12);
3741 row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3742 row4 = veorq_u32(row4,row1);
3743 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3744 row3 = vaddq_u32(row3,row4);
3745 row2 = veorq_u32(row2,row3);
3746 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3748 row4 = vextq_u32(row4,row4,3);
3749 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3750 row2 = vextq_u32(row2,row2,1);
3753 vld1q_u32_rev(buf3, m1,m15,m7,m4);
3755 row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3756 row4 = veorq_u32(row4,row1);
3757 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3758 row3 = vaddq_u32(row3,row4);
3759 row2 = veorq_u32(row2,row3);
3760 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3763 vld1q_u32_rev(buf4, m9,m14,m5,m13);
3765 row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3766 row4 = veorq_u32(row4,row1);
3767 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3768 row3 = vaddq_u32(row3,row4);
3769 row2 = veorq_u32(row2,row3);
3770 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3772 row4 = vextq_u32(row4,row4,1);
3773 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3774 row2 = vextq_u32(row2,row2,3);
3777 vld1q_u32_rev(buf1, m4,m14,m1,m12);
3779 row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3780 row4 = veorq_u32(row4,row1);
3781 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3782 row3 = vaddq_u32(row3,row4);
3783 row2 = veorq_u32(row2,row3);
3784 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3787 vld1q_u32_rev(buf2, m10,m13,m15,m5);
3789 row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3790 row4 = veorq_u32(row4,row1);
3791 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3792 row3 = vaddq_u32(row3,row4);
3793 row2 = veorq_u32(row2,row3);
3794 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3796 row4 = vextq_u32(row4,row4,3);
3797 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3798 row2 = vextq_u32(row2,row2,1);
3801 vld1q_u32_rev(buf3, m8,m9,m6,m0);
3803 row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3804 row4 = veorq_u32(row4,row1);
3805 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3806 row3 = vaddq_u32(row3,row4);
3807 row2 = veorq_u32(row2,row3);
3808 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3811 vld1q_u32_rev(buf4, m11,m2,m3,m7);
3813 row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3814 row4 = veorq_u32(row4,row1);
3815 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3816 row3 = vaddq_u32(row3,row4);
3817 row2 = veorq_u32(row2,row3);
3818 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3820 row4 = vextq_u32(row4,row4,1);
3821 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3822 row2 = vextq_u32(row2,row2,3);
3825 vld1q_u32_rev(buf1, m3,m12,m7,m13);
3827 row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3828 row4 = veorq_u32(row4,row1);
3829 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3830 row3 = vaddq_u32(row3,row4);
3831 row2 = veorq_u32(row2,row3);
3832 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3835 vld1q_u32_rev(buf2, m9,m1,m14,m11);
3837 row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3838 row4 = veorq_u32(row4,row1);
3839 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3840 row3 = vaddq_u32(row3,row4);
3841 row2 = veorq_u32(row2,row3);
3842 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3844 row4 = vextq_u32(row4,row4,3);
3845 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3846 row2 = vextq_u32(row2,row2,1);
3849 vld1q_u32_rev(buf3, m2,m8,m15,m5);
3851 row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3852 row4 = veorq_u32(row4,row1);
3853 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3854 row3 = vaddq_u32(row3,row4);
3855 row2 = veorq_u32(row2,row3);
3856 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3859 vld1q_u32_rev(buf4, m10,m6,m4,m0);
3861 row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3862 row4 = veorq_u32(row4,row1);
3863 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3864 row3 = vaddq_u32(row3,row4);
3865 row2 = veorq_u32(row2,row3);
3866 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3868 row4 = vextq_u32(row4,row4,1);
3869 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3870 row2 = vextq_u32(row2,row2,3);
3873 vld1q_u32_rev(buf1, m0,m11,m14,m6);
3875 row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3876 row4 = veorq_u32(row4,row1);
3877 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3878 row3 = vaddq_u32(row3,row4);
3879 row2 = veorq_u32(row2,row3);
3880 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3883 vld1q_u32_rev(buf2, m8,m3,m9,m15);
3885 row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3886 row4 = veorq_u32(row4,row1);
3887 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3888 row3 = vaddq_u32(row3,row4);
3889 row2 = veorq_u32(row2,row3);
3890 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3892 row4 = vextq_u32(row4,row4,3);
3893 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3894 row2 = vextq_u32(row2,row2,1);
3897 vld1q_u32_rev(buf3, m10,m1,m13,m12);
3899 row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3900 row4 = veorq_u32(row4,row1);
3901 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3902 row3 = vaddq_u32(row3,row4);
3903 row2 = veorq_u32(row2,row3);
3904 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3907 vld1q_u32_rev(buf4, m5,m4,m7,m2);
3909 row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3910 row4 = veorq_u32(row4,row1);
3911 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3912 row3 = vaddq_u32(row3,row4);
3913 row2 = veorq_u32(row2,row3);
3914 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3916 row4 = vextq_u32(row4,row4,1);
3917 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3918 row2 = vextq_u32(row2,row2,3);
3921 vld1q_u32_rev(buf1, m1,m7,m8,m10);
3923 row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3924 row4 = veorq_u32(row4,row1);
3925 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3926 row3 = vaddq_u32(row3,row4);
3927 row2 = veorq_u32(row2,row3);
3928 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3931 vld1q_u32_rev(buf2, m5,m6,m4,m2);
3933 row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3934 row4 = veorq_u32(row4,row1);
3935 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3936 row3 = vaddq_u32(row3,row4);
3937 row2 = veorq_u32(row2,row3);
3938 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3940 row4 = vextq_u32(row4,row4,3);
3941 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3942 row2 = vextq_u32(row2,row2,1);
3945 vld1q_u32_rev(buf3, m13,m3,m9,m15);
3947 row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3948 row4 = veorq_u32(row4,row1);
3949 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3950 row3 = vaddq_u32(row3,row4);
3951 row2 = veorq_u32(row2,row3);
3952 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3955 vld1q_u32_rev(buf4, m0,m12,m14,m11);
3957 row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3958 row4 = veorq_u32(row4,row1);
3959 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3960 row3 = vaddq_u32(row3,row4);
3961 row2 = veorq_u32(row2,row3);
3962 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3964 row4 = vextq_u32(row4,row4,1);
3965 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3966 row2 = vextq_u32(row2,row2,3);
3968 vst1q_u32((uint32_t*)&state.
h[0],veorq_u32(ff0,veorq_u32(row1,row3)));
3969 vst1q_u32((uint32_t*)&state.
h[4],veorq_u32(ff1,veorq_u32(row2,row4)));
3979 uint64x2_t m0m1,m2m3,m4m5,m6m7,m8m9,m10m11,m12m13,m14m15;
3981 m0m1 = vreinterpretq_u64_u8(vld1q_u8(input+ 0));
3982 m2m3 = vreinterpretq_u64_u8(vld1q_u8(input+ 16));
3983 m4m5 = vreinterpretq_u64_u8(vld1q_u8(input+ 32));
3984 m6m7 = vreinterpretq_u64_u8(vld1q_u8(input+ 48));
3985 m8m9 = vreinterpretq_u64_u8(vld1q_u8(input+ 64));
3986 m10m11 = vreinterpretq_u64_u8(vld1q_u8(input+ 80));
3987 m12m13 = vreinterpretq_u64_u8(vld1q_u8(input+ 96));
3988 m14m15 = vreinterpretq_u64_u8(vld1q_u8(input+112));
3990 uint64x2_t row1l, row1h, row2l, row2h;
3991 uint64x2_t row3l, row3h, row4l, row4h;
3992 uint64x2_t b0 = {0,0}, b1 = {0,0}, t0,
t1;
3994 row1l = vld1q_u64((
const uint64_t *)&state.
h[0]);
3995 row1h = vld1q_u64((
const uint64_t *)&state.
h[2]);
3996 row2l = vld1q_u64((
const uint64_t *)&state.
h[4]);
3997 row2h = vld1q_u64((
const uint64_t *)&state.
h[6]);
3998 row3l = vld1q_u64((
const uint64_t *)&
BLAKE2B_IV(0));
3999 row3h = vld1q_u64((
const uint64_t *)&
BLAKE2B_IV(2));
4000 row4l = veorq_u64(vld1q_u64((
const uint64_t *)&
BLAKE2B_IV(4)), vld1q_u64((
const uint64_t*)&state.
t[0]));
4001 row4h = veorq_u64(vld1q_u64((
const uint64_t *)&
BLAKE2B_IV(6)), vld1q_u64((
const uint64_t*)&state.
f[0]));
4003 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_L64);
4004 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_H64);
4005 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_L64);
4006 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_H64);
4007 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4008 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4009 row4l = veorq_u64(row4l, row1l);
4010 row4h = veorq_u64(row4h, row1h);
4011 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4012 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4013 row3l = vaddq_u64(row3l, row4l);
4014 row3h = vaddq_u64(row3h, row4h);
4015 row2l = veorq_u64(row2l, row3l);
4016 row2h = veorq_u64(row2h, row3h);
4017 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4018 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4020 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_L64);
4021 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b0,LANE_H64);
4022 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_L64);
4023 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_H64);
4024 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4025 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4026 row4l = veorq_u64(row4l, row1l);
4027 row4h = veorq_u64(row4h, row1h);
4028 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4029 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4030 row3l = vaddq_u64(row3l, row4l);
4031 row3h = vaddq_u64(row3h, row4h);
4032 row2l = veorq_u64(row2l, row3l);
4033 row2h = veorq_u64(row2h, row3h);
4034 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4035 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4037 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4038 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4039 row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4040 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4041 row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4042 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4043 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4044 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4045 row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4047 b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_L64);
4048 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_H64);
4049 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b1,LANE_L64);
4050 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b1,LANE_H64);
4051 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4052 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4053 row4l = veorq_u64(row4l, row1l);
4054 row4h = veorq_u64(row4h, row1h);
4055 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4056 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4057 row3l = vaddq_u64(row3l, row4l);
4058 row3h = vaddq_u64(row3h, row4h);
4059 row2l = veorq_u64(row2l, row3l);
4060 row2h = veorq_u64(row2h, row3h);
4061 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4062 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4064 b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_L64);
4065 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_H64);
4066 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_L64);
4067 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_H64);
4068 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4069 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4070 row4l = veorq_u64(row4l, row1l);
4071 row4h = veorq_u64(row4h, row1h);
4072 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4073 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4074 row3l = vaddq_u64(row3l, row4l);
4075 row3h = vaddq_u64(row3h, row4h);
4076 row2l = veorq_u64(row2l, row3l);
4077 row2h = veorq_u64(row2h, row3h);
4078 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4079 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4081 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4082 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4083 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4084 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4085 row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4086 row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4087 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4088 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4089 row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4091 b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_L64);
4092 b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b0,LANE_H64);
4093 b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_L64);
4094 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_H64);
4095 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4096 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4097 row4l = veorq_u64(row4l, row1l);
4098 row4h = veorq_u64(row4h, row1h);
4099 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4100 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4101 row3l = vaddq_u64(row3l, row4l);
4102 row3h = vaddq_u64(row3h, row4h);
4103 row2l = veorq_u64(row2l, row3l);
4104 row2h = veorq_u64(row2h, row3h);
4105 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4106 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4108 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_L64);
4109 b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_H64);
4110 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_L64);
4111 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_H64);
4112 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4113 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4114 row4l = veorq_u64(row4l, row1l);
4115 row4h = veorq_u64(row4h, row1h);
4116 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4117 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4118 row3l = vaddq_u64(row3l, row4l);
4119 row3h = vaddq_u64(row3h, row4h);
4120 row2l = veorq_u64(row2l, row3l);
4121 row2h = veorq_u64(row2h, row3h);
4122 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4123 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4125 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4126 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4127 row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4128 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4129 row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4130 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4131 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4132 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4133 row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4135 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_L64);
4136 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_H64);
4137 b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_L64);
4138 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_H64);
4139 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4140 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4141 row4l = veorq_u64(row4l, row1l);
4142 row4h = veorq_u64(row4h, row1h);
4143 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4144 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4145 row3l = vaddq_u64(row3l, row4l);
4146 row3h = vaddq_u64(row3h, row4h);
4147 row2l = veorq_u64(row2l, row3l);
4148 row2h = veorq_u64(row2h, row3h);
4149 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4150 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4152 b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_L64);
4153 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_H64);
4154 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_L64);
4155 b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_H64);
4156 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4157 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4158 row4l = veorq_u64(row4l, row1l);
4159 row4h = veorq_u64(row4h, row1h);
4160 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4161 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4162 row3l = vaddq_u64(row3l, row4l);
4163 row3h = vaddq_u64(row3h, row4h);
4164 row2l = veorq_u64(row2l, row3l);
4165 row2h = veorq_u64(row2h, row3h);
4166 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4167 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4169 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4170 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4171 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4172 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4173 row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4174 row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4175 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4176 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4177 row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4179 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_L64);
4180 b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_H64);
4181 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_L64);
4182 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_H64);
4183 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4184 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4185 row4l = veorq_u64(row4l, row1l);
4186 row4h = veorq_u64(row4h, row1h);
4187 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4188 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4189 row3l = vaddq_u64(row3l, row4l);
4190 row3h = vaddq_u64(row3h, row4h);
4191 row2l = veorq_u64(row2l, row3l);
4192 row2h = veorq_u64(row2h, row3h);
4193 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4194 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4196 b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_L64);
4197 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_H64);
4198 b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b1,LANE_L64);
4199 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_H64);
4200 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4201 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4202 row4l = veorq_u64(row4l, row1l);
4203 row4h = veorq_u64(row4h, row1h);
4204 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4205 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4206 row3l = vaddq_u64(row3l, row4l);
4207 row3h = vaddq_u64(row3h, row4h);
4208 row2l = veorq_u64(row2l, row3l);
4209 row2h = veorq_u64(row2h, row3h);
4210 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4211 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4213 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4214 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4215 row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4216 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4217 row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4218 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4219 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4220 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4221 row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4223 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_L64);
4224 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b0,LANE_H64);
4225 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_L64);
4226 b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_H64);
4227 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4228 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4229 row4l = veorq_u64(row4l, row1l);
4230 row4h = veorq_u64(row4h, row1h);
4231 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4232 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4233 row3l = vaddq_u64(row3l, row4l);
4234 row3h = vaddq_u64(row3h, row4h);
4235 row2l = veorq_u64(row2l, row3l);
4236 row2h = veorq_u64(row2h, row3h);
4237 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4238 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4240 b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_L64);
4241 b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b0,LANE_H64);
4242 b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b1,LANE_L64);
4243 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_H64);
4244 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4245 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4246 row4l = veorq_u64(row4l, row1l);
4247 row4h = veorq_u64(row4h, row1h);
4248 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4249 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4250 row3l = vaddq_u64(row3l, row4l);
4251 row3h = vaddq_u64(row3h, row4h);
4252 row2l = veorq_u64(row2l, row3l);
4253 row2h = veorq_u64(row2h, row3h);
4254 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4255 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4257 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4258 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4259 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4260 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4261 row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4262 row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4263 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4264 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4265 row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4267 b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_L64);
4268 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b0,LANE_H64);
4269 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_L64);
4270 b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_H64);
4271 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4272 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4273 row4l = veorq_u64(row4l, row1l);
4274 row4h = veorq_u64(row4h, row1h);
4275 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4276 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4277 row3l = vaddq_u64(row3l, row4l);
4278 row3h = vaddq_u64(row3h, row4h);
4279 row2l = veorq_u64(row2l, row3l);
4280 row2h = veorq_u64(row2h, row3h);
4281 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4282 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4284 b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_L64);
4285 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_H64);
4286 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b1,LANE_L64);
4287 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b1,LANE_H64);
4288 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4289 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4290 row4l = veorq_u64(row4l, row1l);
4291 row4h = veorq_u64(row4h, row1h);
4292 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4293 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4294 row3l = vaddq_u64(row3l, row4l);
4295 row3h = vaddq_u64(row3h, row4h);
4296 row2l = veorq_u64(row2l, row3l);
4297 row2h = veorq_u64(row2h, row3h);
4298 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4299 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4301 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4302 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4303 row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4304 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4305 row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4306 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4307 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4308 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4309 row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4311 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_L64);
4312 b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b0,LANE_H64);
4313 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_L64);
4314 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_H64);
4315 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4316 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4317 row4l = veorq_u64(row4l, row1l);
4318 row4h = veorq_u64(row4h, row1h);
4319 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4320 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4321 row3l = vaddq_u64(row3l, row4l);
4322 row3h = vaddq_u64(row3h, row4h);
4323 row2l = veorq_u64(row2l, row3l);
4324 row2h = veorq_u64(row2h, row3h);
4325 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4326 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4328 b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b0,LANE_L64);
4329 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_H64);
4330 b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b1,LANE_L64);
4331 b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_H64);
4332 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4333 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4334 row4l = veorq_u64(row4l, row1l);
4335 row4h = veorq_u64(row4h, row1h);
4336 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4337 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4338 row3l = vaddq_u64(row3l, row4l);
4339 row3h = vaddq_u64(row3h, row4h);
4340 row2l = veorq_u64(row2l, row3l);
4341 row2h = veorq_u64(row2h, row3h);
4342 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4343 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4345 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4346 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4347 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4348 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4349 row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4350 row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4351 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4352 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4353 row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4355 b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_L64);
4356 b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b0,LANE_H64);
4357 b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b1,LANE_L64);
4358 b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b1,LANE_H64);
4359 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4360 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4361 row4l = veorq_u64(row4l, row1l);
4362 row4h = veorq_u64(row4h, row1h);
4363 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4364 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4365 row3l = vaddq_u64(row3l, row4l);
4366 row3h = vaddq_u64(row3h, row4h);
4367 row2l = veorq_u64(row2l, row3l);
4368 row2h = veorq_u64(row2h, row3h);
4369 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4370 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4372 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_L64);
4373 b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_H64);
4374 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_L64);
4375 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_H64);
4376 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4377 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4378 row4l = veorq_u64(row4l, row1l);
4379 row4h = veorq_u64(row4h, row1h);
4380 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4381 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4382 row3l = vaddq_u64(row3l, row4l);
4383 row3h = vaddq_u64(row3h, row4h);
4384 row2l = veorq_u64(row2l, row3l);
4385 row2h = veorq_u64(row2h, row3h);
4386 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4387 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4389 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4390 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4391 row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4392 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4393 row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4394 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4395 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4396 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4397 row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4399 b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_L64);
4400 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_H64);
4401 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_L64);
4402 b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_H64);
4403 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4404 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4405 row4l = veorq_u64(row4l, row1l);
4406 row4h = veorq_u64(row4h, row1h);
4407 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4408 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4409 row3l = vaddq_u64(row3l, row4l);
4410 row3h = vaddq_u64(row3h, row4h);
4411 row2l = veorq_u64(row2l, row3l);
4412 row2h = veorq_u64(row2h, row3h);
4413 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4414 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4416 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_L64);
4417 b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_H64);
4418 b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_L64);
4419 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_H64);
4420 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4421 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4422 row4l = veorq_u64(row4l, row1l);
4423 row4h = veorq_u64(row4h, row1h);
4424 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4425 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4426 row3l = vaddq_u64(row3l, row4l);
4427 row3h = vaddq_u64(row3h, row4h);
4428 row2l = veorq_u64(row2l, row3l);
4429 row2h = veorq_u64(row2h, row3h);
4430 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4431 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4433 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4434 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4435 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4436 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4437 row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4438 row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4439 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4440 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4441 row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4443 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_L64);
4444 b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b0,LANE_H64);
4445 b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b1,LANE_L64);
4446 b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_H64);
4447 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4448 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4449 row4l = veorq_u64(row4l, row1l);
4450 row4h = veorq_u64(row4h, row1h);
4451 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4452 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4453 row3l = vaddq_u64(row3l, row4l);
4454 row3h = vaddq_u64(row3h, row4h);
4455 row2l = veorq_u64(row2l, row3l);
4456 row2h = veorq_u64(row2h, row3h);
4457 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4458 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4460 b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_L64);
4461 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_H64);
4462 b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_L64);
4463 b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_H64);
4464 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4465 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4466 row4l = veorq_u64(row4l, row1l);
4467 row4h = veorq_u64(row4h, row1h);
4468 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4469 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4470 row3l = vaddq_u64(row3l, row4l);
4471 row3h = vaddq_u64(row3h, row4h);
4472 row2l = veorq_u64(row2l, row3l);
4473 row2h = veorq_u64(row2h, row3h);
4474 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4475 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4477 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4478 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4479 row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4480 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4481 row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4482 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4483 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4484 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4485 row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4487 b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b0,LANE_L64);
4488 b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_H64);
4489 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_L64);
4490 b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b1,LANE_H64);
4491 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4492 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4493 row4l = veorq_u64(row4l, row1l);
4494 row4h = veorq_u64(row4h, row1h);
4495 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4496 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4497 row3l = vaddq_u64(row3l, row4l);
4498 row3h = vaddq_u64(row3h, row4h);
4499 row2l = veorq_u64(row2l, row3l);
4500 row2h = veorq_u64(row2h, row3h);
4501 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4502 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4504 b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b0,LANE_L64);
4505 b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b0,LANE_H64);
4506 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b1,LANE_L64);
4507 b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_H64);
4508 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4509 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4510 row4l = veorq_u64(row4l, row1l);
4511 row4h = veorq_u64(row4h, row1h);
4512 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4513 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4514 row3l = vaddq_u64(row3l, row4l);
4515 row3h = vaddq_u64(row3h, row4h);
4516 row2l = veorq_u64(row2l, row3l);
4517 row2h = veorq_u64(row2h, row3h);
4518 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4519 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4521 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4522 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4523 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4524 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4525 row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4526 row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4527 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4528 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4529 row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4531 b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_L64);
4532 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_H64);
4533 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b1,LANE_L64);
4534 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_H64);
4535 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4536 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4537 row4l = veorq_u64(row4l, row1l);
4538 row4h = veorq_u64(row4h, row1h);
4539 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4540 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4541 row3l = vaddq_u64(row3l, row4l);
4542 row3h = vaddq_u64(row3h, row4h);
4543 row2l = veorq_u64(row2l, row3l);
4544 row2h = veorq_u64(row2h, row3h);
4545 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4546 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4548 b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b0,LANE_L64);
4549 b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b0,LANE_H64);
4550 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_L64);
4551 b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b1,LANE_H64);
4552 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4553 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4554 row4l = veorq_u64(row4l, row1l);
4555 row4h = veorq_u64(row4h, row1h);
4556 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4557 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4558 row3l = vaddq_u64(row3l, row4l);
4559 row3h = vaddq_u64(row3h, row4h);
4560 row2l = veorq_u64(row2l, row3l);
4561 row2h = veorq_u64(row2h, row3h);
4562 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4563 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4565 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4566 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4567 row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4568 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4569 row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4570 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4571 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4572 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4573 row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4575 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_L64);
4576 b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b0,LANE_H64);
4577 b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_L64);
4578 b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_H64);
4579 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4580 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4581 row4l = veorq_u64(row4l, row1l);
4582 row4h = veorq_u64(row4h, row1h);
4583 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4584 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4585 row3l = vaddq_u64(row3l, row4l);
4586 row3h = vaddq_u64(row3h, row4h);
4587 row2l = veorq_u64(row2l, row3l);
4588 row2h = veorq_u64(row2h, row3h);
4589 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4590 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4592 b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_L64);
4593 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b0,LANE_H64);
4594 b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b1,LANE_L64);
4595 b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_H64);
4596 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4597 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4598 row4l = veorq_u64(row4l, row1l);
4599 row4h = veorq_u64(row4h, row1h);
4600 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4601 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4602 row3l = vaddq_u64(row3l, row4l);
4603 row3h = vaddq_u64(row3h, row4h);
4604 row2l = veorq_u64(row2l, row3l);
4605 row2h = veorq_u64(row2h, row3h);
4606 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4607 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4609 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4610 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4611 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4612 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4613 row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4614 row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4615 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4616 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4617 row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4619 b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b0,LANE_L64);
4620 b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_H64);
4621 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b1,LANE_L64);
4622 b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_H64);
4623 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4624 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4625 row4l = veorq_u64(row4l, row1l);
4626 row4h = veorq_u64(row4h, row1h);
4627 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4628 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4629 row3l = vaddq_u64(row3l, row4l);
4630 row3h = vaddq_u64(row3h, row4h);
4631 row2l = veorq_u64(row2l, row3l);
4632 row2h = veorq_u64(row2h, row3h);
4633 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4634 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4636 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_L64);
4637 b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_H64);
4638 b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b1,LANE_L64);
4639 b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_H64);
4640 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4641 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4642 row4l = veorq_u64(row4l, row1l);
4643 row4h = veorq_u64(row4h, row1h);
4644 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4645 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4646 row3l = vaddq_u64(row3l, row4l);
4647 row3h = vaddq_u64(row3h, row4h);
4648 row2l = veorq_u64(row2l, row3l);
4649 row2h = veorq_u64(row2h, row3h);
4650 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4651 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4653 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4654 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4655 row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4656 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4657 row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4658 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4659 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4660 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4661 row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4663 b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b0,LANE_L64);
4664 b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b0,LANE_H64);
4665 b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_L64);
4666 b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b1,LANE_H64);
4667 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4668 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4669 row4l = veorq_u64(row4l, row1l);
4670 row4h = veorq_u64(row4h, row1h);
4671 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4672 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4673 row3l = vaddq_u64(row3l, row4l);
4674 row3h = vaddq_u64(row3h, row4h);
4675 row2l = veorq_u64(row2l, row3l);
4676 row2h = veorq_u64(row2h, row3h);
4677 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4678 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4680 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_L64);
4681 b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b0,LANE_H64);
4682 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_L64);
4683 b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b1,LANE_H64);
4684 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4685 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4686 row4l = veorq_u64(row4l, row1l);
4687 row4h = veorq_u64(row4h, row1h);
4688 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4689 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4690 row3l = vaddq_u64(row3l, row4l);
4691 row3h = vaddq_u64(row3h, row4h);
4692 row2l = veorq_u64(row2l, row3l);
4693 row2h = veorq_u64(row2h, row3h);
4694 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4695 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4697 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4698 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4699 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4700 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4701 row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4702 row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4703 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4704 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4705 row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4707 b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b0,LANE_L64);
4708 b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_H64);
4709 b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_L64);
4710 b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b1,LANE_H64);
4711 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4712 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4713 row4l = veorq_u64(row4l, row1l);
4714 row4h = veorq_u64(row4h, row1h);
4715 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4716 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4717 row3l = vaddq_u64(row3l, row4l);
4718 row3h = vaddq_u64(row3h, row4h);
4719 row2l = veorq_u64(row2l, row3l);
4720 row2h = veorq_u64(row2h, row3h);
4721 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4722 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4724 b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b0,LANE_L64);
4725 b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_H64);
4726 b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_L64);
4727 b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_H64);
4728 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4729 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4730 row4l = veorq_u64(row4l, row1l);
4731 row4h = veorq_u64(row4h, row1h);
4732 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4733 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4734 row3l = vaddq_u64(row3l, row4l);
4735 row3h = vaddq_u64(row3h, row4h);
4736 row2l = veorq_u64(row2l, row3l);
4737 row2h = veorq_u64(row2h, row3h);
4738 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4739 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4741 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4742 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4743 row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4744 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4745 row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4746 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4747 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4748 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4749 row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4751 b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_L64);
4752 b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b0,LANE_H64);
4753 b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b1,LANE_L64);
4754 b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b1,LANE_H64);
4755 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4756 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4757 row4l = veorq_u64(row4l, row1l);
4758 row4h = veorq_u64(row4h, row1h);
4759 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4760 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4761 row3l = vaddq_u64(row3l, row4l);
4762 row3h = vaddq_u64(row3h, row4h);
4763 row2l = veorq_u64(row2l, row3l);
4764 row2h = veorq_u64(row2h, row3h);
4765 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4766 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4768 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_L64);
4769 b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_H64);
4770 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_L64);
4771 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_H64);
4772 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4773 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4774 row4l = veorq_u64(row4l, row1l);
4775 row4h = veorq_u64(row4h, row1h);
4776 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4777 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4778 row3l = vaddq_u64(row3l, row4l);
4779 row3h = vaddq_u64(row3h, row4h);
4780 row2l = veorq_u64(row2l, row3l);
4781 row2h = veorq_u64(row2h, row3h);
4782 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4783 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4785 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4786 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4787 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4788 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4789 row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4790 row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4791 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4792 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4793 row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4795 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_L64);
4796 b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_H64);
4797 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_L64);
4798 b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b1,LANE_H64);
4799 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4800 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4801 row4l = veorq_u64(row4l, row1l);
4802 row4h = veorq_u64(row4h, row1h);
4803 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4804 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4805 row3l = vaddq_u64(row3l, row4l);
4806 row3h = vaddq_u64(row3h, row4h);
4807 row2l = veorq_u64(row2l, row3l);
4808 row2h = veorq_u64(row2h, row3h);
4809 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4810 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4812 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_L64);
4813 b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b0,LANE_H64);
4814 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_L64);
4815 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_H64);
4816 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4817 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4818 row4l = veorq_u64(row4l, row1l);
4819 row4h = veorq_u64(row4h, row1h);
4820 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4821 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4822 row3l = vaddq_u64(row3l, row4l);
4823 row3h = vaddq_u64(row3h, row4h);
4824 row2l = veorq_u64(row2l, row3l);
4825 row2h = veorq_u64(row2h, row3h);
4826 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4827 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4829 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4830 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4831 row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4832 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4833 row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4834 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4835 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4836 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4837 row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4839 b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b0,LANE_L64);
4840 b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_H64);
4841 b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_L64);
4842 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_H64);
4843 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4844 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4845 row4l = veorq_u64(row4l, row1l);
4846 row4h = veorq_u64(row4h, row1h);
4847 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4848 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4849 row3l = vaddq_u64(row3l, row4l);
4850 row3h = vaddq_u64(row3h, row4h);
4851 row2l = veorq_u64(row2l, row3l);
4852 row2h = veorq_u64(row2h, row3h);
4853 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4854 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4856 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_L64);
4857 b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_H64);
4858 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b1,LANE_L64);
4859 b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b1,LANE_H64);
4860 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4861 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4862 row4l = veorq_u64(row4l, row1l);
4863 row4h = veorq_u64(row4h, row1h);
4864 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4865 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4866 row3l = vaddq_u64(row3l, row4l);
4867 row3h = vaddq_u64(row3h, row4h);
4868 row2l = veorq_u64(row2l, row3l);
4869 row2h = veorq_u64(row2h, row3h);
4870 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4871 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4873 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4874 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4875 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4876 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4877 row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4878 row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4879 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4880 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4881 row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4883 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_L64);
4884 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_H64);
4885 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_L64);
4886 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_H64);
4887 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4888 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4889 row4l = veorq_u64(row4l, row1l);
4890 row4h = veorq_u64(row4h, row1h);
4891 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4892 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4893 row3l = vaddq_u64(row3l, row4l);
4894 row3h = vaddq_u64(row3h, row4h);
4895 row2l = veorq_u64(row2l, row3l);
4896 row2h = veorq_u64(row2h, row3h);
4897 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4898 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4900 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_L64);
4901 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b0,LANE_H64);
4902 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_L64);
4903 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_H64);
4904 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4905 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4906 row4l = veorq_u64(row4l, row1l);
4907 row4h = veorq_u64(row4h, row1h);
4908 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4909 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4910 row3l = vaddq_u64(row3l, row4l);
4911 row3h = vaddq_u64(row3h, row4h);
4912 row2l = veorq_u64(row2l, row3l);
4913 row2h = veorq_u64(row2h, row3h);
4914 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4915 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4917 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4918 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4919 row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4920 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4921 row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4922 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4923 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4924 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4925 row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4927 b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_L64);
4928 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_H64);
4929 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b1,LANE_L64);
4930 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b1,LANE_H64);
4931 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4932 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4933 row4l = veorq_u64(row4l, row1l);
4934 row4h = veorq_u64(row4h, row1h);
4935 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4936 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4937 row3l = vaddq_u64(row3l, row4l);
4938 row3h = vaddq_u64(row3h, row4h);
4939 row2l = veorq_u64(row2l, row3l);
4940 row2h = veorq_u64(row2h, row3h);
4941 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4942 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4944 b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_L64);
4945 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_H64);
4946 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_L64);
4947 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_H64);
4948 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4949 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4950 row4l = veorq_u64(row4l, row1l);
4951 row4h = veorq_u64(row4h, row1h);
4952 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4953 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4954 row3l = vaddq_u64(row3l, row4l);
4955 row3h = vaddq_u64(row3h, row4h);
4956 row2l = veorq_u64(row2l, row3l);
4957 row2h = veorq_u64(row2h, row3h);
4958 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4959 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4961 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4962 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4963 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4964 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4965 row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4966 row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4967 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4968 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4969 row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4971 b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_L64);
4972 b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b0,LANE_H64);
4973 b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_L64);
4974 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_H64);
4975 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4976 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4977 row4l = veorq_u64(row4l, row1l);
4978 row4h = veorq_u64(row4h, row1h);
4979 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4980 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4981 row3l = vaddq_u64(row3l, row4l);
4982 row3h = vaddq_u64(row3h, row4h);
4983 row2l = veorq_u64(row2l, row3l);
4984 row2h = veorq_u64(row2h, row3h);
4985 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4986 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4988 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_L64);
4989 b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_H64);
4990 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_L64);
4991 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_H64);
4992 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4993 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4994 row4l = veorq_u64(row4l, row1l);
4995 row4h = veorq_u64(row4h, row1h);
4996 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4997 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4998 row3l = vaddq_u64(row3l, row4l);
4999 row3h = vaddq_u64(row3h, row4h);
5000 row2l = veorq_u64(row2l, row3l);
5001 row2h = veorq_u64(row2h, row3h);
5002 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
5003 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
5005 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
5006 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
5007 row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
5008 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
5009 row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
5010 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
5011 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
5012 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
5013 row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
5015 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_L64);
5016 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_H64);
5017 b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_L64);
5018 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_H64);
5019 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
5020 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
5021 row4l = veorq_u64(row4l, row1l);
5022 row4h = veorq_u64(row4h, row1h);
5023 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
5024 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
5025 row3l = vaddq_u64(row3l, row4l);
5026 row3h = vaddq_u64(row3h, row4h);
5027 row2l = veorq_u64(row2l, row3l);
5028 row2h = veorq_u64(row2h, row3h);
5029 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
5030 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
5032 b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_L64);
5033 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_H64);
5034 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_L64);
5035 b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_H64);
5036 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
5037 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
5038 row4l = veorq_u64(row4l, row1l);
5039 row4h = veorq_u64(row4h, row1h);
5040 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
5041 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
5042 row3l = vaddq_u64(row3l, row4l);
5043 row3h = vaddq_u64(row3h, row4h);
5044 row2l = veorq_u64(row2l, row3l);
5045 row2h = veorq_u64(row2h, row3h);
5046 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
5047 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
5049 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
5050 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
5051 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
5052 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
5053 row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
5054 row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
5055 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
5056 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
5057 row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
5059 row1l = veorq_u64(row3l, row1l);
5060 row1h = veorq_u64(row3h, row1h);
5061 vst1q_u64((uint64_t*)&state.
h[0], veorq_u64(vld1q_u64((
const uint64_t*)&state.
h[0]), row1l));
5062 vst1q_u64((uint64_t*)&state.
h[2], veorq_u64(vld1q_u64((
const uint64_t*)&state.
h[2]), row1h));
5064 row2l = veorq_u64(row4l, row2l);
5065 row2h = veorq_u64(row4h, row2h);
5066 vst1q_u64((uint64_t*)&state.
h[4], veorq_u64(vld1q_u64((
const uint64_t*)&state.
h[4]), row2l));
5067 vst1q_u64((uint64_t*)&state.
h[6], veorq_u64(vld1q_u64((
const uint64_t*)&state.
h[6]), row2h));
5069 #endif // CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE Used to pass byte array input as part of a NameValuePairs object.
pfnCompress64 InitializeCompress64Fn()
void Restart()
Restart the hash.
Standard names for retrieving values by name when working with NameValuePairs.
Classes for working with NameValuePairs.
void(* pfnCompress32)(const byte *, BLAKE2_State< word32, false > &)
#define NAMESPACE_BEGIN(x)
void TruncatedFinal(byte *hash, size_t size)
Computes the hash of the current message.
Abstract base classes that provide a uniform interface to this library.
void memcpy_s(void *dest, size_t sizeInBytes, const void *src, size_t count)
Bounds checking replacement for memcpy()
size_type size() const
Provides the count of elements in the SecBlock.
AlignedParameterBlock m_block
void IncrementCounter(size_t count=BLOCKSIZE)
size_t size() const
Length of the memory block.
Library configuration file.
AlignedSecByteBlock m_key
void Compress(const byte *input)
const byte * begin() const
Pointer to the first byte in the memory block.
bool IsAlignedOn(const void *ptr, unsigned int alignment)
Determines whether ptr is aligned to a minimum value.
#define CRYPTOPP_ALIGN_DATA(x)
AlgorithmParameters MakeParameters(const char *name, const T &value, bool throwIfNotUsed=true)
Create an object that implements NameValuePairs.
pfnCompress32 InitializeCompress32Fn()
T ConditionalByteReverse(ByteOrder order, T value)
Reverses bytes in a value depending upon endianness.
A::pointer data()
Provides a pointer to the first element in the memory block.
unsigned long long word64
#define CRYPTOPP_CONSTANT(x)
#define CRYPTOPP_ASSERT(exp)
const NameValuePairs & g_nullNameValuePairs
An empty set of name-value pairs.
BLAKE2_ParameterBlock< T_64bit > ParameterBlock
Functions for CPU features and intrinsics.
#define CRYPTOPP_NO_VTABLE
BLAKE2 state information.
void(* pfnCompress64)(const byte *, BLAKE2_State< word64, true > &)
uint8_t const size_t const size
void * memcpy(void *a, const void *b, size_t c)
Access a block of memory.
Access a block of memory.
static ENUM_TYPE ToEnum()
void UncheckedSetKey(const byte *key, unsigned int length, const CryptoPP::NameValuePairs ¶ms)
void Update(const byte *input, size_t length)
Updates a hash with additional input.