70 #ifndef CRYPTOPP_IMPORTS 71 #ifndef CRYPTOPP_GENERATE_X64_MASM 81 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) 82 # define CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS 1 86 #if (__SUNPRO_CC >= 0x5130) 89 # define MAYBE_CONST const 92 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS) 93 # if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM) 95 using namespace rdtable;
100 #else // Not CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS 101 # if defined(CRYPTOPP_X64_MASM_AVAILABLE) 107 #endif // CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS 109 static volatile bool s_TeFilled =
false, s_TdFilled =
false;
113 #define QUARTER_ROUND(L, T, t, a, b, c, d) \ 114 a ^= L(T, 3, byte(t)); t >>= 8;\ 115 b ^= L(T, 2, byte(t)); t >>= 8;\ 116 c ^= L(T, 1, byte(t)); t >>= 8;\ 119 #define QUARTER_ROUND_LE(t, a, b, c, d) \ 120 tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\ 121 tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\ 122 tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\ 123 tempBlock[d] = ((byte *)(Te+t))[1]; 125 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS) 126 #define QUARTER_ROUND_LD(t, a, b, c, d) \ 127 tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\ 128 tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\ 129 tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\ 130 tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7]; 132 #define QUARTER_ROUND_LD(t, a, b, c, d) \ 133 tempBlock[a] = Sd[byte(t)]; t >>= 8;\ 134 tempBlock[b] = Sd[byte(t)]; t >>= 8;\ 135 tempBlock[c] = Sd[byte(t)]; t >>= 8;\ 136 tempBlock[d] = Sd[t]; 139 #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d) 140 #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d) 142 #ifdef IS_LITTLE_ENDIAN 143 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a) 144 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a) 145 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS) 146 #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (6-i)%4+1)) 147 #define TL_M(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (i+3)%4+1)) 149 #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8) 150 #define TL_M(T, i, x) T[i*256 + x] 153 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d) 154 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d) 155 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS) 156 #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (4-i)%4)) 159 #define TL_F(T, i, x) rotrFixed(T[x], i*8) 160 #define TL_M(T, i, x) T[i*256 + x] 165 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b)) 166 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b)) 167 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b)) 169 #define f3(x) (f2(x) ^ x) 170 #define f9(x) (f8(x) ^ x) 171 #define fb(x) (f8(x) ^ f2(x) ^ x) 172 #define fd(x) (f8(x) ^ f4(x) ^ x) 173 #define fe(x) (f8(x) ^ f4(x) ^ f2(x)) 177 for (
int i=0; i<256; i++)
180 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS) 185 for (
int j=0; j<4; j++)
192 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM) 193 Te[256] = Te[257] = 0;
200 for (
int i=0; i<256; i++)
203 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS) 208 for (
int j=0; j<4; j++)
220 AssertValidKeyLength(keylen);
222 m_rounds = keylen/4 + 6;
223 m_key.New(4*(m_rounds+1));
227 #if (CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32)) 229 if (HasAESNI() && HasSSE4())
231 static const word32 rcLE[] = {
232 0x01, 0x02, 0x04, 0x08,
233 0x10, 0x20, 0x40, 0x80,
238 const word32 *ro = rcLE, *rc = rcLE;
241 __m128i temp = _mm_loadu_si128((__m128i *)(
void *)(userKey+keylen-16));
242 memcpy(rk, userKey, keylen);
248 rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
249 rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
250 rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
251 rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
253 if (rk + keylen/4 + 4 == m_key.end())
258 rk[10] = rk[ 4] ^ rk[ 9];
259 rk[11] = rk[ 5] ^ rk[10];
262 temp = _mm_insert_epi32(temp, rk[11], 3);
264 else if (keylen == 32)
268 temp = _mm_insert_epi32(temp, rk[11], 3);
269 rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2);
270 rk[13] = rk[ 5] ^ rk[12];
271 rk[14] = rk[ 6] ^ rk[13];
272 rk[15] = rk[ 7] ^ rk[14];
275 temp = _mm_insert_epi32(temp, rk[15], 3);
281 temp = _mm_insert_epi32(temp, rk[7], 3);
287 if (!IsForwardTransformation())
292 #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120) 295 vec_swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds));
297 std::swap(*(__m128i *)(
void *)(rk), *(__m128i *)(
void *)(rk+4*m_rounds));
299 for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
301 temp = _mm_aesimc_si128(*(__m128i *)(
void *)(rk+i));
302 *(__m128i *)(
void *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(
void *)(rk+j));
303 *(__m128i *)(
void *)(rk+j) = temp;
306 *(__m128i *)(
void *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(
void *)(rk+i));
319 temp = rk[keylen/4-1];
321 rk[keylen/4] = rk[0] ^ x ^ *(rc++);
322 rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
323 rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
324 rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
326 if (rk + keylen/4 + 4 == m_key.end())
331 rk[10] = rk[ 4] ^ rk[ 9];
332 rk[11] = rk[ 5] ^ rk[10];
334 else if (keylen == 32)
338 rk[13] = rk[ 5] ^ rk[12];
339 rk[14] = rk[ 6] ^ rk[13];
340 rk[15] = rk[ 7] ^ rk[14];
347 if (IsForwardTransformation())
362 #define InverseMixColumn(x) TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)]) 364 for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
383 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 391 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 392 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM) 424 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS) 425 for (i=0; i<2048; i+=cacheLineSize)
427 for (i=0; i<1024; i+=cacheLineSize)
429 u &= *(
const word32 *)(
const void *)(((
const byte *)Te)+i);
431 s0 |= u; s1 |= u; s2 |= u; s3 |= u;
439 unsigned int r = m_rounds/2 - 1;
442 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
449 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
460 byte *
const tempBlock = (
byte *)tbw;
467 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
472 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 502 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS) 503 for (i=0; i<2048; i+=cacheLineSize)
505 for (i=0; i<1024; i+=cacheLineSize)
507 u &= *(
const word32 *)(
const void *)(((
const byte *)Td)+i);
509 s0 |= u; s1 |= u; s2 |= u; s3 |= u;
517 unsigned int r = m_rounds/2 - 1;
520 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
527 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
537 #if !(defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)) 542 for (i=0; i<256; i+=cacheLineSize)
543 u &= *(
const word32 *)(
const void *)(Sd+i);
544 u &= *(
const word32 *)(
const void *)(Sd+252);
545 t0 |= u; t1 |= u; t2 |= u; t3 |= u;
549 byte *
const tempBlock = (
byte *)tbw;
556 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
561 #if CRYPTOPP_MSC_VERSION 562 # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code 565 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM 567 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM) 573 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 576 #define L_INDEX(i) (L_REG+768+i) 577 #define L_INXORBLOCKS L_INBLOCKS+4 578 #define L_OUTXORBLOCKS L_INBLOCKS+8 579 #define L_OUTBLOCKS L_INBLOCKS+12 580 #define L_INCREMENTS L_INDEX(16*15) 581 #define L_SP L_INDEX(16*16) 582 #define L_LENGTH L_INDEX(16*16+4) 583 #define L_KEYS_BEGIN L_INDEX(16*16+8) 588 #define MXOR(a,b,c) \ 590 AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ 591 AS2( pxor MM(a), mm7)\ 593 #define MMOV(a,b,c) \ 595 AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ 600 #define L_INDEX(i) (L_REG+i) 601 #define L_INXORBLOCKS L_INBLOCKS+8 602 #define L_OUTXORBLOCKS L_INBLOCKS+16 603 #define L_OUTBLOCKS L_INBLOCKS+24 604 #define L_INCREMENTS L_INDEX(16*16) 605 #define L_LENGTH L_INDEX(16*18+8) 606 #define L_KEYS_BEGIN L_INDEX(16*19) 618 #define MXOR(a,b,c) \ 620 AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ 622 #define MMOV(a,b,c) \ 624 AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ 628 #define L_SUBKEYS L_INDEX(0) 629 #define L_SAVED_X L_SUBKEYS 630 #define L_KEY12 L_INDEX(16*12) 631 #define L_LASTROUND L_INDEX(16*13) 632 #define L_INBLOCKS L_INDEX(16*14) 633 #define MAP0TO4(i) (ASM_MOD(i+3,4)+1) 637 AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ 641 AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ 643 #ifdef CRYPTOPP_GENERATE_X64_MASM 645 Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
652 mov AS_REG_7, ?Te@rdtable@
CryptoPP@@3PA_KA
653 mov edi, DWORD PTR [?g_cacheLineSize@
CryptoPP@@3IA]
654 #elif defined(__GNUC__) 663 AS2( mov AS_REG_7, WORD_REG(si))
669 AS2( lea AS_REG_7, [Te])
670 AS2( mov edi, [g_cacheLineSize])
674 AS2( mov [ecx+16*12+16*4], esp)
675 AS2( lea esp, [ecx-768])
679 AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
680 AS2( mov WORD_REG(ax), 16)
681 AS2( and WORD_REG(ax), WORD_REG(si))
682 AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)])
683 AS2( movdqa [L_KEY12], xmm3)
684 AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
685 AS2( sub WORD_REG(ax), WORD_REG(si))
687 AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
688 AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
689 AS2( add WORD_REG(si), 16)
690 AS2( cmp WORD_REG(si), 16*12)
696 AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)])
697 AS2( movdqa xmm1, [WORD_REG(dx)])
698 AS2( MOVD MM(1), [WORD_REG(dx)+4*4])
699 AS2( mov ebx, [WORD_REG(dx)+5*4])
700 AS2( mov ecx, [WORD_REG(dx)+6*4])
701 AS2( mov edx, [WORD_REG(dx)+7*4])
704 AS2( xor WORD_REG(ax), WORD_REG(ax))
706 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
707 AS2( add WORD_REG(ax), WORD_REG(di))
708 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
709 AS2( add WORD_REG(ax), WORD_REG(di))
710 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
711 AS2( add WORD_REG(ax), WORD_REG(di))
712 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
713 AS2( add WORD_REG(ax), WORD_REG(di))
714 AS2( cmp WORD_REG(ax), 2048)
720 AS2(
test DWORD PTR [L_LENGTH], 1)
726 AS2( mov WORD_REG(si), [L_INBLOCKS])
727 AS2( movdqu xmm2, [WORD_REG(si)])
728 AS2( pxor xmm2, xmm1)
729 AS2( psrldq xmm1, 14)
731 AS2( mov al, BYTE PTR [WORD_REG(si)+15])
732 AS2( MOVD MM(2), eax)
771 AS2( mov eax, [L_KEY12+0*4])
772 AS2( mov edi, [L_KEY12+2*4])
773 AS2( MOVD MM(0), [L_KEY12+3*4])
780 AS2( xor ebx, [L_KEY12+1*4])
792 AS2( MOVD edx, MM(1))
793 AS2( MOVD [L_SAVED_X+3*4], MM(0))
794 AS2( mov [L_SAVED_X+0*4], eax)
795 AS2( mov [L_SAVED_X+1*4], ebx)
796 AS2( mov [L_SAVED_X+2*4], edi)
802 AS2( MOVD MM(1), [L_KEY12+0*4])
803 AS2( mov ebx, [L_KEY12+1*4])
804 AS2( mov ecx, [L_KEY12+2*4])
805 AS2( mov edx, [L_KEY12+3*4])
807 AS2( mov WORD_REG(ax), [L_INBLOCKS])
808 AS2( movdqu xmm2, [WORD_REG(ax)])
809 AS2( mov WORD_REG(si), [L_INXORBLOCKS])
810 AS2( movdqu xmm5, [WORD_REG(si)])
811 AS2( pxor xmm2, xmm1)
812 AS2( pxor xmm2, xmm5)
845 AS2( MOVD eax, MM(1))
847 AS2( add L_REG, [L_KEYS_BEGIN])
848 AS2( add L_REG, 4*16)
854 AS2( MOVD ecx, MM(2))
855 AS2( MOVD edx, MM(1))
856 AS2( mov eax, [L_SAVED_X+0*4])
857 AS2( mov ebx, [L_SAVED_X+1*4])
859 AS2( and WORD_REG(cx), 255)
862 AS2( paddb MM(2), mm3)
867 AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
871 AS2( xor ecx, [L_SAVED_X+2*4])
874 AS2( xor edx, [L_SAVED_X+3*4])
876 AS2( add L_REG, [L_KEYS_BEGIN])
877 AS2( add L_REG, 3*16)
908 AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
909 AS2( mov edi, [L_SUBKEYS-4*16+2*4])
912 AS2( xor eax, [L_SUBKEYS-4*16+0*4])
913 AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
914 AS2( MOVD edx, MM(0))
917 AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
918 AS2( mov edi, [L_SUBKEYS-4*16+6*4])
921 AS2( xor eax, [L_SUBKEYS-4*16+4*4])
922 AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
923 AS2( MOVD edx, MM(0))
926 AS2(
test L_REG, 255)
930 AS2( sub L_REG, 16*16)
932 #define LAST(
a,
b,
c) \
934 AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
936 AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
937 AS2( mov WORD PTR [L_LASTROUND+
c], di )\
953 AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
954 AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
956 AS2( mov WORD_REG(cx), [L_LENGTH])
957 AS2( sub WORD_REG(cx), 16)
959 AS2( movdqu xmm2, [WORD_REG(ax)])
960 AS2( pxor xmm2, xmm4)
963 AS2( movdqa xmm0, [L_INCREMENTS])
964 AS2( paddd xmm0, [L_INBLOCKS])
965 AS2( movdqa [L_INBLOCKS], xmm0)
967 AS2( movdqa xmm0, [L_INCREMENTS+16])
968 AS2( paddq xmm0, [L_INBLOCKS+16])
969 AS2( movdqa [L_INBLOCKS+16], xmm0)
972 AS2( pxor xmm2, [L_LASTROUND])
973 AS2( movdqu [WORD_REG(bx)], xmm2)
978 AS2( mov [L_LENGTH], WORD_REG(cx))
979 AS2(
test WORD_REG(cx), 1)
984 AS2( movdqa xmm0, [L_INCREMENTS])
985 AS2( paddq xmm0, [L_INBLOCKS])
986 AS2( movdqa [L_INBLOCKS], xmm0)
994 AS2( xorps xmm0, xmm0)
995 AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
996 AS2( movaps [WORD_REG(ax)-7*16], xmm0)
997 AS2( movaps [WORD_REG(ax)-6*16], xmm0)
998 AS2( movaps [WORD_REG(ax)-5*16], xmm0)
999 AS2( movaps [WORD_REG(ax)-4*16], xmm0)
1000 AS2( movaps [WORD_REG(ax)-3*16], xmm0)
1001 AS2( movaps [WORD_REG(ax)-2*16], xmm0)
1002 AS2( movaps [WORD_REG(ax)-1*16], xmm0)
1003 AS2( movaps [WORD_REG(ax)+0*16], xmm0)
1004 AS2( movaps [WORD_REG(ax)+1*16], xmm0)
1005 AS2( movaps [WORD_REG(ax)+2*16], xmm0)
1006 AS2( movaps [WORD_REG(ax)+3*16], xmm0)
1007 AS2( movaps [WORD_REG(ax)+4*16], xmm0)
1008 AS2( movaps [WORD_REG(ax)+5*16], xmm0)
1009 AS2( movaps [WORD_REG(ax)+6*16], xmm0)
1011 AS2( mov esp, [L_SP])
1021 #ifdef CRYPTOPP_GENERATE_X64_MASM
1027 Rijndael_Enc_AdvancedProcessBlocks ENDP
1032 :
"c" (locals),
"d" (k),
"S" (Te),
"D" (g_cacheLineSize)
1033 :
"memory",
"cc",
"%eax" 1035 ,
"%rbx",
"%r8",
"%r9",
"%r10",
"%r11",
"%r12" 1043 #ifndef CRYPTOPP_GENERATE_X64_MASM 1045 #ifdef CRYPTOPP_X64_MASM_AVAILABLE 1047 void Rijndael_Enc_AdvancedProcessBlocks(
void *locals,
const word32 *k);
1051 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 1076 static inline bool AliasedWithTable(
const byte *begin,
const byte *end)
1078 ptrdiff_t
s0 = uintptr_t(begin)%4096,
s1 = uintptr_t(end)%4096;
1079 ptrdiff_t t0 = uintptr_t(Te)%4096,
t1 = (uintptr_t(Te)+
sizeof(Te))%4096;
1081 return (s0 >= t0 && s0 <
t1) || (
s1 > t0 &&
s1 <=
t1);
1083 return (s0 <
t1 || s1 <=
t1) || (s0 >= t0 || s1 > t0);
1086 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 1088 inline void AESNI_Enc_Block(__m128i &block,
MAYBE_CONST __m128i *subkeys,
unsigned int rounds)
1090 block = _mm_xor_si128(block, subkeys[0]);
1091 for (
unsigned int i=1; i<rounds-1; i+=2)
1093 block = _mm_aesenc_si128(block, subkeys[i]);
1094 block = _mm_aesenc_si128(block, subkeys[i+1]);
1096 block = _mm_aesenc_si128(block, subkeys[rounds-1]);
1097 block = _mm_aesenclast_si128(block, subkeys[rounds]);
1100 inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3,
MAYBE_CONST __m128i *subkeys,
unsigned int rounds)
1102 __m128i rk = subkeys[0];
1103 block0 = _mm_xor_si128(block0, rk);
1104 block1 = _mm_xor_si128(block1, rk);
1105 block2 = _mm_xor_si128(block2, rk);
1106 block3 = _mm_xor_si128(block3, rk);
1107 for (
unsigned int i=1; i<rounds; i++)
1110 block0 = _mm_aesenc_si128(block0, rk);
1111 block1 = _mm_aesenc_si128(block1, rk);
1112 block2 = _mm_aesenc_si128(block2, rk);
1113 block3 = _mm_aesenc_si128(block3, rk);
1115 rk = subkeys[rounds];
1116 block0 = _mm_aesenclast_si128(block0, rk);
1117 block1 = _mm_aesenclast_si128(block1, rk);
1118 block2 = _mm_aesenclast_si128(block2, rk);
1119 block3 = _mm_aesenclast_si128(block3, rk);
1122 inline void AESNI_Dec_Block(__m128i &block,
MAYBE_CONST __m128i *subkeys,
unsigned int rounds)
1124 block = _mm_xor_si128(block, subkeys[0]);
1125 for (
unsigned int i=1; i<rounds-1; i+=2)
1127 block = _mm_aesdec_si128(block, subkeys[i]);
1128 block = _mm_aesdec_si128(block, subkeys[i+1]);
1130 block = _mm_aesdec_si128(block, subkeys[rounds-1]);
1131 block = _mm_aesdeclast_si128(block, subkeys[rounds]);
1134 inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3,
MAYBE_CONST __m128i *subkeys,
unsigned int rounds)
1136 __m128i rk = subkeys[0];
1137 block0 = _mm_xor_si128(block0, rk);
1138 block1 = _mm_xor_si128(block1, rk);
1139 block2 = _mm_xor_si128(block2, rk);
1140 block3 = _mm_xor_si128(block3, rk);
1141 for (
unsigned int i=1; i<rounds; i++)
1144 block0 = _mm_aesdec_si128(block0, rk);
1145 block1 = _mm_aesdec_si128(block1, rk);
1146 block2 = _mm_aesdec_si128(block2, rk);
1147 block3 = _mm_aesdec_si128(block3, rk);
1149 rk = subkeys[rounds];
1150 block0 = _mm_aesdeclast_si128(block0, rk);
1151 block1 = _mm_aesdeclast_si128(block1, rk);
1152 block2 = _mm_aesdeclast_si128(block2, rk);
1153 block3 = _mm_aesdeclast_si128(block3, rk);
1157 static const word32 s_one[] = {0, 0, 0, 1<<24};
1159 template <
typename F1,
typename F4>
1160 inline size_t AESNI_AdvancedProcessBlocks(
F1 func1,
F4 func4,
MAYBE_CONST __m128i *subkeys,
unsigned int rounds,
const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
1162 size_t blockSize = 16;
1164 size_t xorIncrement = xorBlocks ? blockSize : 0;
1170 inBlocks += length - blockSize;
1171 xorBlocks += length - blockSize;
1172 outBlocks += length - blockSize;
1173 inIncrement = 0-inIncrement;
1174 xorIncrement = 0-xorIncrement;
1175 outIncrement = 0-outIncrement;
1180 while (length >= 4*blockSize)
1182 __m128i block0 = _mm_loadu_si128((
const __m128i *)(
const void *)inBlocks), block1, block2, block3;
1185 const __m128i be1 = *(
const __m128i *)(
const void *)s_one;
1186 block1 = _mm_add_epi32(block0, be1);
1187 block2 = _mm_add_epi32(block1, be1);
1188 block3 = _mm_add_epi32(block2, be1);
1189 _mm_storeu_si128((__m128i *)(
void *)inBlocks, _mm_add_epi32(block3, be1));
1193 inBlocks += inIncrement;
1194 block1 = _mm_loadu_si128((
const __m128i *)(
const void *)inBlocks);
1195 inBlocks += inIncrement;
1196 block2 = _mm_loadu_si128((
const __m128i *)(
const void *)inBlocks);
1197 inBlocks += inIncrement;
1198 block3 = _mm_loadu_si128((
const __m128i *)(
const void *)inBlocks);
1199 inBlocks += inIncrement;
1206 block0 = _mm_xor_si128(block0, _mm_loadu_si128((
const __m128i *)(
const void *)xorBlocks));
1207 xorBlocks += xorIncrement;
1208 block1 = _mm_xor_si128(block1, _mm_loadu_si128((
const __m128i *)(
const void *)xorBlocks));
1209 xorBlocks += xorIncrement;
1210 block2 = _mm_xor_si128(block2, _mm_loadu_si128((
const __m128i *)(
const void *)xorBlocks));
1211 xorBlocks += xorIncrement;
1212 block3 = _mm_xor_si128(block3, _mm_loadu_si128((
const __m128i *)(
const void *)xorBlocks));
1213 xorBlocks += xorIncrement;
1216 func4(block0, block1, block2, block3, subkeys, rounds);
1218 if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1220 block0 = _mm_xor_si128(block0, _mm_loadu_si128((
const __m128i *)(
const void *)xorBlocks));
1221 xorBlocks += xorIncrement;
1222 block1 = _mm_xor_si128(block1, _mm_loadu_si128((
const __m128i *)(
const void *)xorBlocks));
1223 xorBlocks += xorIncrement;
1224 block2 = _mm_xor_si128(block2, _mm_loadu_si128((
const __m128i *)(
const void *)xorBlocks));
1225 xorBlocks += xorIncrement;
1226 block3 = _mm_xor_si128(block3, _mm_loadu_si128((
const __m128i *)(
const void *)xorBlocks));
1227 xorBlocks += xorIncrement;
1230 _mm_storeu_si128((__m128i *)(
void *)outBlocks, block0);
1231 outBlocks += outIncrement;
1232 _mm_storeu_si128((__m128i *)(
void *)outBlocks, block1);
1233 outBlocks += outIncrement;
1234 _mm_storeu_si128((__m128i *)(
void *)outBlocks, block2);
1235 outBlocks += outIncrement;
1236 _mm_storeu_si128((__m128i *)(
void *)outBlocks, block3);
1237 outBlocks += outIncrement;
1239 length -= 4*blockSize;
1243 while (length >= blockSize)
1245 __m128i block = _mm_loadu_si128((
const __m128i *)(
const void *)inBlocks);
1248 block = _mm_xor_si128(block, _mm_loadu_si128((
const __m128i *)(
const void *)xorBlocks));
1251 const_cast<byte *
>(inBlocks)[15]++;
1253 func1(block, subkeys, rounds);
1255 if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1256 block = _mm_xor_si128(block, _mm_loadu_si128((
const __m128i *)(
const void *)xorBlocks));
1258 _mm_storeu_si128((__m128i *)(
void *)outBlocks, block);
1260 inBlocks += inIncrement;
1261 outBlocks += outIncrement;
1262 xorBlocks += xorIncrement;
1263 length -= blockSize;
1270 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 1273 word32 subkeys[4*12], workspace[8];
1274 const byte *inBlocks, *inXorBlocks, *outXorBlocks;
1276 size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
1277 size_t regSpill, lengthAndCounterFlag, keysBegin;
1280 const size_t s_aliasPageSize = 4096;
1281 const size_t s_aliasBlockSize = 256;
1282 const size_t s_sizeToAllocate = s_aliasPageSize + s_aliasBlockSize +
sizeof(Locals);
1284 Rijndael::Enc::Enc() : m_aliasBlock(s_sizeToAllocate) { }
1289 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 1291 return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (
MAYBE_CONST __m128i *)(
const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1294 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM) 1297 if (length < BLOCKSIZE)
1300 static const byte *zeros = (
const byte*)(Te+256);
1301 byte *space = NULL, *originalSpace =
const_cast<byte*
>(m_aliasBlock.data());
1304 space = originalSpace + (s_aliasBlockSize - (uintptr_t)originalSpace % s_aliasBlockSize) % s_aliasBlockSize;
1305 while (AliasedWithTable(space, space +
sizeof(Locals)))
1311 size_t increment = BLOCKSIZE;
1312 if (flags & BT_ReverseDirection)
1315 inBlocks += length - BLOCKSIZE;
1316 xorBlocks += length - BLOCKSIZE;
1317 outBlocks += length - BLOCKSIZE;
1318 increment = 0-increment;
1321 Locals &locals = *(Locals *)(
void *)space;
1323 locals.inBlocks = inBlocks;
1324 locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
1325 locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
1326 locals.outBlocks = outBlocks;
1328 locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1329 locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
1330 locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
1331 locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1333 locals.lengthAndCounterFlag = length - (length%16) -
bool(flags & BT_InBlockIsCounter);
1334 int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
1335 locals.keysBegin = (12-keysToCopy)*16;
1337 Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key);
1339 return length % BLOCKSIZE;
1348 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 1351 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 1353 return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (
MAYBE_CONST __m128i *)(
const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1358 #endif // CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
#define CRYPTOPP_BOOL_X64
void GetUserKey(ByteOrder order, T *out, size_t outlen, const byte *in, size_t inlen)
#define QUARTER_ROUND_FE(t, a, b, c, d)
Utility functions for the Crypto++ library.
void swap(dev::eth::Watch &_a, dev::eth::Watch &_b)
static const std::string s2("AAD")
#define NAMESPACE_BEGIN(x)
static GetBlock< T, B, GA > Get(const void *block)
Library configuration file.
Access a block of memory.
#define QUARTER_ROUND_D(t, a, b, c, d)
static void FillDecTable()
#define ROUND(lh, ll, rh, rl, kh, kl)
static void FillEncTable()
void UncheckedSetKey(const byte *userKey, unsigned int length, const NameValuePairs ¶ms)
Sets the key for this object without performing parameter validation.
#define QUARTER_ROUND_LD(t, a, b, c, d)
#define QUARTER_ROUND_E(t, a, b, c, d)
CRYPTOPP_ALIGN_DATA(16) static word32 Te[256 *4]
T ConditionalByteReverse(ByteOrder order, T value)
Reverses bytes in a value depending upon endianness.
uint32_t shr(uint32_t x, std::size_t n)
BlockGetAndPut< word32, BigEndian > Block
#define QUARTER_ROUND_FD(t, a, b, c, d)
unsigned long long word64
#define QUARTER_ROUND_LE(t, a, b, c, d)
#define CRYPTOPP_ASSERT(exp)
Classes for Rijndael encryption algorithm.
Functions for CPU features and intrinsics.
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
Encrypt or decrypt a block.
#define CRYPTOPP_BOOL_X32
#define CRYPTOPP_BOOL_X86
void * memcpy(void *a, const void *b, size_t c)
#define CRYPTOPP_UNUSED(x)
#define CRYPTOPP_FASTCALL
#define InverseMixColumn(x)
#define MAYBE_CONST
version 3.0 (December 2000)
uint32_t ch(uint32_t x, uint32_t y, uint32_t z)
T rotrFixed(T x, unsigned int y)
Performs a right rotate.
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
Encrypt or decrypt a block.
Interface for retrieving values given their names.
void vec_swap(T &a, T &b)
Swaps two variables which are arrays.