00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067 #include "pch.h"
00068
00069 #ifndef CRYPTOPP_IMPORTS
00070 #ifndef CRYPTOPP_GENERATE_X64_MASM
00071
00072 #include "rijndael.h"
00073 #include "misc.h"
00074 #include "cpu.h"
00075
00076 NAMESPACE_BEGIN(CryptoPP)
00077
00078 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00079 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
00080 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
00081 using namespace rdtable;
00082 #else
00083 static word64 Te[256];
00084 #endif
00085 static word64 Td[256];
00086 #else
00087 static word32 Te[256*4], Td[256*4];
00088 #endif
00089 static volatile bool s_TeFilled = false, s_TdFilled = false;
00090
00091
00092
00093 #define QUARTER_ROUND(L, T, t, a, b, c, d) \
00094 a ^= L(T, 3, byte(t)); t >>= 8;\
00095 b ^= L(T, 2, byte(t)); t >>= 8;\
00096 c ^= L(T, 1, byte(t)); t >>= 8;\
00097 d ^= L(T, 0, t);
00098
00099 #define QUARTER_ROUND_LE(t, a, b, c, d) \
00100 tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
00101 tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
00102 tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
00103 tempBlock[d] = ((byte *)(Te+t))[1];
00104
00105 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00106 #define QUARTER_ROUND_LD(t, a, b, c, d) \
00107 tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
00108 tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
00109 tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
00110 tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
00111 #else
00112 #define QUARTER_ROUND_LD(t, a, b, c, d) \
00113 tempBlock[a] = Sd[byte(t)]; t >>= 8;\
00114 tempBlock[b] = Sd[byte(t)]; t >>= 8;\
00115 tempBlock[c] = Sd[byte(t)]; t >>= 8;\
00116 tempBlock[d] = Sd[t];
00117 #endif
00118
00119 #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
00120 #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
00121
00122 #ifdef IS_LITTLE_ENDIAN
00123 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
00124 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
00125 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00126 #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (6-i)%4+1))
00127 #define TL_M(T, i, x) (*(word32 *)((byte *)T + x*8 + (i+3)%4+1))
00128 #else
00129 #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
00130 #define TL_M(T, i, x) T[i*256 + x]
00131 #endif
00132 #else
00133 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
00134 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
00135 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00136 #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (4-i)%4))
00137 #define TL_M TL_F
00138 #else
00139 #define TL_F(T, i, x) rotrFixed(T[x], i*8)
00140 #define TL_M(T, i, x) T[i*256 + x]
00141 #endif
00142 #endif
00143
00144
00145 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
00146 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
00147 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
00148
00149 #define f3(x) (f2(x) ^ x)
00150 #define f9(x) (f8(x) ^ x)
00151 #define fb(x) (f8(x) ^ f2(x) ^ x)
00152 #define fd(x) (f8(x) ^ f4(x) ^ x)
00153 #define fe(x) (f8(x) ^ f4(x) ^ f2(x))
00154
00155 void Rijndael::Base::FillEncTable()
00156 {
00157 for (int i=0; i<256; i++)
00158 {
00159 byte x = Se[i];
00160 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00161 word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
00162 Te[i] = word64(y | f3(x))<<32 | y;
00163 #else
00164 word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
00165 for (int j=0; j<4; j++)
00166 {
00167 Te[i+j*256] = y;
00168 y = rotrFixed(y, 8);
00169 }
00170 #endif
00171 }
00172 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
00173 Te[256] = Te[257] = 0;
00174 #endif
00175 s_TeFilled = true;
00176 }
00177
00178 void Rijndael::Base::FillDecTable()
00179 {
00180 for (int i=0; i<256; i++)
00181 {
00182 byte x = Sd[i];
00183 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00184 word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
00185 Td[i] = word64(y | fb(x))<<32 | y | x;
00186 #else
00187 word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;;
00188 for (int j=0; j<4; j++)
00189 {
00190 Td[i+j*256] = y;
00191 y = rotrFixed(y, 8);
00192 }
00193 #endif
00194 }
00195 s_TdFilled = true;
00196 }
00197
00198 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &)
00199 {
00200 AssertValidKeyLength(keylen);
00201
00202 m_rounds = keylen/4 + 6;
00203 m_key.New(4*(m_rounds+1));
00204
00205 word32 *rk = m_key;
00206
00207 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86)
00208
00209 if (HasAESNI())
00210 {
00211 static const word32 rcLE[] = {
00212 0x01, 0x02, 0x04, 0x08,
00213 0x10, 0x20, 0x40, 0x80,
00214 0x1B, 0x36,
00215 };
00216 const word32 *rc = rcLE;
00217
00218 __m128i temp = _mm_loadu_si128((__m128i *)(userKey+keylen-16));
00219 memcpy(rk, userKey, keylen);
00220
00221 while (true)
00222 {
00223 rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
00224 rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
00225 rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
00226 rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
00227
00228 if (rk + keylen/4 + 4 == m_key.end())
00229 break;
00230
00231 if (keylen == 24)
00232 {
00233 rk[10] = rk[ 4] ^ rk[ 9];
00234 rk[11] = rk[ 5] ^ rk[10];
00235 temp = _mm_insert_epi32(temp, rk[11], 3);
00236 }
00237 else if (keylen == 32)
00238 {
00239 temp = _mm_insert_epi32(temp, rk[11], 3);
00240 rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2);
00241 rk[13] = rk[ 5] ^ rk[12];
00242 rk[14] = rk[ 6] ^ rk[13];
00243 rk[15] = rk[ 7] ^ rk[14];
00244 temp = _mm_insert_epi32(temp, rk[15], 3);
00245 }
00246 else
00247 temp = _mm_insert_epi32(temp, rk[7], 3);
00248
00249 rk += keylen/4;
00250 }
00251
00252 if (!IsForwardTransformation())
00253 {
00254 rk = m_key;
00255 unsigned int i, j;
00256
00257 std::swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds));
00258
00259 for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
00260 {
00261 temp = _mm_aesimc_si128(*(__m128i *)(rk+i));
00262 *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+j));
00263 *(__m128i *)(rk+j) = temp;
00264 }
00265
00266 *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+i));
00267 }
00268
00269 return;
00270 }
00271 #endif
00272
00273 GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen);
00274 const word32 *rc = rcon;
00275 word32 temp;
00276
00277 while (true)
00278 {
00279 temp = rk[keylen/4-1];
00280 word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
00281 rk[keylen/4] = rk[0] ^ x ^ *(rc++);
00282 rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
00283 rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
00284 rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
00285
00286 if (rk + keylen/4 + 4 == m_key.end())
00287 break;
00288
00289 if (keylen == 24)
00290 {
00291 rk[10] = rk[ 4] ^ rk[ 9];
00292 rk[11] = rk[ 5] ^ rk[10];
00293 }
00294 else if (keylen == 32)
00295 {
00296 temp = rk[11];
00297 rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
00298 rk[13] = rk[ 5] ^ rk[12];
00299 rk[14] = rk[ 6] ^ rk[13];
00300 rk[15] = rk[ 7] ^ rk[14];
00301 }
00302 rk += keylen/4;
00303 }
00304
00305 rk = m_key;
00306
00307 if (IsForwardTransformation())
00308 {
00309 if (!s_TeFilled)
00310 FillEncTable();
00311
00312 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk, rk, 16);
00313 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
00314 }
00315 else
00316 {
00317 if (!s_TdFilled)
00318 FillDecTable();
00319
00320 unsigned int i, j;
00321
00322 #define InverseMixColumn(x) TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
00323
00324 for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
00325 {
00326 temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
00327 temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
00328 temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
00329 temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
00330 }
00331
00332 rk[i+0] = InverseMixColumn(rk[i+0]);
00333 rk[i+1] = InverseMixColumn(rk[i+1]);
00334 rk[i+2] = InverseMixColumn(rk[i+2]);
00335 rk[i+3] = InverseMixColumn(rk[i+3]);
00336
00337 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
00338 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
00339 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
00340 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
00341 }
00342
00343 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
00344 if (HasAESNI())
00345 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
00346 #endif
00347 }
00348
00349 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
00350 {
00351 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
00352 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
00353 if (HasSSE2())
00354 #else
00355 if (HasAESNI())
00356 #endif
00357 {
00358 Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
00359 return;
00360 }
00361 #endif
00362
00363 typedef BlockGetAndPut<word32, NativeByteOrder> Block;
00364
00365 word32 s0, s1, s2, s3, t0, t1, t2, t3;
00366 Block::Get(inBlock)(s0)(s1)(s2)(s3);
00367
00368 const word32 *rk = m_key;
00369 s0 ^= rk[0];
00370 s1 ^= rk[1];
00371 s2 ^= rk[2];
00372 s3 ^= rk[3];
00373 t0 = rk[4];
00374 t1 = rk[5];
00375 t2 = rk[6];
00376 t3 = rk[7];
00377 rk += 8;
00378
00379
00380
00381 const int cacheLineSize = GetCacheLineSize();
00382 unsigned int i;
00383 volatile word32 _u = 0;
00384 word32 u = _u;
00385 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00386 for (i=0; i<2048; i+=cacheLineSize)
00387 #else
00388 for (i=0; i<1024; i+=cacheLineSize)
00389 #endif
00390 u &= *(const word32 *)(((const byte *)Te)+i);
00391 u &= Te[255];
00392 s0 |= u; s1 |= u; s2 |= u; s3 |= u;
00393
00394 QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
00395 QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
00396 QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
00397 QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
00398
00399
00400 unsigned int r = m_rounds/2 - 1;
00401 do
00402 {
00403 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
00404
00405 QUARTER_ROUND_E(t3, s0, s1, s2, s3)
00406 QUARTER_ROUND_E(t2, s3, s0, s1, s2)
00407 QUARTER_ROUND_E(t1, s2, s3, s0, s1)
00408 QUARTER_ROUND_E(t0, s1, s2, s3, s0)
00409
00410 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
00411
00412 QUARTER_ROUND_E(s3, t0, t1, t2, t3)
00413 QUARTER_ROUND_E(s2, t3, t0, t1, t2)
00414 QUARTER_ROUND_E(s1, t2, t3, t0, t1)
00415 QUARTER_ROUND_E(s0, t1, t2, t3, t0)
00416
00417 rk += 8;
00418 } while (--r);
00419
00420 word32 tbw[4];
00421 byte *const tempBlock = (byte *)tbw;
00422
00423 QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
00424 QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
00425 QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
00426 QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
00427
00428 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
00429 }
00430
00431 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
00432 {
00433 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
00434 if (HasAESNI())
00435 {
00436 Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
00437 return;
00438 }
00439 #endif
00440
00441 typedef BlockGetAndPut<word32, NativeByteOrder> Block;
00442
00443 word32 s0, s1, s2, s3, t0, t1, t2, t3;
00444 Block::Get(inBlock)(s0)(s1)(s2)(s3);
00445
00446 const word32 *rk = m_key;
00447 s0 ^= rk[0];
00448 s1 ^= rk[1];
00449 s2 ^= rk[2];
00450 s3 ^= rk[3];
00451 t0 = rk[4];
00452 t1 = rk[5];
00453 t2 = rk[6];
00454 t3 = rk[7];
00455 rk += 8;
00456
00457
00458
00459 const int cacheLineSize = GetCacheLineSize();
00460 unsigned int i;
00461 volatile word32 _u = 0;
00462 word32 u = _u;
00463 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00464 for (i=0; i<2048; i+=cacheLineSize)
00465 #else
00466 for (i=0; i<1024; i+=cacheLineSize)
00467 #endif
00468 u &= *(const word32 *)(((const byte *)Td)+i);
00469 u &= Td[255];
00470 s0 |= u; s1 |= u; s2 |= u; s3 |= u;
00471
00472 QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
00473 QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
00474 QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
00475 QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
00476
00477
00478 unsigned int r = m_rounds/2 - 1;
00479 do
00480 {
00481 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
00482
00483 QUARTER_ROUND_D(t3, s2, s1, s0, s3)
00484 QUARTER_ROUND_D(t2, s1, s0, s3, s2)
00485 QUARTER_ROUND_D(t1, s0, s3, s2, s1)
00486 QUARTER_ROUND_D(t0, s3, s2, s1, s0)
00487
00488 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
00489
00490 QUARTER_ROUND_D(s3, t2, t1, t0, t3)
00491 QUARTER_ROUND_D(s2, t1, t0, t3, t2)
00492 QUARTER_ROUND_D(s1, t0, t3, t2, t1)
00493 QUARTER_ROUND_D(s0, t3, t2, t1, t0)
00494
00495 rk += 8;
00496 } while (--r);
00497
00498 #ifndef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00499
00500
00501
00502 u = _u;
00503 for (i=0; i<256; i+=cacheLineSize)
00504 u &= *(const word32 *)(Sd+i);
00505 u &= *(const word32 *)(Sd+252);
00506 t0 |= u; t1 |= u; t2 |= u; t3 |= u;
00507 #endif
00508
00509 word32 tbw[4];
00510 byte *const tempBlock = (byte *)tbw;
00511
00512 QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
00513 QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
00514 QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
00515 QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
00516
00517 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
00518 }
00519
00520
00521
00522 #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
00523
00524 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
00525
00526 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00527
00528 CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k)
00529 {
00530 #if CRYPTOPP_BOOL_X86
00531
00532 #define L_REG esp
00533 #define L_INDEX(i) (L_REG+768+i)
00534 #define L_INXORBLOCKS L_INBLOCKS+4
00535 #define L_OUTXORBLOCKS L_INBLOCKS+8
00536 #define L_OUTBLOCKS L_INBLOCKS+12
00537 #define L_INCREMENTS L_INDEX(16*15)
00538 #define L_SP L_INDEX(16*16)
00539 #define L_LENGTH L_INDEX(16*16+4)
00540 #define L_KEYS_BEGIN L_INDEX(16*16+8)
00541
00542 #define MOVD movd
00543 #define MM(i) mm##i
00544
00545 #define MXOR(a,b,c) \
00546 AS2( movzx esi, b)\
00547 AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00548 AS2( pxor MM(a), mm7)\
00549
00550 #define MMOV(a,b,c) \
00551 AS2( movzx esi, b)\
00552 AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00553
00554 #else
00555
00556 #define L_REG r8
00557 #define L_INDEX(i) (L_REG+i)
00558 #define L_INXORBLOCKS L_INBLOCKS+8
00559 #define L_OUTXORBLOCKS L_INBLOCKS+16
00560 #define L_OUTBLOCKS L_INBLOCKS+24
00561 #define L_INCREMENTS L_INDEX(16*16)
00562 #define L_LENGTH L_INDEX(16*18+8)
00563 #define L_KEYS_BEGIN L_INDEX(16*19)
00564
00565 #define MOVD mov
00566 #define MM_0 r9d
00567 #define MM_1 r12d
00568 #ifdef __GNUC__
00569 #define MM_2 r11d
00570 #else
00571 #define MM_2 r10d
00572 #endif
00573 #define MM(i) MM_##i
00574
00575 #define MXOR(a,b,c) \
00576 AS2( movzx esi, b)\
00577 AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00578
00579 #define MMOV(a,b,c) \
00580 AS2( movzx esi, b)\
00581 AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00582
00583 #endif
00584
00585 #define L_SUBKEYS L_INDEX(0)
00586 #define L_SAVED_X L_SUBKEYS
00587 #define L_KEY12 L_INDEX(16*12)
00588 #define L_LASTROUND L_INDEX(16*13)
00589 #define L_INBLOCKS L_INDEX(16*14)
00590 #define MAP0TO4(i) (ASM_MOD(i+3,4)+1)
00591
00592 #define XOR(a,b,c) \
00593 AS2( movzx esi, b)\
00594 AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00595
00596 #define MOV(a,b,c) \
00597 AS2( movzx esi, b)\
00598 AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00599
00600 #ifdef CRYPTOPP_GENERATE_X64_MASM
00601 ALIGN 8
00602 Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
00603 rex_push_reg rsi
00604 push_reg rdi
00605 push_reg rbx
00606 push_reg r12
00607 .endprolog
00608 mov L_REG, rcx
00609 mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
00610 mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
00611 #elif defined(__GNUC__)
00612 __asm__ __volatile__
00613 (
00614 ".intel_syntax noprefix;"
00615 #if CRYPTOPP_BOOL_X64
00616 AS2( mov L_REG, rcx)
00617 #endif
00618 AS_PUSH_IF86(bx)
00619 AS_PUSH_IF86(bp)
00620 AS2( mov AS_REG_7, WORD_REG(si))
00621 #else
00622 AS_PUSH_IF86(si)
00623 AS_PUSH_IF86(di)
00624 AS_PUSH_IF86(bx)
00625 AS_PUSH_IF86(bp)
00626 AS2( lea AS_REG_7, [Te])
00627 AS2( mov edi, [g_cacheLineSize])
00628 #endif
00629
00630 #if CRYPTOPP_BOOL_X86
00631 AS2( mov [ecx+16*12+16*4], esp)
00632 AS2( lea esp, [ecx-768])
00633 #endif
00634
00635
00636 AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
00637 AS2( mov WORD_REG(ax), 16)
00638 AS2( and WORD_REG(ax), WORD_REG(si))
00639 AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)])
00640 AS2( movdqa [L_KEY12], xmm3)
00641 AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
00642 AS2( sub WORD_REG(ax), WORD_REG(si))
00643 ASL(0)
00644 AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
00645 AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
00646 AS2( add WORD_REG(si), 16)
00647 AS2( cmp WORD_REG(si), 16*12)
00648 ASJ( jl, 0, b)
00649
00650
00651 AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)])
00652 AS2( movdqa xmm1, [WORD_REG(dx)])
00653 AS2( MOVD MM(1), [WORD_REG(dx)+4*4])
00654 AS2( mov ebx, [WORD_REG(dx)+5*4])
00655 AS2( mov ecx, [WORD_REG(dx)+6*4])
00656 AS2( mov edx, [WORD_REG(dx)+7*4])
00657
00658
00659 AS2( xor WORD_REG(ax), WORD_REG(ax))
00660 ASL(9)
00661 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
00662 AS2( add WORD_REG(ax), WORD_REG(di))
00663 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
00664 AS2( add WORD_REG(ax), WORD_REG(di))
00665 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
00666 AS2( add WORD_REG(ax), WORD_REG(di))
00667 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
00668 AS2( add WORD_REG(ax), WORD_REG(di))
00669 AS2( cmp WORD_REG(ax), 2048)
00670 ASJ( jl, 9, b)
00671 AS1( lfence)
00672
00673 AS2( test DWORD PTR [L_LENGTH], 1)
00674 ASJ( jz, 8, f)
00675
00676
00677 AS2( mov WORD_REG(si), [L_INBLOCKS])
00678 AS2( movdqu xmm2, [WORD_REG(si)])
00679 AS2( pxor xmm2, xmm1)
00680 AS2( psrldq xmm1, 14)
00681 AS2( movd eax, xmm1)
00682 AS2( mov al, BYTE PTR [WORD_REG(si)+15])
00683 AS2( MOVD MM(2), eax)
00684 #if CRYPTOPP_BOOL_X86
00685 AS2( mov eax, 1)
00686 AS2( movd mm3, eax)
00687 #endif
00688
00689
00690 AS2( movd eax, xmm2)
00691 AS2( psrldq xmm2, 4)
00692 AS2( movd edi, xmm2)
00693 AS2( psrldq xmm2, 4)
00694 MXOR( 1, al, 0)
00695 XOR( edx, ah, 1)
00696 AS2( shr eax, 16)
00697 XOR( ecx, al, 2)
00698 XOR( ebx, ah, 3)
00699 AS2( mov eax, edi)
00700 AS2( movd edi, xmm2)
00701 AS2( psrldq xmm2, 4)
00702 XOR( ebx, al, 0)
00703 MXOR( 1, ah, 1)
00704 AS2( shr eax, 16)
00705 XOR( edx, al, 2)
00706 XOR( ecx, ah, 3)
00707 AS2( mov eax, edi)
00708 AS2( movd edi, xmm2)
00709 XOR( ecx, al, 0)
00710 XOR( ebx, ah, 1)
00711 AS2( shr eax, 16)
00712 MXOR( 1, al, 2)
00713 XOR( edx, ah, 3)
00714 AS2( mov eax, edi)
00715 XOR( edx, al, 0)
00716 XOR( ecx, ah, 1)
00717 AS2( shr eax, 16)
00718 XOR( ebx, al, 2)
00719 AS2( psrldq xmm2, 3)
00720
00721
00722 AS2( mov eax, [L_KEY12+0*4])
00723 AS2( mov edi, [L_KEY12+2*4])
00724 AS2( MOVD MM(0), [L_KEY12+3*4])
00725 MXOR( 0, cl, 3)
00726 XOR( edi, bl, 3)
00727 MXOR( 0, bh, 2)
00728 AS2( shr ebx, 16)
00729 XOR( eax, bl, 1)
00730 MOV( ebx, bh, 0)
00731 AS2( xor ebx, [L_KEY12+1*4])
00732 XOR( eax, ch, 2)
00733 AS2( shr ecx, 16)
00734 XOR( eax, dl, 3)
00735 XOR( ebx, dh, 2)
00736 AS2( shr edx, 16)
00737 XOR( edi, ch, 0)
00738 XOR( ebx, cl, 1)
00739 XOR( edi, dl, 1)
00740 MXOR( 0, dh, 0)
00741
00742 AS2( movd ecx, xmm2)
00743 AS2( MOVD edx, MM(1))
00744 AS2( MOVD [L_SAVED_X+3*4], MM(0))
00745 AS2( mov [L_SAVED_X+0*4], eax)
00746 AS2( mov [L_SAVED_X+1*4], ebx)
00747 AS2( mov [L_SAVED_X+2*4], edi)
00748 ASJ( jmp, 5, f)
00749
00750 ASL(3)
00751
00752 AS2( MOVD MM(1), [L_KEY12+0*4])
00753 AS2( mov ebx, [L_KEY12+1*4])
00754 AS2( mov ecx, [L_KEY12+2*4])
00755 AS2( mov edx, [L_KEY12+3*4])
00756 ASL(8)
00757 AS2( mov WORD_REG(ax), [L_INBLOCKS])
00758 AS2( movdqu xmm2, [WORD_REG(ax)])
00759 AS2( mov WORD_REG(si), [L_INXORBLOCKS])
00760 AS2( movdqu xmm5, [WORD_REG(si)])
00761 AS2( pxor xmm2, xmm1)
00762 AS2( pxor xmm2, xmm5)
00763
00764
00765 AS2( movd eax, xmm2)
00766 AS2( psrldq xmm2, 4)
00767 AS2( movd edi, xmm2)
00768 AS2( psrldq xmm2, 4)
00769 MXOR( 1, al, 0)
00770 XOR( edx, ah, 1)
00771 AS2( shr eax, 16)
00772 XOR( ecx, al, 2)
00773 XOR( ebx, ah, 3)
00774 AS2( mov eax, edi)
00775 AS2( movd edi, xmm2)
00776 AS2( psrldq xmm2, 4)
00777 XOR( ebx, al, 0)
00778 MXOR( 1, ah, 1)
00779 AS2( shr eax, 16)
00780 XOR( edx, al, 2)
00781 XOR( ecx, ah, 3)
00782 AS2( mov eax, edi)
00783 AS2( movd edi, xmm2)
00784 XOR( ecx, al, 0)
00785 XOR( ebx, ah, 1)
00786 AS2( shr eax, 16)
00787 MXOR( 1, al, 2)
00788 XOR( edx, ah, 3)
00789 AS2( mov eax, edi)
00790 XOR( edx, al, 0)
00791 XOR( ecx, ah, 1)
00792 AS2( shr eax, 16)
00793 XOR( ebx, al, 2)
00794 MXOR( 1, ah, 3)
00795 AS2( MOVD eax, MM(1))
00796
00797 AS2( add L_REG, [L_KEYS_BEGIN])
00798 AS2( add L_REG, 4*16)
00799 ASJ( jmp, 2, f)
00800
00801 ASL(1)
00802
00803 AS2( MOVD ecx, MM(2))
00804 AS2( MOVD edx, MM(1))
00805 AS2( mov eax, [L_SAVED_X+0*4])
00806 AS2( mov ebx, [L_SAVED_X+1*4])
00807 AS2( xor cl, ch)
00808 AS2( and WORD_REG(cx), 255)
00809 ASL(5)
00810 #if CRYPTOPP_BOOL_X86
00811 AS2( paddb MM(2), mm3)
00812 #else
00813 AS2( add MM(2), 1)
00814 #endif
00815
00816 AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
00817 XOR( ebx, dl, 3)
00818 MOV( ecx, dh, 2)
00819 AS2( shr edx, 16)
00820 AS2( xor ecx, [L_SAVED_X+2*4])
00821 XOR( eax, dh, 0)
00822 MOV( edx, dl, 1)
00823 AS2( xor edx, [L_SAVED_X+3*4])
00824
00825 AS2( add L_REG, [L_KEYS_BEGIN])
00826 AS2( add L_REG, 3*16)
00827 ASJ( jmp, 4, f)
00828
00829
00830
00831 #define ROUND() \
00832 MXOR( 0, cl, 3) \
00833 AS2( mov cl, al) \
00834 XOR( edi, ah, 2) \
00835 AS2( shr eax, 16) \
00836 XOR( edi, bl, 3) \
00837 MXOR( 0, bh, 2) \
00838 AS2( shr ebx, 16) \
00839 MXOR( 0, al, 1) \
00840 MOV( eax, ah, 0) \
00841 XOR( eax, bl, 1) \
00842 MOV( ebx, bh, 0) \
00843 XOR( eax, ch, 2) \
00844 XOR( ebx, cl, 3) \
00845 AS2( shr ecx, 16) \
00846 XOR( eax, dl, 3) \
00847 XOR( ebx, dh, 2) \
00848 AS2( shr edx, 16) \
00849 XOR( edi, ch, 0) \
00850 XOR( ebx, cl, 1) \
00851 XOR( edi, dl, 1) \
00852 MXOR( 0, dh, 0) \
00853
00854 ASL(2)
00855 AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
00856 AS2( mov edi, [L_SUBKEYS-4*16+2*4])
00857 ROUND()
00858 AS2( mov ecx, edi)
00859 AS2( xor eax, [L_SUBKEYS-4*16+0*4])
00860 AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
00861 AS2( MOVD edx, MM(0))
00862
00863 ASL(4)
00864 AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
00865 AS2( mov edi, [L_SUBKEYS-4*16+6*4])
00866 ROUND()
00867 AS2( mov ecx, edi)
00868 AS2( xor eax, [L_SUBKEYS-4*16+4*4])
00869 AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
00870 AS2( MOVD edx, MM(0))
00871
00872 AS2( add L_REG, 32)
00873 AS2( test L_REG, 255)
00874 ASJ( jnz, 2, b)
00875 AS2( sub L_REG, 16*16)
00876
00877 #define LAST(a, b, c) \
00878 AS2( movzx esi, a )\
00879 AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
00880 AS2( movzx esi, b )\
00881 AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
00882 AS2( mov WORD PTR [L_LASTROUND+c], di )\
00883
00884
00885 LAST(ch, dl, 2)
00886 LAST(dh, al, 6)
00887 AS2( shr edx, 16)
00888 LAST(ah, bl, 10)
00889 AS2( shr eax, 16)
00890 LAST(bh, cl, 14)
00891 AS2( shr ebx, 16)
00892 LAST(dh, al, 12)
00893 AS2( shr ecx, 16)
00894 LAST(ah, bl, 0)
00895 LAST(bh, cl, 4)
00896 LAST(ch, dl, 8)
00897
00898 AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
00899 AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
00900
00901 AS2( mov WORD_REG(cx), [L_LENGTH])
00902 AS2( sub WORD_REG(cx), 16)
00903
00904 AS2( movdqu xmm2, [WORD_REG(ax)])
00905 AS2( pxor xmm2, xmm4)
00906
00907 #if CRYPTOPP_BOOL_X86
00908 AS2( movdqa xmm0, [L_INCREMENTS])
00909 AS2( paddd xmm0, [L_INBLOCKS])
00910 AS2( movdqa [L_INBLOCKS], xmm0)
00911 #else
00912 AS2( movdqa xmm0, [L_INCREMENTS+16])
00913 AS2( paddq xmm0, [L_INBLOCKS+16])
00914 AS2( movdqa [L_INBLOCKS+16], xmm0)
00915 #endif
00916
00917 AS2( pxor xmm2, [L_LASTROUND])
00918 AS2( movdqu [WORD_REG(bx)], xmm2)
00919
00920 ASJ( jle, 7, f)
00921 AS2( mov [L_LENGTH], WORD_REG(cx))
00922 AS2( test WORD_REG(cx), 1)
00923 ASJ( jnz, 1, b)
00924 #if CRYPTOPP_BOOL_X64
00925 AS2( movdqa xmm0, [L_INCREMENTS])
00926 AS2( paddq xmm0, [L_INBLOCKS])
00927 AS2( movdqa [L_INBLOCKS], xmm0)
00928 #endif
00929 ASJ( jmp, 3, b)
00930
00931 ASL(7)
00932
00933 AS2( xorps xmm0, xmm0)
00934 AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
00935 AS2( movaps [WORD_REG(ax)-7*16], xmm0)
00936 AS2( movaps [WORD_REG(ax)-6*16], xmm0)
00937 AS2( movaps [WORD_REG(ax)-5*16], xmm0)
00938 AS2( movaps [WORD_REG(ax)-4*16], xmm0)
00939 AS2( movaps [WORD_REG(ax)-3*16], xmm0)
00940 AS2( movaps [WORD_REG(ax)-2*16], xmm0)
00941 AS2( movaps [WORD_REG(ax)-1*16], xmm0)
00942 AS2( movaps [WORD_REG(ax)+0*16], xmm0)
00943 AS2( movaps [WORD_REG(ax)+1*16], xmm0)
00944 AS2( movaps [WORD_REG(ax)+2*16], xmm0)
00945 AS2( movaps [WORD_REG(ax)+3*16], xmm0)
00946 AS2( movaps [WORD_REG(ax)+4*16], xmm0)
00947 AS2( movaps [WORD_REG(ax)+5*16], xmm0)
00948 AS2( movaps [WORD_REG(ax)+6*16], xmm0)
00949 #if CRYPTOPP_BOOL_X86
00950 AS2( mov esp, [L_SP])
00951 AS1( emms)
00952 #endif
00953 AS_POP_IF86(bp)
00954 AS_POP_IF86(bx)
00955 #if defined(_MSC_VER) && CRYPTOPP_BOOL_X86
00956 AS_POP_IF86(di)
00957 AS_POP_IF86(si)
00958 AS1(ret)
00959 #endif
00960 #ifdef CRYPTOPP_GENERATE_X64_MASM
00961 pop r12
00962 pop rbx
00963 pop rdi
00964 pop rsi
00965 ret
00966 Rijndael_Enc_AdvancedProcessBlocks ENDP
00967 #endif
00968 #ifdef __GNUC__
00969 ".att_syntax prefix;"
00970 :
00971 : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
00972 : "memory", "cc", "%eax"
00973 #if CRYPTOPP_BOOL_X64
00974 , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
00975 #endif
00976 );
00977 #endif
00978 }
00979
00980 #endif
00981
00982 #ifndef CRYPTOPP_GENERATE_X64_MASM
00983
00984 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
00985 extern "C" {
00986 void Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k);
00987 }
00988 #endif
00989
00990 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X86
00991
00992 static inline bool AliasedWithTable(const byte *begin, const byte *end)
00993 {
00994 size_t s0 = size_t(begin)%4096, s1 = size_t(end)%4096;
00995 size_t t0 = size_t(Te)%4096, t1 = (size_t(Te)+sizeof(Te))%4096;
00996 if (t1 > t0)
00997 return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
00998 else
00999 return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
01000 }
01001
01002 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
01003
01004 inline void AESNI_Enc_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
01005 {
01006 block = _mm_xor_si128(block, subkeys[0]);
01007 for (unsigned int i=1; i<rounds-1; i+=2)
01008 {
01009 block = _mm_aesenc_si128(block, subkeys[i]);
01010 block = _mm_aesenc_si128(block, subkeys[i+1]);
01011 }
01012 block = _mm_aesenc_si128(block, subkeys[rounds-1]);
01013 block = _mm_aesenclast_si128(block, subkeys[rounds]);
01014 }
01015
01016 inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
01017 {
01018 __m128i rk = subkeys[0];
01019 block0 = _mm_xor_si128(block0, rk);
01020 block1 = _mm_xor_si128(block1, rk);
01021 block2 = _mm_xor_si128(block2, rk);
01022 block3 = _mm_xor_si128(block3, rk);
01023 for (unsigned int i=1; i<rounds; i++)
01024 {
01025 rk = subkeys[i];
01026 block0 = _mm_aesenc_si128(block0, rk);
01027 block1 = _mm_aesenc_si128(block1, rk);
01028 block2 = _mm_aesenc_si128(block2, rk);
01029 block3 = _mm_aesenc_si128(block3, rk);
01030 }
01031 rk = subkeys[rounds];
01032 block0 = _mm_aesenclast_si128(block0, rk);
01033 block1 = _mm_aesenclast_si128(block1, rk);
01034 block2 = _mm_aesenclast_si128(block2, rk);
01035 block3 = _mm_aesenclast_si128(block3, rk);
01036 }
01037
01038 inline void AESNI_Dec_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
01039 {
01040 block = _mm_xor_si128(block, subkeys[0]);
01041 for (unsigned int i=1; i<rounds-1; i+=2)
01042 {
01043 block = _mm_aesdec_si128(block, subkeys[i]);
01044 block = _mm_aesdec_si128(block, subkeys[i+1]);
01045 }
01046 block = _mm_aesdec_si128(block, subkeys[rounds-1]);
01047 block = _mm_aesdeclast_si128(block, subkeys[rounds]);
01048 }
01049
01050 inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
01051 {
01052 __m128i rk = subkeys[0];
01053 block0 = _mm_xor_si128(block0, rk);
01054 block1 = _mm_xor_si128(block1, rk);
01055 block2 = _mm_xor_si128(block2, rk);
01056 block3 = _mm_xor_si128(block3, rk);
01057 for (unsigned int i=1; i<rounds; i++)
01058 {
01059 rk = subkeys[i];
01060 block0 = _mm_aesdec_si128(block0, rk);
01061 block1 = _mm_aesdec_si128(block1, rk);
01062 block2 = _mm_aesdec_si128(block2, rk);
01063 block3 = _mm_aesdec_si128(block3, rk);
01064 }
01065 rk = subkeys[rounds];
01066 block0 = _mm_aesdeclast_si128(block0, rk);
01067 block1 = _mm_aesdeclast_si128(block1, rk);
01068 block2 = _mm_aesdeclast_si128(block2, rk);
01069 block3 = _mm_aesdeclast_si128(block3, rk);
01070 }
01071
01072 static CRYPTOPP_ALIGN_DATA(16) const word32 s_one[] = {0, 0, 0, 1<<24};
01073
01074 template <typename F1, typename F4>
01075 inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, const __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
01076 {
01077 size_t blockSize = 16;
01078 size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize;
01079 size_t xorIncrement = xorBlocks ? blockSize : 0;
01080 size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
01081
01082 if (flags & BlockTransformation::BT_ReverseDirection)
01083 {
01084 assert(length % blockSize == 0);
01085 inBlocks += length - blockSize;
01086 xorBlocks += length - blockSize;
01087 outBlocks += length - blockSize;
01088 inIncrement = 0-inIncrement;
01089 xorIncrement = 0-xorIncrement;
01090 outIncrement = 0-outIncrement;
01091 }
01092
01093 if (flags & BlockTransformation::BT_AllowParallel)
01094 {
01095 while (length >= 4*blockSize)
01096 {
01097 __m128i block0 = _mm_loadu_si128((const __m128i *)inBlocks), block1, block2, block3;
01098 if (flags & BlockTransformation::BT_InBlockIsCounter)
01099 {
01100 const __m128i be1 = *(const __m128i *)s_one;
01101 block1 = _mm_add_epi32(block0, be1);
01102 block2 = _mm_add_epi32(block1, be1);
01103 block3 = _mm_add_epi32(block2, be1);
01104 _mm_storeu_si128((__m128i *)inBlocks, _mm_add_epi32(block3, be1));
01105 }
01106 else
01107 {
01108 inBlocks += inIncrement;
01109 block1 = _mm_loadu_si128((const __m128i *)inBlocks);
01110 inBlocks += inIncrement;
01111 block2 = _mm_loadu_si128((const __m128i *)inBlocks);
01112 inBlocks += inIncrement;
01113 block3 = _mm_loadu_si128((const __m128i *)inBlocks);
01114 inBlocks += inIncrement;
01115 }
01116
01117 if (flags & BlockTransformation::BT_XorInput)
01118 {
01119 block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks));
01120 xorBlocks += xorIncrement;
01121 block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks));
01122 xorBlocks += xorIncrement;
01123 block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks));
01124 xorBlocks += xorIncrement;
01125 block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks));
01126 xorBlocks += xorIncrement;
01127 }
01128
01129 func4(block0, block1, block2, block3, subkeys, rounds);
01130
01131 if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
01132 {
01133 block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks));
01134 xorBlocks += xorIncrement;
01135 block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks));
01136 xorBlocks += xorIncrement;
01137 block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks));
01138 xorBlocks += xorIncrement;
01139 block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks));
01140 xorBlocks += xorIncrement;
01141 }
01142
01143 _mm_storeu_si128((__m128i *)outBlocks, block0);
01144 outBlocks += outIncrement;
01145 _mm_storeu_si128((__m128i *)outBlocks, block1);
01146 outBlocks += outIncrement;
01147 _mm_storeu_si128((__m128i *)outBlocks, block2);
01148 outBlocks += outIncrement;
01149 _mm_storeu_si128((__m128i *)outBlocks, block3);
01150 outBlocks += outIncrement;
01151
01152 length -= 4*blockSize;
01153 }
01154 }
01155
01156 while (length >= blockSize)
01157 {
01158 __m128i block = _mm_loadu_si128((const __m128i *)inBlocks);
01159
01160 if (flags & BlockTransformation::BT_XorInput)
01161 block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks));
01162
01163 if (flags & BlockTransformation::BT_InBlockIsCounter)
01164 const_cast<byte *>(inBlocks)[15]++;
01165
01166 func1(block, subkeys, rounds);
01167
01168 if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
01169 block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks));
01170
01171 _mm_storeu_si128((__m128i *)outBlocks, block);
01172
01173 inBlocks += inIncrement;
01174 outBlocks += outIncrement;
01175 xorBlocks += xorIncrement;
01176 length -= blockSize;
01177 }
01178
01179 return length;
01180 }
01181 #endif
01182
01183 size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
01184 {
01185 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
01186 if (HasAESNI())
01187 return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
01188 #endif
01189
01190 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
01191 if (HasSSE2())
01192 {
01193 if (length < BLOCKSIZE)
01194 return length;
01195
01196 struct Locals
01197 {
01198 word32 subkeys[4*12], workspace[8];
01199 const byte *inBlocks, *inXorBlocks, *outXorBlocks;
01200 byte *outBlocks;
01201 size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
01202 size_t regSpill, lengthAndCounterFlag, keysBegin;
01203 };
01204
01205 size_t increment = BLOCKSIZE;
01206 const byte* zeros = (byte *)(Te+256);
01207 byte *space;
01208
01209 do {
01210 space = (byte *)alloca(255+sizeof(Locals));
01211 space += (256-(size_t)space%256)%256;
01212 }
01213 while (AliasedWithTable(space, space+sizeof(Locals)));
01214
01215 if (flags & BT_ReverseDirection)
01216 {
01217 assert(length % BLOCKSIZE == 0);
01218 inBlocks += length - BLOCKSIZE;
01219 xorBlocks += length - BLOCKSIZE;
01220 outBlocks += length - BLOCKSIZE;
01221 increment = 0-increment;
01222 }
01223
01224 Locals &locals = *(Locals *)space;
01225
01226 locals.inBlocks = inBlocks;
01227 locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
01228 locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
01229 locals.outBlocks = outBlocks;
01230
01231 locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
01232 locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
01233 locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
01234 locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
01235
01236 locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
01237 int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
01238 locals.keysBegin = (12-keysToCopy)*16;
01239
01240 Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key);
01241 return length % BLOCKSIZE;
01242 }
01243 #endif
01244
01245 return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
01246 }
01247
01248 #endif
01249
01250 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
01251
01252 size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
01253 {
01254 if (HasAESNI())
01255 return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
01256
01257 return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
01258 }
01259
01260 #endif // #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
01261
01262 NAMESPACE_END
01263
01264 #endif
01265 #endif