Crypto++ 8.2
Free C&
rijndael.cpp
1// rijndael.cpp - modified by Chris Morgan <cmorgan@wpi.edu>
2// and Wei Dai from Paulo Baretto's Rijndael implementation
3// The original code and all modifications are in the public domain.
4
5// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
6
7/*
8July 2018: Added support for ARMv7 AES instructions via Cryptogams ASM.
9 See the head notes in aes_armv4.S for copyright and license.
10*/
11
12/*
13September 2017: Added support for Power8 AES instructions via compiler intrinsics.
14*/
15
16/*
17July 2017: Added support for ARMv8 AES instructions via compiler intrinsics.
18*/
19
20/*
21July 2010: Added support for AES-NI instructions via compiler intrinsics.
22*/
23
24/*
25Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode
26caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein
27and Peter Schwabe in their paper "New AES software speed records". The round
28function was also modified to include a trick similar to one in Brian Gladman's
29x86 assembly code, doing an 8-bit register move to minimize the number of
30register spills. Also switched to compressed tables and copying round keys to
31the stack.
32
33The C++ implementation uses compressed tables if
34CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS is defined.
35It is defined on x86 platforms by default but no others.
36*/
37
38/*
39July 2006: Defense against timing attacks was added in by Wei Dai.
40
41The code now uses smaller tables in the first and last rounds,
42and preloads them into L1 cache before usage (by loading at least
43one element in each cache line).
44
45We try to delay subsequent accesses to each table (used in the first
46and last rounds) until all of the table has been preloaded. Hopefully
47the compiler isn't smart enough to optimize that code away.
48
49After preloading the table, we also try not to access any memory location
50other than the table and the stack, in order to prevent table entries from
51being unloaded from L1 cache, until that round is finished.
52(Some popular CPUs have 2-way associative caches.)
53*/
54
55// This is the original introductory comment:
56
57/**
58 * version 3.0 (December 2000)
59 *
60 * Optimised ANSI C code for the Rijndael cipher (now AES)
61 *
62 * author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
63 * author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
64 * author Paulo Barreto <paulo.barreto@terra.com.br>
65 *
66 * This code is hereby placed in the public domain.
67 *
68 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
69 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
70 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
71 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
72 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
73 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
74 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
75 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
76 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
77 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
78 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
79 */
80
81#include "pch.h"
82#include "config.h"
83
84#ifndef CRYPTOPP_IMPORTS
85#ifndef CRYPTOPP_GENERATE_X64_MASM
86
87#include "rijndael.h"
88#include "misc.h"
89#include "cpu.h"
90
91// VS2017 and global optimization bug. TODO, figure out when
92// we can re-enable full optimizations for VS2017. Also see
93// https://github.com/weidai11/cryptopp/issues/649
94#if (_MSC_VER >= 1910)
95# ifndef CRYPTOPP_DEBUG
96# pragma optimize("", off)
97# pragma optimize("ts", on)
98# endif
99#endif
100
101NAMESPACE_BEGIN(CryptoPP)
102
103// Hack for http://github.com/weidai11/cryptopp/issues/42 and http://github.com/weidai11/cryptopp/issues/132
104#if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE))
105# define CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS 1
106#endif
107
108// Clang __m128i casts
109#define M128I_CAST(x) ((__m128i *)(void *)(x))
110#define CONST_M128I_CAST(x) ((const __m128i *)(const void *)(x))
111
112#if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
113# if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
114namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
115using namespace rdtable;
116# else
117static word64 Te[256];
118# endif
119static word64 Td[256];
120#else // Not CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS
121# if defined(CRYPTOPP_X64_MASM_AVAILABLE)
122// Unused; avoids linker error on Microsoft X64 non-AESNI platforms
123namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
124# endif
125CRYPTOPP_ALIGN_DATA(16) static word32 Te[256*4];
126CRYPTOPP_ALIGN_DATA(16) static word32 Td[256*4];
127#endif // CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS
128
129static volatile bool s_TeFilled = false, s_TdFilled = false;
130
131ANONYMOUS_NAMESPACE_BEGIN
132
133#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
134
135// Determine whether the range between begin and end overlaps
136// with the same 4k block offsets as the Te table. Logically,
137// the code is trying to create the condition:
138//
139// Two sepearate memory pages:
140//
141// +-----+ +-----+
142// |XXXXX| |YYYYY|
143// |XXXXX| |YYYYY|
144// | | | |
145// | | | |
146// +-----+ +-----+
147// Te Table Locals
148//
149// Have a logical cache view of (X and Y may be inverted):
150//
151// +-----+
152// |XXXXX|
153// |XXXXX|
154// |YYYYY|
155// |YYYYY|
156// +-----+
157//
158static inline bool AliasedWithTable(const byte *begin, const byte *end)
159{
160 ptrdiff_t s0 = uintptr_t(begin)%4096, s1 = uintptr_t(end)%4096;
161 ptrdiff_t t0 = uintptr_t(Te)%4096, t1 = (uintptr_t(Te)+sizeof(Te))%4096;
162 if (t1 > t0)
163 return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
164 else
165 return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
166}
167
168struct Locals
169{
170 word32 subkeys[4*12], workspace[8];
171 const byte *inBlocks, *inXorBlocks, *outXorBlocks;
172 byte *outBlocks;
173 size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
174 size_t regSpill, lengthAndCounterFlag, keysBegin;
175};
176
177const size_t s_aliasPageSize = 4096;
178const size_t s_aliasBlockSize = 256;
179const size_t s_sizeToAllocate = s_aliasPageSize + s_aliasBlockSize + sizeof(Locals);
180
181#endif // CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
182
183ANONYMOUS_NAMESPACE_END
184
185// ************************* Portable Code ************************************
186
187#define QUARTER_ROUND(L, T, t, a, b, c, d) \
188 a ^= L(T, 3, byte(t)); t >>= 8;\
189 b ^= L(T, 2, byte(t)); t >>= 8;\
190 c ^= L(T, 1, byte(t)); t >>= 8;\
191 d ^= L(T, 0, t);
192
193#define QUARTER_ROUND_LE(t, a, b, c, d) \
194 tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
195 tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
196 tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
197 tempBlock[d] = ((byte *)(Te+t))[1];
198
199#if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
200 #define QUARTER_ROUND_LD(t, a, b, c, d) \
201 tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
202 tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
203 tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
204 tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
205#else
206 #define QUARTER_ROUND_LD(t, a, b, c, d) \
207 tempBlock[a] = Sd[byte(t)]; t >>= 8;\
208 tempBlock[b] = Sd[byte(t)]; t >>= 8;\
209 tempBlock[c] = Sd[byte(t)]; t >>= 8;\
210 tempBlock[d] = Sd[t];
211#endif
212
213#define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
214#define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
215
216#if (CRYPTOPP_LITTLE_ENDIAN)
217 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
218 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
219 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
220 #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (6-i)%4+1))
221 #define TL_M(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (i+3)%4+1))
222 #else
223 #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
224 #define TL_M(T, i, x) T[i*256 + x]
225 #endif
226#else
227 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
228 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
229 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
230 #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (4-i)%4))
231 #define TL_M TL_F
232 #else
233 #define TL_F(T, i, x) rotrFixed(T[x], i*8)
234 #define TL_M(T, i, x) T[i*256 + x]
235 #endif
236#endif
237
238
239#define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
240#define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
241#define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
242
243#define f3(x) (f2(x) ^ x)
244#define f9(x) (f8(x) ^ x)
245#define fb(x) (f8(x) ^ f2(x) ^ x)
246#define fd(x) (f8(x) ^ f4(x) ^ x)
247#define fe(x) (f8(x) ^ f4(x) ^ f2(x))
248
249unsigned int Rijndael::Base::OptimalDataAlignment() const
250{
251#if (CRYPTOPP_AESNI_AVAILABLE)
252 if (HasAESNI())
253 return 1;
254#endif
255#if (CRYPTOPP_ARM_AES_AVAILABLE)
256 if (HasAES())
257 return 1;
258#endif
259#if (CRYPTOGAMS_ARM_AES)
260 if (HasARMv7())
261 return 1;
262#endif
263#if (CRYPTOPP_POWER8_AES_AVAILABLE)
264 if (HasAES())
265 return 1;
266#endif
268}
269
270void Rijndael::Base::FillEncTable()
271{
272 for (int i=0; i<256; i++)
273 {
274 byte x = Se[i];
275#if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
276 word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
277 Te[i] = word64(y | f3(x))<<32 | y;
278#else
279 word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
280 for (int j=0; j<4; j++)
281 {
282 Te[i+j*256] = y;
283 y = rotrConstant<8>(y);
284 }
285#endif
286 }
287#if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
288 Te[256] = Te[257] = 0;
289#endif
290 s_TeFilled = true;
291}
292
293void Rijndael::Base::FillDecTable()
294{
295 for (int i=0; i<256; i++)
296 {
297 byte x = Sd[i];
298#if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
299 word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
300 Td[i] = word64(y | fb(x))<<32 | y | x;
301#else
302 word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;;
303 for (int j=0; j<4; j++)
304 {
305 Td[i+j*256] = y;
306 y = rotrConstant<8>(y);
307 }
308#endif
309 }
310 s_TdFilled = true;
311}
312
313#if (CRYPTOPP_AESNI_AVAILABLE)
314extern void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32* rk);
315extern void Rijndael_UncheckedSetKeyRev_AESNI(word32 *key, unsigned int rounds);
316
317extern size_t Rijndael_Enc_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds,
318 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
319extern size_t Rijndael_Dec_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds,
320 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
321#endif
322
323#if (CRYPTOPP_ARM_AES_AVAILABLE)
324extern size_t Rijndael_Enc_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds,
325 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
326extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds,
327 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
328#endif
329
330#if (CRYPTOGAMS_ARM_AES)
331extern "C" int AES_set_encrypt_key(const unsigned char *userKey, const int bitLen, word32 *rkey);
332extern "C" int AES_set_decrypt_key(const unsigned char *userKey, const int bitLen, word32 *rkey);
333extern "C" void AES_encrypt(const unsigned char in[16], unsigned char out[16], const word32 *rkey);
334extern "C" void AES_decrypt(const unsigned char in[16], unsigned char out[16], const word32 *rkey);
335#endif
336
337#if (CRYPTOPP_POWER8_AES_AVAILABLE)
338extern void Rijndael_UncheckedSetKey_POWER8(const byte* userKey, size_t keyLen,
339 word32* rk, const byte* Se);
340
341extern size_t Rijndael_Enc_AdvancedProcessBlocks128_6x1_ALTIVEC(const word32 *subkeys, size_t rounds,
342 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
343extern size_t Rijndael_Dec_AdvancedProcessBlocks128_6x1_ALTIVEC(const word32 *subkeys, size_t rounds,
344 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
345#endif
346
347#if (CRYPTOGAMS_ARM_AES)
348int CRYPTOGAMS_set_encrypt_key(const byte *userKey, const int bitLen, word32 *rkey)
349{
350 return AES_set_encrypt_key(userKey, bitLen, rkey);
351}
352int CRYPTOGAMS_set_decrypt_key(const byte *userKey, const int bitLen, word32 *rkey)
353{
354 return AES_set_decrypt_key(userKey, bitLen, rkey);
355}
356void CRYPTOGAMS_encrypt(const byte *inBlock, const byte *xorBlock, byte *outBlock, const word32 *rkey)
357{
358 AES_encrypt(inBlock, outBlock, rkey);
359 if (xorBlock)
360 xorbuf (outBlock, xorBlock, 16);
361}
362void CRYPTOGAMS_decrypt(const byte *inBlock, const byte *xorBlock, byte *outBlock, const word32 *rkey)
363{
364 AES_decrypt(inBlock, outBlock, rkey);
365 if (xorBlock)
366 xorbuf (outBlock, xorBlock, 16);
367}
368#endif
369
370std::string Rijndael::Base::AlgorithmProvider() const
371{
372#if (CRYPTOPP_AESNI_AVAILABLE)
373 if (HasAESNI())
374 return "AESNI";
375#endif
376#if CRYPTOPP_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
377 if (HasSSE2())
378 return "SSE2";
379#endif
380#if (CRYPTOPP_ARM_AES_AVAILABLE)
381 if (HasAES())
382 return "ARMv8";
383#endif
384#if (CRYPTOGAMS_ARM_AES)
385 if (HasARMv7())
386 return "ARMv7";
387#endif
388#if (CRYPTOPP_POWER8_AES_AVAILABLE)
389 if (HasAES())
390 return "Power8";
391#endif
392 return "C++";
393}
394
395void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, const NameValuePairs &)
396{
397 AssertValidKeyLength(keyLen);
398
399#if (CRYPTOGAMS_ARM_AES)
400 if (HasARMv7())
401 {
402 m_rounds = keyLen/4 + 6;
403 m_key.New(4*(15+1)+4);
404
405 if (IsForwardTransformation())
406 CRYPTOGAMS_set_encrypt_key(userKey, keyLen*8, m_key.begin());
407 else
408 CRYPTOGAMS_set_decrypt_key(userKey, keyLen*8, m_key.begin());
409 return;
410 }
411#endif
412
413#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
414 m_aliasBlock.New(s_sizeToAllocate);
415 // The alias block is only used on IA-32 when unaligned data access is in effect.
416 // Setting the low water mark to 0 avoids zeroization when m_aliasBlock is unused.
417 m_aliasBlock.SetMark(0);
418#endif
419
420 m_rounds = keyLen/4 + 6;
421 m_key.New(4*(m_rounds+1));
422 word32 *rk = m_key;
423
424#if (CRYPTOPP_AESNI_AVAILABLE && CRYPTOPP_SSE41_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32))
425 // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64
426 if (HasAESNI() && HasSSE41())
427 {
428 // TODO: Add non-SSE4.1 variant for low-end Atoms. The low-end
429 // Atoms have SSE2-SSSE3 and AES-NI, but not SSE4.1 or SSE4.2.
430 Rijndael_UncheckedSetKey_SSE4_AESNI(userKey, keyLen, rk);
431 if (!IsForwardTransformation())
432 Rijndael_UncheckedSetKeyRev_AESNI(m_key, m_rounds);
433
434 return;
435 }
436#endif
437
438#if CRYPTOPP_POWER8_AES_AVAILABLE
439 if (HasAES())
440 {
441 // We still need rcon and Se to fallback to C/C++ for AES-192 and AES-256.
442 // The IBM docs on AES sucks. Intel's docs on AESNI puts IBM to shame.
443 Rijndael_UncheckedSetKey_POWER8(userKey, keyLen, rk, Se);
444 return;
445 }
446#endif
447
448 GetUserKey(BIG_ENDIAN_ORDER, rk, keyLen/4, userKey, keyLen);
449 const word32 *rc = rcon;
450 word32 temp;
451
452 while (true)
453 {
454 temp = rk[keyLen/4-1];
455 word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^
456 (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
457 rk[keyLen/4] = rk[0] ^ x ^ *(rc++);
458 rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4];
459 rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1];
460 rk[keyLen/4+3] = rk[3] ^ rk[keyLen/4+2];
461
462 if (rk + keyLen/4 + 4 == m_key.end())
463 break;
464
465 if (keyLen == 24)
466 {
467 rk[10] = rk[ 4] ^ rk[ 9];
468 rk[11] = rk[ 5] ^ rk[10];
469 }
470 else if (keyLen == 32)
471 {
472 temp = rk[11];
473 rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
474 rk[13] = rk[ 5] ^ rk[12];
475 rk[14] = rk[ 6] ^ rk[13];
476 rk[15] = rk[ 7] ^ rk[14];
477 }
478 rk += keyLen/4;
479 }
480
481 rk = m_key;
482
483 if (IsForwardTransformation())
484 {
485 if (!s_TeFilled)
486 FillEncTable();
487
489 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
490 }
491 else
492 {
493 if (!s_TdFilled)
494 FillDecTable();
495
496 #define InverseMixColumn(x) \
497 TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ \
498 TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
499
500 unsigned int i, j;
501 for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
502 {
503 temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
504 temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
505 temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
506 temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
507 }
508
509 rk[i+0] = InverseMixColumn(rk[i+0]);
510 rk[i+1] = InverseMixColumn(rk[i+1]);
511 rk[i+2] = InverseMixColumn(rk[i+2]);
512 rk[i+3] = InverseMixColumn(rk[i+3]);
513
514 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
515 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
516 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
517 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
518 }
519
520#if CRYPTOPP_AESNI_AVAILABLE
521 if (HasAESNI())
522 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
523#endif
524#if CRYPTOPP_ARM_AES_AVAILABLE
525 if (HasAES())
526 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
527#endif
528}
529
530void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
531{
532#if CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_AESNI_AVAILABLE
533# if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
534 if (HasSSE2())
535# else
536 if (HasAESNI())
537# endif
538 {
539 (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
540 return;
541 }
542#endif
543
544#if (CRYPTOPP_ARM_AES_AVAILABLE)
545 if (HasAES())
546 {
547 (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
548 return;
549 }
550#endif
551
552#if (CRYPTOGAMS_ARM_AES)
553 if (HasARMv7())
554 {
555 CRYPTOGAMS_encrypt(inBlock, xorBlock, outBlock, m_key.begin());
556 return;
557 }
558#endif
559
560#if (CRYPTOPP_POWER8_AES_AVAILABLE)
561 if (HasAES())
562 {
563 (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
564 return;
565 }
566#endif
567
569
570 word32 s0, s1, s2, s3, t0, t1, t2, t3;
571 Block::Get(inBlock)(s0)(s1)(s2)(s3);
572
573 const word32 *rk = m_key;
574 s0 ^= rk[0];
575 s1 ^= rk[1];
576 s2 ^= rk[2];
577 s3 ^= rk[3];
578 t0 = rk[4];
579 t1 = rk[5];
580 t2 = rk[6];
581 t3 = rk[7];
582 rk += 8;
583
584 // timing attack countermeasure. see comments at top for more details.
585 // also see http://github.com/weidai11/cryptopp/issues/146
586 const int cacheLineSize = GetCacheLineSize();
587 unsigned int i;
588 volatile word32 _u = 0;
589 word32 u = _u;
590#if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
591 for (i=0; i<2048; i+=cacheLineSize)
592#else
593 for (i=0; i<1024; i+=cacheLineSize)
594#endif
595 u &= *(const word32 *)(const void *)(((const byte *)Te)+i);
596 u &= Te[255];
597 s0 |= u; s1 |= u; s2 |= u; s3 |= u;
598
599 QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
600 QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
601 QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
602 QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
603
604 // Nr - 2 full rounds:
605 unsigned int r = m_rounds/2 - 1;
606 do
607 {
608 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
609
610 QUARTER_ROUND_E(t3, s0, s1, s2, s3)
611 QUARTER_ROUND_E(t2, s3, s0, s1, s2)
612 QUARTER_ROUND_E(t1, s2, s3, s0, s1)
613 QUARTER_ROUND_E(t0, s1, s2, s3, s0)
614
615 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
616
617 QUARTER_ROUND_E(s3, t0, t1, t2, t3)
618 QUARTER_ROUND_E(s2, t3, t0, t1, t2)
619 QUARTER_ROUND_E(s1, t2, t3, t0, t1)
620 QUARTER_ROUND_E(s0, t1, t2, t3, t0)
621
622 rk += 8;
623 } while (--r);
624
625 word32 tbw[4];
626 byte *const tempBlock = (byte *)tbw;
627
628 QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
629 QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
630 QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
631 QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
632
633 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
634}
635
636void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
637{
638#if CRYPTOPP_AESNI_AVAILABLE
639 if (HasAESNI())
640 {
641 (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
642 return;
643 }
644#endif
645
646#if (CRYPTOPP_ARM_AES_AVAILABLE)
647 if (HasAES())
648 {
649 (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
650 return;
651 }
652#endif
653
654#if (CRYPTOGAMS_ARM_AES)
655 if (HasARMv7())
656 {
657 CRYPTOGAMS_decrypt(inBlock, xorBlock, outBlock, m_key.begin());
658 return;
659 }
660#endif
661
662#if (CRYPTOPP_POWER8_AES_AVAILABLE)
663 if (HasAES())
664 {
665 (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
666 return;
667 }
668#endif
669
671
672 word32 s0, s1, s2, s3, t0, t1, t2, t3;
673 Block::Get(inBlock)(s0)(s1)(s2)(s3);
674
675 const word32 *rk = m_key;
676 s0 ^= rk[0];
677 s1 ^= rk[1];
678 s2 ^= rk[2];
679 s3 ^= rk[3];
680 t0 = rk[4];
681 t1 = rk[5];
682 t2 = rk[6];
683 t3 = rk[7];
684 rk += 8;
685
686 // timing attack countermeasure. see comments at top for more details.
687 // also see http://github.com/weidai11/cryptopp/issues/146
688 const int cacheLineSize = GetCacheLineSize();
689 unsigned int i;
690 volatile word32 _u = 0;
691 word32 u = _u;
692#if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
693 for (i=0; i<2048; i+=cacheLineSize)
694#else
695 for (i=0; i<1024; i+=cacheLineSize)
696#endif
697 u &= *(const word32 *)(const void *)(((const byte *)Td)+i);
698 u &= Td[255];
699 s0 |= u; s1 |= u; s2 |= u; s3 |= u;
700
701 QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
702 QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
703 QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
704 QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
705
706 // Nr - 2 full rounds:
707 unsigned int r = m_rounds/2 - 1;
708 do
709 {
710 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
711
712 QUARTER_ROUND_D(t3, s2, s1, s0, s3)
713 QUARTER_ROUND_D(t2, s1, s0, s3, s2)
714 QUARTER_ROUND_D(t1, s0, s3, s2, s1)
715 QUARTER_ROUND_D(t0, s3, s2, s1, s0)
716
717 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
718
719 QUARTER_ROUND_D(s3, t2, t1, t0, t3)
720 QUARTER_ROUND_D(s2, t1, t0, t3, t2)
721 QUARTER_ROUND_D(s1, t0, t3, t2, t1)
722 QUARTER_ROUND_D(s0, t3, t2, t1, t0)
723
724 rk += 8;
725 } while (--r);
726
727#if !(defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS))
728 // timing attack countermeasure. see comments at top for more details
729 // If CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS is defined,
730 // QUARTER_ROUND_LD will use Td, which is already preloaded.
731 u = _u;
732 for (i=0; i<256; i+=cacheLineSize)
733 u &= *(const word32 *)(const void *)(Sd+i);
734 u &= *(const word32 *)(const void *)(Sd+252);
735 t0 |= u; t1 |= u; t2 |= u; t3 |= u;
736#endif
737
738 word32 tbw[4];
739 byte *const tempBlock = (byte *)tbw;
740
741 QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
742 QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
743 QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
744 QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
745
746 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
747}
748
749// ************************* Assembly Code ************************************
750
751#if CRYPTOPP_MSC_VERSION
752# pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
753#endif
754
755#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
756
757#if CRYPTOPP_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
758
759CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks_SSE2(void *locals, const word32 *k)
760{
761 CRYPTOPP_UNUSED(locals); CRYPTOPP_UNUSED(k);
762
763#if CRYPTOPP_BOOL_X86
764
765#define L_REG esp
766#define L_INDEX(i) (L_REG+768+i)
767#define L_INXORBLOCKS L_INBLOCKS+4
768#define L_OUTXORBLOCKS L_INBLOCKS+8
769#define L_OUTBLOCKS L_INBLOCKS+12
770#define L_INCREMENTS L_INDEX(16*15)
771#define L_SP L_INDEX(16*16)
772#define L_LENGTH L_INDEX(16*16+4)
773#define L_KEYS_BEGIN L_INDEX(16*16+8)
774
775#define MOVD movd
776#define MM(i) mm##i
777
778#define MXOR(a,b,c) \
779 AS2( movzx esi, b)\
780 AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
781 AS2( pxor MM(a), mm7)\
782
783#define MMOV(a,b,c) \
784 AS2( movzx esi, b)\
785 AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
786
787#else
788
789#define L_REG r8
790#define L_INDEX(i) (L_REG+i)
791#define L_INXORBLOCKS L_INBLOCKS+8
792#define L_OUTXORBLOCKS L_INBLOCKS+16
793#define L_OUTBLOCKS L_INBLOCKS+24
794#define L_INCREMENTS L_INDEX(16*16)
795#define L_LENGTH L_INDEX(16*18+8)
796#define L_KEYS_BEGIN L_INDEX(16*19)
797
798#define MOVD mov
799#define MM_0 r9d
800#define MM_1 r12d
801#ifdef __GNUC__
802#define MM_2 r11d
803#else
804#define MM_2 r10d
805#endif
806#define MM(i) MM_##i
807
808#define MXOR(a,b,c) \
809 AS2( movzx esi, b)\
810 AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
811
812#define MMOV(a,b,c) \
813 AS2( movzx esi, b)\
814 AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
815
816#endif
817
818#define L_SUBKEYS L_INDEX(0)
819#define L_SAVED_X L_SUBKEYS
820#define L_KEY12 L_INDEX(16*12)
821#define L_LASTROUND L_INDEX(16*13)
822#define L_INBLOCKS L_INDEX(16*14)
823#define MAP0TO4(i) (ASM_MOD(i+3,4)+1)
824
825#define XOR(a,b,c) \
826 AS2( movzx esi, b)\
827 AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
828
829#define MOV(a,b,c) \
830 AS2( movzx esi, b)\
831 AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
832
833#ifdef CRYPTOPP_GENERATE_X64_MASM
834 ALIGN 8
835 Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
836 rex_push_reg rsi
837 push_reg rdi
838 push_reg rbx
839 push_reg r12
840 .endprolog
841 mov L_REG, rcx
842 mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
843 mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
844#elif defined(__GNUC__)
845 __asm__ __volatile__
846 (
847 INTEL_NOPREFIX
848 #if CRYPTOPP_BOOL_X64
849 AS2( mov L_REG, rcx)
850 #endif
851 AS_PUSH_IF86(bx)
852 AS_PUSH_IF86(bp)
853 AS2( mov AS_REG_7, WORD_REG(si))
854#else
855 AS_PUSH_IF86(si)
856 AS_PUSH_IF86(di)
857 AS_PUSH_IF86(bx)
858 AS_PUSH_IF86(bp)
859 AS2( lea AS_REG_7, [Te])
860 AS2( mov edi, [g_cacheLineSize])
861#endif
862
863#if CRYPTOPP_BOOL_X86
864 AS2( mov [ecx+16*12+16*4], esp) // save esp to L_SP
865 AS2( lea esp, [ecx-768])
866#endif
867
868 // copy subkeys to stack
869 AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
870 AS2( mov WORD_REG(ax), 16)
871 AS2( and WORD_REG(ax), WORD_REG(si))
872 AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)]) // subkey 1 (non-counter) or 2 (counter)
873 AS2( movdqa [L_KEY12], xmm3)
874 AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
875 AS2( sub WORD_REG(ax), WORD_REG(si))
876 ASL(0)
877 AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
878 AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
879 AS2( add WORD_REG(si), 16)
880 AS2( cmp WORD_REG(si), 16*12)
881 ATT_NOPREFIX
882 ASJ( jl, 0, b)
883 INTEL_NOPREFIX
884
885 // read subkeys 0, 1 and last
886 AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)]) // last subkey
887 AS2( movdqa xmm1, [WORD_REG(dx)]) // subkey 0
888 AS2( MOVD MM(1), [WORD_REG(dx)+4*4]) // 0,1,2,3
889 AS2( mov ebx, [WORD_REG(dx)+5*4]) // 4,5,6,7
890 AS2( mov ecx, [WORD_REG(dx)+6*4]) // 8,9,10,11
891 AS2( mov edx, [WORD_REG(dx)+7*4]) // 12,13,14,15
892
893 // load table into cache
894 AS2( xor WORD_REG(ax), WORD_REG(ax))
895 ASL(9)
896 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
897 AS2( add WORD_REG(ax), WORD_REG(di))
898 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
899 AS2( add WORD_REG(ax), WORD_REG(di))
900 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
901 AS2( add WORD_REG(ax), WORD_REG(di))
902 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
903 AS2( add WORD_REG(ax), WORD_REG(di))
904 AS2( cmp WORD_REG(ax), 2048)
905 ATT_NOPREFIX
906 ASJ( jl, 9, b)
907 INTEL_NOPREFIX
908 AS1( lfence)
909
910 AS2( test DWORD PTR [L_LENGTH], 1)
911 ATT_NOPREFIX
912 ASJ( jz, 8, f)
913 INTEL_NOPREFIX
914
915 // counter mode one-time setup
916 AS2( mov WORD_REG(si), [L_INBLOCKS])
917 AS2( movdqu xmm2, [WORD_REG(si)]) // counter
918 AS2( pxor xmm2, xmm1)
919 AS2( psrldq xmm1, 14)
920 AS2( movd eax, xmm1)
921 AS2( mov al, BYTE PTR [WORD_REG(si)+15])
922 AS2( MOVD MM(2), eax)
923#if CRYPTOPP_BOOL_X86
924 AS2( mov eax, 1)
925 AS2( movd mm3, eax)
926#endif
927
928 // partial first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: mm1, ebx, ecx, edx
929 AS2( movd eax, xmm2)
930 AS2( psrldq xmm2, 4)
931 AS2( movd edi, xmm2)
932 AS2( psrldq xmm2, 4)
933 MXOR( 1, al, 0) // 0
934 XOR( edx, ah, 1) // 1
935 AS2( shr eax, 16)
936 XOR( ecx, al, 2) // 2
937 XOR( ebx, ah, 3) // 3
938 AS2( mov eax, edi)
939 AS2( movd edi, xmm2)
940 AS2( psrldq xmm2, 4)
941 XOR( ebx, al, 0) // 4
942 MXOR( 1, ah, 1) // 5
943 AS2( shr eax, 16)
944 XOR( edx, al, 2) // 6
945 XOR( ecx, ah, 3) // 7
946 AS2( mov eax, edi)
947 AS2( movd edi, xmm2)
948 XOR( ecx, al, 0) // 8
949 XOR( ebx, ah, 1) // 9
950 AS2( shr eax, 16)
951 MXOR( 1, al, 2) // 10
952 XOR( edx, ah, 3) // 11
953 AS2( mov eax, edi)
954 XOR( edx, al, 0) // 12
955 XOR( ecx, ah, 1) // 13
956 AS2( shr eax, 16)
957 XOR( ebx, al, 2) // 14
958 AS2( psrldq xmm2, 3)
959
960 // partial second round, in: ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15), out: eax, ebx, edi, mm0
961 AS2( mov eax, [L_KEY12+0*4])
962 AS2( mov edi, [L_KEY12+2*4])
963 AS2( MOVD MM(0), [L_KEY12+3*4])
964 MXOR( 0, cl, 3) /* 11 */
965 XOR( edi, bl, 3) /* 7 */
966 MXOR( 0, bh, 2) /* 6 */
967 AS2( shr ebx, 16) /* 4,5 */
968 XOR( eax, bl, 1) /* 5 */
969 MOV( ebx, bh, 0) /* 4 */
970 AS2( xor ebx, [L_KEY12+1*4])
971 XOR( eax, ch, 2) /* 10 */
972 AS2( shr ecx, 16) /* 8,9 */
973 XOR( eax, dl, 3) /* 15 */
974 XOR( ebx, dh, 2) /* 14 */
975 AS2( shr edx, 16) /* 12,13 */
976 XOR( edi, ch, 0) /* 8 */
977 XOR( ebx, cl, 1) /* 9 */
978 XOR( edi, dl, 1) /* 13 */
979 MXOR( 0, dh, 0) /* 12 */
980
981 AS2( movd ecx, xmm2)
982 AS2( MOVD edx, MM(1))
983 AS2( MOVD [L_SAVED_X+3*4], MM(0))
984 AS2( mov [L_SAVED_X+0*4], eax)
985 AS2( mov [L_SAVED_X+1*4], ebx)
986 AS2( mov [L_SAVED_X+2*4], edi)
987 ATT_NOPREFIX
988 ASJ( jmp, 5, f)
989 INTEL_NOPREFIX
990 ASL(3)
991 // non-counter mode per-block setup
992 AS2( MOVD MM(1), [L_KEY12+0*4]) // 0,1,2,3
993 AS2( mov ebx, [L_KEY12+1*4]) // 4,5,6,7
994 AS2( mov ecx, [L_KEY12+2*4]) // 8,9,10,11
995 AS2( mov edx, [L_KEY12+3*4]) // 12,13,14,15
996 ASL(8)
997 AS2( mov WORD_REG(ax), [L_INBLOCKS])
998 AS2( movdqu xmm2, [WORD_REG(ax)])
999 AS2( mov WORD_REG(si), [L_INXORBLOCKS])
1000 AS2( movdqu xmm5, [WORD_REG(si)])
1001 AS2( pxor xmm2, xmm1)
1002 AS2( pxor xmm2, xmm5)
1003
1004 // first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: eax, ebx, ecx, edx
1005 AS2( movd eax, xmm2)
1006 AS2( psrldq xmm2, 4)
1007 AS2( movd edi, xmm2)
1008 AS2( psrldq xmm2, 4)
1009 MXOR( 1, al, 0) // 0
1010 XOR( edx, ah, 1) // 1
1011 AS2( shr eax, 16)
1012 XOR( ecx, al, 2) // 2
1013 XOR( ebx, ah, 3) // 3
1014 AS2( mov eax, edi)
1015 AS2( movd edi, xmm2)
1016 AS2( psrldq xmm2, 4)
1017 XOR( ebx, al, 0) // 4
1018 MXOR( 1, ah, 1) // 5
1019 AS2( shr eax, 16)
1020 XOR( edx, al, 2) // 6
1021 XOR( ecx, ah, 3) // 7
1022 AS2( mov eax, edi)
1023 AS2( movd edi, xmm2)
1024 XOR( ecx, al, 0) // 8
1025 XOR( ebx, ah, 1) // 9
1026 AS2( shr eax, 16)
1027 MXOR( 1, al, 2) // 10
1028 XOR( edx, ah, 3) // 11
1029 AS2( mov eax, edi)
1030 XOR( edx, al, 0) // 12
1031 XOR( ecx, ah, 1) // 13
1032 AS2( shr eax, 16)
1033 XOR( ebx, al, 2) // 14
1034 MXOR( 1, ah, 3) // 15
1035 AS2( MOVD eax, MM(1))
1036
1037 AS2( add L_REG, [L_KEYS_BEGIN])
1038 AS2( add L_REG, 4*16)
1039 ATT_NOPREFIX
1040 ASJ( jmp, 2, f)
1041 INTEL_NOPREFIX
1042 ASL(1)
1043 // counter-mode per-block setup
1044 AS2( MOVD ecx, MM(2))
1045 AS2( MOVD edx, MM(1))
1046 AS2( mov eax, [L_SAVED_X+0*4])
1047 AS2( mov ebx, [L_SAVED_X+1*4])
1048 AS2( xor cl, ch)
1049 AS2( and WORD_REG(cx), 255)
1050 ASL(5)
1051#if CRYPTOPP_BOOL_X86
1052 AS2( paddb MM(2), mm3)
1053#else
1054 AS2( add MM(2), 1)
1055#endif
1056 // remaining part of second round, in: edx(previous round),esi(keyed counter byte) eax,ebx,[L_SAVED_X+2*4],[L_SAVED_X+3*4], out: eax,ebx,ecx,edx
1057 AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
1058 XOR( ebx, dl, 3)
1059 MOV( ecx, dh, 2)
1060 AS2( shr edx, 16)
1061 AS2( xor ecx, [L_SAVED_X+2*4])
1062 XOR( eax, dh, 0)
1063 MOV( edx, dl, 1)
1064 AS2( xor edx, [L_SAVED_X+3*4])
1065
1066 AS2( add L_REG, [L_KEYS_BEGIN])
1067 AS2( add L_REG, 3*16)
1068 ATT_NOPREFIX
1069 ASJ( jmp, 4, f)
1070 INTEL_NOPREFIX
1071
1072// in: eax(0,1,2,3), ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15)
1073// out: eax, ebx, edi, mm0
1074#define ROUND() \
1075 MXOR( 0, cl, 3) /* 11 */\
1076 AS2( mov cl, al) /* 8,9,10,3 */\
1077 XOR( edi, ah, 2) /* 2 */\
1078 AS2( shr eax, 16) /* 0,1 */\
1079 XOR( edi, bl, 3) /* 7 */\
1080 MXOR( 0, bh, 2) /* 6 */\
1081 AS2( shr ebx, 16) /* 4,5 */\
1082 MXOR( 0, al, 1) /* 1 */\
1083 MOV( eax, ah, 0) /* 0 */\
1084 XOR( eax, bl, 1) /* 5 */\
1085 MOV( ebx, bh, 0) /* 4 */\
1086 XOR( eax, ch, 2) /* 10 */\
1087 XOR( ebx, cl, 3) /* 3 */\
1088 AS2( shr ecx, 16) /* 8,9 */\
1089 XOR( eax, dl, 3) /* 15 */\
1090 XOR( ebx, dh, 2) /* 14 */\
1091 AS2( shr edx, 16) /* 12,13 */\
1092 XOR( edi, ch, 0) /* 8 */\
1093 XOR( ebx, cl, 1) /* 9 */\
1094 XOR( edi, dl, 1) /* 13 */\
1095 MXOR( 0, dh, 0) /* 12 */\
1096
1097 ASL(2) // 2-round loop
1098 AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
1099 AS2( mov edi, [L_SUBKEYS-4*16+2*4])
1100 ROUND()
1101 AS2( mov ecx, edi)
1102 AS2( xor eax, [L_SUBKEYS-4*16+0*4])
1103 AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
1104 AS2( MOVD edx, MM(0))
1105
1106 ASL(4)
1107 AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
1108 AS2( mov edi, [L_SUBKEYS-4*16+6*4])
1109 ROUND()
1110 AS2( mov ecx, edi)
1111 AS2( xor eax, [L_SUBKEYS-4*16+4*4])
1112 AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
1113 AS2( MOVD edx, MM(0))
1114
1115 AS2( add L_REG, 32)
1116 AS2( test L_REG, 255)
1117 ATT_NOPREFIX
1118 ASJ( jnz, 2, b)
1119 INTEL_NOPREFIX
1120 AS2( sub L_REG, 16*16)
1121
1122#define LAST(a, b, c) \
1123 AS2( movzx esi, a )\
1124 AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
1125 AS2( movzx esi, b )\
1126 AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
1127 AS2( mov WORD PTR [L_LASTROUND+c], di )\
1128
1129 // last round
1130 LAST(ch, dl, 2)
1131 LAST(dh, al, 6)
1132 AS2( shr edx, 16)
1133 LAST(ah, bl, 10)
1134 AS2( shr eax, 16)
1135 LAST(bh, cl, 14)
1136 AS2( shr ebx, 16)
1137 LAST(dh, al, 12)
1138 AS2( shr ecx, 16)
1139 LAST(ah, bl, 0)
1140 LAST(bh, cl, 4)
1141 LAST(ch, dl, 8)
1142
1143 AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
1144 AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
1145
1146 AS2( mov WORD_REG(cx), [L_LENGTH])
1147 AS2( sub WORD_REG(cx), 16)
1148
1149 AS2( movdqu xmm2, [WORD_REG(ax)])
1150 AS2( pxor xmm2, xmm4)
1151
1152#if CRYPTOPP_BOOL_X86
1153 AS2( movdqa xmm0, [L_INCREMENTS])
1154 AS2( paddd xmm0, [L_INBLOCKS])
1155 AS2( movdqa [L_INBLOCKS], xmm0)
1156#else
1157 AS2( movdqa xmm0, [L_INCREMENTS+16])
1158 AS2( paddq xmm0, [L_INBLOCKS+16])
1159 AS2( movdqa [L_INBLOCKS+16], xmm0)
1160#endif
1161
1162 AS2( pxor xmm2, [L_LASTROUND])
1163 AS2( movdqu [WORD_REG(bx)], xmm2)
1164
1165 ATT_NOPREFIX
1166 ASJ( jle, 7, f)
1167 INTEL_NOPREFIX
1168 AS2( mov [L_LENGTH], WORD_REG(cx))
1169 AS2( test WORD_REG(cx), 1)
1170 ATT_NOPREFIX
1171 ASJ( jnz, 1, b)
1172 INTEL_NOPREFIX
1173#if CRYPTOPP_BOOL_X64
1174 AS2( movdqa xmm0, [L_INCREMENTS])
1175 AS2( paddq xmm0, [L_INBLOCKS])
1176 AS2( movdqa [L_INBLOCKS], xmm0)
1177#endif
1178 ATT_NOPREFIX
1179 ASJ( jmp, 3, b)
1180 INTEL_NOPREFIX
1181
1182 ASL(7)
1183 // erase keys on stack
1184 AS2( xorps xmm0, xmm0)
1185 AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
1186 AS2( movaps [WORD_REG(ax)-7*16], xmm0)
1187 AS2( movaps [WORD_REG(ax)-6*16], xmm0)
1188 AS2( movaps [WORD_REG(ax)-5*16], xmm0)
1189 AS2( movaps [WORD_REG(ax)-4*16], xmm0)
1190 AS2( movaps [WORD_REG(ax)-3*16], xmm0)
1191 AS2( movaps [WORD_REG(ax)-2*16], xmm0)
1192 AS2( movaps [WORD_REG(ax)-1*16], xmm0)
1193 AS2( movaps [WORD_REG(ax)+0*16], xmm0)
1194 AS2( movaps [WORD_REG(ax)+1*16], xmm0)
1195 AS2( movaps [WORD_REG(ax)+2*16], xmm0)
1196 AS2( movaps [WORD_REG(ax)+3*16], xmm0)
1197 AS2( movaps [WORD_REG(ax)+4*16], xmm0)
1198 AS2( movaps [WORD_REG(ax)+5*16], xmm0)
1199 AS2( movaps [WORD_REG(ax)+6*16], xmm0)
1200#if CRYPTOPP_BOOL_X86
1201 AS2( mov esp, [L_SP])
1202 AS1( emms)
1203#endif
1204 AS_POP_IF86(bp)
1205 AS_POP_IF86(bx)
1206#if defined(_MSC_VER) && CRYPTOPP_BOOL_X86
1207 AS_POP_IF86(di)
1208 AS_POP_IF86(si)
1209 AS1(ret)
1210#endif
1211#ifdef CRYPTOPP_GENERATE_X64_MASM
1212 pop r12
1213 pop rbx
1214 pop rdi
1215 pop rsi
1216 ret
1217 Rijndael_Enc_AdvancedProcessBlocks ENDP
1218#endif
1219#ifdef __GNUC__
1220 ATT_PREFIX
1221 :
1222 : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
1223 : "memory", "cc", "%eax"
1224 #if CRYPTOPP_BOOL_X64
1225 , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
1226 #endif
1227 );
1228#endif
1229}
1230
1231#endif
1232
1233#ifndef CRYPTOPP_GENERATE_X64_MASM
1234
1235#ifdef CRYPTOPP_X64_MASM_AVAILABLE
1236extern "C" {
1237void Rijndael_Enc_AdvancedProcessBlocks_SSE2(void *locals, const word32 *k);
1238}
1239#endif
1240
1241#if CRYPTOPP_RIJNDAEL_ADVANCED_PROCESS_BLOCKS
1242size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1243{
1244#if CRYPTOPP_AESNI_AVAILABLE
1245 if (HasAESNI())
1246 return Rijndael_Enc_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1247#endif
1248#if CRYPTOPP_ARM_AES_AVAILABLE
1249 if (HasAES())
1250 return Rijndael_Enc_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1251#endif
1252#if CRYPTOPP_POWER8_AES_AVAILABLE
1253 if (HasAES())
1254 return Rijndael_Enc_AdvancedProcessBlocks128_6x1_ALTIVEC(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1255#endif
1256
1257#if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
1258 if (HasSSE2())
1259 {
1260 if (length < BLOCKSIZE)
1261 return length;
1262
1263 static const byte *zeros = (const byte*)(Te+256);
1264 m_aliasBlock.SetMark(m_aliasBlock.size());
1265 byte *space = NULLPTR, *originalSpace = const_cast<byte*>(m_aliasBlock.data());
1266
1267 // round up to nearest 256 byte boundary
1268 space = originalSpace + (s_aliasBlockSize - (uintptr_t)originalSpace % s_aliasBlockSize) % s_aliasBlockSize;
1269 while (AliasedWithTable(space, space + sizeof(Locals)))
1270 {
1271 space += 256;
1272 CRYPTOPP_ASSERT(space < (originalSpace + s_aliasPageSize));
1273 }
1274
1275 size_t increment = BLOCKSIZE;
1276 if (flags & BT_ReverseDirection)
1277 {
1278 CRYPTOPP_ASSERT(length % BLOCKSIZE == 0);
1279 inBlocks += length - BLOCKSIZE;
1280 xorBlocks += length - BLOCKSIZE;
1281 outBlocks += length - BLOCKSIZE;
1282 increment = 0-increment;
1283 }
1284
1285 Locals &locals = *(Locals *)(void *)space;
1286
1287 locals.inBlocks = inBlocks;
1288 locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
1289 locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
1290 locals.outBlocks = outBlocks;
1291
1292 locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1293 locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
1294 locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
1295 locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1296
1297 locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
1298 int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
1299 locals.keysBegin = (12-keysToCopy)*16;
1300
1301 Rijndael_Enc_AdvancedProcessBlocks_SSE2(&locals, m_key);
1302
1303 return length % BLOCKSIZE;
1304 }
1305#endif
1306
1307 return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1308}
1309
1310size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1311{
1312#if CRYPTOPP_AESNI_AVAILABLE
1313 if (HasAESNI())
1314 return Rijndael_Dec_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1315#endif
1316#if CRYPTOPP_ARM_AES_AVAILABLE
1317 if (HasAES())
1318 return Rijndael_Dec_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1319#endif
1320#if CRYPTOPP_POWER8_AES_AVAILABLE
1321 if (HasAES())
1322 return Rijndael_Dec_AdvancedProcessBlocks128_6x1_ALTIVEC(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1323#endif
1324
1325 return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1326}
1327#endif // CRYPTOPP_RIJNDAEL_ADVANCED_PROCESS_BLOCKS
1328
1329NAMESPACE_END
1330
1331#endif
1332#endif
virtual size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
Encrypt and xor multiple blocks using additional flags.
Definition: cryptlib.cpp:141
virtual unsigned int OptimalDataAlignment() const
Provides input and output data alignment for optimal performance.
Definition: cryptlib.cpp:190
Interface for retrieving values given their names.
Definition: cryptlib.h:294
Rijndael block cipher.
Definition: rijndael.h:46
Library configuration file.
Functions for CPU features and intrinsics.
bool HasAES()
Determine if an ARM processor has AES available.
Definition: cpu.h:449
bool HasARMv7()
Determine if an ARM processor is ARMv7 or above.
Definition: cpu.h:367
bool HasAESNI()
Determines AES-NI availability.
Definition: cpu.h:165
int GetCacheLineSize()
Provides the cache line size.
Definition: cpu.h:328
bool HasSSE2()
Determines SSE2 availability.
Definition: cpu.h:116
bool HasSSE41()
Determines SSE4.1 availability.
Definition: cpu.h:142
@ BIG_ENDIAN_ORDER
byte order is big-endian
Definition: cryptlib.h:147
Utility functions for the Crypto++ library.
T ConditionalByteReverse(ByteOrder order, T value)
Reverses bytes in a value depending upon endianness.
Definition: misc.h:2113
Crypto++ library namespace.
Precompiled header file.
Classes for Rijndael encryption algorithm.
Access a block of memory.
Definition: misc.h:2533
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:69