Crypto++ 8.2
Free C&
lea_simd.cpp
1// lea_simd.cpp - written and placed in the public domain by Jeffrey Walton
2//
3// This source file uses intrinsics and built-ins to gain access to
4// SSSE3, ARM NEON and ARMv8a, and Power8 Altivec instructions. A separate
5// source file is needed because additional CXXFLAGS are required to enable
6// the appropriate instructions sets in some build configurations.
7
8#include "pch.h"
9#include "config.h"
10
11#include "lea.h"
12#include "misc.h"
13
14// Uncomment for benchmarking C++ against SSE or NEON.
15// Do so in both simon.cpp and simon-simd.cpp.
16// #undef CRYPTOPP_SSSE3_AVAILABLE
17// #undef CRYPTOPP_ARM_NEON_AVAILABLE
18
19#if (CRYPTOPP_SSSE3_AVAILABLE)
20# include "adv_simd.h"
21# include <pmmintrin.h>
22# include <tmmintrin.h>
23#endif
24
25#if defined(__XOP__)
26# include <ammintrin.h>
27#endif
28
29#if defined(__AVX512F__)
30# define CRYPTOPP_AVX512_ROTATE 1
31# include <immintrin.h>
32#endif
33
34// C1189: error: This header is specific to ARM targets
35#if (CRYPTOPP_ARM_NEON_AVAILABLE)
36# include "adv_simd.h"
37# ifndef _M_ARM64
38# include <arm_neon.h>
39# endif
40#endif
41
42#if (CRYPTOPP_ARM_ACLE_AVAILABLE)
43# include <stdint.h>
44# include <arm_acle.h>
45#endif
46
47// Do not port this to POWER architecture. Naively we hoped
48// for a 2x to 3x speedup. The result was a 5x slow down.
49// The table below shows MiB/s and cpb.
50//
51// C++:
52// <TD>LEA-128(128)/CTR (128-bit key)<TD>C++<TD>207<TD>15.64
53// <TD>LEA-128(192)/CTR (192-bit key)<TD>C++<TD>186<TD>17.48
54// <TD>LEA-128(256)/CTR (256-bit key)<TD>C++<TD>124<TD>26.2
55//
56// Power8:
57// <TD>LEA-128(128)/CTR (128-bit key)<TD>Power8<TD>37<TD>88.7
58// <TD>LEA-128(192)/CTR (192-bit key)<TD>Power8<TD>40<TD>82.1
59// <TD>LEA-128(256)/CTR (256-bit key)<TD>Power8<TD>28<TD>116.0
60
61#undef CRYPTOPP_POWER8_AVAILABLE
62#if defined(CRYPTOPP_POWER8_AVAILABLE)
63# include "adv_simd.h"
64# include "ppc_simd.h"
65#endif
66
67// Squash MS LNK4221 and libtool warnings
68extern const char LEA_SIMD_FNAME[] = __FILE__;
69
70ANONYMOUS_NAMESPACE_BEGIN
71
72using CryptoPP::word32;
73
74// *************************** ARM NEON ***************************//
75
76#if (CRYPTOPP_ARM_NEON_AVAILABLE)
77
78inline uint32x4_t Xor(const uint32x4_t& a, const uint32x4_t& b)
79{
80 return veorq_u32(a, b);
81}
82
83inline uint32x4_t Add(const uint32x4_t& a, const uint32x4_t& b)
84{
85 return vaddq_u32(a, b);
86}
87
88inline uint32x4_t Sub(const uint32x4_t& a, const uint32x4_t& b)
89{
90 return vsubq_u32(a, b);
91}
92
93template <unsigned int R>
94inline uint32x4_t RotateLeft(const uint32x4_t& val)
95{
96 const uint32x4_t a(vshlq_n_u32(val, R));
97 const uint32x4_t b(vshrq_n_u32(val, 32 - R));
98 return vorrq_u32(a, b);
99}
100
101template <unsigned int R>
102inline uint32x4_t RotateRight(const uint32x4_t& val)
103{
104 const uint32x4_t a(vshlq_n_u32(val, 32 - R));
105 const uint32x4_t b(vshrq_n_u32(val, R));
106 return vorrq_u32(a, b);
107}
108
109#if defined(__aarch32__) || defined(__aarch64__)
110template <>
111inline uint32x4_t RotateLeft<8>(const uint32x4_t& val)
112{
113#if (CRYPTOPP_BIG_ENDIAN)
114 const uint8_t maskb[16] = { 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3 };
115 const uint8x16_t mask = vld1q_u8(maskb);
116#else
117 const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
118 const uint8x16_t mask = vld1q_u8(maskb);
119#endif
120
121 return vreinterpretq_u32_u8(
122 vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
123}
124
125template <>
126inline uint32x4_t RotateRight<8>(const uint32x4_t& val)
127{
128#if (CRYPTOPP_BIG_ENDIAN)
129 const uint8_t maskb[16] = { 12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1 };
130 const uint8x16_t mask = vld1q_u8(maskb);
131#else
132 const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,14,12 };
133 const uint8x16_t mask = vld1q_u8(maskb);
134#endif
135
136 return vreinterpretq_u32_u8(
137 vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
138}
139#endif
140
141uint32x4_t UnpackLow32(uint32x4_t a, uint32x4_t b)
142{
143 uint32x2_t a1 = vget_low_u32(a);
144 uint32x2_t b1 = vget_low_u32(b);
145 uint32x2x2_t result = vzip_u32(a1, b1);
146 return vcombine_u32(result.val[0], result.val[1]);
147}
148
149uint32x4_t UnpackHigh32(uint32x4_t a, uint32x4_t b)
150{
151 uint32x2_t a1 = vget_high_u32(a);
152 uint32x2_t b1 = vget_high_u32(b);
153 uint32x2x2_t result = vzip_u32(a1, b1);
154 return vcombine_u32(result.val[0], result.val[1]);
155}
156
157uint32x4_t UnpackLow64(uint32x4_t a, uint32x4_t b)
158{
159 uint64x1_t a1 = vget_low_u64((uint64x2_t)a);
160 uint64x1_t b1 = vget_low_u64((uint64x2_t)b);
161 return (uint32x4_t)vcombine_u64(a1, b1);
162}
163
164uint32x4_t UnpackHigh64(uint32x4_t a, uint32x4_t b)
165{
166 uint64x1_t a1 = vget_high_u64((uint64x2_t)a);
167 uint64x1_t b1 = vget_high_u64((uint64x2_t)b);
168 return (uint32x4_t)vcombine_u64(a1, b1);
169}
170
171template <unsigned int IDX>
172inline uint32x4_t LoadKey(const word32 rkey[])
173{
174 return vdupq_n_u32(rkey[IDX]);
175}
176
177template <unsigned int IDX>
178inline uint32x4_t UnpackNEON(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
179{
180 // Should not be instantiated
181 CRYPTOPP_ASSERT(0);;
182 return vmovq_n_u32(0);
183}
184
185template <>
186inline uint32x4_t UnpackNEON<0>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
187{
188 const uint32x4_t r1 = UnpackLow32(a, b);
189 const uint32x4_t r2 = UnpackLow32(c, d);
190 return UnpackLow64(r1, r2);
191}
192
193template <>
194inline uint32x4_t UnpackNEON<1>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
195{
196 const uint32x4_t r1 = UnpackLow32(a, b);
197 const uint32x4_t r2 = UnpackLow32(c, d);
198 return UnpackHigh64(r1, r2);
199}
200
201template <>
202inline uint32x4_t UnpackNEON<2>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
203{
204 const uint32x4_t r1 = UnpackHigh32(a, b);
205 const uint32x4_t r2 = UnpackHigh32(c, d);
206 return UnpackLow64(r1, r2);
207}
208
209template <>
210inline uint32x4_t UnpackNEON<3>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
211{
212 const uint32x4_t r1 = UnpackHigh32(a, b);
213 const uint32x4_t r2 = UnpackHigh32(c, d);
214 return UnpackHigh64(r1, r2);
215}
216
217template <unsigned int IDX>
218inline uint32x4_t UnpackNEON(const uint32x4_t& v)
219{
220 // Should not be instantiated
221 CRYPTOPP_ASSERT(0);;
222 return vmovq_n_u32(0);
223}
224
225template <>
226inline uint32x4_t UnpackNEON<0>(const uint32x4_t& v)
227{
228 // Splat to all lanes
229 return vdupq_n_u32(vgetq_lane_u32(v, 0));
230}
231
232template <>
233inline uint32x4_t UnpackNEON<1>(const uint32x4_t& v)
234{
235 // Splat to all lanes
236 return vdupq_n_u32(vgetq_lane_u32(v, 1));
237}
238
239template <>
240inline uint32x4_t UnpackNEON<2>(const uint32x4_t& v)
241{
242 // Splat to all lanes
243 return vdupq_n_u32(vgetq_lane_u32(v, 2));
244}
245
246template <>
247inline uint32x4_t UnpackNEON<3>(const uint32x4_t& v)
248{
249 // Splat to all lanes
250 return vdupq_n_u32(vgetq_lane_u32(v, 3));
251}
252
253template <unsigned int IDX>
254inline uint32x4_t RepackNEON(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
255{
256 return UnpackNEON<IDX>(a, b, c, d);
257}
258
259template <unsigned int IDX>
260inline uint32x4_t RepackNEON(const uint32x4_t& v)
261{
262 return UnpackNEON<IDX>(v);
263}
264
265#endif // CRYPTOPP_ARM_NEON_AVAILABLE
266
267// *************************** IA-32 ***************************//
268
269#if (CRYPTOPP_SSSE3_AVAILABLE)
270
271inline __m128i Xor(const __m128i& a, const __m128i& b)
272{
273 return _mm_xor_si128(a, b);
274}
275
276inline __m128i Add(const __m128i& a, const __m128i& b)
277{
278 return _mm_add_epi32(a, b);
279}
280
281inline __m128i Sub(const __m128i& a, const __m128i& b)
282{
283 return _mm_sub_epi32(a, b);
284}
285
286template <unsigned int R>
287inline __m128i RotateLeft(const __m128i& val)
288{
289#if defined(__XOP__)
290 return _mm_roti_epi32(val, R);
291#else
292 return _mm_or_si128(
293 _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
294#endif
295}
296
297template <unsigned int R>
298inline __m128i RotateRight(const __m128i& val)
299{
300#if defined(__XOP__)
301 return _mm_roti_epi32(val, 32-R);
302#else
303 return _mm_or_si128(
304 _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
305#endif
306}
307
308// Faster than two Shifts and an Or.
309template <>
310inline __m128i RotateLeft<8>(const __m128i& val)
311{
312#if defined(__XOP__)
313 return _mm_roti_epi32(val, 8);
314#else
315 const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
316 return _mm_shuffle_epi8(val, mask);
317#endif
318}
319
320// Faster than two Shifts and an Or.
321template <>
322inline __m128i RotateRight<8>(const __m128i& val)
323{
324#if defined(__XOP__)
325 return _mm_roti_epi32(val, 32-8);
326#else
327 const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
328 return _mm_shuffle_epi8(val, mask);
329#endif
330}
331
332template <unsigned int IDX>
333inline __m128i LoadKey(const word32 rkey[])
334{
335 float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk));
336 return _mm_castps_si128(_mm_load_ps1(&rk));
337}
338
339template <unsigned int IDX>
340inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
341{
342 // Should not be instantiated
343 CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
344 CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
346 return _mm_setzero_si128();
347}
348
349template <>
350inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
351{
352 // LEA is little-endian oriented, so there is no need for a separate shuffle.
353 const __m128i r1 = _mm_unpacklo_epi32(a, b);
354 const __m128i r2 = _mm_unpacklo_epi32(c, d);
355 return _mm_unpacklo_epi64(r1, r2);
356}
357
358template <>
359inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
360{
361 // LEA is little-endian oriented, so there is no need for a separate shuffle.
362 const __m128i r1 = _mm_unpacklo_epi32(a, b);
363 const __m128i r2 = _mm_unpacklo_epi32(c, d);
364 return _mm_unpackhi_epi64(r1, r2);
365}
366
367template <>
368inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
369{
370 // LEA is little-endian oriented, so there is no need for a separate shuffle.
371 const __m128i r1 = _mm_unpackhi_epi32(a, b);
372 const __m128i r2 = _mm_unpackhi_epi32(c, d);
373 return _mm_unpacklo_epi64(r1, r2);
374}
375
376template <>
377inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
378{
379 // LEA is little-endian oriented, so there is no need for a separate shuffle.
380 const __m128i r1 = _mm_unpackhi_epi32(a, b);
381 const __m128i r2 = _mm_unpackhi_epi32(c, d);
382 return _mm_unpackhi_epi64(r1, r2);
383}
384
385template <unsigned int IDX>
386inline __m128i UnpackXMM(const __m128i& v)
387{
388 // Should not be instantiated
389 CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0);
390 return _mm_setzero_si128();
391}
392
393template <>
394inline __m128i UnpackXMM<0>(const __m128i& v)
395{
396 // Splat to all lanes
397 return _mm_shuffle_epi8(v, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
398}
399
400template <>
401inline __m128i UnpackXMM<1>(const __m128i& v)
402{
403 // Splat to all lanes
404 return _mm_shuffle_epi8(v, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
405}
406
407template <>
408inline __m128i UnpackXMM<2>(const __m128i& v)
409{
410 // Splat to all lanes
411 return _mm_shuffle_epi8(v, _mm_set_epi8(11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8));
412}
413
414template <>
415inline __m128i UnpackXMM<3>(const __m128i& v)
416{
417 // Splat to all lanes
418 return _mm_shuffle_epi8(v, _mm_set_epi8(15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12));
419}
420
421template <unsigned int IDX>
422inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
423{
424 return UnpackXMM<IDX>(a, b, c, d);
425}
426
427template <unsigned int IDX>
428inline __m128i RepackXMM(const __m128i& v)
429{
430 return UnpackXMM<IDX>(v);
431}
432
433#endif // CRYPTOPP_SSSE3_AVAILABLE
434
435// *************************** Power8 ***************************//
436
437#if (CRYPTOPP_POWER8_AVAILABLE)
438
439using CryptoPP::uint8x16_p;
440using CryptoPP::uint32x4_p;
441using CryptoPP::uint64x2_p;
442
443inline uint32x4_p Xor(const uint32x4_p& a, const uint32x4_p& b)
444{
445 return VecXor(a, b);
446}
447
448inline uint32x4_p Add(const uint32x4_p& a, const uint32x4_p& b)
449{
450 return VecAdd(a, b);
451}
452
453inline uint32x4_p Sub(const uint32x4_p& a, const uint32x4_p& b)
454{
455 return VecSub(a, b);
456}
457
458template <unsigned int R>
459inline uint32x4_p RotateLeft(const uint32x4_p& val)
460{
461 const uint32x4_p m = {R, R, R, R};
462 return vec_rl(val, m);
463}
464
465template <unsigned int R>
466inline uint32x4_p RotateRight(const uint32x4_p& val)
467{
468 const uint32x4_p m = {32-R, 32-R, 32-R, 32-R};
469 return vec_rl(val, m);
470}
471
472template <unsigned int IDX>
473inline uint32x4_p LoadKey(const word32 rkey[])
474{
475 return vec_splats(rkey[IDX]);
476}
477
478template <unsigned int IDX>
479inline uint32x4_p UnpackSIMD(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
480{
481 // Should not be instantiated
482 CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
483 CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
485 return VecXor(a, a);
486}
487
488template <>
489inline uint32x4_p UnpackSIMD<0>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
490{
491 const uint64x2_p r1 = (uint64x2_p)vec_mergel(a, b);
492 const uint64x2_p r2 = (uint64x2_p)vec_mergel(c, d);
493 return (uint32x4_p)vec_mergel(r1, r2);
494}
495
496template <>
497inline uint32x4_p UnpackSIMD<1>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
498{
499 const uint64x2_p r1 = (uint64x2_p)vec_mergel(a, b);
500 const uint64x2_p r2 = (uint64x2_p)vec_mergel(c, d);
501 return (uint32x4_p)vec_mergeh(r1, r2);
502}
503
504template <>
505inline uint32x4_p UnpackSIMD<2>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
506{
507 const uint64x2_p r1 = (uint64x2_p)vec_mergeh(a, b);
508 const uint64x2_p r2 = (uint64x2_p)vec_mergeh(c, d);
509 return (uint32x4_p)vec_mergel(r1, r2);
510}
511
512template <>
513inline uint32x4_p UnpackSIMD<3>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
514{
515 const uint64x2_p r1 = (uint64x2_p)vec_mergeh(a, b);
516 const uint64x2_p r2 = (uint64x2_p)vec_mergeh(c, d);
517 return (uint32x4_p)vec_mergeh(r1, r2);
518}
519
520template <unsigned int IDX>
521inline uint32x4_p UnpackSIMD(const uint32x4_p& v)
522{
523 // Should not be instantiated
525 return VecXor(v, v);
526}
527
528template <>
529inline uint32x4_p UnpackSIMD<0>(const uint32x4_p& v)
530{
531 // Splat to all lanes
532 const uint8x16_p m = {3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0};
533 return (uint32x4_p)VecPermute(v, v, m);
534}
535
536template <>
537inline uint32x4_p UnpackSIMD<1>(const uint32x4_p& v)
538{
539 // Splat to all lanes
540 const uint8x16_p m = {7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4};
541 return (uint32x4_p)VecPermute(v, v, m);
542}
543
544template <>
545inline uint32x4_p UnpackSIMD<2>(const uint32x4_p& v)
546{
547 // Splat to all lanes
548 const uint8x16_p m = {11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8};
549 return (uint32x4_p)VecPermute(v, v, m);
550}
551
552template <>
553inline uint32x4_p UnpackSIMD<3>(const uint32x4_p& v)
554{
555 // Splat to all lanes
556 const uint8x16_p m = {15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12};
557 return (uint32x4_p)VecPermute(v, v, m);
558}
559
560template <unsigned int IDX>
561inline uint32x4_p RepackSIMD(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
562{
563 return UnpackSIMD<IDX>(a, b, c, d);
564}
565
566template <unsigned int IDX>
567inline uint32x4_p RepackSIMD(const uint32x4_p& v)
568{
569 return UnpackSIMD<IDX>(v);
570}
571
572#endif // CRYPTOPP_POWER8_AVAILABLE
573
574// *************************** LEA Encryption ***************************//
575
576#if (CRYPTOPP_ARM_NEON_AVAILABLE || CRYPTOPP_SSSE3_AVAILABLE)
577
578template <class W>
579inline void LEA_Encryption(W temp[4], const word32 *subkeys, unsigned int rounds)
580{
581 temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<4>(subkeys)), Xor(temp[3], LoadKey<5>(subkeys))));
582 temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<2>(subkeys)), Xor(temp[2], LoadKey<3>(subkeys))));
583 temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<0>(subkeys)), Xor(temp[1], LoadKey<1>(subkeys))));
584 temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<10>(subkeys)), Xor(temp[0], LoadKey<11>(subkeys))));
585 temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<8>(subkeys)), Xor(temp[3], LoadKey<9>(subkeys))));
586 temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<6>(subkeys)), Xor(temp[2], LoadKey<7>(subkeys))));
587 temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<16>(subkeys)), Xor(temp[1], LoadKey<17>(subkeys))));
588 temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<14>(subkeys)), Xor(temp[0], LoadKey<15>(subkeys))));
589 temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<12>(subkeys)), Xor(temp[3], LoadKey<13>(subkeys))));
590 temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<22>(subkeys)), Xor(temp[2], LoadKey<23>(subkeys))));
591 temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<20>(subkeys)), Xor(temp[1], LoadKey<21>(subkeys))));
592 temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<18>(subkeys)), Xor(temp[0], LoadKey<19>(subkeys))));
593
594 temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<28>(subkeys)), Xor(temp[3], LoadKey<29>(subkeys))));
595 temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<26>(subkeys)), Xor(temp[2], LoadKey<27>(subkeys))));
596 temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<24>(subkeys)), Xor(temp[1], LoadKey<25>(subkeys))));
597 temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<34>(subkeys)), Xor(temp[0], LoadKey<35>(subkeys))));
598 temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<32>(subkeys)), Xor(temp[3], LoadKey<33>(subkeys))));
599 temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<30>(subkeys)), Xor(temp[2], LoadKey<31>(subkeys))));
600 temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<40>(subkeys)), Xor(temp[1], LoadKey<41>(subkeys))));
601 temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<38>(subkeys)), Xor(temp[0], LoadKey<39>(subkeys))));
602 temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<36>(subkeys)), Xor(temp[3], LoadKey<37>(subkeys))));
603 temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<46>(subkeys)), Xor(temp[2], LoadKey<47>(subkeys))));
604 temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<44>(subkeys)), Xor(temp[1], LoadKey<45>(subkeys))));
605 temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<42>(subkeys)), Xor(temp[0], LoadKey<43>(subkeys))));
606
607 temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<52>(subkeys)), Xor(temp[3], LoadKey<53>(subkeys))));
608 temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<50>(subkeys)), Xor(temp[2], LoadKey<51>(subkeys))));
609 temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<48>(subkeys)), Xor(temp[1], LoadKey<49>(subkeys))));
610 temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<58>(subkeys)), Xor(temp[0], LoadKey<59>(subkeys))));
611 temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<56>(subkeys)), Xor(temp[3], LoadKey<57>(subkeys))));
612 temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<54>(subkeys)), Xor(temp[2], LoadKey<55>(subkeys))));
613 temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<64>(subkeys)), Xor(temp[1], LoadKey<65>(subkeys))));
614 temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<62>(subkeys)), Xor(temp[0], LoadKey<63>(subkeys))));
615 temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<60>(subkeys)), Xor(temp[3], LoadKey<61>(subkeys))));
616 temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<70>(subkeys)), Xor(temp[2], LoadKey<71>(subkeys))));
617 temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<68>(subkeys)), Xor(temp[1], LoadKey<69>(subkeys))));
618 temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<66>(subkeys)), Xor(temp[0], LoadKey<67>(subkeys))));
619
620 temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<76>(subkeys)), Xor(temp[3], LoadKey<77>(subkeys))));
621 temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<74>(subkeys)), Xor(temp[2], LoadKey<75>(subkeys))));
622 temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<72>(subkeys)), Xor(temp[1], LoadKey<73>(subkeys))));
623 temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<82>(subkeys)), Xor(temp[0], LoadKey<83>(subkeys))));
624 temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<80>(subkeys)), Xor(temp[3], LoadKey<81>(subkeys))));
625 temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<78>(subkeys)), Xor(temp[2], LoadKey<79>(subkeys))));
626 temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<88>(subkeys)), Xor(temp[1], LoadKey<89>(subkeys))));
627 temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<86>(subkeys)), Xor(temp[0], LoadKey<87>(subkeys))));
628 temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<84>(subkeys)), Xor(temp[3], LoadKey<85>(subkeys))));
629 temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<94>(subkeys)), Xor(temp[2], LoadKey<95>(subkeys))));
630 temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<92>(subkeys)), Xor(temp[1], LoadKey<93>(subkeys))));
631 temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<90>(subkeys)), Xor(temp[0], LoadKey<91>(subkeys))));
632
633 temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<100>(subkeys)), Xor(temp[3], LoadKey<101>(subkeys))));
634 temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<98>(subkeys)), Xor(temp[2], LoadKey<99>(subkeys))));
635 temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<96>(subkeys)), Xor(temp[1], LoadKey<97>(subkeys))));
636 temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<106>(subkeys)), Xor(temp[0], LoadKey<107>(subkeys))));
637 temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<104>(subkeys)), Xor(temp[3], LoadKey<105>(subkeys))));
638 temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<102>(subkeys)), Xor(temp[2], LoadKey<103>(subkeys))));
639 temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<112>(subkeys)), Xor(temp[1], LoadKey<113>(subkeys))));
640 temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<110>(subkeys)), Xor(temp[0], LoadKey<111>(subkeys))));
641 temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<108>(subkeys)), Xor(temp[3], LoadKey<109>(subkeys))));
642 temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<118>(subkeys)), Xor(temp[2], LoadKey<119>(subkeys))));
643 temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<116>(subkeys)), Xor(temp[1], LoadKey<117>(subkeys))));
644 temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<114>(subkeys)), Xor(temp[0], LoadKey<115>(subkeys))));
645
646 temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<124>(subkeys)), Xor(temp[3], LoadKey<125>(subkeys))));
647 temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<122>(subkeys)), Xor(temp[2], LoadKey<123>(subkeys))));
648 temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<120>(subkeys)), Xor(temp[1], LoadKey<121>(subkeys))));
649 temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<130>(subkeys)), Xor(temp[0], LoadKey<131>(subkeys))));
650 temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<128>(subkeys)), Xor(temp[3], LoadKey<129>(subkeys))));
651 temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<126>(subkeys)), Xor(temp[2], LoadKey<127>(subkeys))));
652 temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<136>(subkeys)), Xor(temp[1], LoadKey<137>(subkeys))));
653 temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<134>(subkeys)), Xor(temp[0], LoadKey<135>(subkeys))));
654 temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<132>(subkeys)), Xor(temp[3], LoadKey<133>(subkeys))));
655 temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<142>(subkeys)), Xor(temp[2], LoadKey<143>(subkeys))));
656 temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<140>(subkeys)), Xor(temp[1], LoadKey<141>(subkeys))));
657 temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<138>(subkeys)), Xor(temp[0], LoadKey<139>(subkeys))));
658
659 if(rounds > 24)
660 {
661 temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<148>(subkeys)), Xor(temp[3], LoadKey<149>(subkeys))));
662 temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<146>(subkeys)), Xor(temp[2], LoadKey<147>(subkeys))));
663 temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<144>(subkeys)), Xor(temp[1], LoadKey<145>(subkeys))));
664 temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<154>(subkeys)), Xor(temp[0], LoadKey<155>(subkeys))));
665 temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<152>(subkeys)), Xor(temp[3], LoadKey<153>(subkeys))));
666 temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<150>(subkeys)), Xor(temp[2], LoadKey<151>(subkeys))));
667 temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<160>(subkeys)), Xor(temp[1], LoadKey<161>(subkeys))));
668 temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<158>(subkeys)), Xor(temp[0], LoadKey<159>(subkeys))));
669 temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<156>(subkeys)), Xor(temp[3], LoadKey<157>(subkeys))));
670 temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<166>(subkeys)), Xor(temp[2], LoadKey<167>(subkeys))));
671 temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<164>(subkeys)), Xor(temp[1], LoadKey<165>(subkeys))));
672 temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<162>(subkeys)), Xor(temp[0], LoadKey<163>(subkeys))));
673 }
674
675 if(rounds > 28)
676 {
677 temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<172>(subkeys)), Xor(temp[3], LoadKey<173>(subkeys))));
678 temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<170>(subkeys)), Xor(temp[2], LoadKey<171>(subkeys))));
679 temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<168>(subkeys)), Xor(temp[1], LoadKey<169>(subkeys))));
680 temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<178>(subkeys)), Xor(temp[0], LoadKey<179>(subkeys))));
681 temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<176>(subkeys)), Xor(temp[3], LoadKey<177>(subkeys))));
682 temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<174>(subkeys)), Xor(temp[2], LoadKey<175>(subkeys))));
683 temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<184>(subkeys)), Xor(temp[1], LoadKey<185>(subkeys))));
684 temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<182>(subkeys)), Xor(temp[0], LoadKey<183>(subkeys))));
685 temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<180>(subkeys)), Xor(temp[3], LoadKey<181>(subkeys))));
686 temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<190>(subkeys)), Xor(temp[2], LoadKey<191>(subkeys))));
687 temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<188>(subkeys)), Xor(temp[1], LoadKey<189>(subkeys))));
688 temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<186>(subkeys)), Xor(temp[0], LoadKey<187>(subkeys))));
689 }
690}
691
692// *************************** LEA Decryption ***************************//
693
694template <class W>
695inline void LEA_Decryption(W temp[4], const word32 *subkeys, unsigned int rounds)
696{
697 if(rounds > 28)
698 {
699 temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<186>(subkeys))), LoadKey<187>(subkeys));
700 temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<188>(subkeys))), LoadKey<189>(subkeys));
701 temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<190>(subkeys))), LoadKey<191>(subkeys));
702 temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<180>(subkeys))), LoadKey<181>(subkeys));
703 temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<182>(subkeys))), LoadKey<183>(subkeys));
704 temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<184>(subkeys))), LoadKey<185>(subkeys));
705 temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<174>(subkeys))), LoadKey<175>(subkeys));
706 temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<176>(subkeys))), LoadKey<177>(subkeys));
707 temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<178>(subkeys))), LoadKey<179>(subkeys));
708 temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<168>(subkeys))), LoadKey<169>(subkeys));
709 temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<170>(subkeys))), LoadKey<171>(subkeys));
710 temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<172>(subkeys))), LoadKey<173>(subkeys));
711 }
712
713 if(rounds > 24)
714 {
715 temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<162>(subkeys))), LoadKey<163>(subkeys));
716 temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<164>(subkeys))), LoadKey<165>(subkeys));
717 temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<166>(subkeys))), LoadKey<167>(subkeys));
718 temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<156>(subkeys))), LoadKey<157>(subkeys));
719 temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<158>(subkeys))), LoadKey<159>(subkeys));
720 temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<160>(subkeys))), LoadKey<161>(subkeys));
721 temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<150>(subkeys))), LoadKey<151>(subkeys));
722 temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<152>(subkeys))), LoadKey<153>(subkeys));
723 temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<154>(subkeys))), LoadKey<155>(subkeys));
724 temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<144>(subkeys))), LoadKey<145>(subkeys));
725 temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<146>(subkeys))), LoadKey<147>(subkeys));
726 temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<148>(subkeys))), LoadKey<149>(subkeys));
727 }
728
729 temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<138>(subkeys))), LoadKey<139>(subkeys));
730 temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<140>(subkeys))), LoadKey<141>(subkeys));
731 temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<142>(subkeys))), LoadKey<143>(subkeys));
732 temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<132>(subkeys))), LoadKey<133>(subkeys));
733 temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<134>(subkeys))), LoadKey<135>(subkeys));
734 temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<136>(subkeys))), LoadKey<137>(subkeys));
735 temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<126>(subkeys))), LoadKey<127>(subkeys));
736 temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<128>(subkeys))), LoadKey<129>(subkeys));
737 temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<130>(subkeys))), LoadKey<131>(subkeys));
738 temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<120>(subkeys))), LoadKey<121>(subkeys));
739 temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<122>(subkeys))), LoadKey<123>(subkeys));
740 temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<124>(subkeys))), LoadKey<125>(subkeys));
741
742 temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<114>(subkeys))), LoadKey<115>(subkeys));
743 temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<116>(subkeys))), LoadKey<117>(subkeys));
744 temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<118>(subkeys))), LoadKey<119>(subkeys));
745 temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<108>(subkeys))), LoadKey<109>(subkeys));
746 temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<110>(subkeys))), LoadKey<111>(subkeys));
747 temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<112>(subkeys))), LoadKey<113>(subkeys));
748 temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<102>(subkeys))), LoadKey<103>(subkeys));
749 temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<104>(subkeys))), LoadKey<105>(subkeys));
750 temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<106>(subkeys))), LoadKey<107>(subkeys));
751 temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<96>(subkeys))), LoadKey<97>(subkeys));
752 temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<98>(subkeys))), LoadKey<99>(subkeys));
753 temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<100>(subkeys))), LoadKey<101>(subkeys));
754
755 temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<90>(subkeys))), LoadKey<91>(subkeys));
756 temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<92>(subkeys))), LoadKey<93>(subkeys));
757 temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<94>(subkeys))), LoadKey<95>(subkeys));
758 temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<84>(subkeys))), LoadKey<85>(subkeys));
759 temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<86>(subkeys))), LoadKey<87>(subkeys));
760 temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<88>(subkeys))), LoadKey<89>(subkeys));
761 temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<78>(subkeys))), LoadKey<79>(subkeys));
762 temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<80>(subkeys))), LoadKey<81>(subkeys));
763 temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<82>(subkeys))), LoadKey<83>(subkeys));
764 temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<72>(subkeys))), LoadKey<73>(subkeys));
765 temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<74>(subkeys))), LoadKey<75>(subkeys));
766 temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<76>(subkeys))), LoadKey<77>(subkeys));
767
768 temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<66>(subkeys))), LoadKey<67>(subkeys));
769 temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<68>(subkeys))), LoadKey<69>(subkeys));
770 temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<70>(subkeys))), LoadKey<71>(subkeys));
771 temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<60>(subkeys))), LoadKey<61>(subkeys));
772 temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<62>(subkeys))), LoadKey<63>(subkeys));
773 temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<64>(subkeys))), LoadKey<65>(subkeys));
774 temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<54>(subkeys))), LoadKey<55>(subkeys));
775 temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<56>(subkeys))), LoadKey<57>(subkeys));
776 temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<58>(subkeys))), LoadKey<59>(subkeys));
777 temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<48>(subkeys))), LoadKey<49>(subkeys));
778 temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<50>(subkeys))), LoadKey<51>(subkeys));
779 temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<52>(subkeys))), LoadKey<53>(subkeys));
780
781 temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<42>(subkeys))), LoadKey<43>(subkeys));
782 temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<44>(subkeys))), LoadKey<45>(subkeys));
783 temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<46>(subkeys))), LoadKey<47>(subkeys));
784 temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<36>(subkeys))), LoadKey<37>(subkeys));
785 temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<38>(subkeys))), LoadKey<39>(subkeys));
786 temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<40>(subkeys))), LoadKey<41>(subkeys));
787 temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<30>(subkeys))), LoadKey<31>(subkeys));
788 temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<32>(subkeys))), LoadKey<33>(subkeys));
789 temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<34>(subkeys))), LoadKey<35>(subkeys));
790 temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<24>(subkeys))), LoadKey<25>(subkeys));
791 temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<26>(subkeys))), LoadKey<27>(subkeys));
792 temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<28>(subkeys))), LoadKey<29>(subkeys));
793
794 temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<18>(subkeys))), LoadKey<19>(subkeys));
795 temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<20>(subkeys))), LoadKey<21>(subkeys));
796 temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<22>(subkeys))), LoadKey<23>(subkeys));
797 temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<12>(subkeys))), LoadKey<13>(subkeys));
798 temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<14>(subkeys))), LoadKey<15>(subkeys));
799 temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<16>(subkeys))), LoadKey<17>(subkeys));
800 temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<6>(subkeys))), LoadKey<7>(subkeys));
801 temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<8>(subkeys))), LoadKey<9>(subkeys));
802 temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<10>(subkeys))), LoadKey<11>(subkeys));
803 temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<0>(subkeys))), LoadKey<1>(subkeys));
804 temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<2>(subkeys))), LoadKey<3>(subkeys));
805 temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<4>(subkeys))), LoadKey<5>(subkeys));
806}
807
808#endif // LEA Encryption and Decryption
809
810// *************************** ARM NEON ***************************//
811
812#if (CRYPTOPP_ARM_NEON_AVAILABLE)
813
814inline void LEA_Enc_Block(uint32x4_t &block0,
815 const word32 *subkeys, unsigned int rounds)
816{
817 uint32x4_t temp[4];
818 temp[0] = UnpackNEON<0>(block0);
819 temp[1] = UnpackNEON<1>(block0);
820 temp[2] = UnpackNEON<2>(block0);
821 temp[3] = UnpackNEON<3>(block0);
822
823 LEA_Encryption(temp, subkeys, rounds);
824
825 block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
826}
827
828inline void LEA_Dec_Block(uint32x4_t &block0,
829 const word32 *subkeys, unsigned int rounds)
830{
831 uint32x4_t temp[4];
832 temp[0] = UnpackNEON<0>(block0);
833 temp[1] = UnpackNEON<1>(block0);
834 temp[2] = UnpackNEON<2>(block0);
835 temp[3] = UnpackNEON<3>(block0);
836
837 LEA_Decryption(temp, subkeys, rounds);
838
839 block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
840}
841
842inline void LEA_Enc_4_Blocks(uint32x4_t &block0, uint32x4_t &block1,
843 uint32x4_t &block2, uint32x4_t &block3, const word32 *subkeys, unsigned int rounds)
844{
845 uint32x4_t temp[4];
846 temp[0] = UnpackNEON<0>(block0, block1, block2, block3);
847 temp[1] = UnpackNEON<1>(block0, block1, block2, block3);
848 temp[2] = UnpackNEON<2>(block0, block1, block2, block3);
849 temp[3] = UnpackNEON<3>(block0, block1, block2, block3);
850
851 LEA_Encryption(temp, subkeys, rounds);
852
853 block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
854 block1 = RepackNEON<1>(temp[0], temp[1], temp[2], temp[3]);
855 block2 = RepackNEON<2>(temp[0], temp[1], temp[2], temp[3]);
856 block3 = RepackNEON<3>(temp[0], temp[1], temp[2], temp[3]);
857}
858
859inline void LEA_Dec_4_Blocks(uint32x4_t &block0, uint32x4_t &block1,
860 uint32x4_t &block2, uint32x4_t &block3, const word32 *subkeys, unsigned int rounds)
861{
862 uint32x4_t temp[4];
863 temp[0] = UnpackNEON<0>(block0, block1, block2, block3);
864 temp[1] = UnpackNEON<1>(block0, block1, block2, block3);
865 temp[2] = UnpackNEON<2>(block0, block1, block2, block3);
866 temp[3] = UnpackNEON<3>(block0, block1, block2, block3);
867
868 LEA_Decryption(temp, subkeys, rounds);
869
870 block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
871 block1 = RepackNEON<1>(temp[0], temp[1], temp[2], temp[3]);
872 block2 = RepackNEON<2>(temp[0], temp[1], temp[2], temp[3]);
873 block3 = RepackNEON<3>(temp[0], temp[1], temp[2], temp[3]);
874}
875
876#endif // CRYPTOPP_ARM_NEON_AVAILABLE
877
878// *************************** IA-32 ***************************//
879
880#if (CRYPTOPP_SSSE3_AVAILABLE)
881
882inline void LEA_Enc_Block(__m128i &block0,
883 const word32 *subkeys, unsigned int rounds)
884{
885 __m128i temp[4];
886 temp[0] = UnpackXMM<0>(block0);
887 temp[1] = UnpackXMM<1>(block0);
888 temp[2] = UnpackXMM<2>(block0);
889 temp[3] = UnpackXMM<3>(block0);
890
891 LEA_Encryption(temp, subkeys, rounds);
892
893 block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
894}
895
896inline void LEA_Dec_Block(__m128i &block0,
897 const word32 *subkeys, unsigned int rounds)
898{
899 __m128i temp[4];
900 temp[0] = UnpackXMM<0>(block0);
901 temp[1] = UnpackXMM<1>(block0);
902 temp[2] = UnpackXMM<2>(block0);
903 temp[3] = UnpackXMM<3>(block0);
904
905 LEA_Decryption(temp, subkeys, rounds);
906
907 block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
908}
909
910inline void LEA_Enc_4_Blocks(__m128i &block0, __m128i &block1,
911 __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
912{
913 __m128i temp[4];
914 temp[0] = UnpackXMM<0>(block0, block1, block2, block3);
915 temp[1] = UnpackXMM<1>(block0, block1, block2, block3);
916 temp[2] = UnpackXMM<2>(block0, block1, block2, block3);
917 temp[3] = UnpackXMM<3>(block0, block1, block2, block3);
918
919 LEA_Encryption(temp, subkeys, rounds);
920
921 block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
922 block1 = RepackXMM<1>(temp[0], temp[1], temp[2], temp[3]);
923 block2 = RepackXMM<2>(temp[0], temp[1], temp[2], temp[3]);
924 block3 = RepackXMM<3>(temp[0], temp[1], temp[2], temp[3]);
925}
926
927inline void LEA_Dec_4_Blocks(__m128i &block0, __m128i &block1,
928 __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
929{
930 __m128i temp[4];
931 temp[0] = UnpackXMM<0>(block0, block1, block2, block3);
932 temp[1] = UnpackXMM<1>(block0, block1, block2, block3);
933 temp[2] = UnpackXMM<2>(block0, block1, block2, block3);
934 temp[3] = UnpackXMM<3>(block0, block1, block2, block3);
935
936 LEA_Decryption(temp, subkeys, rounds);
937
938 block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
939 block1 = RepackXMM<1>(temp[0], temp[1], temp[2], temp[3]);
940 block2 = RepackXMM<2>(temp[0], temp[1], temp[2], temp[3]);
941 block3 = RepackXMM<3>(temp[0], temp[1], temp[2], temp[3]);
942}
943
944#endif // CRYPTOPP_SSSE3_AVAILABLE
945
946// *************************** Power8 ***************************//
947
948#if (CRYPTOPP_POWER8_AVAILABLE)
949
950inline void LEA_Enc_Block(uint32x4_p &block0,
951 const word32 *subkeys, unsigned int rounds)
952{
953 uint32x4_p temp[4];
954 temp[0] = UnpackSIMD<0>(block0);
955 temp[1] = UnpackSIMD<1>(block0);
956 temp[2] = UnpackSIMD<2>(block0);
957 temp[3] = UnpackSIMD<3>(block0);
958
959 LEA_Encryption(temp, subkeys, rounds);
960
961 block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]);
962}
963
964inline void LEA_Dec_Block(uint32x4_p &block0,
965 const word32 *subkeys, unsigned int rounds)
966{
967 uint32x4_p temp[4];
968 temp[0] = UnpackSIMD<0>(block0);
969 temp[1] = UnpackSIMD<1>(block0);
970 temp[2] = UnpackSIMD<2>(block0);
971 temp[3] = UnpackSIMD<3>(block0);
972
973 LEA_Decryption(temp, subkeys, rounds);
974
975 block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]);
976}
977
978inline void LEA_Enc_4_Blocks(uint32x4_p &block0, uint32x4_p &block1,
979 uint32x4_p &block2, uint32x4_p &block3, const word32 *subkeys, unsigned int rounds)
980{
981 uint32x4_p temp[4];
982 temp[0] = UnpackSIMD<0>(block0, block1, block2, block3);
983 temp[1] = UnpackSIMD<1>(block0, block1, block2, block3);
984 temp[2] = UnpackSIMD<2>(block0, block1, block2, block3);
985 temp[3] = UnpackSIMD<3>(block0, block1, block2, block3);
986
987 LEA_Encryption(temp, subkeys, rounds);
988
989 block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]);
990 block1 = RepackSIMD<1>(temp[0], temp[1], temp[2], temp[3]);
991 block2 = RepackSIMD<2>(temp[0], temp[1], temp[2], temp[3]);
992 block3 = RepackSIMD<3>(temp[0], temp[1], temp[2], temp[3]);
993}
994
995inline void LEA_Dec_4_Blocks(uint32x4_p &block0, uint32x4_p &block1,
996 uint32x4_p &block2, uint32x4_p &block3, const word32 *subkeys, unsigned int rounds)
997{
998 uint32x4_p temp[4];
999 temp[0] = UnpackSIMD<0>(block0, block1, block2, block3);
1000 temp[1] = UnpackSIMD<1>(block0, block1, block2, block3);
1001 temp[2] = UnpackSIMD<2>(block0, block1, block2, block3);
1002 temp[3] = UnpackSIMD<3>(block0, block1, block2, block3);
1003
1004 LEA_Decryption(temp, subkeys, rounds);
1005
1006 block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]);
1007 block1 = RepackSIMD<1>(temp[0], temp[1], temp[2], temp[3]);
1008 block2 = RepackSIMD<2>(temp[0], temp[1], temp[2], temp[3]);
1009 block3 = RepackSIMD<3>(temp[0], temp[1], temp[2], temp[3]);
1010}
1011
1012#endif // CRYPTOPP_POWER8_AVAILABLE
1013
1014ANONYMOUS_NAMESPACE_END
1015
1016// *************************** SIMD Templates ***************************//
1017
1018NAMESPACE_BEGIN(CryptoPP)
1019
1020#if defined(CRYPTOPP_SSSE3_AVAILABLE)
1021size_t LEA_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
1022 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1023{
1024 return AdvancedProcessBlocks128_4x1_SSE(LEA_Enc_Block, LEA_Enc_4_Blocks,
1025 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1026}
1027
1028size_t LEA_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
1029 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1030{
1031 return AdvancedProcessBlocks128_4x1_SSE(LEA_Dec_Block, LEA_Dec_4_Blocks,
1032 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1033}
1034#endif // CRYPTOPP_SSSE3_AVAILABLE
1035
1036#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
1037size_t LEA_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
1038 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1039{
1040 return AdvancedProcessBlocks128_4x1_NEON(LEA_Enc_Block, LEA_Enc_4_Blocks,
1041 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1042}
1043
1044size_t LEA_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
1045 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1046{
1047 return AdvancedProcessBlocks128_4x1_NEON(LEA_Dec_Block, LEA_Dec_4_Blocks,
1048 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1049}
1050#endif // CRYPTOPP_ARM_NEON_AVAILABLE
1051
1052#if defined(CRYPTOPP_POWER8_AVAILABLE)
1053size_t LEA_Enc_AdvancedProcessBlocks_POWER8(const word32* subKeys, size_t rounds,
1054 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1055{
1056 return AdvancedProcessBlocks128_4x1_ALTIVEC(LEA_Enc_Block, LEA_Enc_4_Blocks,
1057 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1058}
1059
1060size_t LEA_Dec_AdvancedProcessBlocks_POWER8(const word32* subKeys, size_t rounds,
1061 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1062{
1063 return AdvancedProcessBlocks128_4x1_ALTIVEC(LEA_Dec_Block, LEA_Dec_4_Blocks,
1064 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1065}
1066#endif // CRYPTOPP_POWER8_AVAILABLE
1067
1068NAMESPACE_END
Template for AdvancedProcessBlocks and SIMD processing.
Library configuration file.
Classes for the LEA block cipher.
Utility functions for the Crypto++ library.
Crypto++ library namespace.
Precompiled header file.
Support functions for PowerPC and vector operations.
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:129
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1010
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:119
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:916
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:139
T1 VecSub(const T1 vec1, const T2 vec2)
Subtract two vectors.
Definition: ppc_simd.h:956
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:939
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:69