Crypto++ 8.2
Free C&
simon64_simd.cpp
1// simon-simd.cpp - written and placed in the public domain by Jeffrey Walton
2//
3// This source file uses intrinsics and built-ins to gain access to
4// SSSE3, ARM NEON and ARMv8a, and Altivec instructions. A separate
5// source file is needed because additional CXXFLAGS are required to enable
6// the appropriate instructions sets in some build configurations.
7
8#include "pch.h"
9#include "config.h"
10
11#include "simon.h"
12#include "misc.h"
13
14// Uncomment for benchmarking C++ against SSE or NEON.
15// Do so in both simon.cpp and simon-simd.cpp.
16// #undef CRYPTOPP_SSE41_AVAILABLE
17// #undef CRYPTOPP_ARM_NEON_AVAILABLE
18
19#if (CRYPTOPP_SSSE3_AVAILABLE)
20# include "adv_simd.h"
21# include <pmmintrin.h>
22# include <tmmintrin.h>
23#endif
24
25#if (CRYPTOPP_SSE41_AVAILABLE)
26# include <smmintrin.h>
27#endif
28
29#if defined(__XOP__)
30# include <ammintrin.h>
31#endif
32
33#if defined(__AVX512F__)
34# define CRYPTOPP_AVX512_ROTATE 1
35# include <immintrin.h>
36#endif
37
38// C1189: error: This header is specific to ARM targets
39#if (CRYPTOPP_ARM_NEON_AVAILABLE)
40# include "adv_simd.h"
41# ifndef _M_ARM64
42# include <arm_neon.h>
43# endif
44#endif
45
46#if (CRYPTOPP_ARM_ACLE_AVAILABLE)
47# include <stdint.h>
48# include <arm_acle.h>
49#endif
50
51#if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
52# include "adv_simd.h"
53# include "ppc_simd.h"
54#endif
55
56// Squash MS LNK4221 and libtool warnings
57extern const char SIMON64_SIMD_FNAME[] = __FILE__;
58
59ANONYMOUS_NAMESPACE_BEGIN
60
61using CryptoPP::byte;
62using CryptoPP::word32;
63using CryptoPP::word64;
64using CryptoPP::vec_swap; // SunCC
65
66// *************************** ARM NEON ************************** //
67
68#if (CRYPTOPP_ARM_NEON_AVAILABLE)
69
70template <class T>
71inline T UnpackHigh32(const T& a, const T& b)
72{
73 const uint32x2_t x(vget_high_u32((uint32x4_t)a));
74 const uint32x2_t y(vget_high_u32((uint32x4_t)b));
75 const uint32x2x2_t r = vzip_u32(x, y);
76 return (T)vcombine_u32(r.val[0], r.val[1]);
77}
78
79template <class T>
80inline T UnpackLow32(const T& a, const T& b)
81{
82 const uint32x2_t x(vget_low_u32((uint32x4_t)a));
83 const uint32x2_t y(vget_low_u32((uint32x4_t)b));
84 const uint32x2x2_t r = vzip_u32(x, y);
85 return (T)vcombine_u32(r.val[0], r.val[1]);
86}
87
88template <unsigned int R>
89inline uint32x4_t RotateLeft32(const uint32x4_t& val)
90{
91 const uint32x4_t a(vshlq_n_u32(val, R));
92 const uint32x4_t b(vshrq_n_u32(val, 32 - R));
93 return vorrq_u32(a, b);
94}
95
96template <unsigned int R>
97inline uint32x4_t RotateRight32(const uint32x4_t& val)
98{
99 const uint32x4_t a(vshlq_n_u32(val, 32 - R));
100 const uint32x4_t b(vshrq_n_u32(val, R));
101 return vorrq_u32(a, b);
102}
103
104#if defined(__aarch32__) || defined(__aarch64__)
105// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
106template <>
107inline uint32x4_t RotateLeft32<8>(const uint32x4_t& val)
108{
109 const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
110 const uint8x16_t mask = vld1q_u8(maskb);
111
112 return vreinterpretq_u32_u8(
113 vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
114}
115
116// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
117template <>
118inline uint32x4_t RotateRight32<8>(const uint32x4_t& val)
119{
120 const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,14,12 };
121 const uint8x16_t mask = vld1q_u8(maskb);
122
123 return vreinterpretq_u32_u8(
124 vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
125}
126#endif
127
128inline uint32x4_t SIMON64_f(const uint32x4_t& val)
129{
130 return veorq_u32(RotateLeft32<2>(val),
131 vandq_u32(RotateLeft32<1>(val), RotateLeft32<8>(val)));
132}
133
134inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
135 const word32 *subkeys, unsigned int rounds)
136{
137 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
138 uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
139 uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
140
141 for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
142 {
143 const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
144 y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
145
146 const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
147 x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
148 }
149
150 if (rounds & 1)
151 {
152 const uint32x4_t rk = vld1q_dup_u32(subkeys+rounds-1);
153
154 y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
155 std::swap(x1, y1);
156 }
157
158 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
159 block0 = UnpackLow32(y1, x1);
160 block1 = UnpackHigh32(y1, x1);
161}
162
163inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
164 const word32 *subkeys, unsigned int rounds)
165{
166 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
167 uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
168 uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
169
170 if (rounds & 1)
171 {
172 std::swap(x1, y1);
173 const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
174
175 y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
176 rounds--;
177 }
178
179 for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
180 {
181 const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i+1);
182 x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
183
184 const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i);
185 y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
186 }
187
188 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
189 block0 = UnpackLow32(y1, x1);
190 block1 = UnpackHigh32(y1, x1);
191}
192
193inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
194 uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
195 const word32 *subkeys, unsigned int rounds)
196{
197 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
198 uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
199 uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
200 uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
201 uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
202 uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
203 uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
204
205 for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
206 {
207 const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
208 y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
209 y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk1);
210 y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk1);
211
212 const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
213 x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
214 x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk2);
215 x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk2);
216 }
217
218 if (rounds & 1)
219 {
220 const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
221
222 y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
223 y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk);
224 y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk);
225 std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
226 }
227
228 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
229 block0 = UnpackLow32(y1, x1);
230 block1 = UnpackHigh32(y1, x1);
231 block2 = UnpackLow32(y2, x2);
232 block3 = UnpackHigh32(y2, x2);
233 block4 = UnpackLow32(y3, x3);
234 block5 = UnpackHigh32(y3, x3);
235}
236
237inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
238 uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
239 const word32 *subkeys, unsigned int rounds)
240{
241 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
242 uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
243 uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
244 uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
245 uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
246 uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
247 uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
248
249 if (rounds & 1)
250 {
251 std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
252 const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
253
254 y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
255 y2 = veorq_u32(veorq_u32(y2, rk), SIMON64_f(x2));
256 y3 = veorq_u32(veorq_u32(y3, rk), SIMON64_f(x3));
257 rounds--;
258 }
259
260 for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
261 {
262 const uint32x4_t rk1 = vld1q_dup_u32(subkeys + i + 1);
263 x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
264 x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk1);
265 x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk1);
266
267 const uint32x4_t rk2 = vld1q_dup_u32(subkeys + i);
268 y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
269 y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk2);
270 y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk2);
271 }
272
273 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
274 block0 = UnpackLow32(y1, x1);
275 block1 = UnpackHigh32(y1, x1);
276 block2 = UnpackLow32(y2, x2);
277 block3 = UnpackHigh32(y2, x2);
278 block4 = UnpackLow32(y3, x3);
279 block5 = UnpackHigh32(y3, x3);
280}
281
282#endif // CRYPTOPP_ARM_NEON_AVAILABLE
283
284// ***************************** IA-32 ***************************** //
285
286#if defined(CRYPTOPP_SSE41_AVAILABLE)
287
288inline void Swap128(__m128i& a,__m128i& b)
289{
290#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
291 // __m128i is an unsigned long long[2], and support for swapping it was not added until C++11.
292 // SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11.
293 vec_swap(a, b);
294#else
295 std::swap(a, b);
296#endif
297}
298
299template <unsigned int R>
300inline __m128i RotateLeft32(const __m128i& val)
301{
302#if defined(__XOP__)
303 return _mm_roti_epi32(val, R);
304#else
305 return _mm_or_si128(
306 _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
307#endif
308}
309
310template <unsigned int R>
311inline __m128i RotateRight32(const __m128i& val)
312{
313#if defined(__XOP__)
314 return _mm_roti_epi32(val, 32-R);
315#else
316 return _mm_or_si128(
317 _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
318#endif
319}
320
321// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
322template <>
323__m128i RotateLeft32<8>(const __m128i& val)
324{
325#if defined(__XOP__)
326 return _mm_roti_epi32(val, 8);
327#else
328 const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
329 return _mm_shuffle_epi8(val, mask);
330#endif
331}
332
333// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
334template <>
335__m128i RotateRight32<8>(const __m128i& val)
336{
337#if defined(__XOP__)
338 return _mm_roti_epi32(val, 32-8);
339#else
340 const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
341 return _mm_shuffle_epi8(val, mask);
342#endif
343}
344
345inline __m128i SIMON64_f(const __m128i& v)
346{
347 return _mm_xor_si128(RotateLeft32<2>(v),
348 _mm_and_si128(RotateLeft32<1>(v), RotateLeft32<8>(v)));
349}
350
351inline void SIMON64_Enc_Block(__m128i &block0, __m128i &block1,
352 const word32 *subkeys, unsigned int rounds)
353{
354 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
355 const __m128 t0 = _mm_castsi128_ps(block0);
356 const __m128 t1 = _mm_castsi128_ps(block1);
357 __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
358 __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
359
360 for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
361 {
362 const __m128i rk1 = _mm_set1_epi32(subkeys[i]);
363 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
364
365 const __m128i rk2 = _mm_set1_epi32(subkeys[i+1]);
366 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
367 }
368
369 if (rounds & 1)
370 {
371 const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
372 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
373 Swap128(x1, y1);
374 }
375
376 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
377 block0 = _mm_unpacklo_epi32(y1, x1);
378 block1 = _mm_unpackhi_epi32(y1, x1);
379}
380
381inline void SIMON64_Dec_Block(__m128i &block0, __m128i &block1,
382 const word32 *subkeys, unsigned int rounds)
383{
384 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
385 const __m128 t0 = _mm_castsi128_ps(block0);
386 const __m128 t1 = _mm_castsi128_ps(block1);
387 __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
388 __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
389
390 if (rounds & 1)
391 {
392 Swap128(x1, y1);
393 const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
394 y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
395 rounds--;
396 }
397
398 for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
399 {
400 const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
401 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
402
403 const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
404 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
405 }
406
407 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
408 block0 = _mm_unpacklo_epi32(y1, x1);
409 block1 = _mm_unpackhi_epi32(y1, x1);
410}
411
412inline void SIMON64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
413 __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
414 const word32 *subkeys, unsigned int rounds)
415{
416 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
417 const __m128 t0 = _mm_castsi128_ps(block0);
418 const __m128 t1 = _mm_castsi128_ps(block1);
419 __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
420 __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
421
422 const __m128 t2 = _mm_castsi128_ps(block2);
423 const __m128 t3 = _mm_castsi128_ps(block3);
424 __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
425 __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
426
427 const __m128 t4 = _mm_castsi128_ps(block4);
428 const __m128 t5 = _mm_castsi128_ps(block5);
429 __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
430 __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
431
432 for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
433 {
434 const __m128i rk1 = _mm_set1_epi32(subkeys[i]);
435 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
436 y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk1);
437 y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk1);
438
439 const __m128i rk2 = _mm_set1_epi32(subkeys[i+1]);
440 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
441 x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk2);
442 x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk2);
443 }
444
445 if (rounds & 1)
446 {
447 const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
448 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
449 y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk);
450 y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk);
451 Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
452 }
453
454 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
455 block0 = _mm_unpacklo_epi32(y1, x1);
456 block1 = _mm_unpackhi_epi32(y1, x1);
457 block2 = _mm_unpacklo_epi32(y2, x2);
458 block3 = _mm_unpackhi_epi32(y2, x2);
459 block4 = _mm_unpacklo_epi32(y3, x3);
460 block5 = _mm_unpackhi_epi32(y3, x3);
461}
462
463inline void SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
464 __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
465 const word32 *subkeys, unsigned int rounds)
466{
467 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
468 const __m128 t0 = _mm_castsi128_ps(block0);
469 const __m128 t1 = _mm_castsi128_ps(block1);
470 __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
471 __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
472
473 const __m128 t2 = _mm_castsi128_ps(block2);
474 const __m128 t3 = _mm_castsi128_ps(block3);
475 __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
476 __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
477
478 const __m128 t4 = _mm_castsi128_ps(block4);
479 const __m128 t5 = _mm_castsi128_ps(block5);
480 __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
481 __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
482
483 if (rounds & 1)
484 {
485 Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
486 const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
487 y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
488 y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON64_f(x2));
489 y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON64_f(x3));
490 rounds--;
491 }
492
493 for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
494 {
495 const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
496 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
497 x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk1);
498 x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk1);
499
500 const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
501 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
502 y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk2);
503 y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk2);
504 }
505
506 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
507 block0 = _mm_unpacklo_epi32(y1, x1);
508 block1 = _mm_unpackhi_epi32(y1, x1);
509 block2 = _mm_unpacklo_epi32(y2, x2);
510 block3 = _mm_unpackhi_epi32(y2, x2);
511 block4 = _mm_unpacklo_epi32(y3, x3);
512 block5 = _mm_unpackhi_epi32(y3, x3);
513}
514
515#endif // CRYPTOPP_SSE41_AVAILABLE
516
517// ***************************** Altivec ***************************** //
518
519#if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
520
521using CryptoPP::uint8x16_p;
522using CryptoPP::uint32x4_p;
523
524using CryptoPP::VecAnd;
525using CryptoPP::VecXor;
526using CryptoPP::VecLoad;
527using CryptoPP::VecLoadBE;
528using CryptoPP::VecPermute;
529
530// Rotate left by bit count
531template<unsigned int C>
532inline uint32x4_p RotateLeft32(const uint32x4_p val)
533{
534 const uint32x4_p m = {C, C, C, C};
535 return vec_rl(val, m);
536}
537
538// Rotate right by bit count
539template<unsigned int C>
540inline uint32x4_p RotateRight32(const uint32x4_p val)
541{
542 const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
543 return vec_rl(val, m);
544}
545
546inline uint32x4_p SIMON64_f(const uint32x4_p val)
547{
548 return VecXor(RotateLeft32<2>(val),
549 VecAnd(RotateLeft32<1>(val), RotateLeft32<8>(val)));
550}
551
552inline void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
553 const word32 *subkeys, unsigned int rounds)
554{
555#if (CRYPTOPP_BIG_ENDIAN)
556 const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
557 const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
558#else
559 const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
560 const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
561#endif
562
563 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
564 uint32x4_p x1 = VecPermute(block0, block1, m1);
565 uint32x4_p y1 = VecPermute(block0, block1, m2);
566
567 for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
568 {
569#if CRYPTOPP_POWER8_AVAILABLE
570 const uint32x4_p rk1 = vec_splats(subkeys[i]);
571 const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
572#else
573 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
574 uint32x4_p rk1 = VecLoad(subkeys+i);
575 uint32x4_p rk2 = VecLoad(subkeys+i+1);
576 rk1 = VecPermute(rk1, rk1, m);
577 rk2 = VecPermute(rk2, rk2, m);
578#endif
579 y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk1);
580 x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk2);
581 }
582
583 if (rounds & 1)
584 {
585#if CRYPTOPP_POWER8_AVAILABLE
586 const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
587#else
588 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
589 uint32x4_p rk = VecLoad(subkeys+rounds-1);
590 rk = VecPermute(rk, rk, m);
591#endif
592 y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk);
593 std::swap(x1, y1);
594 }
595
596#if (CRYPTOPP_BIG_ENDIAN)
597 const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
598 const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
599#else
600 const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
601 const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
602#endif
603
604 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
605 block0 = (uint32x4_p)VecPermute(x1, y1, m3);
606 block1 = (uint32x4_p)VecPermute(x1, y1, m4);
607}
608
609inline void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
610 const word32 *subkeys, unsigned int rounds)
611{
612#if (CRYPTOPP_BIG_ENDIAN)
613 const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
614 const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
615#else
616 const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
617 const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
618#endif
619
620 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
621 uint32x4_p x1 = VecPermute(block0, block1, m1);
622 uint32x4_p y1 = VecPermute(block0, block1, m2);
623
624 if (rounds & 1)
625 {
626 std::swap(x1, y1);
627#if CRYPTOPP_POWER8_AVAILABLE
628 const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
629#else
630 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
631 uint32x4_p rk = VecLoad(subkeys+rounds-1);
632 rk = VecPermute(rk, rk, m);
633#endif
634 y1 = VecXor(VecXor(y1, rk), SIMON64_f(x1));
635 rounds--;
636 }
637
638 for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
639 {
640#if CRYPTOPP_POWER8_AVAILABLE
641 const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
642 const uint32x4_p rk2 = vec_splats(subkeys[i]);
643#else
644 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
645 uint32x4_p rk1 = VecLoad(subkeys+i+1);
646 uint32x4_p rk2 = VecLoad(subkeys+i);
647 rk1 = VecPermute(rk1, rk1, m);
648 rk2 = VecPermute(rk2, rk2, m);
649#endif
650 x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk1);
651 y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk2);
652 }
653
654#if (CRYPTOPP_BIG_ENDIAN)
655 const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
656 const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
657#else
658 const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
659 const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
660#endif
661
662 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
663 block0 = (uint32x4_p)VecPermute(x1, y1, m3);
664 block1 = (uint32x4_p)VecPermute(x1, y1, m4);
665}
666
667inline void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
668 uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
669 uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
670{
671#if (CRYPTOPP_BIG_ENDIAN)
672 const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
673 const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
674#else
675 const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
676 const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
677#endif
678
679 // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
680 uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
681 uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
682 uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
683 uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
684 uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
685 uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
686
687 for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
688 {
689#if CRYPTOPP_POWER8_AVAILABLE
690 const uint32x4_p rk1 = vec_splats(subkeys[i]);
691 const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
692#else
693 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
694 uint32x4_p rk1 = VecLoad(subkeys+i);
695 uint32x4_p rk2 = VecLoad(subkeys+i+1);
696 rk1 = VecPermute(rk1, rk1, m);
697 rk2 = VecPermute(rk2, rk2, m);
698#endif
699 y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk1);
700 y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk1);
701 y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk1);
702
703 x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk2);
704 x2 = VecXor(VecXor(x2, SIMON64_f(y2)), rk2);
705 x3 = VecXor(VecXor(x3, SIMON64_f(y3)), rk2);
706 }
707
708 if (rounds & 1)
709 {
710#if CRYPTOPP_POWER8_AVAILABLE
711 const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
712#else
713 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
714 uint32x4_p rk = VecLoad(subkeys+rounds-1);
715 rk = VecPermute(rk, rk, m);
716#endif
717 y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk);
718 y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk);
719 y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk);
720 std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
721 }
722
723#if (CRYPTOPP_BIG_ENDIAN)
724 const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
725 const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
726#else
727 const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
728 const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
729#endif
730
731 // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
732 block0 = (uint32x4_p)VecPermute(x1, y1, m3);
733 block1 = (uint32x4_p)VecPermute(x1, y1, m4);
734 block2 = (uint32x4_p)VecPermute(x2, y2, m3);
735 block3 = (uint32x4_p)VecPermute(x2, y2, m4);
736 block4 = (uint32x4_p)VecPermute(x3, y3, m3);
737 block5 = (uint32x4_p)VecPermute(x3, y3, m4);
738}
739
740inline void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
741 uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
742 uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
743{
744#if (CRYPTOPP_BIG_ENDIAN)
745 const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
746 const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
747#else
748 const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
749 const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
750#endif
751
752 // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
753 uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
754 uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
755 uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
756 uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
757 uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
758 uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
759
760 if (rounds & 1)
761 {
762 std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
763
764#if CRYPTOPP_POWER8_AVAILABLE
765 const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
766#else
767 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
768 uint32x4_p rk = VecLoad(subkeys+rounds-1);
769 rk = VecPermute(rk, rk, m);
770#endif
771 y1 = VecXor(VecXor(y1, rk), SIMON64_f(x1));
772 y2 = VecXor(VecXor(y2, rk), SIMON64_f(x2));
773 y3 = VecXor(VecXor(y3, rk), SIMON64_f(x3));
774 rounds--;
775 }
776
777 for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
778 {
779#if CRYPTOPP_POWER8_AVAILABLE
780 const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
781 const uint32x4_p rk2 = vec_splats(subkeys[i]);
782#else
783 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
784 uint32x4_p rk1 = VecLoad(subkeys+i+1);
785 uint32x4_p rk2 = VecLoad(subkeys+i);
786 rk1 = VecPermute(rk1, rk1, m);
787 rk2 = VecPermute(rk2, rk2, m);
788#endif
789 x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk1);
790 x2 = VecXor(VecXor(x2, SIMON64_f(y2)), rk1);
791 x3 = VecXor(VecXor(x3, SIMON64_f(y3)), rk1);
792
793 y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk2);
794 y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk2);
795 y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk2);
796 }
797
798#if (CRYPTOPP_BIG_ENDIAN)
799 const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
800 const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
801#else
802 const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
803 const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
804#endif
805
806 // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
807 block0 = (uint32x4_p)VecPermute(x1, y1, m3);
808 block1 = (uint32x4_p)VecPermute(x1, y1, m4);
809 block2 = (uint32x4_p)VecPermute(x2, y2, m3);
810 block3 = (uint32x4_p)VecPermute(x2, y2, m4);
811 block4 = (uint32x4_p)VecPermute(x3, y3, m3);
812 block5 = (uint32x4_p)VecPermute(x3, y3, m4);
813}
814
815#endif // CRYPTOPP_ALTIVEC_AVAILABLE
816
817ANONYMOUS_NAMESPACE_END
818
819///////////////////////////////////////////////////////////////////////
820
821NAMESPACE_BEGIN(CryptoPP)
822
823// *************************** ARM NEON **************************** //
824
825#if (CRYPTOPP_ARM_NEON_AVAILABLE)
826size_t SIMON64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
827 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
828{
829 return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
830 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
831}
832
833size_t SIMON64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
834 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
835{
836 return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
837 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
838}
839#endif // CRYPTOPP_ARM_NEON_AVAILABLE
840
841// ***************************** IA-32 ***************************** //
842
843#if defined(CRYPTOPP_SSE41_AVAILABLE)
844size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
845 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
846{
847 return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
848 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
849}
850
851size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
852 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
853{
854 return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
855 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
856}
857#endif
858
859// ***************************** Altivec ***************************** //
860
861#if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
862size_t SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
863 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
864{
865 return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
866 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
867}
868
869size_t SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
870 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
871{
872 return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
873 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
874}
875#endif
876
877NAMESPACE_END
Template for AdvancedProcessBlocks and SIMD processing.
Library configuration file.
Utility functions for the Crypto++ library.
void vec_swap(T &a, T &b)
Swaps two variables which are arrays.
Definition: misc.h:531
Crypto++ library namespace.
Precompiled header file.
Support functions for PowerPC and vector operations.
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:129
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1010
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:119
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:916
T1 VecAnd(const T1 vec1, const T2 vec2)
AND two vectors.
Definition: ppc_simd.h:882
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:253
Classes for the Simon block cipher.