Crypto++ 8.2
Free C&
speck64_simd.cpp
1// speck64_simd.cpp - written and placed in the public domain by Jeffrey Walton
2//
3// This source file uses intrinsics and built-ins to gain access to
4// SSSE3, ARM NEON and ARMv8a, and Altivec instructions. A separate
5// source file is needed because additional CXXFLAGS are required to enable
6// the appropriate instructions sets in some build configurations.
7
8#include "pch.h"
9#include "config.h"
10
11#include "speck.h"
12#include "misc.h"
13
14// Uncomment for benchmarking C++ against SSE or NEON.
15// Do so in both speck.cpp and speck-simd.cpp.
16// #undef CRYPTOPP_SSE41_AVAILABLE
17// #undef CRYPTOPP_ARM_NEON_AVAILABLE
18
19#if (CRYPTOPP_SSSE3_AVAILABLE)
20# include "adv_simd.h"
21# include <pmmintrin.h>
22# include <tmmintrin.h>
23#endif
24
25#if (CRYPTOPP_SSE41_AVAILABLE)
26# include <smmintrin.h>
27#endif
28
29#if defined(__XOP__)
30# include <ammintrin.h>
31#endif
32
33#if defined(__AVX512F__)
34# define CRYPTOPP_AVX512_ROTATE 1
35# include <immintrin.h>
36#endif
37
38// C1189: error: This header is specific to ARM targets
39#if (CRYPTOPP_ARM_NEON_AVAILABLE)
40# include "adv_simd.h"
41# ifndef _M_ARM64
42# include <arm_neon.h>
43# endif
44#endif
45
46#if (CRYPTOPP_ARM_ACLE_AVAILABLE)
47# include <stdint.h>
48# include <arm_acle.h>
49#endif
50
51#if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
52# include "adv_simd.h"
53# include "ppc_simd.h"
54#endif
55
56// Squash MS LNK4221 and libtool warnings
57extern const char SPECK64_SIMD_FNAME[] = __FILE__;
58
59ANONYMOUS_NAMESPACE_BEGIN
60
61using CryptoPP::byte;
62using CryptoPP::word32;
63using CryptoPP::word64;
64
65// *************************** ARM NEON ************************** //
66
67#if (CRYPTOPP_ARM_NEON_AVAILABLE)
68
69template <class T>
70inline T UnpackHigh32(const T& a, const T& b)
71{
72 const uint32x2_t x(vget_high_u32((uint32x4_t)a));
73 const uint32x2_t y(vget_high_u32((uint32x4_t)b));
74 const uint32x2x2_t r = vzip_u32(x, y);
75 return (T)vcombine_u32(r.val[0], r.val[1]);
76}
77
78template <class T>
79inline T UnpackLow32(const T& a, const T& b)
80{
81 const uint32x2_t x(vget_low_u32((uint32x4_t)a));
82 const uint32x2_t y(vget_low_u32((uint32x4_t)b));
83 const uint32x2x2_t r = vzip_u32(x, y);
84 return (T)vcombine_u32(r.val[0], r.val[1]);
85}
86
87template <unsigned int R>
88inline uint32x4_t RotateLeft32(const uint32x4_t& val)
89{
90 const uint32x4_t a(vshlq_n_u32(val, R));
91 const uint32x4_t b(vshrq_n_u32(val, 32 - R));
92 return vorrq_u32(a, b);
93}
94
95template <unsigned int R>
96inline uint32x4_t RotateRight32(const uint32x4_t& val)
97{
98 const uint32x4_t a(vshlq_n_u32(val, 32 - R));
99 const uint32x4_t b(vshrq_n_u32(val, R));
100 return vorrq_u32(a, b);
101}
102
103#if defined(__aarch32__) || defined(__aarch64__)
104// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
105template <>
106inline uint32x4_t RotateLeft32<8>(const uint32x4_t& val)
107{
108 const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
109 const uint8x16_t mask = vld1q_u8(maskb);
110
111 return vreinterpretq_u32_u8(
112 vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
113}
114
115// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
116template <>
117inline uint32x4_t RotateRight32<8>(const uint32x4_t& val)
118{
119 const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,15,12 };
120 const uint8x16_t mask = vld1q_u8(maskb);
121
122 return vreinterpretq_u32_u8(
123 vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
124}
125#endif // Aarch32 or Aarch64
126
127inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1,
128 const word32 *subkeys, unsigned int rounds)
129{
130 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
131 uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
132 uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
133
134 for (int i=0; i < static_cast<int>(rounds); ++i)
135 {
136 const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
137
138 x1 = RotateRight32<8>(x1);
139 x1 = vaddq_u32(x1, y1);
140 x1 = veorq_u32(x1, rk);
141 y1 = RotateLeft32<3>(y1);
142 y1 = veorq_u32(y1, x1);
143 }
144
145 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
146 block0 = UnpackLow32(y1, x1);
147 block1 = UnpackHigh32(y1, x1);
148}
149
150inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
151 const word32 *subkeys, unsigned int rounds)
152{
153 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
154 uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
155 uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
156
157 for (int i = static_cast<int>(rounds-1); i >= 0; --i)
158 {
159 const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
160
161 y1 = veorq_u32(y1, x1);
162 y1 = RotateRight32<3>(y1);
163 x1 = veorq_u32(x1, rk);
164 x1 = vsubq_u32(x1, y1);
165 x1 = RotateLeft32<8>(x1);
166 }
167
168 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
169 block0 = UnpackLow32(y1, x1);
170 block1 = UnpackHigh32(y1, x1);
171}
172
173inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
174 uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
175 const word32 *subkeys, unsigned int rounds)
176{
177 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
178 uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
179 uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
180 uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
181 uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
182 uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
183 uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
184
185 for (int i=0; i < static_cast<int>(rounds); ++i)
186 {
187 const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
188
189 x1 = RotateRight32<8>(x1);
190 x2 = RotateRight32<8>(x2);
191 x3 = RotateRight32<8>(x3);
192 x1 = vaddq_u32(x1, y1);
193 x2 = vaddq_u32(x2, y2);
194 x3 = vaddq_u32(x3, y3);
195 x1 = veorq_u32(x1, rk);
196 x2 = veorq_u32(x2, rk);
197 x3 = veorq_u32(x3, rk);
198 y1 = RotateLeft32<3>(y1);
199 y2 = RotateLeft32<3>(y2);
200 y3 = RotateLeft32<3>(y3);
201 y1 = veorq_u32(y1, x1);
202 y2 = veorq_u32(y2, x2);
203 y3 = veorq_u32(y3, x3);
204 }
205
206 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
207 block0 = UnpackLow32(y1, x1);
208 block1 = UnpackHigh32(y1, x1);
209 block2 = UnpackLow32(y2, x2);
210 block3 = UnpackHigh32(y2, x2);
211 block4 = UnpackLow32(y3, x3);
212 block5 = UnpackHigh32(y3, x3);
213}
214
215inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
216 uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
217 const word32 *subkeys, unsigned int rounds)
218{
219 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
220 uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
221 uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
222 uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
223 uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
224 uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
225 uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
226
227 for (int i = static_cast<int>(rounds-1); i >= 0; --i)
228 {
229 const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
230
231 y1 = veorq_u32(y1, x1);
232 y2 = veorq_u32(y2, x2);
233 y3 = veorq_u32(y3, x3);
234 y1 = RotateRight32<3>(y1);
235 y2 = RotateRight32<3>(y2);
236 y3 = RotateRight32<3>(y3);
237 x1 = veorq_u32(x1, rk);
238 x2 = veorq_u32(x2, rk);
239 x3 = veorq_u32(x3, rk);
240 x1 = vsubq_u32(x1, y1);
241 x2 = vsubq_u32(x2, y2);
242 x3 = vsubq_u32(x3, y3);
243 x1 = RotateLeft32<8>(x1);
244 x2 = RotateLeft32<8>(x2);
245 x3 = RotateLeft32<8>(x3);
246 }
247
248 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
249 block0 = UnpackLow32(y1, x1);
250 block1 = UnpackHigh32(y1, x1);
251 block2 = UnpackLow32(y2, x2);
252 block3 = UnpackHigh32(y2, x2);
253 block4 = UnpackLow32(y3, x3);
254 block5 = UnpackHigh32(y3, x3);
255}
256
257#endif // CRYPTOPP_ARM_NEON_AVAILABLE
258
259// ***************************** IA-32 ***************************** //
260
261#if defined(CRYPTOPP_SSE41_AVAILABLE)
262
263template <unsigned int R>
264inline __m128i RotateLeft32(const __m128i& val)
265{
266#if defined(__XOP__)
267 return _mm_roti_epi32(val, R);
268#else
269 return _mm_or_si128(
270 _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
271#endif
272}
273
274template <unsigned int R>
275inline __m128i RotateRight32(const __m128i& val)
276{
277#if defined(__XOP__)
278 return _mm_roti_epi32(val, 32-R);
279#else
280 return _mm_or_si128(
281 _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
282#endif
283}
284
285// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
286template <>
287__m128i RotateLeft32<8>(const __m128i& val)
288{
289#if defined(__XOP__)
290 return _mm_roti_epi32(val, 8);
291#else
292 const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
293 return _mm_shuffle_epi8(val, mask);
294#endif
295}
296
297// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
298template <>
299__m128i RotateRight32<8>(const __m128i& val)
300{
301#if defined(__XOP__)
302 return _mm_roti_epi32(val, 32-8);
303#else
304 const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
305 return _mm_shuffle_epi8(val, mask);
306#endif
307}
308
309inline void SPECK64_Enc_Block(__m128i &block0, __m128i &block1,
310 const word32 *subkeys, unsigned int rounds)
311{
312 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
313 const __m128 t0 = _mm_castsi128_ps(block0);
314 const __m128 t1 = _mm_castsi128_ps(block1);
315 __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
316 __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
317
318 for (int i=0; i < static_cast<int>(rounds); ++i)
319 {
320 const __m128i rk = _mm_set1_epi32(subkeys[i]);
321
322 x1 = RotateRight32<8>(x1);
323 x1 = _mm_add_epi32(x1, y1);
324 x1 = _mm_xor_si128(x1, rk);
325 y1 = RotateLeft32<3>(y1);
326 y1 = _mm_xor_si128(y1, x1);
327 }
328
329 // The is roughly the SSE equivalent to ARM vzp32
330 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
331 block0 = _mm_unpacklo_epi32(y1, x1);
332 block1 = _mm_unpackhi_epi32(y1, x1);
333}
334
335inline void SPECK64_Dec_Block(__m128i &block0, __m128i &block1,
336 const word32 *subkeys, unsigned int rounds)
337{
338 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
339 const __m128 t0 = _mm_castsi128_ps(block0);
340 const __m128 t1 = _mm_castsi128_ps(block1);
341 __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
342 __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
343
344 for (int i = static_cast<int>(rounds-1); i >= 0; --i)
345 {
346 const __m128i rk = _mm_set1_epi32(subkeys[i]);
347
348 y1 = _mm_xor_si128(y1, x1);
349 y1 = RotateRight32<3>(y1);
350 x1 = _mm_xor_si128(x1, rk);
351 x1 = _mm_sub_epi32(x1, y1);
352 x1 = RotateLeft32<8>(x1);
353 }
354
355 // The is roughly the SSE equivalent to ARM vzp32
356 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
357 block0 = _mm_unpacklo_epi32(y1, x1);
358 block1 = _mm_unpackhi_epi32(y1, x1);
359}
360
361inline void SPECK64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
362 __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
363 const word32 *subkeys, unsigned int rounds)
364{
365 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
366 const __m128 t0 = _mm_castsi128_ps(block0);
367 const __m128 t1 = _mm_castsi128_ps(block1);
368 __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
369 __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
370
371 const __m128 t2 = _mm_castsi128_ps(block2);
372 const __m128 t3 = _mm_castsi128_ps(block3);
373 __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
374 __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
375
376 const __m128 t4 = _mm_castsi128_ps(block4);
377 const __m128 t5 = _mm_castsi128_ps(block5);
378 __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
379 __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
380
381 for (int i=0; i < static_cast<int>(rounds); ++i)
382 {
383 const __m128i rk = _mm_set1_epi32(subkeys[i]);
384
385 x1 = RotateRight32<8>(x1);
386 x2 = RotateRight32<8>(x2);
387 x3 = RotateRight32<8>(x3);
388 x1 = _mm_add_epi32(x1, y1);
389 x2 = _mm_add_epi32(x2, y2);
390 x3 = _mm_add_epi32(x3, y3);
391 x1 = _mm_xor_si128(x1, rk);
392 x2 = _mm_xor_si128(x2, rk);
393 x3 = _mm_xor_si128(x3, rk);
394 y1 = RotateLeft32<3>(y1);
395 y2 = RotateLeft32<3>(y2);
396 y3 = RotateLeft32<3>(y3);
397 y1 = _mm_xor_si128(y1, x1);
398 y2 = _mm_xor_si128(y2, x2);
399 y3 = _mm_xor_si128(y3, x3);
400 }
401
402 // The is roughly the SSE equivalent to ARM vzp32
403 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
404 block0 = _mm_unpacklo_epi32(y1, x1);
405 block1 = _mm_unpackhi_epi32(y1, x1);
406 block2 = _mm_unpacklo_epi32(y2, x2);
407 block3 = _mm_unpackhi_epi32(y2, x2);
408 block4 = _mm_unpacklo_epi32(y3, x3);
409 block5 = _mm_unpackhi_epi32(y3, x3);
410}
411
412inline void SPECK64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
413 __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
414 const word32 *subkeys, unsigned int rounds)
415{
416 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
417 const __m128 t0 = _mm_castsi128_ps(block0);
418 const __m128 t1 = _mm_castsi128_ps(block1);
419 __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
420 __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
421
422 const __m128 t2 = _mm_castsi128_ps(block2);
423 const __m128 t3 = _mm_castsi128_ps(block3);
424 __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
425 __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
426
427 const __m128 t4 = _mm_castsi128_ps(block4);
428 const __m128 t5 = _mm_castsi128_ps(block5);
429 __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
430 __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
431
432 for (int i = static_cast<int>(rounds-1); i >= 0; --i)
433 {
434 const __m128i rk = _mm_set1_epi32(subkeys[i]);
435
436 y1 = _mm_xor_si128(y1, x1);
437 y2 = _mm_xor_si128(y2, x2);
438 y3 = _mm_xor_si128(y3, x3);
439 y1 = RotateRight32<3>(y1);
440 y2 = RotateRight32<3>(y2);
441 y3 = RotateRight32<3>(y3);
442 x1 = _mm_xor_si128(x1, rk);
443 x2 = _mm_xor_si128(x2, rk);
444 x3 = _mm_xor_si128(x3, rk);
445 x1 = _mm_sub_epi32(x1, y1);
446 x2 = _mm_sub_epi32(x2, y2);
447 x3 = _mm_sub_epi32(x3, y3);
448 x1 = RotateLeft32<8>(x1);
449 x2 = RotateLeft32<8>(x2);
450 x3 = RotateLeft32<8>(x3);
451 }
452
453 // The is roughly the SSE equivalent to ARM vzp32
454 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
455 block0 = _mm_unpacklo_epi32(y1, x1);
456 block1 = _mm_unpackhi_epi32(y1, x1);
457 block2 = _mm_unpacklo_epi32(y2, x2);
458 block3 = _mm_unpackhi_epi32(y2, x2);
459 block4 = _mm_unpacklo_epi32(y3, x3);
460 block5 = _mm_unpackhi_epi32(y3, x3);
461}
462
463#endif // CRYPTOPP_SSE41_AVAILABLE
464
465// ***************************** Altivec ***************************** //
466
467#if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
468using CryptoPP::uint8x16_p;
469using CryptoPP::uint32x4_p;
470
471using CryptoPP::VecAdd;
472using CryptoPP::VecSub;
473using CryptoPP::VecXor;
474using CryptoPP::VecLoad;
475using CryptoPP::VecPermute;
476
477// Rotate left by bit count
478template<unsigned int C>
479inline uint32x4_p RotateLeft32(const uint32x4_p val)
480{
481 const uint32x4_p m = {C, C, C, C};
482 return vec_rl(val, m);
483}
484
485// Rotate right by bit count
486template<unsigned int C>
487inline uint32x4_p RotateRight32(const uint32x4_p val)
488{
489 const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
490 return vec_rl(val, m);
491}
492
493void SPECK64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
494 const word32 *subkeys, unsigned int rounds)
495{
496#if (CRYPTOPP_BIG_ENDIAN)
497 const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
498 const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
499#else
500 const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
501 const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
502#endif
503
504 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
505 uint32x4_p x1 = VecPermute(block0, block1, m1);
506 uint32x4_p y1 = VecPermute(block0, block1, m2);
507
508 for (int i=0; i < static_cast<int>(rounds); ++i)
509 {
510#if CRYPTOPP_POWER8_AVAILABLE
511 const uint32x4_p rk = vec_splats(subkeys[i]);
512#else
513 // subkeys has extra elements so memory backs the last subkey
514 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
515 uint32x4_p rk = VecLoad(subkeys+i);
516 rk = VecPermute(rk, rk, m);
517#endif
518
519 x1 = RotateRight32<8>(x1);
520 x1 = VecAdd(x1, y1);
521 x1 = VecXor(x1, rk);
522
523 y1 = RotateLeft32<3>(y1);
524 y1 = VecXor(y1, x1);
525 }
526
527#if (CRYPTOPP_BIG_ENDIAN)
528 const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
529 const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
530#else
531 const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
532 const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
533#endif
534
535 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
536 block0 = (uint32x4_p)VecPermute(x1, y1, m3);
537 block1 = (uint32x4_p)VecPermute(x1, y1, m4);
538}
539
540void SPECK64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
541 const word32 *subkeys, unsigned int rounds)
542{
543#if (CRYPTOPP_BIG_ENDIAN)
544 const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
545 const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
546#else
547 const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
548 const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
549#endif
550
551 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
552 uint32x4_p x1 = VecPermute(block0, block1, m1);
553 uint32x4_p y1 = VecPermute(block0, block1, m2);
554
555 for (int i = static_cast<int>(rounds-1); i >= 0; --i)
556 {
557#if CRYPTOPP_POWER8_AVAILABLE
558 const uint32x4_p rk = vec_splats(subkeys[i]);
559#else
560 // subkeys has extra elements so memory backs the last subkey
561 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
562 uint32x4_p rk = VecLoad(subkeys+i);
563 rk = VecPermute(rk, rk, m);
564#endif
565
566 y1 = VecXor(y1, x1);
567 y1 = RotateRight32<3>(y1);
568
569 x1 = VecXor(x1, rk);
570 x1 = VecSub(x1, y1);
571 x1 = RotateLeft32<8>(x1);
572 }
573
574#if (CRYPTOPP_BIG_ENDIAN)
575 const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
576 const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
577#else
578 const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
579 const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
580#endif
581
582 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
583 block0 = (uint32x4_p)VecPermute(x1, y1, m3);
584 block1 = (uint32x4_p)VecPermute(x1, y1, m4);
585}
586
587void SPECK64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
588 uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
589 uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
590{
591#if (CRYPTOPP_BIG_ENDIAN)
592 const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
593 const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
594#else
595 const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
596 const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
597#endif
598
599 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
600 uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
601 uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
602 uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
603 uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
604 uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
605 uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
606
607 for (int i=0; i < static_cast<int>(rounds); ++i)
608 {
609#if CRYPTOPP_POWER8_AVAILABLE
610 const uint32x4_p rk = vec_splats(subkeys[i]);
611#else
612 // subkeys has extra elements so memory backs the last subkey
613 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
614 uint32x4_p rk = VecLoad(subkeys+i);
615 rk = VecPermute(rk, rk, m);
616#endif
617
618 x1 = RotateRight32<8>(x1);
619 x2 = RotateRight32<8>(x2);
620 x3 = RotateRight32<8>(x3);
621
622 x1 = VecAdd(x1, y1);
623 x2 = VecAdd(x2, y2);
624 x3 = VecAdd(x3, y3);
625
626 x1 = VecXor(x1, rk);
627 x2 = VecXor(x2, rk);
628 x3 = VecXor(x3, rk);
629
630 y1 = RotateLeft32<3>(y1);
631 y2 = RotateLeft32<3>(y2);
632 y3 = RotateLeft32<3>(y3);
633
634 y1 = VecXor(y1, x1);
635 y2 = VecXor(y2, x2);
636 y3 = VecXor(y3, x3);
637 }
638
639#if (CRYPTOPP_BIG_ENDIAN)
640 const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
641 const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
642#else
643 const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
644 const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
645#endif
646
647 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
648 block0 = (uint32x4_p)VecPermute(x1, y1, m3);
649 block1 = (uint32x4_p)VecPermute(x1, y1, m4);
650 block2 = (uint32x4_p)VecPermute(x2, y2, m3);
651 block3 = (uint32x4_p)VecPermute(x2, y2, m4);
652 block4 = (uint32x4_p)VecPermute(x3, y3, m3);
653 block5 = (uint32x4_p)VecPermute(x3, y3, m4);
654}
655
656void SPECK64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
657 uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
658 uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
659{
660#if (CRYPTOPP_BIG_ENDIAN)
661 const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
662 const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
663#else
664 const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
665 const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
666#endif
667
668 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
669 uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
670 uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
671 uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
672 uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
673 uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
674 uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
675
676 for (int i = static_cast<int>(rounds-1); i >= 0; --i)
677 {
678#if CRYPTOPP_POWER8_AVAILABLE
679 const uint32x4_p rk = vec_splats(subkeys[i]);
680#else
681 // subkeys has extra elements so memory backs the last subkey
682 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
683 uint32x4_p rk = VecLoad(subkeys+i);
684 rk = VecPermute(rk, rk, m);
685#endif
686
687 y1 = VecXor(y1, x1);
688 y2 = VecXor(y2, x2);
689 y3 = VecXor(y3, x3);
690
691 y1 = RotateRight32<3>(y1);
692 y2 = RotateRight32<3>(y2);
693 y3 = RotateRight32<3>(y3);
694
695 x1 = VecXor(x1, rk);
696 x2 = VecXor(x2, rk);
697 x3 = VecXor(x3, rk);
698
699 x1 = VecSub(x1, y1);
700 x2 = VecSub(x2, y2);
701 x3 = VecSub(x3, y3);
702
703 x1 = RotateLeft32<8>(x1);
704 x2 = RotateLeft32<8>(x2);
705 x3 = RotateLeft32<8>(x3);
706 }
707
708#if (CRYPTOPP_BIG_ENDIAN)
709 const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
710 const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
711#else
712 const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
713 const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
714#endif
715
716 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
717 block0 = (uint32x4_p)VecPermute(x1, y1, m3);
718 block1 = (uint32x4_p)VecPermute(x1, y1, m4);
719 block2 = (uint32x4_p)VecPermute(x2, y2, m3);
720 block3 = (uint32x4_p)VecPermute(x2, y2, m4);
721 block4 = (uint32x4_p)VecPermute(x3, y3, m3);
722 block5 = (uint32x4_p)VecPermute(x3, y3, m4);
723}
724
725#endif // CRYPTOPP_ALTIVEC_AVAILABLE
726
727ANONYMOUS_NAMESPACE_END
728
729///////////////////////////////////////////////////////////////////////
730
731NAMESPACE_BEGIN(CryptoPP)
732
733// *************************** ARM NEON **************************** //
734
735#if (CRYPTOPP_ARM_NEON_AVAILABLE)
736size_t SPECK64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
737 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
738{
739 return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
740 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
741}
742
743size_t SPECK64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
744 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
745{
746 return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
747 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
748}
749#endif
750
751// ***************************** IA-32 ***************************** //
752
753#if defined(CRYPTOPP_SSE41_AVAILABLE)
754size_t SPECK64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
755 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
756{
757 return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
758 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
759}
760
761size_t SPECK64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
762 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
763{
764 return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
765 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
766}
767#endif
768
769// ***************************** Altivec ***************************** //
770
771#if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
772size_t SPECK64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
773 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
774{
775 return AdvancedProcessBlocks64_6x2_ALTIVEC(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
776 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
777}
778
779size_t SPECK64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
780 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
781{
782 return AdvancedProcessBlocks64_6x2_ALTIVEC(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
783 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
784}
785#endif
786
787NAMESPACE_END
Template for AdvancedProcessBlocks and SIMD processing.
Library configuration file.
Utility functions for the Crypto++ library.
Crypto++ library namespace.
Precompiled header file.
Support functions for PowerPC and vector operations.
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:129
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1010
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:119
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:916
T1 VecSub(const T1 vec1, const T2 vec2)
Subtract two vectors.
Definition: ppc_simd.h:956
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:939
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:253
Classes for the Speck block cipher.