Crypto++ 8.2
Free C&
chacha_simd.cpp
1// chacha_simd.cpp - written and placed in the public domain by
2// Jack Lloyd and Jeffrey Walton
3//
4// This source file uses intrinsics and built-ins to gain access to
5// SSE2, ARM NEON and ARMv8a, Power7 and Altivec instructions. A separate
6// source file is needed because additional CXXFLAGS are required to enable
7// the appropriate instructions sets in some build configurations.
8//
9// SSE2 implementation based on Botan's chacha_sse2.cpp. Many thanks
10// to Jack Lloyd and the Botan team for allowing us to use it.
11//
12// The SSE2 implementation is kind of unusual among Crypto++ algorithms.
13// We guard on CRYTPOPP_SSE2_AVAILABLE and use HasSSE2() at runtime. However,
14// if the compiler says a target machine has SSSE3 or XOP available (say, by
15// way of -march=native), then we can pull another 150 to 800 MB/s out of
16// ChaCha. To capture SSSE3 and XOP we use the compiler defines __SSSE3__ and
17// __XOP__ and forgo runtime tests.
18//
19// Runtime tests for HasSSSE3() and HasXop() are too expensive to make a
20// sub-case of SSE2. The rotates are on a critical path and the runtime tests
21// crush performance.
22//
23// Here are some relative numbers for ChaCha8:
24// * Intel Skylake, 3.0 GHz: SSE2 at 2160 MB/s; SSSE3 at 2310 MB/s.
25// * AMD Bulldozer, 3.3 GHz: SSE2 at 1680 MB/s; XOP at 2510 MB/s.
26
27#include "pch.h"
28#include "config.h"
29
30#include "chacha.h"
31#include "misc.h"
32
33// Internal compiler error in GCC 3.3 and below
34#if defined(__GNUC__) && (__GNUC__ < 4)
35# undef CRYPTOPP_SSE2_INTRIN_AVAILABLE
36#endif
37
38#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE)
39# include <xmmintrin.h>
40# include <emmintrin.h>
41#endif
42
43#if defined(__SSSE3__)
44# include <tmmintrin.h>
45#endif
46
47#if defined(__XOP__)
48# include <ammintrin.h>
49#endif
50
51// C1189: error: This header is specific to ARM targets
52#if (CRYPTOPP_ARM_NEON_AVAILABLE) && !defined(_M_ARM64)
53# include <arm_neon.h>
54#endif
55
56#if (CRYPTOPP_ARM_ACLE_AVAILABLE)
57# include <stdint.h>
58# include <arm_acle.h>
59#endif
60
61#if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
62# include "ppc_simd.h"
63#endif
64
65// Squash MS LNK4221 and libtool warnings
66extern const char CHACHA_SIMD_FNAME[] = __FILE__;
67
68ANONYMOUS_NAMESPACE_BEGIN
69
70// ***************************** NEON ***************************** //
71
72#if (CRYPTOPP_ARM_NEON_AVAILABLE)
73
74template <unsigned int R>
75inline uint32x4_t RotateLeft(const uint32x4_t& val)
76{
77 return vorrq_u32(vshlq_n_u32(val, R), vshrq_n_u32(val, 32 - R));
78}
79
80template <unsigned int R>
81inline uint32x4_t RotateRight(const uint32x4_t& val)
82{
83 return vorrq_u32(vshlq_n_u32(val, 32 - R), vshrq_n_u32(val, R));
84}
85
86template <>
87inline uint32x4_t RotateLeft<8>(const uint32x4_t& val)
88{
89#if defined(__aarch32__) || defined(__aarch64__)
90 const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
91 const uint8x16_t mask = vld1q_u8(maskb);
92
93 return vreinterpretq_u32_u8(
94 vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
95#else
96 // fallback to slower C++ rotation.
97 return vorrq_u32(vshlq_n_u32(val, 8),
98 vshrq_n_u32(val, 32 - 8));
99#endif
100}
101
102template <>
103inline uint32x4_t RotateLeft<16>(const uint32x4_t& val)
104{
105#if defined(__aarch32__) || defined(__aarch64__)
106 return vreinterpretq_u32_u16(
107 vrev32q_u16(vreinterpretq_u16_u32(val)));
108#else
109 // fallback to slower C++ rotation.
110 return vorrq_u32(vshlq_n_u32(val, 16),
111 vshrq_n_u32(val, 32 - 16));
112#endif
113}
114
115template <>
116inline uint32x4_t RotateRight<8>(const uint32x4_t& val)
117{
118#if defined(__aarch32__) || defined(__aarch64__)
119 const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,15,12 };
120 const uint8x16_t mask = vld1q_u8(maskb);
121
122 return vreinterpretq_u32_u8(
123 vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
124#else
125 // fallback to slower C++ rotation.
126 return vorrq_u32(vshrq_n_u32(val, 8),
127 vshlq_n_u32(val, 32 - 8));
128#endif
129}
130
131template <>
132inline uint32x4_t RotateRight<16>(const uint32x4_t& val)
133{
134#if defined(__aarch32__) || defined(__aarch64__)
135 return vreinterpretq_u32_u16(
136 vrev32q_u16(vreinterpretq_u16_u32(val)));
137#else
138 // fallback to slower C++ rotation.
139 return vorrq_u32(vshrq_n_u32(val, 16),
140 vshlq_n_u32(val, 32 - 16));
141#endif
142}
143
144// ChaCha's use of x86 shuffle is really a 4, 8, or 12 byte
145// rotation on the 128-bit vector word:
146// * [3,2,1,0] => [0,3,2,1] is Extract<1>(x)
147// * [3,2,1,0] => [1,0,3,2] is Extract<2>(x)
148// * [3,2,1,0] => [2,1,0,3] is Extract<3>(x)
149template <unsigned int S>
150inline uint32x4_t Extract(const uint32x4_t& val)
151{
152 return vextq_u32(val, val, S);
153}
154
155// Helper to perform 64-bit addition across two elements of 32-bit vectors
156inline uint32x4_t Add64(const uint32x4_t& a, const uint32x4_t& b)
157{
158 return vreinterpretq_u32_u64(
159 vaddq_u64(
160 vreinterpretq_u64_u32(a),
161 vreinterpretq_u64_u32(b)));
162}
163
164#endif // CRYPTOPP_ARM_NEON_AVAILABLE
165
166// ***************************** SSE2 ***************************** //
167
168#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE)
169
170template <unsigned int R>
171inline __m128i RotateLeft(const __m128i val)
172{
173#ifdef __XOP__
174 return _mm_roti_epi32(val, R);
175#else
176 return _mm_or_si128(_mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
177#endif
178}
179
180template <>
181inline __m128i RotateLeft<8>(const __m128i val)
182{
183#if defined(__XOP__)
184 return _mm_roti_epi32(val, 8);
185#elif defined(__SSSE3__)
186 const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
187 return _mm_shuffle_epi8(val, mask);
188#else
189 return _mm_or_si128(_mm_slli_epi32(val, 8), _mm_srli_epi32(val, 32-8));
190#endif
191}
192
193template <>
194inline __m128i RotateLeft<16>(const __m128i val)
195{
196#if defined(__XOP__)
197 return _mm_roti_epi32(val, 16);
198#elif defined(__SSSE3__)
199 const __m128i mask = _mm_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);
200 return _mm_shuffle_epi8(val, mask);
201#else
202 return _mm_or_si128(_mm_slli_epi32(val, 16), _mm_srli_epi32(val, 32-16));
203#endif
204}
205
206#endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE
207
208// **************************** Altivec **************************** //
209
210#if (CRYPTOPP_ALTIVEC_AVAILABLE)
211
212// ChaCha_OperateKeystream_POWER8 is optimized for POWER7. However, Altivec
213// is supported by using vec_ld and vec_st, and using a composite VecAdd
214// that supports 64-bit element adds. vec_ld and vec_st add significant
215// overhead when memory is not aligned. Despite the drawbacks Altivec
216// is profitable. The numbers for ChaCha8 are:
217//
218// PowerMac, C++, 2.0 GHz: 205 MB/s, 9.29 cpb
219// PowerMac, Altivec, 2.0 GHz: 471 MB/s, 4.09 cpb
220
221using CryptoPP::uint8x16_p;
222using CryptoPP::uint32x4_p;
223using CryptoPP::VecLoad;
224using CryptoPP::VecStore;
225using CryptoPP::VecPermute;
226
227// Permutes bytes in packed 32-bit words to little endian.
228// State is already in proper endian order. Input and
229// output must be permuted during load and save.
230inline uint32x4_p VecLoad32LE(const uint8_t src[16])
231{
232#if (CRYPTOPP_BIG_ENDIAN)
233 const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
234 const uint32x4_p val = VecLoad(src);
235 return VecPermute(val, val, mask);
236#else
237 return VecLoad(src);
238#endif
239}
240
241// Permutes bytes in packed 32-bit words to little endian.
242// State is already in proper endian order. Input and
243// output must be permuted during load and save.
244inline void VecStore32LE(uint8_t dest[16], const uint32x4_p& val)
245{
246#if (CRYPTOPP_BIG_ENDIAN)
247 const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
248 VecStore(VecPermute(val, val, mask), dest);
249#else
250 return VecStore(val, dest);
251#endif
252}
253
254// ChaCha's use of x86 shuffle is really a 4, 8, or 12 byte
255// rotation on the 128-bit vector word:
256// * [3,2,1,0] => [0,3,2,1] is Shuffle<1>(x)
257// * [3,2,1,0] => [1,0,3,2] is Shuffle<2>(x)
258// * [3,2,1,0] => [2,1,0,3] is Shuffle<3>(x)
259template <unsigned int S>
260inline uint32x4_p Shuffle(const uint32x4_p& val)
261{
263 return val;
264}
265
266template <>
267inline uint32x4_p Shuffle<1>(const uint32x4_p& val)
268{
269 const uint8x16_p mask = {4,5,6,7, 8,9,10,11, 12,13,14,15, 0,1,2,3};
270 return VecPermute(val, val, mask);
271}
272
273template <>
274inline uint32x4_p Shuffle<2>(const uint32x4_p& val)
275{
276 const uint8x16_p mask = {8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7};
277 return VecPermute(val, val, mask);
278}
279
280template <>
281inline uint32x4_p Shuffle<3>(const uint32x4_p& val)
282{
283 const uint8x16_p mask = {12,13,14,15, 0,1,2,3, 4,5,6,7, 8,9,10,11};
284 return VecPermute(val, val, mask);
285}
286
287#endif // CRYPTOPP_ALTIVEC_AVAILABLE
288
289ANONYMOUS_NAMESPACE_END
290
291NAMESPACE_BEGIN(CryptoPP)
292
293// ***************************** NEON ***************************** //
294
295#if (CRYPTOPP_ARM_NEON_AVAILABLE)
296
297void ChaCha_OperateKeystream_NEON(const word32 *state, const byte* input, byte *output, unsigned int rounds)
298{
299 const uint32x4_t state0 = vld1q_u32(state + 0*4);
300 const uint32x4_t state1 = vld1q_u32(state + 1*4);
301 const uint32x4_t state2 = vld1q_u32(state + 2*4);
302 const uint32x4_t state3 = vld1q_u32(state + 3*4);
303
304 const unsigned int w[] = {1,0,0,0, 2,0,0,0, 3,0,0,0};
305 const uint32x4_t CTRS[3] = {
306 vld1q_u32(w+0), vld1q_u32(w+4), vld1q_u32(w+8)
307 };
308
309 uint32x4_t r0_0 = state0;
310 uint32x4_t r0_1 = state1;
311 uint32x4_t r0_2 = state2;
312 uint32x4_t r0_3 = state3;
313
314 uint32x4_t r1_0 = state0;
315 uint32x4_t r1_1 = state1;
316 uint32x4_t r1_2 = state2;
317 uint32x4_t r1_3 = Add64(r0_3, CTRS[0]);
318
319 uint32x4_t r2_0 = state0;
320 uint32x4_t r2_1 = state1;
321 uint32x4_t r2_2 = state2;
322 uint32x4_t r2_3 = Add64(r0_3, CTRS[1]);
323
324 uint32x4_t r3_0 = state0;
325 uint32x4_t r3_1 = state1;
326 uint32x4_t r3_2 = state2;
327 uint32x4_t r3_3 = Add64(r0_3, CTRS[2]);
328
329 for (int i = static_cast<int>(rounds); i > 0; i -= 2)
330 {
331 r0_0 = vaddq_u32(r0_0, r0_1);
332 r1_0 = vaddq_u32(r1_0, r1_1);
333 r2_0 = vaddq_u32(r2_0, r2_1);
334 r3_0 = vaddq_u32(r3_0, r3_1);
335
336 r0_3 = veorq_u32(r0_3, r0_0);
337 r1_3 = veorq_u32(r1_3, r1_0);
338 r2_3 = veorq_u32(r2_3, r2_0);
339 r3_3 = veorq_u32(r3_3, r3_0);
340
341 r0_3 = RotateLeft<16>(r0_3);
342 r1_3 = RotateLeft<16>(r1_3);
343 r2_3 = RotateLeft<16>(r2_3);
344 r3_3 = RotateLeft<16>(r3_3);
345
346 r0_2 = vaddq_u32(r0_2, r0_3);
347 r1_2 = vaddq_u32(r1_2, r1_3);
348 r2_2 = vaddq_u32(r2_2, r2_3);
349 r3_2 = vaddq_u32(r3_2, r3_3);
350
351 r0_1 = veorq_u32(r0_1, r0_2);
352 r1_1 = veorq_u32(r1_1, r1_2);
353 r2_1 = veorq_u32(r2_1, r2_2);
354 r3_1 = veorq_u32(r3_1, r3_2);
355
356 r0_1 = RotateLeft<12>(r0_1);
357 r1_1 = RotateLeft<12>(r1_1);
358 r2_1 = RotateLeft<12>(r2_1);
359 r3_1 = RotateLeft<12>(r3_1);
360
361 r0_0 = vaddq_u32(r0_0, r0_1);
362 r1_0 = vaddq_u32(r1_0, r1_1);
363 r2_0 = vaddq_u32(r2_0, r2_1);
364 r3_0 = vaddq_u32(r3_0, r3_1);
365
366 r0_3 = veorq_u32(r0_3, r0_0);
367 r1_3 = veorq_u32(r1_3, r1_0);
368 r2_3 = veorq_u32(r2_3, r2_0);
369 r3_3 = veorq_u32(r3_3, r3_0);
370
371 r0_3 = RotateLeft<8>(r0_3);
372 r1_3 = RotateLeft<8>(r1_3);
373 r2_3 = RotateLeft<8>(r2_3);
374 r3_3 = RotateLeft<8>(r3_3);
375
376 r0_2 = vaddq_u32(r0_2, r0_3);
377 r1_2 = vaddq_u32(r1_2, r1_3);
378 r2_2 = vaddq_u32(r2_2, r2_3);
379 r3_2 = vaddq_u32(r3_2, r3_3);
380
381 r0_1 = veorq_u32(r0_1, r0_2);
382 r1_1 = veorq_u32(r1_1, r1_2);
383 r2_1 = veorq_u32(r2_1, r2_2);
384 r3_1 = veorq_u32(r3_1, r3_2);
385
386 r0_1 = RotateLeft<7>(r0_1);
387 r1_1 = RotateLeft<7>(r1_1);
388 r2_1 = RotateLeft<7>(r2_1);
389 r3_1 = RotateLeft<7>(r3_1);
390
391 r0_1 = Extract<1>(r0_1);
392 r0_2 = Extract<2>(r0_2);
393 r0_3 = Extract<3>(r0_3);
394
395 r1_1 = Extract<1>(r1_1);
396 r1_2 = Extract<2>(r1_2);
397 r1_3 = Extract<3>(r1_3);
398
399 r2_1 = Extract<1>(r2_1);
400 r2_2 = Extract<2>(r2_2);
401 r2_3 = Extract<3>(r2_3);
402
403 r3_1 = Extract<1>(r3_1);
404 r3_2 = Extract<2>(r3_2);
405 r3_3 = Extract<3>(r3_3);
406
407 r0_0 = vaddq_u32(r0_0, r0_1);
408 r1_0 = vaddq_u32(r1_0, r1_1);
409 r2_0 = vaddq_u32(r2_0, r2_1);
410 r3_0 = vaddq_u32(r3_0, r3_1);
411
412 r0_3 = veorq_u32(r0_3, r0_0);
413 r1_3 = veorq_u32(r1_3, r1_0);
414 r2_3 = veorq_u32(r2_3, r2_0);
415 r3_3 = veorq_u32(r3_3, r3_0);
416
417 r0_3 = RotateLeft<16>(r0_3);
418 r1_3 = RotateLeft<16>(r1_3);
419 r2_3 = RotateLeft<16>(r2_3);
420 r3_3 = RotateLeft<16>(r3_3);
421
422 r0_2 = vaddq_u32(r0_2, r0_3);
423 r1_2 = vaddq_u32(r1_2, r1_3);
424 r2_2 = vaddq_u32(r2_2, r2_3);
425 r3_2 = vaddq_u32(r3_2, r3_3);
426
427 r0_1 = veorq_u32(r0_1, r0_2);
428 r1_1 = veorq_u32(r1_1, r1_2);
429 r2_1 = veorq_u32(r2_1, r2_2);
430 r3_1 = veorq_u32(r3_1, r3_2);
431
432 r0_1 = RotateLeft<12>(r0_1);
433 r1_1 = RotateLeft<12>(r1_1);
434 r2_1 = RotateLeft<12>(r2_1);
435 r3_1 = RotateLeft<12>(r3_1);
436
437 r0_0 = vaddq_u32(r0_0, r0_1);
438 r1_0 = vaddq_u32(r1_0, r1_1);
439 r2_0 = vaddq_u32(r2_0, r2_1);
440 r3_0 = vaddq_u32(r3_0, r3_1);
441
442 r0_3 = veorq_u32(r0_3, r0_0);
443 r1_3 = veorq_u32(r1_3, r1_0);
444 r2_3 = veorq_u32(r2_3, r2_0);
445 r3_3 = veorq_u32(r3_3, r3_0);
446
447 r0_3 = RotateLeft<8>(r0_3);
448 r1_3 = RotateLeft<8>(r1_3);
449 r2_3 = RotateLeft<8>(r2_3);
450 r3_3 = RotateLeft<8>(r3_3);
451
452 r0_2 = vaddq_u32(r0_2, r0_3);
453 r1_2 = vaddq_u32(r1_2, r1_3);
454 r2_2 = vaddq_u32(r2_2, r2_3);
455 r3_2 = vaddq_u32(r3_2, r3_3);
456
457 r0_1 = veorq_u32(r0_1, r0_2);
458 r1_1 = veorq_u32(r1_1, r1_2);
459 r2_1 = veorq_u32(r2_1, r2_2);
460 r3_1 = veorq_u32(r3_1, r3_2);
461
462 r0_1 = RotateLeft<7>(r0_1);
463 r1_1 = RotateLeft<7>(r1_1);
464 r2_1 = RotateLeft<7>(r2_1);
465 r3_1 = RotateLeft<7>(r3_1);
466
467 r0_1 = Extract<3>(r0_1);
468 r0_2 = Extract<2>(r0_2);
469 r0_3 = Extract<1>(r0_3);
470
471 r1_1 = Extract<3>(r1_1);
472 r1_2 = Extract<2>(r1_2);
473 r1_3 = Extract<1>(r1_3);
474
475 r2_1 = Extract<3>(r2_1);
476 r2_2 = Extract<2>(r2_2);
477 r2_3 = Extract<1>(r2_3);
478
479 r3_1 = Extract<3>(r3_1);
480 r3_2 = Extract<2>(r3_2);
481 r3_3 = Extract<1>(r3_3);
482 }
483
484 r0_0 = vaddq_u32(r0_0, state0);
485 r0_1 = vaddq_u32(r0_1, state1);
486 r0_2 = vaddq_u32(r0_2, state2);
487 r0_3 = vaddq_u32(r0_3, state3);
488
489 r1_0 = vaddq_u32(r1_0, state0);
490 r1_1 = vaddq_u32(r1_1, state1);
491 r1_2 = vaddq_u32(r1_2, state2);
492 r1_3 = vaddq_u32(r1_3, state3);
493 r1_3 = Add64(r1_3, CTRS[0]);
494
495 r2_0 = vaddq_u32(r2_0, state0);
496 r2_1 = vaddq_u32(r2_1, state1);
497 r2_2 = vaddq_u32(r2_2, state2);
498 r2_3 = vaddq_u32(r2_3, state3);
499 r2_3 = Add64(r2_3, CTRS[1]);
500
501 r3_0 = vaddq_u32(r3_0, state0);
502 r3_1 = vaddq_u32(r3_1, state1);
503 r3_2 = vaddq_u32(r3_2, state2);
504 r3_3 = vaddq_u32(r3_3, state3);
505 r3_3 = Add64(r3_3, CTRS[2]);
506
507 if (input)
508 {
509 r0_0 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 0*16)), r0_0);
510 r0_1 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 1*16)), r0_1);
511 r0_2 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 2*16)), r0_2);
512 r0_3 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 3*16)), r0_3);
513 }
514
515 vst1q_u8(output + 0*16, vreinterpretq_u8_u32(r0_0));
516 vst1q_u8(output + 1*16, vreinterpretq_u8_u32(r0_1));
517 vst1q_u8(output + 2*16, vreinterpretq_u8_u32(r0_2));
518 vst1q_u8(output + 3*16, vreinterpretq_u8_u32(r0_3));
519
520 if (input)
521 {
522 r1_0 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 4*16)), r1_0);
523 r1_1 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 5*16)), r1_1);
524 r1_2 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 6*16)), r1_2);
525 r1_3 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 7*16)), r1_3);
526 }
527
528 vst1q_u8(output + 4*16, vreinterpretq_u8_u32(r1_0));
529 vst1q_u8(output + 5*16, vreinterpretq_u8_u32(r1_1));
530 vst1q_u8(output + 6*16, vreinterpretq_u8_u32(r1_2));
531 vst1q_u8(output + 7*16, vreinterpretq_u8_u32(r1_3));
532
533 if (input)
534 {
535 r2_0 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 8*16)), r2_0);
536 r2_1 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 9*16)), r2_1);
537 r2_2 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 10*16)), r2_2);
538 r2_3 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 11*16)), r2_3);
539 }
540
541 vst1q_u8(output + 8*16, vreinterpretq_u8_u32(r2_0));
542 vst1q_u8(output + 9*16, vreinterpretq_u8_u32(r2_1));
543 vst1q_u8(output + 10*16, vreinterpretq_u8_u32(r2_2));
544 vst1q_u8(output + 11*16, vreinterpretq_u8_u32(r2_3));
545
546 if (input)
547 {
548 r3_0 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 12*16)), r3_0);
549 r3_1 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 13*16)), r3_1);
550 r3_2 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 14*16)), r3_2);
551 r3_3 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 15*16)), r3_3);
552 }
553
554 vst1q_u8(output + 12*16, vreinterpretq_u8_u32(r3_0));
555 vst1q_u8(output + 13*16, vreinterpretq_u8_u32(r3_1));
556 vst1q_u8(output + 14*16, vreinterpretq_u8_u32(r3_2));
557 vst1q_u8(output + 15*16, vreinterpretq_u8_u32(r3_3));
558}
559
560#endif // CRYPTOPP_ARM_NEON_AVAILABLE
561
562// ***************************** SSE2 ***************************** //
563
564#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE)
565
566void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *output, unsigned int rounds)
567{
568 const __m128i* state_mm = reinterpret_cast<const __m128i*>(state);
569 const __m128i* input_mm = reinterpret_cast<const __m128i*>(input);
570 __m128i* output_mm = reinterpret_cast<__m128i*>(output);
571
572 const __m128i state0 = _mm_load_si128(state_mm + 0);
573 const __m128i state1 = _mm_load_si128(state_mm + 1);
574 const __m128i state2 = _mm_load_si128(state_mm + 2);
575 const __m128i state3 = _mm_load_si128(state_mm + 3);
576
577 __m128i r0_0 = state0;
578 __m128i r0_1 = state1;
579 __m128i r0_2 = state2;
580 __m128i r0_3 = state3;
581
582 __m128i r1_0 = state0;
583 __m128i r1_1 = state1;
584 __m128i r1_2 = state2;
585 __m128i r1_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 1));
586
587 __m128i r2_0 = state0;
588 __m128i r2_1 = state1;
589 __m128i r2_2 = state2;
590 __m128i r2_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 2));
591
592 __m128i r3_0 = state0;
593 __m128i r3_1 = state1;
594 __m128i r3_2 = state2;
595 __m128i r3_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 3));
596
597 for (int i = static_cast<int>(rounds); i > 0; i -= 2)
598 {
599 r0_0 = _mm_add_epi32(r0_0, r0_1);
600 r1_0 = _mm_add_epi32(r1_0, r1_1);
601 r2_0 = _mm_add_epi32(r2_0, r2_1);
602 r3_0 = _mm_add_epi32(r3_0, r3_1);
603
604 r0_3 = _mm_xor_si128(r0_3, r0_0);
605 r1_3 = _mm_xor_si128(r1_3, r1_0);
606 r2_3 = _mm_xor_si128(r2_3, r2_0);
607 r3_3 = _mm_xor_si128(r3_3, r3_0);
608
609 r0_3 = RotateLeft<16>(r0_3);
610 r1_3 = RotateLeft<16>(r1_3);
611 r2_3 = RotateLeft<16>(r2_3);
612 r3_3 = RotateLeft<16>(r3_3);
613
614 r0_2 = _mm_add_epi32(r0_2, r0_3);
615 r1_2 = _mm_add_epi32(r1_2, r1_3);
616 r2_2 = _mm_add_epi32(r2_2, r2_3);
617 r3_2 = _mm_add_epi32(r3_2, r3_3);
618
619 r0_1 = _mm_xor_si128(r0_1, r0_2);
620 r1_1 = _mm_xor_si128(r1_1, r1_2);
621 r2_1 = _mm_xor_si128(r2_1, r2_2);
622 r3_1 = _mm_xor_si128(r3_1, r3_2);
623
624 r0_1 = RotateLeft<12>(r0_1);
625 r1_1 = RotateLeft<12>(r1_1);
626 r2_1 = RotateLeft<12>(r2_1);
627 r3_1 = RotateLeft<12>(r3_1);
628
629 r0_0 = _mm_add_epi32(r0_0, r0_1);
630 r1_0 = _mm_add_epi32(r1_0, r1_1);
631 r2_0 = _mm_add_epi32(r2_0, r2_1);
632 r3_0 = _mm_add_epi32(r3_0, r3_1);
633
634 r0_3 = _mm_xor_si128(r0_3, r0_0);
635 r1_3 = _mm_xor_si128(r1_3, r1_0);
636 r2_3 = _mm_xor_si128(r2_3, r2_0);
637 r3_3 = _mm_xor_si128(r3_3, r3_0);
638
639 r0_3 = RotateLeft<8>(r0_3);
640 r1_3 = RotateLeft<8>(r1_3);
641 r2_3 = RotateLeft<8>(r2_3);
642 r3_3 = RotateLeft<8>(r3_3);
643
644 r0_2 = _mm_add_epi32(r0_2, r0_3);
645 r1_2 = _mm_add_epi32(r1_2, r1_3);
646 r2_2 = _mm_add_epi32(r2_2, r2_3);
647 r3_2 = _mm_add_epi32(r3_2, r3_3);
648
649 r0_1 = _mm_xor_si128(r0_1, r0_2);
650 r1_1 = _mm_xor_si128(r1_1, r1_2);
651 r2_1 = _mm_xor_si128(r2_1, r2_2);
652 r3_1 = _mm_xor_si128(r3_1, r3_2);
653
654 r0_1 = RotateLeft<7>(r0_1);
655 r1_1 = RotateLeft<7>(r1_1);
656 r2_1 = RotateLeft<7>(r2_1);
657 r3_1 = RotateLeft<7>(r3_1);
658
659 r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1));
660 r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));
661 r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3));
662
663 r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(0, 3, 2, 1));
664 r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2));
665 r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(2, 1, 0, 3));
666
667 r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(0, 3, 2, 1));
668 r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2));
669 r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(2, 1, 0, 3));
670
671 r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(0, 3, 2, 1));
672 r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2));
673 r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(2, 1, 0, 3));
674
675 r0_0 = _mm_add_epi32(r0_0, r0_1);
676 r1_0 = _mm_add_epi32(r1_0, r1_1);
677 r2_0 = _mm_add_epi32(r2_0, r2_1);
678 r3_0 = _mm_add_epi32(r3_0, r3_1);
679
680 r0_3 = _mm_xor_si128(r0_3, r0_0);
681 r1_3 = _mm_xor_si128(r1_3, r1_0);
682 r2_3 = _mm_xor_si128(r2_3, r2_0);
683 r3_3 = _mm_xor_si128(r3_3, r3_0);
684
685 r0_3 = RotateLeft<16>(r0_3);
686 r1_3 = RotateLeft<16>(r1_3);
687 r2_3 = RotateLeft<16>(r2_3);
688 r3_3 = RotateLeft<16>(r3_3);
689
690 r0_2 = _mm_add_epi32(r0_2, r0_3);
691 r1_2 = _mm_add_epi32(r1_2, r1_3);
692 r2_2 = _mm_add_epi32(r2_2, r2_3);
693 r3_2 = _mm_add_epi32(r3_2, r3_3);
694
695 r0_1 = _mm_xor_si128(r0_1, r0_2);
696 r1_1 = _mm_xor_si128(r1_1, r1_2);
697 r2_1 = _mm_xor_si128(r2_1, r2_2);
698 r3_1 = _mm_xor_si128(r3_1, r3_2);
699
700 r0_1 = RotateLeft<12>(r0_1);
701 r1_1 = RotateLeft<12>(r1_1);
702 r2_1 = RotateLeft<12>(r2_1);
703 r3_1 = RotateLeft<12>(r3_1);
704
705 r0_0 = _mm_add_epi32(r0_0, r0_1);
706 r1_0 = _mm_add_epi32(r1_0, r1_1);
707 r2_0 = _mm_add_epi32(r2_0, r2_1);
708 r3_0 = _mm_add_epi32(r3_0, r3_1);
709
710 r0_3 = _mm_xor_si128(r0_3, r0_0);
711 r1_3 = _mm_xor_si128(r1_3, r1_0);
712 r2_3 = _mm_xor_si128(r2_3, r2_0);
713 r3_3 = _mm_xor_si128(r3_3, r3_0);
714
715 r0_3 = RotateLeft<8>(r0_3);
716 r1_3 = RotateLeft<8>(r1_3);
717 r2_3 = RotateLeft<8>(r2_3);
718 r3_3 = RotateLeft<8>(r3_3);
719
720 r0_2 = _mm_add_epi32(r0_2, r0_3);
721 r1_2 = _mm_add_epi32(r1_2, r1_3);
722 r2_2 = _mm_add_epi32(r2_2, r2_3);
723 r3_2 = _mm_add_epi32(r3_2, r3_3);
724
725 r0_1 = _mm_xor_si128(r0_1, r0_2);
726 r1_1 = _mm_xor_si128(r1_1, r1_2);
727 r2_1 = _mm_xor_si128(r2_1, r2_2);
728 r3_1 = _mm_xor_si128(r3_1, r3_2);
729
730 r0_1 = RotateLeft<7>(r0_1);
731 r1_1 = RotateLeft<7>(r1_1);
732 r2_1 = RotateLeft<7>(r2_1);
733 r3_1 = RotateLeft<7>(r3_1);
734
735 r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3));
736 r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));
737 r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1));
738
739 r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(2, 1, 0, 3));
740 r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2));
741 r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(0, 3, 2, 1));
742
743 r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(2, 1, 0, 3));
744 r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2));
745 r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(0, 3, 2, 1));
746
747 r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(2, 1, 0, 3));
748 r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2));
749 r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(0, 3, 2, 1));
750 }
751
752 r0_0 = _mm_add_epi32(r0_0, state0);
753 r0_1 = _mm_add_epi32(r0_1, state1);
754 r0_2 = _mm_add_epi32(r0_2, state2);
755 r0_3 = _mm_add_epi32(r0_3, state3);
756
757 r1_0 = _mm_add_epi32(r1_0, state0);
758 r1_1 = _mm_add_epi32(r1_1, state1);
759 r1_2 = _mm_add_epi32(r1_2, state2);
760 r1_3 = _mm_add_epi32(r1_3, state3);
761 r1_3 = _mm_add_epi64(r1_3, _mm_set_epi32(0, 0, 0, 1));
762
763 r2_0 = _mm_add_epi32(r2_0, state0);
764 r2_1 = _mm_add_epi32(r2_1, state1);
765 r2_2 = _mm_add_epi32(r2_2, state2);
766 r2_3 = _mm_add_epi32(r2_3, state3);
767 r2_3 = _mm_add_epi64(r2_3, _mm_set_epi32(0, 0, 0, 2));
768
769 r3_0 = _mm_add_epi32(r3_0, state0);
770 r3_1 = _mm_add_epi32(r3_1, state1);
771 r3_2 = _mm_add_epi32(r3_2, state2);
772 r3_3 = _mm_add_epi32(r3_3, state3);
773 r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3));
774
775 if (input_mm)
776 {
777 r0_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 0), r0_0);
778 r0_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 1), r0_1);
779 r0_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 2), r0_2);
780 r0_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 3), r0_3);
781 }
782
783 _mm_storeu_si128(output_mm + 0, r0_0);
784 _mm_storeu_si128(output_mm + 1, r0_1);
785 _mm_storeu_si128(output_mm + 2, r0_2);
786 _mm_storeu_si128(output_mm + 3, r0_3);
787
788 if (input_mm)
789 {
790 r1_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 4), r1_0);
791 r1_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 5), r1_1);
792 r1_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 6), r1_2);
793 r1_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 7), r1_3);
794 }
795
796 _mm_storeu_si128(output_mm + 4, r1_0);
797 _mm_storeu_si128(output_mm + 5, r1_1);
798 _mm_storeu_si128(output_mm + 6, r1_2);
799 _mm_storeu_si128(output_mm + 7, r1_3);
800
801 if (input_mm)
802 {
803 r2_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 8), r2_0);
804 r2_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 9), r2_1);
805 r2_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 10), r2_2);
806 r2_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 11), r2_3);
807 }
808
809 _mm_storeu_si128(output_mm + 8, r2_0);
810 _mm_storeu_si128(output_mm + 9, r2_1);
811 _mm_storeu_si128(output_mm + 10, r2_2);
812 _mm_storeu_si128(output_mm + 11, r2_3);
813
814 if (input_mm)
815 {
816 r3_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 12), r3_0);
817 r3_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 13), r3_1);
818 r3_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 14), r3_2);
819 r3_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 15), r3_3);
820 }
821
822 _mm_storeu_si128(output_mm + 12, r3_0);
823 _mm_storeu_si128(output_mm + 13, r3_1);
824 _mm_storeu_si128(output_mm + 14, r3_2);
825 _mm_storeu_si128(output_mm + 15, r3_3);
826}
827
828#endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE
829
830#if (CRYPTOPP_POWER8_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE)
831
832// ChaCha_OperateKeystream_CORE will use either POWER7 or ALTIVEC,
833// depending on the flags used to compile this source file. The
834// abstractions are handled in VecLoad, VecStore and friends. In
835// the future we may to provide both POWER7 or ALTIVEC at the same
836// time to better support distros.
837inline void ChaCha_OperateKeystream_CORE(const word32 *state, const byte* input, byte *output, unsigned int rounds)
838{
839 const uint32x4_p state0 = VecLoad(state + 0*4);
840 const uint32x4_p state1 = VecLoad(state + 1*4);
841 const uint32x4_p state2 = VecLoad(state + 2*4);
842 const uint32x4_p state3 = VecLoad(state + 3*4);
843
844 const uint32x4_p CTRS[3] = {
845 {1,0,0,0}, {2,0,0,0}, {3,0,0,0}
846 };
847
848 uint32x4_p r0_0 = state0;
849 uint32x4_p r0_1 = state1;
850 uint32x4_p r0_2 = state2;
851 uint32x4_p r0_3 = state3;
852
853 uint32x4_p r1_0 = state0;
854 uint32x4_p r1_1 = state1;
855 uint32x4_p r1_2 = state2;
856 uint32x4_p r1_3 = VecAdd64(r0_3, CTRS[0]);
857
858 uint32x4_p r2_0 = state0;
859 uint32x4_p r2_1 = state1;
860 uint32x4_p r2_2 = state2;
861 uint32x4_p r2_3 = VecAdd64(r0_3, CTRS[1]);
862
863 uint32x4_p r3_0 = state0;
864 uint32x4_p r3_1 = state1;
865 uint32x4_p r3_2 = state2;
866 uint32x4_p r3_3 = VecAdd64(r0_3, CTRS[2]);
867
868 for (int i = static_cast<int>(rounds); i > 0; i -= 2)
869 {
870 r0_0 = VecAdd(r0_0, r0_1);
871 r1_0 = VecAdd(r1_0, r1_1);
872 r2_0 = VecAdd(r2_0, r2_1);
873 r3_0 = VecAdd(r3_0, r3_1);
874
875 r0_3 = VecXor(r0_3, r0_0);
876 r1_3 = VecXor(r1_3, r1_0);
877 r2_3 = VecXor(r2_3, r2_0);
878 r3_3 = VecXor(r3_3, r3_0);
879
880 r0_3 = VecRotateLeft<16>(r0_3);
881 r1_3 = VecRotateLeft<16>(r1_3);
882 r2_3 = VecRotateLeft<16>(r2_3);
883 r3_3 = VecRotateLeft<16>(r3_3);
884
885 r0_2 = VecAdd(r0_2, r0_3);
886 r1_2 = VecAdd(r1_2, r1_3);
887 r2_2 = VecAdd(r2_2, r2_3);
888 r3_2 = VecAdd(r3_2, r3_3);
889
890 r0_1 = VecXor(r0_1, r0_2);
891 r1_1 = VecXor(r1_1, r1_2);
892 r2_1 = VecXor(r2_1, r2_2);
893 r3_1 = VecXor(r3_1, r3_2);
894
895 r0_1 = VecRotateLeft<12>(r0_1);
896 r1_1 = VecRotateLeft<12>(r1_1);
897 r2_1 = VecRotateLeft<12>(r2_1);
898 r3_1 = VecRotateLeft<12>(r3_1);
899
900 r0_0 = VecAdd(r0_0, r0_1);
901 r1_0 = VecAdd(r1_0, r1_1);
902 r2_0 = VecAdd(r2_0, r2_1);
903 r3_0 = VecAdd(r3_0, r3_1);
904
905 r0_3 = VecXor(r0_3, r0_0);
906 r1_3 = VecXor(r1_3, r1_0);
907 r2_3 = VecXor(r2_3, r2_0);
908 r3_3 = VecXor(r3_3, r3_0);
909
910 r0_3 = VecRotateLeft<8>(r0_3);
911 r1_3 = VecRotateLeft<8>(r1_3);
912 r2_3 = VecRotateLeft<8>(r2_3);
913 r3_3 = VecRotateLeft<8>(r3_3);
914
915 r0_2 = VecAdd(r0_2, r0_3);
916 r1_2 = VecAdd(r1_2, r1_3);
917 r2_2 = VecAdd(r2_2, r2_3);
918 r3_2 = VecAdd(r3_2, r3_3);
919
920 r0_1 = VecXor(r0_1, r0_2);
921 r1_1 = VecXor(r1_1, r1_2);
922 r2_1 = VecXor(r2_1, r2_2);
923 r3_1 = VecXor(r3_1, r3_2);
924
925 r0_1 = VecRotateLeft<7>(r0_1);
926 r1_1 = VecRotateLeft<7>(r1_1);
927 r2_1 = VecRotateLeft<7>(r2_1);
928 r3_1 = VecRotateLeft<7>(r3_1);
929
930 r0_1 = Shuffle<1>(r0_1);
931 r0_2 = Shuffle<2>(r0_2);
932 r0_3 = Shuffle<3>(r0_3);
933
934 r1_1 = Shuffle<1>(r1_1);
935 r1_2 = Shuffle<2>(r1_2);
936 r1_3 = Shuffle<3>(r1_3);
937
938 r2_1 = Shuffle<1>(r2_1);
939 r2_2 = Shuffle<2>(r2_2);
940 r2_3 = Shuffle<3>(r2_3);
941
942 r3_1 = Shuffle<1>(r3_1);
943 r3_2 = Shuffle<2>(r3_2);
944 r3_3 = Shuffle<3>(r3_3);
945
946 r0_0 = VecAdd(r0_0, r0_1);
947 r1_0 = VecAdd(r1_0, r1_1);
948 r2_0 = VecAdd(r2_0, r2_1);
949 r3_0 = VecAdd(r3_0, r3_1);
950
951 r0_3 = VecXor(r0_3, r0_0);
952 r1_3 = VecXor(r1_3, r1_0);
953 r2_3 = VecXor(r2_3, r2_0);
954 r3_3 = VecXor(r3_3, r3_0);
955
956 r0_3 = VecRotateLeft<16>(r0_3);
957 r1_3 = VecRotateLeft<16>(r1_3);
958 r2_3 = VecRotateLeft<16>(r2_3);
959 r3_3 = VecRotateLeft<16>(r3_3);
960
961 r0_2 = VecAdd(r0_2, r0_3);
962 r1_2 = VecAdd(r1_2, r1_3);
963 r2_2 = VecAdd(r2_2, r2_3);
964 r3_2 = VecAdd(r3_2, r3_3);
965
966 r0_1 = VecXor(r0_1, r0_2);
967 r1_1 = VecXor(r1_1, r1_2);
968 r2_1 = VecXor(r2_1, r2_2);
969 r3_1 = VecXor(r3_1, r3_2);
970
971 r0_1 = VecRotateLeft<12>(r0_1);
972 r1_1 = VecRotateLeft<12>(r1_1);
973 r2_1 = VecRotateLeft<12>(r2_1);
974 r3_1 = VecRotateLeft<12>(r3_1);
975
976 r0_0 = VecAdd(r0_0, r0_1);
977 r1_0 = VecAdd(r1_0, r1_1);
978 r2_0 = VecAdd(r2_0, r2_1);
979 r3_0 = VecAdd(r3_0, r3_1);
980
981 r0_3 = VecXor(r0_3, r0_0);
982 r1_3 = VecXor(r1_3, r1_0);
983 r2_3 = VecXor(r2_3, r2_0);
984 r3_3 = VecXor(r3_3, r3_0);
985
986 r0_3 = VecRotateLeft<8>(r0_3);
987 r1_3 = VecRotateLeft<8>(r1_3);
988 r2_3 = VecRotateLeft<8>(r2_3);
989 r3_3 = VecRotateLeft<8>(r3_3);
990
991 r0_2 = VecAdd(r0_2, r0_3);
992 r1_2 = VecAdd(r1_2, r1_3);
993 r2_2 = VecAdd(r2_2, r2_3);
994 r3_2 = VecAdd(r3_2, r3_3);
995
996 r0_1 = VecXor(r0_1, r0_2);
997 r1_1 = VecXor(r1_1, r1_2);
998 r2_1 = VecXor(r2_1, r2_2);
999 r3_1 = VecXor(r3_1, r3_2);
1000
1001 r0_1 = VecRotateLeft<7>(r0_1);
1002 r1_1 = VecRotateLeft<7>(r1_1);
1003 r2_1 = VecRotateLeft<7>(r2_1);
1004 r3_1 = VecRotateLeft<7>(r3_1);
1005
1006 r0_1 = Shuffle<3>(r0_1);
1007 r0_2 = Shuffle<2>(r0_2);
1008 r0_3 = Shuffle<1>(r0_3);
1009
1010 r1_1 = Shuffle<3>(r1_1);
1011 r1_2 = Shuffle<2>(r1_2);
1012 r1_3 = Shuffle<1>(r1_3);
1013
1014 r2_1 = Shuffle<3>(r2_1);
1015 r2_2 = Shuffle<2>(r2_2);
1016 r2_3 = Shuffle<1>(r2_3);
1017
1018 r3_1 = Shuffle<3>(r3_1);
1019 r3_2 = Shuffle<2>(r3_2);
1020 r3_3 = Shuffle<1>(r3_3);
1021 }
1022
1023 r0_0 = VecAdd(r0_0, state0);
1024 r0_1 = VecAdd(r0_1, state1);
1025 r0_2 = VecAdd(r0_2, state2);
1026 r0_3 = VecAdd(r0_3, state3);
1027
1028 r1_0 = VecAdd(r1_0, state0);
1029 r1_1 = VecAdd(r1_1, state1);
1030 r1_2 = VecAdd(r1_2, state2);
1031 r1_3 = VecAdd(r1_3, state3);
1032 r1_3 = VecAdd64(r1_3, CTRS[0]);
1033
1034 r2_0 = VecAdd(r2_0, state0);
1035 r2_1 = VecAdd(r2_1, state1);
1036 r2_2 = VecAdd(r2_2, state2);
1037 r2_3 = VecAdd(r2_3, state3);
1038 r2_3 = VecAdd64(r2_3, CTRS[1]);
1039
1040 r3_0 = VecAdd(r3_0, state0);
1041 r3_1 = VecAdd(r3_1, state1);
1042 r3_2 = VecAdd(r3_2, state2);
1043 r3_3 = VecAdd(r3_3, state3);
1044 r3_3 = VecAdd64(r3_3, CTRS[2]);
1045
1046 if (input)
1047 {
1048 r0_0 = VecXor(VecLoad32LE(input + 0*16), r0_0);
1049 r0_1 = VecXor(VecLoad32LE(input + 1*16), r0_1);
1050 r0_2 = VecXor(VecLoad32LE(input + 2*16), r0_2);
1051 r0_3 = VecXor(VecLoad32LE(input + 3*16), r0_3);
1052 }
1053
1054 VecStore32LE(output + 0*16, r0_0);
1055 VecStore32LE(output + 1*16, r0_1);
1056 VecStore32LE(output + 2*16, r0_2);
1057 VecStore32LE(output + 3*16, r0_3);
1058
1059 if (input)
1060 {
1061 r1_0 = VecXor(VecLoad32LE(input + 4*16), r1_0);
1062 r1_1 = VecXor(VecLoad32LE(input + 5*16), r1_1);
1063 r1_2 = VecXor(VecLoad32LE(input + 6*16), r1_2);
1064 r1_3 = VecXor(VecLoad32LE(input + 7*16), r1_3);
1065 }
1066
1067 VecStore32LE(output + 4*16, r1_0);
1068 VecStore32LE(output + 5*16, r1_1);
1069 VecStore32LE(output + 6*16, r1_2);
1070 VecStore32LE(output + 7*16, r1_3);
1071
1072 if (input)
1073 {
1074 r2_0 = VecXor(VecLoad32LE(input + 8*16), r2_0);
1075 r2_1 = VecXor(VecLoad32LE(input + 9*16), r2_1);
1076 r2_2 = VecXor(VecLoad32LE(input + 10*16), r2_2);
1077 r2_3 = VecXor(VecLoad32LE(input + 11*16), r2_3);
1078 }
1079
1080 VecStore32LE(output + 8*16, r2_0);
1081 VecStore32LE(output + 9*16, r2_1);
1082 VecStore32LE(output + 10*16, r2_2);
1083 VecStore32LE(output + 11*16, r2_3);
1084
1085 if (input)
1086 {
1087 r3_0 = VecXor(VecLoad32LE(input + 12*16), r3_0);
1088 r3_1 = VecXor(VecLoad32LE(input + 13*16), r3_1);
1089 r3_2 = VecXor(VecLoad32LE(input + 14*16), r3_2);
1090 r3_3 = VecXor(VecLoad32LE(input + 15*16), r3_3);
1091 }
1092
1093 VecStore32LE(output + 12*16, r3_0);
1094 VecStore32LE(output + 13*16, r3_1);
1095 VecStore32LE(output + 14*16, r3_2);
1096 VecStore32LE(output + 15*16, r3_3);
1097}
1098
1099#endif // CRYPTOPP_POWER8_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE
1100
1101#if (CRYPTOPP_POWER8_AVAILABLE)
1102
1103void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds)
1104{
1105 ChaCha_OperateKeystream_CORE(state, input, output, rounds);
1106}
1107
1108#elif (CRYPTOPP_ALTIVEC_AVAILABLE)
1109
1110void ChaCha_OperateKeystream_ALTIVEC(const word32 *state, const byte* input, byte *output, unsigned int rounds)
1111{
1112 ChaCha_OperateKeystream_CORE(state, input, output, rounds);
1113}
1114
1115#endif
1116
1117NAMESPACE_END
Classes for ChaCha8, ChaCha12 and ChaCha20 stream ciphers.
Library configuration file.
Utility functions for the Crypto++ library.
Crypto++ library namespace.
Precompiled header file.
Support functions for PowerPC and vector operations.
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:129
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1010
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:119
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:916
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:939
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:605
uint32x4_p VecAdd64(const uint32x4_p &vec1, const uint32x4_p &vec2)
Add two vectors.
Definition: ppc_simd.h:974
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:253
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:69