Crypto++ 8.2
Free C&
ppc_simd.h
Go to the documentation of this file.
1// ppc_simd.h - written and placed in public domain by Jeffrey Walton
2
3/// \file ppc_simd.h
4/// \brief Support functions for PowerPC and vector operations
5/// \details This header provides an agnostic interface into Clang, GCC
6/// and IBM XL C/C++ compilers modulo their different built-in functions
7/// for accessing vector intructions.
8/// \details The abstractions are necesssary to support back to GCC 4.8 and
9/// XLC 11 and 12. GCC 4.8 and 4.9 are still popular, and they are the
10/// default compiler for GCC112, GCC118 and others on the compile farm.
11/// Older IBM XL C/C++ compilers also experience it due to lack of
12/// <tt>vec_xl</tt> and <tt>vec_xst</tt> support on some platforms. Modern
13/// compilers provide best support and don't need many of the hacks
14/// below.
15/// \details The library is tested with the following PowerPC machines and
16/// compilers. GCC110, GCC111, GCC112, GCC119 and GCC135 are provided by
17/// the <A HREF="https://cfarm.tetaneutral.net/">GCC Compile Farm</A>
18/// - PowerMac G5, OSX 10.5, POWER4, Apple GCC 4.0
19/// - PowerMac G5, OSX 10.5, POWER4, Macports GCC 5.0
20/// - GCC110, Linux, POWER7, GCC 4.8.5
21/// - GCC110, Linux, POWER7, XLC 12.01
22/// - GCC111, AIX, POWER7, GCC 4.8.1
23/// - GCC111, AIX, POWER7, XLC 12.01
24/// - GCC112, Linux, POWER8, GCC 4.8.5
25/// - GCC112, Linux, POWER8, XLC 13.01
26/// - GCC112, Linux, POWER8, Clang 7.0
27/// - GCC119, AIX, POWER8, GCC 7.2.0
28/// - GCC119, AIX, POWER8, XLC 13.01
29/// - GCC135, Linux, POWER9, GCC 7.0
30/// \details 12 machines are used for testing because the three compilers form
31/// five profiles. The profiles are listed below.
32/// - GCC (Linux GCC, Macports GCC, etc. Consistent across machines)
33/// - XLC 13.0 and earlier (all IBM components)
34/// - XLC 13.1 and later on Linux (LLVM front-end, no compatibility macros)
35/// - XLC 13.1 and later on Linux (LLVM front-end, -qxlcompatmacros option)
36/// - LLVM Clang (traditional Clang compiler)
37/// \details The LLVM front-end makes it tricky to write portable code because
38/// LLVM pretends to be other compilers but cannot consume other compiler's
39/// builtins. When using XLC with -qxlcompatmacros the compiler pretends to
40/// be GCC, Clang and XLC all at once but it can only consume it's variety
41/// of builtins.
42/// \details At Crypto++ 8.0 the various <tt>Vector{FuncName}</tt> were
43/// renamed to <tt>Vec{FuncName}</tt>. For example, <tt>VectorAnd</tt> was
44/// changed to <tt>VecAnd</tt>. The name change helped consolidate two
45/// slightly different implementations.
46/// \since Crypto++ 6.0, LLVM Clang compiler support since Crypto++ 8.0
47
48// Use __ALTIVEC__, _ARCH_PWR7 and _ARCH_PWR8 when detecting actual
49// availaibility of the feature for the source file being compiled. The
50// preprocessor macros depend on compiler options like -maltivec; and
51// not compiler versions.
52
53// DO NOT USE this pattern in VecLoad and VecStore. We have to use the
54// spaghetti code tangled in preprocessor macros because XLC 12 generates
55// bad code in some places. To verify the bad code generation test on
56// GCC111 with XLC 12.01 installed. XLC 13.01 on GCC112 and GCC119 are OK.
57//
58// inline uint32x4_p VecLoad(const byte src[16])
59// {
60// #if defined(_ARCH_PWR8)
61// return (uint32x4_p) *(uint8x16_p*)((byte*)src);
62// #else
63// return VecLoad_ALTIVEC(src);
64// #endif
65// }
66
67#ifndef CRYPTOPP_PPC_CRYPTO_H
68#define CRYPTOPP_PPC_CRYPTO_H
69
70#include "config.h"
71#include "misc.h"
72
73#if defined(__ALTIVEC__)
74# include <altivec.h>
75# undef vector
76# undef pixel
77# undef bool
78#endif
79
80// IBM XLC on AIX does not define __CRYPTO__ like it should with -qarch=pwr8.
81// Crypto is available in XLC 13.1 and above. More LLVM front-end goodness.
82#if defined(_AIX) && defined(_ARCH_PWR8) && (__xlC__ >= 0xd01)
83# undef __CRYPTO__
84# define __CRYPTO__ 1
85#endif
86
87// Hack to detect early XLC compilers. XLC compilers for POWER7 use
88// vec_xlw4 and vec_xstw4 (and ld2 variants); not vec_xl and vec_st.
89// Some XLC compilers for POWER7 and above use vec_xl and vec_xst.
90// The way to tell the difference is, XLC compilers version 13.0 and
91// earlier use vec_xlw4 and vec_xstw4. XLC compilers 13.1 and later
92// are use vec_xl and vec_xst. The open question is, how to handle
93// early Clang compilers for POWER7. We know the latest Clang
94// compilers support vec_xl and vec_xst. Also see
95// https://www-01.ibm.com/support/docview.wss?uid=swg21683541.
96
97#if defined(__xlc__) && (__xlc__ < 0x0d01)
98# define __early_xlc__ 1
99#endif
100#if defined(__xlC__) && (__xlC__ < 0x0d01)
101# define __early_xlC__ 1
102#endif
103
104// VecLoad_ALTIVEC and VecStore_ALTIVEC are
105// too noisy on modern compilers
106#if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
107# pragma GCC diagnostic push
108# pragma GCC diagnostic ignored "-Wdeprecated"
109#endif
110
111NAMESPACE_BEGIN(CryptoPP)
112
113#if defined(__ALTIVEC__) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
114
115/// \brief Vector of 8-bit elements
116/// \par Wraps
117/// __vector unsigned char
118/// \since Crypto++ 6.0
119typedef __vector unsigned char uint8x16_p;
120/// \brief Vector of 16-bit elements
121/// \par Wraps
122/// __vector unsigned short
123/// \since Crypto++ 6.0
124typedef __vector unsigned short uint16x8_p;
125/// \brief Vector of 32-bit elements
126/// \par Wraps
127/// __vector unsigned int
128/// \since Crypto++ 6.0
129typedef __vector unsigned int uint32x4_p;
130
131#if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
132/// \brief Vector of 64-bit elements
133/// \details uint64x2_p is available on POWER7 and above. Some supporting
134/// functions, like 64-bit <tt>vec_add</tt> (<tt>vaddudm</tt>), did not
135/// arrive until POWER8.
136/// \par Wraps
137/// __vector unsigned long long
138/// \since Crypto++ 6.0
139typedef __vector unsigned long long uint64x2_p;
140#endif // _ARCH_PWR8
141
142/// \brief The 0 vector
143/// \returns a 32-bit vector of 0's
144/// \since Crypto++ 8.0
146{
147 const uint32x4_p v = {0,0,0,0};
148 return v;
149}
150
151/// \brief The 1 vector
152/// \returns a 32-bit vector of 1's
153/// \since Crypto++ 8.0
155{
156 const uint32x4_p v = {1,1,1,1};
157 return v;
158}
159
160/// \brief Reverse bytes in a vector
161/// \tparam T vector type
162/// \param data the vector
163/// \returns vector
164/// \details VecReverse() reverses the bytes in a vector
165/// \par Wraps
166/// vec_perm
167/// \since Crypto++ 6.0
168template <class T>
169inline T VecReverse(const T data)
170{
171#if (_ARCH_PWR9)
172 return (T)vec_revb((uint8x16_p)data);
173#else
174 const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
175 return (T)vec_perm(data, data, mask);
176#endif
177}
178
179/// \name LOAD OPERATIONS
180//@{
181
182/// \brief Loads a vector from a byte array
183/// \param src the byte array
184/// \details Loads a vector in native endian format from a byte array.
185/// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
186/// of <tt>src</tt> is aligned. If unaligned it uses <tt>vec_lvsl</tt>,
187/// <tt>vec_ld</tt>, <tt>vec_perm</tt> and <tt>src</tt>. The fixups using
188/// <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are relatively expensive so
189/// you should provide aligned memory adresses.
190/// \par Wraps
191/// vec_ld, vec_lvsl, vec_perm
192/// \since Crypto++ 6.0
193inline uint32x4_p VecLoad_ALTIVEC(const byte src[16])
194{
195 // Avoid IsAlignedOn for convenience.
196 uintptr_t eff = reinterpret_cast<uintptr_t>(src)+0;
197 if (eff % 16 == 0)
198 {
199 return (uint32x4_p)vec_ld(0, src);
200 }
201 else
202 {
203 // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
204 const uint8x16_p perm = vec_lvsl(0, src);
205 const uint8x16_p low = vec_ld(0, src);
206 const uint8x16_p high = vec_ld(15, src);
207 return (uint32x4_p)vec_perm(low, high, perm);
208 }
209}
210
211/// \brief Loads a vector from a byte array
212/// \param src the byte array
213/// \param off offset into the src byte array
214/// \details Loads a vector in native endian format from a byte array.
215/// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
216/// of <tt>src</tt> is aligned. If unaligned it uses <tt>vec_lvsl</tt>,
217/// <tt>vec_ld</tt>, <tt>vec_perm</tt> and <tt>src</tt>.
218/// \details The fixups using <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are
219/// relatively expensive so you should provide aligned memory adresses.
220/// \par Wraps
221/// vec_ld, vec_lvsl, vec_perm
222/// \since Crypto++ 6.0
223inline uint32x4_p VecLoad_ALTIVEC(int off, const byte src[16])
224{
225 // Avoid IsAlignedOn for convenience.
226 uintptr_t eff = reinterpret_cast<uintptr_t>(src)+off;
227 if (eff % 16 == 0)
228 {
229 return (uint32x4_p)vec_ld(off, src);
230 }
231 else
232 {
233 // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
234 const uint8x16_p perm = vec_lvsl(off, src);
235 const uint8x16_p low = vec_ld(off, src);
236 const uint8x16_p high = vec_ld(15, src);
237 return (uint32x4_p)vec_perm(low, high, perm);
238 }
239}
240
241/// \brief Loads a vector from a byte array
242/// \param src the byte array
243/// \details VecLoad() loads a vector in from a byte array.
244/// \details VecLoad() uses POWER7's <tt>vec_xl</tt> or
245/// <tt>vec_vsx_ld</tt> if available. The instructions do not require
246/// aligned effective memory addresses. VecLoad_ALTIVEC() is used if POWER7
247/// is not available. VecLoad_ALTIVEC() can be relatively expensive if
248/// extra instructions are required to fix up unaligned memory
249/// addresses.
250/// \par Wraps
251/// vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld (and Altivec load)
252/// \since Crypto++ 6.0
253inline uint32x4_p VecLoad(const byte src[16])
254{
255#if defined(_ARCH_PWR8)
256# if defined(__early_xlc__) || defined(__early_xlC__)
257 return (uint32x4_p)vec_xlw4(0, (byte*)src);
258# elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
259 return (uint32x4_p)vec_xl(0, (byte*)src);
260# else
261 return (uint32x4_p)vec_vsx_ld(0, (byte*)src);
262# endif
263#else
264 return VecLoad_ALTIVEC(src);
265#endif
266}
267
268/// \brief Loads a vector from a byte array
269/// \param src the byte array
270/// \param off offset into the byte array
271/// \details VecLoad() loads a vector in from a byte array.
272/// \details VecLoad() uses POWER7's <tt>vec_xl</tt> or
273/// <tt>vec_vsx_ld</tt> if available. The instructions do not require
274/// aligned effective memory addresses. VecLoad_ALTIVEC() is used if POWER7
275/// is not available. VecLoad_ALTIVEC() can be relatively expensive if
276/// extra instructions are required to fix up unaligned memory
277/// addresses.
278/// \par Wraps
279/// vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld (and Altivec load)
280/// \since Crypto++ 6.0
281inline uint32x4_p VecLoad(int off, const byte src[16])
282{
283#if defined(_ARCH_PWR8)
284# if defined(__early_xlc__) || defined(__early_xlC__)
285 return (uint32x4_p)vec_xlw4(off, (byte*)src);
286# elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
287 return (uint32x4_p)vec_xl(off, (byte*)src);
288# else
289 return (uint32x4_p)vec_vsx_ld(off, (byte*)src);
290# endif
291#else
292 return VecLoad_ALTIVEC(off, src);
293#endif
294}
295
296/// \brief Loads a vector from a word array
297/// \param src the word array
298/// \details VecLoad() loads a vector in from a word array.
299/// \details VecLoad() uses POWER7's <tt>vec_xl</tt> or
300/// <tt>vec_vsx_ld</tt> if available. The instructions do not require
301/// aligned effective memory addresses. VecLoad_ALTIVEC() is used if POWER7
302/// is not available. VecLoad_ALTIVEC() can be relatively expensive if
303/// extra instructions are required to fix up unaligned memory
304/// addresses.
305/// \par Wraps
306/// vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld (and Altivec load)
307/// \since Crypto++ 8.0
308inline uint32x4_p VecLoad(const word32 src[4])
309{
310 return VecLoad((const byte*)src);
311}
312
313/// \brief Loads a vector from a word array
314/// \param src the word array
315/// \param off offset into the word array
316/// \details VecLoad() loads a vector in from a word array.
317/// \details VecLoad() uses POWER7's <tt>vec_xl</tt> or
318/// <tt>vec_vsx_ld</tt> if available. The instructions do not require
319/// aligned effective memory addresses. VecLoad_ALTIVEC() is used if POWER7
320/// is not available. VecLoad_ALTIVEC() can be relatively expensive if
321/// extra instructions are required to fix up unaligned memory
322/// addresses.
323/// \par Wraps
324/// vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld (and Altivec load)
325/// \since Crypto++ 8.0
326inline uint32x4_p VecLoad(int off, const word32 src[4])
327{
328 return VecLoad(off, (const byte*)src);
329}
330
331#if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
332
333/// \brief Loads a vector from a word array
334/// \param src the word array
335/// \details VecLoad() loads a vector in from a word array.
336/// \details VecLoad() uses POWER7's <tt>vec_xl</tt> or
337/// <tt>vec_vsx_ld</tt> if available. The instructions do not require
338/// aligned effective memory addresses. VecLoad_ALTIVEC() is used if POWER7
339/// is not available. VecLoad_ALTIVEC() can be relatively expensive if
340/// extra instructions are required to fix up unaligned memory
341/// addresses.
342/// \details VecLoad() with 64-bit elements is available on POWER7 and above.
343/// \par Wraps
344/// vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld (and Altivec load)
345/// \since Crypto++ 8.0
346inline uint64x2_p VecLoad(const word64 src[2])
347{
348 return (uint64x2_p)VecLoad((const byte*)src);
349}
350
351/// \brief Loads a vector from a word array
352/// \param src the word array
353/// \param off offset into the word array
354/// \details VecLoad() loads a vector in from a word array.
355/// \details VecLoad() uses POWER7's <tt>vec_xl</tt> or
356/// <tt>vec_vsx_ld</tt> if available. The instructions do not require
357/// aligned effective memory addresses. VecLoad_ALTIVEC() is used if POWER7
358/// is not available. VecLoad_ALTIVEC() can be relatively expensive if
359/// extra instructions are required to fix up unaligned memory
360/// addresses.
361/// \details VecLoad() with 64-bit elements is available on POWER8 and above.
362/// \par Wraps
363/// vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld (and Altivec load)
364/// \since Crypto++ 8.0
365inline uint64x2_p VecLoad(int off, const word64 src[2])
366{
367 return (uint64x2_p)VecLoad(off, (const byte*)src);
368}
369
370#endif // _ARCH_PWR8
371
372/// \brief Loads a vector from an aligned byte array
373/// \param src the byte array
374/// \details VecLoadAligned() loads a vector in from an aligned byte array.
375/// \details VecLoadAligned() uses POWER7's <tt>vec_xl</tt> or
376/// <tt>vec_vsx_ld</tt> if available. The instructions do not require
377/// aligned effective memory addresses. Altivec's <tt>vec_ld</tt> is used
378/// if POWER7 is not available. The effective address of <tt>src</tt> must
379/// be aligned.
380/// \par Wraps
381/// vec_ld, vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld
382/// \since Crypto++ 8.0
383inline uint32x4_p VecLoadAligned(const byte src[16])
384{
385#if defined(_ARCH_PWR8)
386# if defined(__early_xlc__) || defined(__early_xlC__)
387 return (uint32x4_p)vec_xlw4(0, (byte*)src);
388# elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
389 return (uint32x4_p)vec_xl(0, (byte*)src);
390# else
391 return (uint32x4_p)vec_vsx_ld(0, (byte*)src);
392# endif
393#else // _ARCH_PWR8
394 CRYPTOPP_ASSERT(((uintptr_t)src) % 16 == 0);
395 return (uint32x4_p)vec_ld(0, (byte*)src);
396#endif // _ARCH_PWR8
397}
398
399/// \brief Loads a vector from an aligned byte array
400/// \param src the byte array
401/// \param off offset into the byte array
402/// \details VecLoadAligned() loads a vector in from an aligned byte array.
403/// \details VecLoadAligned() uses POWER7's <tt>vec_xl</tt> or
404/// <tt>vec_vsx_ld</tt> if available. The instructions do not require
405/// aligned effective memory addresses. Altivec's <tt>vec_ld</tt> is used
406/// if POWER7 is not available. The effective address of <tt>src</tt> must
407/// be aligned.
408/// \par Wraps
409/// vec_ld, vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld
410/// \since Crypto++ 8.0
411inline uint32x4_p VecLoadAligned(int off, const byte src[16])
412{
413#if defined(_ARCH_PWR8)
414# if defined(__early_xlc__) || defined(__early_xlC__)
415 return (uint32x4_p)vec_xlw4(off, (byte*)src);
416# elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
417 return (uint32x4_p)vec_xl(off, (byte*)src);
418# else
419 return (uint32x4_p)vec_vsx_ld(off, (byte*)src);
420# endif
421#else // _ARCH_PWR8
422 CRYPTOPP_ASSERT((((uintptr_t)src)+off) % 16 == 0);
423 return (uint32x4_p)vec_ld(off, (byte*)src);
424#endif // _ARCH_PWR8
425}
426
427/// \brief Loads a vector from a byte array
428/// \param src the byte array
429/// \details VecLoadBE() loads a vector in from a byte array. VecLoadBE
430/// will reverse all bytes in the array on a little endian system.
431/// \details VecLoadBE() uses POWER7's <tt>vec_xl</tt> or
432/// <tt>vec_vsx_ld</tt> if available. The instructions do not require
433/// aligned effective memory addresses. VecLoad_ALTIVEC() is used if POWER7
434/// is not available. VecLoad_ALTIVEC() can be relatively expensive if
435/// extra instructions are required to fix up unaligned memory
436/// addresses.
437/// \par Wraps
438/// vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld (and Altivec load)
439/// \since Crypto++ 6.0
440inline uint32x4_p VecLoadBE(const byte src[16])
441{
442#if defined(_ARCH_PWR8)
443# if defined(__early_xlc__) || defined(__early_xlC__)
444# if (CRYPTOPP_BIG_ENDIAN)
445 return (uint32x4_p)vec_xlw4(0, (byte*)src);
446# else
447 return (uint32x4_p)VecReverse(vec_xlw4(0, (byte*)src));
448# endif
449# elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
450 return (uint32x4_p)vec_xl_be(0, (byte*)src);
451# else
452# if (CRYPTOPP_BIG_ENDIAN)
453 return (uint32x4_p)vec_vsx_ld(0, (byte*)src);
454# else
455 return (uint32x4_p)VecReverse(vec_vsx_ld(0, (byte*)src));
456# endif
457# endif
458#else // _ARCH_PWR8
459# if (CRYPTOPP_BIG_ENDIAN)
460 return (uint32x4_p)VecLoad((const byte*)src);
461# else
462 return (uint32x4_p)VecReverse(VecLoad((const byte*)src));
463# endif
464#endif // _ARCH_PWR8
465}
466
467/// \brief Loads a vector from a byte array
468/// \param src the byte array
469/// \param off offset into the src byte array
470/// \details VecLoadBE() loads a vector in from a byte array. VecLoadBE
471/// will reverse all bytes in the array on a little endian system.
472/// \details VecLoadBE() uses POWER7's <tt>vec_xl</tt> or
473/// <tt>vec_vsx_ld</tt> if available. The instructions do not require
474/// aligned effective memory addresses. VecLoad_ALTIVEC() is used if POWER7
475/// is not available. VecLoad_ALTIVEC() can be relatively expensive if
476/// extra instructions are required to fix up unaligned memory
477/// addresses.
478/// \par Wraps
479/// vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld (and Altivec load)
480/// \since Crypto++ 6.0
481inline uint32x4_p VecLoadBE(int off, const byte src[16])
482{
483#if defined(_ARCH_PWR8)
484# if defined(__early_xlc__) || defined(__early_xlC__)
485# if (CRYPTOPP_BIG_ENDIAN)
486 return (uint32x4_p)vec_xlw4(off, (byte*)src);
487# else
488 return (uint32x4_p)VecReverse(vec_xlw4(off, (byte*)src));
489# endif
490# elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
491 return (uint32x4_p)vec_xl_be(off, (byte*)src);
492# else
493# if (CRYPTOPP_BIG_ENDIAN)
494 return (uint32x4_p)vec_vsx_ld(off, (byte*)src);
495# else
496 return (uint32x4_p)VecReverse(vec_vsx_ld(off, (byte*)src));
497# endif
498# endif
499#else // _ARCH_PWR8
500# if (CRYPTOPP_BIG_ENDIAN)
501 return (uint32x4_p)VecLoad(off, (const byte*)src);
502# else
503 return (uint32x4_p)VecReverse(VecLoad(off, (const byte*)src));
504# endif
505#endif // _ARCH_PWR8
506}
507
508//@}
509
510/// \name STORE OPERATIONS
511//@{
512
513/// \brief Stores a vector to a byte array
514/// \tparam T vector type
515/// \param data the vector
516/// \param dest the byte array
517/// \details VecStore_ALTIVEC() stores a vector to a byte array.
518/// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
519/// of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.
520/// <tt>vec_ste</tt> is relatively expensive so you should provide aligned
521/// memory adresses.
522/// \details VecStore_ALTIVEC() is used automatically when POWER7 or above
523/// and unaligned loads is not available.
524/// \par Wraps
525/// vec_st, vec_ste, vec_lvsr, vec_perm
526/// \since Crypto++ 8.0
527template<class T>
528inline void VecStore_ALTIVEC(const T data, byte dest[16])
529{
530 // Avoid IsAlignedOn for convenience.
531 uintptr_t eff = reinterpret_cast<uintptr_t>(dest)+0;
532 if (eff % 16 == 0)
533 {
534 vec_st((uint8x16_p)data, 0, dest);
535 }
536 else
537 {
538 // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
539 uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, dest));
540 vec_ste((uint8x16_p) perm, 0, (unsigned char*) dest);
541 vec_ste((uint16x8_p) perm, 1, (unsigned short*)dest);
542 vec_ste((uint32x4_p) perm, 3, (unsigned int*) dest);
543 vec_ste((uint32x4_p) perm, 4, (unsigned int*) dest);
544 vec_ste((uint32x4_p) perm, 8, (unsigned int*) dest);
545 vec_ste((uint32x4_p) perm, 12, (unsigned int*) dest);
546 vec_ste((uint16x8_p) perm, 14, (unsigned short*)dest);
547 vec_ste((uint8x16_p) perm, 15, (unsigned char*) dest);
548 }
549}
550
551/// \brief Stores a vector to a byte array
552/// \tparam T vector type
553/// \param data the vector
554/// \param off the byte offset into the array
555/// \param dest the byte array
556/// \details VecStore_ALTIVEC() stores a vector to a byte array.
557/// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
558/// of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.
559/// <tt>vec_ste</tt> is relatively expensive so you should provide aligned
560/// memory adresses.
561/// \details VecStore_ALTIVEC() is used automatically when POWER7 or above
562/// and unaligned loads is not available.
563/// \par Wraps
564/// vec_st, vec_ste, vec_lvsr, vec_perm
565/// \since Crypto++ 8.0
566template<class T>
567inline void VecStore_ALTIVEC(const T data, int off, byte dest[16])
568{
569 // Avoid IsAlignedOn for convenience.
570 uintptr_t eff = reinterpret_cast<uintptr_t>(dest)+off;
571 if (eff % 16 == 0)
572 {
573 vec_st((uint8x16_p)data, off, dest);
574 }
575 else
576 {
577 // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
578 uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(off, dest));
579 vec_ste((uint8x16_p) perm, 0, (unsigned char*) dest);
580 vec_ste((uint16x8_p) perm, 1, (unsigned short*)dest);
581 vec_ste((uint32x4_p) perm, 3, (unsigned int*) dest);
582 vec_ste((uint32x4_p) perm, 4, (unsigned int*) dest);
583 vec_ste((uint32x4_p) perm, 8, (unsigned int*) dest);
584 vec_ste((uint32x4_p) perm, 12, (unsigned int*) dest);
585 vec_ste((uint16x8_p) perm, 14, (unsigned short*)dest);
586 vec_ste((uint8x16_p) perm, 15, (unsigned char*) dest);
587 }
588}
589
590/// \brief Stores a vector to a byte array
591/// \tparam T vector type
592/// \param data the vector
593/// \param dest the byte array
594/// \details VecStore() stores a vector to a byte array.
595/// \details VecStore() uses POWER7's <tt>vec_xst</tt> or
596/// <tt>vec_vsx_st</tt> if available. The instructions do not require
597/// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
598/// is not available. VecStore_ALTIVEC() can be relatively expensive if
599/// extra instructions are required to fix up unaligned memory
600/// addresses.
601/// \par Wraps
602/// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
603/// \since Crypto++ 6.0
604template<class T>
605inline void VecStore(const T data, byte dest[16])
606{
607#if defined(_ARCH_PWR8)
608# if defined(__early_xlc__) || defined(__early_xlC__)
609 vec_xstw4((uint8x16_p)data, 0, (byte*)dest);
610# elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
611 vec_xst((uint8x16_p)data, 0, (byte*)dest);
612# else
613 vec_vsx_st((uint8x16_p)data, 0, (byte*)dest);
614# endif
615#else
616 VecStore_ALTIVEC((uint8x16_p)data, 0, (byte*)dest);
617#endif
618}
619
620/// \brief Stores a vector to a byte array
621/// \tparam T vector type
622/// \param data the vector
623/// \param off the byte offset into the array
624/// \param dest the byte array
625/// \details VecStore() stores a vector to a byte array.
626/// \details VecStore() uses POWER7's <tt>vec_xst</tt> or
627/// <tt>vec_vsx_st</tt> if available. The instructions do not require
628/// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
629/// is not available. VecStore_ALTIVEC() can be relatively expensive if
630/// extra instructions are required to fix up unaligned memory
631/// addresses.
632/// \par Wraps
633/// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
634/// \since Crypto++ 6.0
635template<class T>
636inline void VecStore(const T data, int off, byte dest[16])
637{
638#if defined(_ARCH_PWR8)
639# if defined(__early_xlc__) || defined(__early_xlC__)
640 vec_xstw4((uint8x16_p)data, off, (byte*)dest);
641# elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
642 vec_xst((uint8x16_p)data, off, (byte*)dest);
643# else
644 vec_vsx_st((uint8x16_p)data, off, (byte*)dest);
645# endif
646#else
647 VecStore_ALTIVEC((uint8x16_p)data, off, (byte*)dest);
648#endif
649}
650
651/// \brief Stores a vector to a word array
652/// \tparam T vector type
653/// \param data the vector
654/// \param dest the word array
655/// \details VecStore() stores a vector to a word array.
656/// \details VecStore() uses POWER7's <tt>vec_xst</tt> or
657/// <tt>vec_vsx_st</tt> if available. The instructions do not require
658/// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
659/// is not available. VecStore_ALTIVEC() can be relatively expensive if
660/// extra instructions are required to fix up unaligned memory
661/// addresses.
662/// \par Wraps
663/// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
664/// \since Crypto++ 8.0
665template<class T>
666inline void VecStore(const T data, word32 dest[4])
667{
668 VecStore((uint8x16_p)data, 0, (byte*)dest);
669}
670
671/// \brief Stores a vector to a word array
672/// \tparam T vector type
673/// \param data the vector
674/// \param off the byte offset into the array
675/// \param dest the word array
676/// \details VecStore() stores a vector to a word array.
677/// \details VecStore() uses POWER7's <tt>vec_xst</tt> or
678/// <tt>vec_vsx_st</tt> if available. The instructions do not require
679/// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
680/// is not available. VecStore_ALTIVEC() can be relatively expensive if
681/// extra instructions are required to fix up unaligned memory
682/// addresses.
683/// \par Wraps
684/// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
685/// \since Crypto++ 8.0
686template<class T>
687inline void VecStore(const T data, int off, word32 dest[4])
688{
689 VecStore((uint8x16_p)data, off, (byte*)dest);
690}
691
692/// \brief Stores a vector to a word array
693/// \tparam T vector type
694/// \param data the vector
695/// \param dest the word array
696/// \details VecStore() stores a vector to a word array.
697/// \details VecStore() uses POWER7's <tt>vec_xst</tt> or
698/// <tt>vec_vsx_st</tt> if available. The instructions do not require
699/// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
700/// is not available. VecStore_ALTIVEC() can be relatively expensive if
701/// extra instructions are required to fix up unaligned memory
702/// addresses.
703/// \details VecStore() with 64-bit elements is available on POWER8 and above.
704/// \par Wraps
705/// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
706/// \since Crypto++ 8.0
707template<class T>
708inline void VecStore(const T data, word64 dest[2])
709{
710 VecStore((uint8x16_p)data, 0, (byte*)dest);
711}
712
713/// \brief Stores a vector to a word array
714/// \tparam T vector type
715/// \param data the vector
716/// \param off the byte offset into the array
717/// \param dest the word array
718/// \details VecStore() stores a vector to a word array.
719/// \details VecStore() uses POWER7's <tt>vec_xst</tt> or
720/// <tt>vec_vsx_st</tt> if available. The instructions do not require
721/// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
722/// is not available. VecStore_ALTIVEC() can be relatively expensive if
723/// extra instructions are required to fix up unaligned memory
724/// addresses.
725/// \details VecStore() with 64-bit elements is available on POWER8 and above.
726/// \par Wraps
727/// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
728/// \since Crypto++ 8.0
729template<class T>
730inline void VecStore(const T data, int off, word64 dest[2])
731{
732 VecStore((uint8x16_p)data, off, (byte*)dest);
733}
734
735/// \brief Stores a vector to a byte array
736/// \tparam T vector type
737/// \param data the vector
738/// \param dest the byte array
739/// \details VecStoreBE() stores a vector to a byte array. VecStoreBE
740/// will reverse all bytes in the array on a little endian system.
741/// \details VecStoreBE() uses POWER7's <tt>vec_xst</tt> or
742/// <tt>vec_vsx_st</tt> if available. The instructions do not require
743/// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
744/// is not available. VecStore_ALTIVEC() can be relatively expensive if
745/// extra instructions are required to fix up unaligned memory
746/// addresses.
747/// \par Wraps
748/// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
749/// \since Crypto++ 6.0
750template <class T>
751inline void VecStoreBE(const T data, byte dest[16])
752{
753#if defined(_ARCH_PWR8)
754# if defined(__early_xlc__) || defined(__early_xlC__)
755# if (CRYPTOPP_BIG_ENDIAN)
756 vec_xstw4((uint8x16_p)data, 0, (byte*)dest);
757# else
758 vec_xstw4((uint8x16_p)VecReverse(data), 0, (byte*)dest);
759# endif
760# elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
761 vec_xst_be((uint8x16_p)data, 0, (byte*)dest);
762# else
763# if (CRYPTOPP_BIG_ENDIAN)
764 vec_vsx_st((uint8x16_p)data, 0, (byte*)dest);
765# else
766 vec_vsx_st((uint8x16_p)VecReverse(data), 0, (byte*)dest);
767# endif
768# endif
769#else // _ARCH_PWR8
770# if (CRYPTOPP_BIG_ENDIAN)
771 VecStore_ALTIVEC((uint8x16_p)data, 0, (byte*)dest);
772# else
773 VecStore_ALTIVEC((uint8x16_p)VecReverse(data), 0, (byte*)dest);
774# endif
775#endif // _ARCH_PWR8
776}
777
778/// \brief Stores a vector to a byte array
779/// \tparam T vector type
780/// \param data the vector
781/// \param off offset into the dest byte array
782/// \param dest the byte array
783/// \details VecStoreBE() stores a vector to a byte array. VecStoreBE
784/// will reverse all bytes in the array on a little endian system.
785/// \details VecStoreBE() uses POWER7's <tt>vec_xst</tt> or
786/// <tt>vec_vsx_st</tt> if available. The instructions do not require
787/// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
788/// is not available. VecStore_ALTIVEC() can be relatively expensive if
789/// extra instructions are required to fix up unaligned memory
790/// addresses.
791/// \par Wraps
792/// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
793/// \since Crypto++ 6.0
794template <class T>
795inline void VecStoreBE(const T data, int off, byte dest[16])
796{
797#if defined(_ARCH_PWR8)
798# if defined(__early_xlc__) || defined(__early_xlC__)
799# if (CRYPTOPP_BIG_ENDIAN)
800 vec_xstw4((uint8x16_p)data, off, (byte*)dest);
801# else
802 vec_xstw4((uint8x16_p)VecReverse(data), off, (byte*)dest);
803# endif
804# elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
805 vec_xst_be((uint8x16_p)data, off, (byte*)dest);
806# else
807# if (CRYPTOPP_BIG_ENDIAN)
808 vec_vsx_st((uint8x16_p)data, off, (byte*)dest);
809# else
810 vec_vsx_st((uint8x16_p)VecReverse(data), off, (byte*)dest);
811# endif
812# endif
813#else // _ARCH_PWR8
814# if (CRYPTOPP_BIG_ENDIAN)
815 VecStore_ALTIVEC((uint8x16_p)data, off, (byte*)dest);
816# else
817 VecStore_ALTIVEC((uint8x16_p)VecReverse(data), off, (byte*)dest);
818# endif
819#endif // _ARCH_PWR8
820}
821
822/// \brief Stores a vector to a word array
823/// \tparam T vector type
824/// \param data the vector
825/// \param dest the word array
826/// \details VecStoreBE() stores a vector to a word array. VecStoreBE
827/// will reverse all bytes in the array on a little endian system.
828/// \details VecStoreBE() uses POWER7's <tt>vec_xst</tt> or
829/// <tt>vec_vsx_st</tt> if available. The instructions do not require
830/// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
831/// is not available. VecStore_ALTIVEC() can be relatively expensive if
832/// extra instructions are required to fix up unaligned memory
833/// addresses.
834/// \par Wraps
835/// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
836/// \since Crypto++ 8.0
837template <class T>
838inline void VecStoreBE(const T data, word32 dest[4])
839{
840 return VecStoreBE((uint8x16_p)data, (byte*)dest);
841}
842
843/// \brief Stores a vector to a word array
844/// \tparam T vector type
845/// \param data the vector
846/// \param off offset into the dest word array
847/// \param dest the word array
848/// \details VecStoreBE() stores a vector to a word array. VecStoreBE
849/// will reverse all words in the array on a little endian system.
850/// \details VecStoreBE() uses POWER7's <tt>vec_xst</tt> or
851/// <tt>vec_vsx_st</tt> if available. The instructions do not require
852/// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
853/// is not available. VecStore_ALTIVEC() can be relatively expensive if
854/// extra instructions are required to fix up unaligned memory
855/// addresses.
856/// \par Wraps
857/// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
858/// \since Crypto++ 8.0
859template <class T>
860inline void VecStoreBE(const T data, int off, word32 dest[4])
861{
862 return VecStoreBE((uint8x16_p)data, off, (byte*)dest);
863}
864
865//@}
866
867/// \name LOGICAL OPERATIONS
868//@{
869
870/// \brief AND two vectors
871/// \tparam T1 vector type
872/// \tparam T2 vector type
873/// \param vec1 the first vector
874/// \param vec2 the second vector
875/// \returns vector
876/// \details VecAnd() returns a new vector from vec1 and vec2. The return
877/// vector is the same type as vec1.
878/// \par Wraps
879/// vec_and
880/// \since Crypto++ 6.0
881template <class T1, class T2>
882inline T1 VecAnd(const T1 vec1, const T2 vec2)
883{
884 return (T1)vec_and(vec1, (T1)vec2);
885}
886
887/// \brief OR two vectors
888/// \tparam T1 vector type
889/// \tparam T2 vector type
890/// \param vec1 the first vector
891/// \param vec2 the second vector
892/// \returns vector
893/// \details VecOr() returns a new vector from vec1 and vec2. The return
894/// vector is the same type as vec1.
895/// \par Wraps
896/// vec_or
897/// \since Crypto++ 6.0
898template <class T1, class T2>
899inline T1 VecOr(const T1 vec1, const T2 vec2)
900{
901 return (T1)vec_or(vec1, (T1)vec2);
902}
903
904/// \brief XOR two vectors
905/// \tparam T1 vector type
906/// \tparam T2 vector type
907/// \param vec1 the first vector
908/// \param vec2 the second vector
909/// \returns vector
910/// \details VecXor() returns a new vector from vec1 and vec2. The return
911/// vector is the same type as vec1.
912/// \par Wraps
913/// vec_xor
914/// \since Crypto++ 6.0
915template <class T1, class T2>
916inline T1 VecXor(const T1 vec1, const T2 vec2)
917{
918 return (T1)vec_xor(vec1, (T1)vec2);
919}
920
921//@}
922
923/// \name ARITHMETIC OPERATIONS
924//@{
925
926/// \brief Add two vectors
927/// \tparam T1 vector type
928/// \tparam T2 vector type
929/// \param vec1 the first vector
930/// \param vec2 the second vector
931/// \returns vector
932/// \details VecAdd() returns a new vector from vec1 and vec2.
933/// vec2 is cast to the same type as vec1. The return vector
934/// is the same type as vec1.
935/// \par Wraps
936/// vec_add
937/// \since Crypto++ 6.0
938template <class T1, class T2>
939inline T1 VecAdd(const T1 vec1, const T2 vec2)
940{
941 return (T1)vec_add(vec1, (T1)vec2);
942}
943
944/// \brief Subtract two vectors
945/// \tparam T1 vector type
946/// \tparam T2 vector type
947/// \param vec1 the first vector
948/// \param vec2 the second vector
949/// \details VecSub() returns a new vector from vec1 and vec2.
950/// vec2 is cast to the same type as vec1. The return vector
951/// is the same type as vec1.
952/// \par Wraps
953/// vec_sub
954/// \since Crypto++ 6.0
955template <class T1, class T2>
956inline T1 VecSub(const T1 vec1, const T2 vec2)
957{
958 return (T1)vec_sub(vec1, (T1)vec2);
959}
960
961/// \brief Add two vectors
962/// \tparam T1 vector type
963/// \tparam T2 vector type
964/// \param vec1 the first vector
965/// \param vec2 the second vector
966/// \returns vector
967/// \details VecAdd64() returns a new vector from vec1 and vec2.
968/// vec1 and vec2 are added as if uint64x2_p vectors. On POWER7
969/// and below VecAdd64() manages the carries from two elements in
970/// a uint32x4_p vector.
971/// \par Wraps
972/// vec_add for POWER8, vec_addc, vec_perm, vec_add for Altivec
973/// \since Crypto++ 8.0
974inline uint32x4_p VecAdd64(const uint32x4_p& vec1, const uint32x4_p& vec2)
975{
976 // 64-bit elements available at POWER7, but addudm requires POWER8
977#if defined(_ARCH_PWR8)
978 return (uint32x4_p)vec_add((uint64x2_p)vec1, (uint64x2_p)vec2);
979#else
980 // The carry mask selects carries from elements 1 and 3 and sets remaining
981 // elements to 0. The mask also shifts the carried values left by 4 bytes
982 // so the carries are added to elements 0 and 2.
983 const uint8x16_p cmask = {4,5,6,7, 16,16,16,16, 12,13,14,15, 16,16,16,16};
984 const uint32x4_p zero = {0, 0, 0, 0};
985
986 uint32x4_p cy = vec_addc(vec1, vec2);
987 cy = vec_perm(cy, zero, cmask);
988 return vec_add(vec_add(vec1, vec2), cy);
989#endif
990}
991
992//@}
993
994/// \name OTHER OPERATIONS
995//@{
996
997/// \brief Permutes a vector
998/// \tparam T1 vector type
999/// \tparam T2 vector type
1000/// \param vec the vector
1001/// \param mask vector mask
1002/// \returns vector
1003/// \details VecPermute() returns a new vector from vec based on
1004/// mask. mask is an uint8x16_p type vector. The return
1005/// vector is the same type as vec.
1006/// \par Wraps
1007/// vec_perm
1008/// \since Crypto++ 6.0
1009template <class T1, class T2>
1010inline T1 VecPermute(const T1 vec, const T2 mask)
1011{
1012 return (T1)vec_perm(vec, vec, (uint8x16_p)mask);
1013}
1014
1015/// \brief Permutes two vectors
1016/// \tparam T1 vector type
1017/// \tparam T2 vector type
1018/// \param vec1 the first vector
1019/// \param vec2 the second vector
1020/// \param mask vector mask
1021/// \returns vector
1022/// \details VecPermute() returns a new vector from vec1 and vec2
1023/// based on mask. mask is an uint8x16_p type vector. The return
1024/// vector is the same type as vec1.
1025/// \par Wraps
1026/// vec_perm
1027/// \since Crypto++ 6.0
1028template <class T1, class T2>
1029inline T1 VecPermute(const T1 vec1, const T1 vec2, const T2 mask)
1030{
1031 return (T1)vec_perm(vec1, (T1)vec2, (uint8x16_p)mask);
1032}
1033
1034/// \brief Shift a vector left
1035/// \tparam C shift byte count
1036/// \tparam T vector type
1037/// \param vec the vector
1038/// \returns vector
1039/// \details VecShiftLeftOctet() returns a new vector after shifting the
1040/// concatenation of the zero vector and the source vector by the specified
1041/// number of bytes. The return vector is the same type as vec.
1042/// \details On big endian machines VecShiftLeftOctet() is <tt>vec_sld(a, z,
1043/// c)</tt>. On little endian machines VecShiftLeftOctet() is translated to
1044/// <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
1045/// if on a big endian machine as shown below.
1046/// <pre>
1047/// uint8x16_p x = VecLoad(ptr);
1048/// uint8x16_p y = VecShiftLeftOctet<12>(x);
1049/// </pre>
1050/// \par Wraps
1051/// vec_sld
1052/// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1053/// endian sensitive?</A> on Stack Overflow
1054/// \since Crypto++ 6.0
1055template <unsigned int C, class T>
1056inline T VecShiftLeftOctet(const T vec)
1057{
1058 const T zero = {0};
1059 if (C >= 16)
1060 {
1061 // Out of range
1062 return zero;
1063 }
1064 else if (C == 0)
1065 {
1066 // Noop
1067 return vec;
1068 }
1069 else
1070 {
1071#if (CRYPTOPP_BIG_ENDIAN)
1072 enum { R=C&0xf };
1073 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);
1074#else
1075 enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1076 return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);
1077#endif
1078 }
1079}
1080
1081/// \brief Shift a vector right
1082/// \tparam C shift byte count
1083/// \tparam T vector type
1084/// \param vec the vector
1085/// \returns vector
1086/// \details VecShiftRightOctet() returns a new vector after shifting the
1087/// concatenation of the zero vector and the source vector by the specified
1088/// number of bytes. The return vector is the same type as vec.
1089/// \details On big endian machines VecShiftRightOctet() is <tt>vec_sld(a, z,
1090/// c)</tt>. On little endian machines VecShiftRightOctet() is translated to
1091/// <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
1092/// if on a big endian machine as shown below.
1093/// <pre>
1094/// uint8x16_p x = VecLoad(ptr);
1095/// uint8x16_p y = VecShiftRightOctet<12>(y);
1096/// </pre>
1097/// \par Wraps
1098/// vec_sld
1099/// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1100/// endian sensitive?</A> on Stack Overflow
1101/// \since Crypto++ 6.0
1102template <unsigned int C, class T>
1103inline T VecShiftRightOctet(const T vec)
1104{
1105 const T zero = {0};
1106 if (C >= 16)
1107 {
1108 // Out of range
1109 return zero;
1110 }
1111 else if (C == 0)
1112 {
1113 // Noop
1114 return vec;
1115 }
1116 else
1117 {
1118#if (CRYPTOPP_BIG_ENDIAN)
1119 enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1120 return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);
1121#else
1122 enum { R=C&0xf };
1123 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);
1124#endif
1125 }
1126}
1127
1128/// \brief Rotate a vector left
1129/// \tparam C shift byte count
1130/// \tparam T vector type
1131/// \param vec the vector
1132/// \returns vector
1133/// \details VecRotateLeftOctet() returns a new vector after rotating the
1134/// concatenation of the source vector with itself by the specified
1135/// number of bytes. The return vector is the same type as vec.
1136/// \par Wraps
1137/// vec_sld
1138/// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1139/// endian sensitive?</A> on Stack Overflow
1140/// \since Crypto++ 6.0
1141template <unsigned int C, class T>
1142inline T VecRotateLeftOctet(const T vec)
1143{
1144#if (CRYPTOPP_BIG_ENDIAN)
1145 enum { R = C&0xf };
1146 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1147#else
1148 enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1149 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1150#endif
1151}
1152
1153/// \brief Rotate a vector right
1154/// \tparam C shift byte count
1155/// \tparam T vector type
1156/// \param vec the vector
1157/// \returns vector
1158/// \details VecRotateRightOctet() returns a new vector after rotating the
1159/// concatenation of the source vector with itself by the specified
1160/// number of bytes. The return vector is the same type as vec.
1161/// \par Wraps
1162/// vec_sld
1163/// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1164/// endian sensitive?</A> on Stack Overflow
1165/// \since Crypto++ 6.0
1166template <unsigned int C, class T>
1167inline T VecRotateRightOctet(const T vec)
1168{
1169#if (CRYPTOPP_BIG_ENDIAN)
1170 enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1171 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1172#else
1173 enum { R = C&0xf };
1174 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1175#endif
1176}
1177
1178/// \brief Rotate a packed vector left
1179/// \tparam C shift bit count
1180/// \param vec the vector
1181/// \returns vector
1182/// \details VecRotateLeft() rotates each element in a packed vector by bit count.
1183/// \par Wraps
1184/// vec_rl
1185/// \since Crypto++ 7.0
1186template<unsigned int C>
1188{
1189 const uint32x4_p m = {C, C, C, C};
1190 return vec_rl(vec, m);
1191}
1192
1193/// \brief Shift a packed vector left
1194/// \tparam C shift bit count
1195/// \param vec the vector
1196/// \returns vector
1197/// \details VecShiftLeft() rotates each element in a packed vector by bit count.
1198/// \par Wraps
1199/// vec_sl
1200/// \since Crypto++ 8.1
1201template<unsigned int C>
1203{
1204 const uint32x4_p m = {C, C, C, C};
1205 return vec_sl(vec, m);
1206}
1207
1208/// \brief Merge two vectors
1209/// \tparam T vector type
1210/// \param vec1 the first vector
1211/// \param vec2 the second vector
1212/// \returns vector
1213/// \par Wraps
1214/// vec_mergeh
1215/// \since Crypto++ 8.1
1216template <class T>
1217inline T VecMergeHigh(const T vec1, const T vec2)
1218{
1219 return vec_mergeh(vec1, vec2);
1220}
1221
1222/// \brief Merge two vectors
1223/// \tparam T vector type
1224/// \param vec1 the first vector
1225/// \param vec2 the second vector
1226/// \returns vector
1227/// \par Wraps
1228/// vec_mergel
1229/// \since Crypto++ 8.1
1230template <class T>
1231inline T VecMergeLow(const T vec1, const T vec2)
1232{
1233 return vec_mergel(vec1, vec2);
1234}
1235
1236#if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
1237
1238/// \brief Rotate a packed vector left
1239/// \tparam C shift bit count
1240/// \param vec the vector
1241/// \returns vector
1242/// \details VecRotateLeft() rotates each element in a packed vector by bit count.
1243/// \details VecRotateLeft() with 64-bit elements is available on POWER8 and above.
1244/// \par Wraps
1245/// vec_rl
1246/// \since Crypto++ 8.0
1247template<unsigned int C>
1249{
1250 const uint64x2_p m = {C, C};
1251 return vec_rl(vec, m);
1252}
1253
1254/// \brief Shift a packed vector left
1255/// \tparam C shift bit count
1256/// \param vec the vector
1257/// \returns vector
1258/// \details VecShiftLeft() rotates each element in a packed vector by bit count.
1259/// \details VecShiftLeft() with 64-bit elements is available on POWER8 and above.
1260/// \par Wraps
1261/// vec_sl
1262/// \since Crypto++ 8.1
1263template<unsigned int C>
1265{
1266 const uint64x2_p m = {C, C};
1267 return vec_sl(vec, m);
1268}
1269
1270#endif
1271
1272/// \brief Rotate a packed vector right
1273/// \tparam C shift bit count
1274/// \param vec the vector
1275/// \returns vector
1276/// \details VecRotateRight() rotates each element in a packed vector by bit count.
1277/// \par Wraps
1278/// vec_rl
1279/// \since Crypto++ 7.0
1280template<unsigned int C>
1282{
1283 const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
1284 return vec_rl(vec, m);
1285}
1286
1287/// \brief Shift a packed vector right
1288/// \tparam C shift bit count
1289/// \param vec the vector
1290/// \returns vector
1291/// \details VecShiftRight() rotates each element in a packed vector by bit count.
1292/// \par Wraps
1293/// vec_rl
1294/// \since Crypto++ 8.1
1295template<unsigned int C>
1297{
1298 const uint32x4_p m = {C, C, C, C};
1299 return vec_sr(vec, m);
1300}
1301
1302#if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
1303
1304/// \brief Rotate a packed vector right
1305/// \tparam C shift bit count
1306/// \param vec the vector
1307/// \returns vector
1308/// \details VecRotateRight() rotates each element in a packed vector by bit count.
1309/// \details VecRotateRight() with 64-bit elements is available on POWER8 and above.
1310/// \par Wraps
1311/// vec_rl
1312/// \since Crypto++ 8.0
1313template<unsigned int C>
1315{
1316 const uint64x2_p m = {64-C, 64-C};
1317 return vec_rl(vec, m);
1318}
1319
1320/// \brief Shift a packed vector right
1321/// \tparam C shift bit count
1322/// \param vec the vector
1323/// \returns vector
1324/// \details VecShiftRight() rotates each element in a packed vector by bit count.
1325/// \details VecShiftRight() with 64-bit elements is available on POWER8 and above.
1326/// \par Wraps
1327/// vec_sr
1328/// \since Crypto++ 8.1
1329template<unsigned int C>
1331{
1332 const uint64x2_p m = {C, C};
1333 return vec_sr(vec, m);
1334}
1335
1336#endif
1337
1338/// \brief Exchange high and low double words
1339/// \tparam T vector type
1340/// \param vec the vector
1341/// \returns vector
1342/// \par Wraps
1343/// vec_sld
1344/// \since Crypto++ 7.0
1345template <class T>
1346inline T VecSwapWords(const T vec)
1347{
1348 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, 8);
1349}
1350
1351/// \brief Extract a dword from a vector
1352/// \tparam T vector type
1353/// \param val the vector
1354/// \returns vector created from low dword
1355/// \details VecGetLow() extracts the low dword from a vector. The low dword
1356/// is composed of the least significant bits and occupies bytes 8 through 15
1357/// when viewed as a big endian array. The return vector is the same type as
1358/// the original vector and padded with 0's in the most significant bit positions.
1359/// \par Wraps
1360/// vec_sld
1361/// \since Crypto++ 7.0
1362template <class T>
1363inline T VecGetLow(const T val)
1364{
1365#if (CRYPTOPP_BIG_ENDIAN) && (_ARCH_PWR8)
1366 const T zero = {0};
1367 return (T)VecMergeLow((uint64x2_p)zero, (uint64x2_p)val);
1368#else
1369 return VecShiftRightOctet<8>(VecShiftLeftOctet<8>(val));
1370#endif
1371}
1372
1373/// \brief Extract a dword from a vector
1374/// \tparam T vector type
1375/// \param val the vector
1376/// \returns vector created from high dword
1377/// \details VecGetHigh() extracts the high dword from a vector. The high dword
1378/// is composed of the most significant bits and occupies bytes 0 through 7
1379/// when viewed as a big endian array. The return vector is the same type as
1380/// the original vector and padded with 0's in the most significant bit positions.
1381/// \par Wraps
1382/// vec_sld
1383/// \since Crypto++ 7.0
1384template <class T>
1385inline T VecGetHigh(const T val)
1386{
1387#if (CRYPTOPP_BIG_ENDIAN) && (_ARCH_PWR8)
1388 const T zero = {0};
1389 return (T)VecMergeHigh((uint64x2_p)zero, (uint64x2_p)val);
1390#else
1391 return VecShiftRightOctet<8>(val);
1392#endif
1393}
1394
1395/// \brief Compare two vectors
1396/// \tparam T1 vector type
1397/// \tparam T2 vector type
1398/// \param vec1 the first vector
1399/// \param vec2 the second vector
1400/// \returns true if vec1 equals vec2, false otherwise
1401/// \details VecEqual() performs a bitwise compare. The vector element types do
1402/// not matter.
1403/// \par Wraps
1404/// vec_all_eq
1405/// \since Crypto++ 8.0
1406template <class T1, class T2>
1407inline bool VecEqual(const T1 vec1, const T2 vec2)
1408{
1409 return 1 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
1410}
1411
1412/// \brief Compare two vectors
1413/// \tparam T1 vector type
1414/// \tparam T2 vector type
1415/// \param vec1 the first vector
1416/// \param vec2 the second vector
1417/// \returns true if vec1 does not equal vec2, false otherwise
1418/// \details VecNotEqual() performs a bitwise compare. The vector element types do
1419/// not matter.
1420/// \par Wraps
1421/// vec_all_eq
1422/// \since Crypto++ 8.0
1423template <class T1, class T2>
1424inline bool VecNotEqual(const T1 vec1, const T2 vec2)
1425{
1426 return 0 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
1427}
1428
1429//@}
1430
1431//////////////////////// Power8 Crypto ////////////////////////
1432
1433#if defined(__CRYPTO__) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
1434
1435/// \name POLYNOMIAL MULTIPLICATION
1436//@{
1437
1438/// \brief Polynomial multiplication
1439/// \param a the first term
1440/// \param b the second term
1441/// \returns vector product
1442/// \details VecPolyMultiply() performs polynomial multiplication. POWER8
1443/// polynomial multiplication multiplies the high and low terms, and then
1444/// XOR's the high and low products. That is, the result is <tt>ah*bh XOR
1445/// al*bl</tt>. It is different behavior than Intel polynomial
1446/// multiplication. To obtain a single product without the XOR, then set
1447/// one of the high or low terms to 0. For example, setting <tt>ah=0</tt>
1448/// results in <tt>0*bh XOR al*bl = al*bl</tt>.
1449/// \par Wraps
1450/// __vpmsumw, __builtin_altivec_crypto_vpmsumw and __builtin_crypto_vpmsumw.
1451/// \since Crypto++ 8.1
1453{
1454#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
1455 return __vpmsumw (a, b);
1456#elif defined(__clang__)
1457 return __builtin_altivec_crypto_vpmsumw (a, b);
1458#else
1459 return __builtin_crypto_vpmsumw (a, b);
1460#endif
1461}
1462
1463/// \brief Polynomial multiplication
1464/// \param a the first term
1465/// \param b the second term
1466/// \returns vector product
1467/// \details VecPolyMultiply() performs polynomial multiplication. POWER8
1468/// polynomial multiplication multiplies the high and low terms, and then
1469/// XOR's the high and low products. That is, the result is <tt>ah*bh XOR
1470/// al*bl</tt>. It is different behavior than Intel polynomial
1471/// multiplication. To obtain a single product without the XOR, then set
1472/// one of the high or low terms to 0. For example, setting <tt>ah=0</tt>
1473/// results in <tt>0*bh XOR al*bl = al*bl</tt>.
1474/// \par Wraps
1475/// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
1476/// \since Crypto++ 8.1
1478{
1479#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
1480 return __vpmsumd (a, b);
1481#elif defined(__clang__)
1482 return __builtin_altivec_crypto_vpmsumd (a, b);
1483#else
1484 return __builtin_crypto_vpmsumd (a, b);
1485#endif
1486}
1487
1488/// \brief Polynomial multiplication
1489/// \param a the first term
1490/// \param b the second term
1491/// \returns vector product
1492/// \details VecPolyMultiply00LE() performs polynomial multiplication and presents
1493/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x00)</tt>.
1494/// The <tt>0x00</tt> indicates the low 64-bits of <tt>a</tt> and <tt>b</tt>
1495/// are multiplied.
1496/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
1497/// is MSB and numbered 127, while the the rightmost bit is LSB and numbered 0.
1498/// \par Wraps
1499/// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
1500/// \since Crypto++ 8.0
1502{
1503#if (CRYPTOPP_BIG_ENDIAN)
1505#else
1506 return VecPolyMultiply(VecGetHigh(a), VecGetHigh(b));
1507#endif
1508}
1509
1510/// \brief Polynomial multiplication
1511/// \param a the first term
1512/// \param b the second term
1513/// \returns vector product
1514/// \details VecPolyMultiply01LE performs() polynomial multiplication and presents
1515/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x01)</tt>.
1516/// The <tt>0x01</tt> indicates the low 64-bits of <tt>a</tt> and high
1517/// 64-bits of <tt>b</tt> are multiplied.
1518/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
1519/// is MSB and numbered 127, while the the rightmost bit is LSB and numbered 0.
1520/// \par Wraps
1521/// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
1522/// \since Crypto++ 8.0
1524{
1525#if (CRYPTOPP_BIG_ENDIAN)
1527#else
1528 return VecPolyMultiply(a, VecGetHigh(b));
1529#endif
1530}
1531
1532/// \brief Polynomial multiplication
1533/// \param a the first term
1534/// \param b the second term
1535/// \returns vector product
1536/// \details VecPolyMultiply10LE() performs polynomial multiplication and presents
1537/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x10)</tt>.
1538/// The <tt>0x10</tt> indicates the high 64-bits of <tt>a</tt> and low
1539/// 64-bits of <tt>b</tt> are multiplied.
1540/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
1541/// is MSB and numbered 127, while the the rightmost bit is LSB and numbered 0.
1542/// \par Wraps
1543/// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
1544/// \since Crypto++ 8.0
1546{
1547#if (CRYPTOPP_BIG_ENDIAN)
1549#else
1550 return VecPolyMultiply(VecGetHigh(a), b);
1551#endif
1552}
1553
1554/// \brief Polynomial multiplication
1555/// \param a the first term
1556/// \param b the second term
1557/// \returns vector product
1558/// \details VecPolyMultiply11LE() performs polynomial multiplication and presents
1559/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x11)</tt>.
1560/// The <tt>0x11</tt> indicates the high 64-bits of <tt>a</tt> and <tt>b</tt>
1561/// are multiplied.
1562/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
1563/// is MSB and numbered 127, while the the rightmost bit is LSB and numbered 0.
1564/// \par Wraps
1565/// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
1566/// \since Crypto++ 8.0
1568{
1569#if (CRYPTOPP_BIG_ENDIAN)
1570 return VecSwapWords(VecPolyMultiply(VecGetLow(a), b));
1571#else
1572 return VecPolyMultiply(VecGetLow(a), b);
1573#endif
1574}
1575
1576//@}
1577
1578/// \name AES ENCRYPTION
1579//@{
1580
1581/// \brief One round of AES encryption
1582/// \tparam T1 vector type
1583/// \tparam T2 vector type
1584/// \param state the state vector
1585/// \param key the subkey vector
1586/// \details VecEncrypt() performs one round of AES encryption of state
1587/// using subkey key. The return vector is the same type as vec1.
1588/// \details VecEncrypt() is available on POWER8 and above.
1589/// \par Wraps
1590/// __vcipher, __builtin_altivec_crypto_vcipher, __builtin_crypto_vcipher
1591/// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
1592template <class T1, class T2>
1593inline T1 VecEncrypt(const T1 state, const T2 key)
1594{
1595#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
1596 return (T1)__vcipher((uint8x16_p)state, (uint8x16_p)key);
1597#elif defined(__clang__)
1598 return (T1)__builtin_altivec_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);
1599#elif defined(__GNUC__)
1600 return (T1)__builtin_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);
1601#else
1602 CRYPTOPP_ASSERT(0);
1603#endif
1604}
1605
1606/// \brief Final round of AES encryption
1607/// \tparam T1 vector type
1608/// \tparam T2 vector type
1609/// \param state the state vector
1610/// \param key the subkey vector
1611/// \details VecEncryptLast() performs the final round of AES encryption
1612/// of state using subkey key. The return vector is the same type as vec1.
1613/// \details VecEncryptLast() is available on POWER8 and above.
1614/// \par Wraps
1615/// __vcipherlast, __builtin_altivec_crypto_vcipherlast, __builtin_crypto_vcipherlast
1616/// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
1617template <class T1, class T2>
1618inline T1 VecEncryptLast(const T1 state, const T2 key)
1619{
1620#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
1621 return (T1)__vcipherlast((uint8x16_p)state, (uint8x16_p)key);
1622#elif defined(__clang__)
1623 return (T1)__builtin_altivec_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);
1624#elif defined(__GNUC__)
1625 return (T1)__builtin_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);
1626#else
1627 CRYPTOPP_ASSERT(0);
1628#endif
1629}
1630
1631/// \brief One round of AES decryption
1632/// \tparam T1 vector type
1633/// \tparam T2 vector type
1634/// \param state the state vector
1635/// \param key the subkey vector
1636/// \details VecDecrypt() performs one round of AES decryption of state
1637/// using subkey key. The return vector is the same type as vec1.
1638/// \details VecDecrypt() is available on POWER8 and above.
1639/// \par Wraps
1640/// __vncipher, __builtin_altivec_crypto_vncipher, __builtin_crypto_vncipher
1641/// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
1642template <class T1, class T2>
1643inline T1 VecDecrypt(const T1 state, const T2 key)
1644{
1645#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
1646 return (T1)__vncipher((uint8x16_p)state, (uint8x16_p)key);
1647#elif defined(__clang__)
1648 return (T1)__builtin_altivec_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);
1649#elif defined(__GNUC__)
1650 return (T1)__builtin_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);
1651#else
1652 CRYPTOPP_ASSERT(0);
1653#endif
1654}
1655
1656/// \brief Final round of AES decryption
1657/// \tparam T1 vector type
1658/// \tparam T2 vector type
1659/// \param state the state vector
1660/// \param key the subkey vector
1661/// \details VecDecryptLast() performs the final round of AES decryption
1662/// of state using subkey key. The return vector is the same type as vec1.
1663/// \details VecDecryptLast() is available on POWER8 and above.
1664/// \par Wraps
1665/// __vncipherlast, __builtin_altivec_crypto_vncipherlast, __builtin_crypto_vncipherlast
1666/// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
1667template <class T1, class T2>
1668inline T1 VecDecryptLast(const T1 state, const T2 key)
1669{
1670#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
1671 return (T1)__vncipherlast((uint8x16_p)state, (uint8x16_p)key);
1672#elif defined(__clang__)
1673 return (T1)__builtin_altivec_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);
1674#elif defined(__GNUC__)
1675 return (T1)__builtin_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);
1676#else
1677 CRYPTOPP_ASSERT(0);
1678#endif
1679}
1680
1681//@}
1682
1683/// \name SHA DIGESTS
1684//@{
1685
1686/// \brief SHA256 Sigma functions
1687/// \tparam func function
1688/// \tparam fmask function mask
1689/// \tparam T vector type
1690/// \param vec the block to transform
1691/// \details VecSHA256() selects sigma0, sigma1, Sigma0, Sigma1 based on
1692/// func and fmask. The return vector is the same type as vec.
1693/// \details VecSHA256() is available on POWER8 and above.
1694/// \par Wraps
1695/// __vshasigmaw, __builtin_altivec_crypto_vshasigmaw, __builtin_crypto_vshasigmaw
1696/// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
1697template <int func, int fmask, class T>
1698inline T VecSHA256(const T vec)
1699{
1700#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
1701 return (T)__vshasigmaw((uint32x4_p)vec, func, fmask);
1702#elif defined(__clang__)
1703 return (T)__builtin_altivec_crypto_vshasigmaw((uint32x4_p)vec, func, fmask);
1704#elif defined(__GNUC__)
1705 return (T)__builtin_crypto_vshasigmaw((uint32x4_p)vec, func, fmask);
1706#else
1707 CRYPTOPP_ASSERT(0);
1708#endif
1709}
1710
1711/// \brief SHA512 Sigma functions
1712/// \tparam func function
1713/// \tparam fmask function mask
1714/// \tparam T vector type
1715/// \param vec the block to transform
1716/// \details VecSHA512() selects sigma0, sigma1, Sigma0, Sigma1 based on
1717/// func and fmask. The return vector is the same type as vec.
1718/// \details VecSHA512() is available on POWER8 and above.
1719/// \par Wraps
1720/// __vshasigmad, __builtin_altivec_crypto_vshasigmad, __builtin_crypto_vshasigmad
1721/// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
1722template <int func, int fmask, class T>
1723inline T VecSHA512(const T vec)
1724{
1725#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
1726 return (T)__vshasigmad((uint64x2_p)vec, func, fmask);
1727#elif defined(__clang__)
1728 return (T)__builtin_altivec_crypto_vshasigmad((uint64x2_p)vec, func, fmask);
1729#elif defined(__GNUC__)
1730 return (T)__builtin_crypto_vshasigmad((uint64x2_p)vec, func, fmask);
1731#else
1732 CRYPTOPP_ASSERT(0);
1733#endif
1734}
1735
1736//@}
1737
1738#endif // __CRYPTO__
1739
1740#endif // _ALTIVEC_
1741
1742NAMESPACE_END
1743
1744#if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
1745# pragma GCC diagnostic pop
1746#endif
1747
1748#endif // CRYPTOPP_PPC_CRYPTO_H
Library configuration file.
Utility functions for the Crypto++ library.
Crypto++ library namespace.
uint32x4_p VecZero()
The 0 vector.
Definition: ppc_simd.h:145
uint32x4_p VecRotateRight(const uint32x4_p vec)
Rotate a packed vector right.
Definition: ppc_simd.h:1281
T1 VecOr(const T1 vec1, const T2 vec2)
OR two vectors.
Definition: ppc_simd.h:899
uint64x2_p VecPolyMultiply11LE(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:1567
uint32x4_p VecLoadBE(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:440
void VecStore_ALTIVEC(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:528
uint32x4_p VecLoadAligned(const byte src[16])
Loads a vector from an aligned byte array.
Definition: ppc_simd.h:383
T VecRotateRightOctet(const T vec)
Rotate a vector right.
Definition: ppc_simd.h:1167
T VecShiftRightOctet(const T vec)
Shift a vector right.
Definition: ppc_simd.h:1103
uint64x2_p VecPolyMultiply00LE(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:1501
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:129
void VecStoreBE(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:751
T VecShiftLeftOctet(const T vec)
Shift a vector left.
Definition: ppc_simd.h:1056
uint32x4_p VecLoad_ALTIVEC(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:193
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1010
T VecMergeHigh(const T vec1, const T vec2)
Merge two vectors.
Definition: ppc_simd.h:1217
T VecSHA256(const T vec)
SHA256 Sigma functions.
Definition: ppc_simd.h:1698
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:119
bool VecNotEqual(const T1 vec1, const T2 vec2)
Compare two vectors.
Definition: ppc_simd.h:1424
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:916
uint64x2_p VecPolyMultiply10LE(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:1545
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:139
T1 VecSub(const T1 vec1, const T2 vec2)
Subtract two vectors.
Definition: ppc_simd.h:956
bool VecEqual(const T1 vec1, const T2 vec2)
Compare two vectors.
Definition: ppc_simd.h:1407
T1 VecEncryptLast(const T1 state, const T2 key)
Final round of AES encryption.
Definition: ppc_simd.h:1618
T VecMergeLow(const T vec1, const T vec2)
Merge two vectors.
Definition: ppc_simd.h:1231
T1 VecEncrypt(const T1 state, const T2 key)
One round of AES encryption.
Definition: ppc_simd.h:1593
T1 VecDecryptLast(const T1 state, const T2 key)
Final round of AES decryption.
Definition: ppc_simd.h:1668
uint32x4_p VecPolyMultiply(const uint32x4_p &a, const uint32x4_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:1452
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:939
uint32x4_p VecRotateLeft(const uint32x4_p vec)
Rotate a packed vector left.
Definition: ppc_simd.h:1187
T VecRotateLeftOctet(const T vec)
Rotate a vector left.
Definition: ppc_simd.h:1142
T VecSHA512(const T vec)
SHA512 Sigma functions.
Definition: ppc_simd.h:1723
T1 VecAnd(const T1 vec1, const T2 vec2)
AND two vectors.
Definition: ppc_simd.h:882
uint32x4_p VecShiftRight(const uint32x4_p vec)
Shift a packed vector right.
Definition: ppc_simd.h:1296
T VecGetHigh(const T val)
Extract a dword from a vector.
Definition: ppc_simd.h:1385
T1 VecDecrypt(const T1 state, const T2 key)
One round of AES decryption.
Definition: ppc_simd.h:1643
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:605
T VecReverse(const T data)
Reverse bytes in a vector.
Definition: ppc_simd.h:169
uint32x4_p VecShiftLeft(const uint32x4_p vec)
Shift a packed vector left.
Definition: ppc_simd.h:1202
uint32x4_p VecOne()
The 1 vector.
Definition: ppc_simd.h:154
T VecGetLow(const T val)
Extract a dword from a vector.
Definition: ppc_simd.h:1363
uint32x4_p VecAdd64(const uint32x4_p &vec1, const uint32x4_p &vec2)
Add two vectors.
Definition: ppc_simd.h:974
T VecSwapWords(const T vec)
Exchange high and low double words.
Definition: ppc_simd.h:1346
__vector unsigned short uint16x8_p
Vector of 16-bit elements.
Definition: ppc_simd.h:124
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:253
uint64x2_p VecPolyMultiply01LE(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:1523
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:69