Crypto++ 8.2
Free C&
arm_simd.h
Go to the documentation of this file.
1// arm_simd.h - written and placed in public domain by Jeffrey Walton
2
3/// \file arm_simd.h
4/// \brief Support functions for ARM and vector operations
5
6#ifndef CRYPTOPP_ARM_SIMD_H
7#define CRYPTOPP_ARM_SIMD_H
8
9#include "config.h"
10
11// C1189: error: This header is specific to ARM targets
12#if (CRYPTOPP_ARM_NEON_AVAILABLE) && !defined(_M_ARM64)
13# include <arm_neon.h>
14#endif
15
16#if (CRYPTOPP_ARM_ACLE_AVAILABLE)
17# include <stdint.h>
18# include <arm_acle.h>
19#endif
20
21#if (CRYPTOPP_ARM_PMULL_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
22
23/// \brief Polynomial multiplication
24/// \param a the first term
25/// \param b the second term
26/// \returns vector product
27/// \details PMULL_00() performs polynomial multiplication and presents
28/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x00)</tt>.
29/// The <tt>0x00</tt> indicates the low 64-bits of <tt>a</tt> and <tt>b</tt>
30/// are multiplied.
31/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
32/// is MSB and numbered 127, while the the rightmost bit is LSB and
33/// numbered 0.
34/// \since Crypto++ 8.0
35inline uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
36{
37#if defined(_MSC_VER)
38 const __n64 x = { vgetq_lane_u64(a, 0) };
39 const __n64 y = { vgetq_lane_u64(b, 0) };
40 return vmull_p64(x, y);
41#elif defined(__GNUC__)
42 uint64x2_t r;
43 __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t"
44 :"=w" (r) : "w" (a), "w" (b) );
45 return r;
46#else
47 return (uint64x2_t)(vmull_p64(
48 vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
49 vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
50#endif
51}
52
53/// \brief Polynomial multiplication
54/// \param a the first term
55/// \param b the second term
56/// \returns vector product
57/// \details PMULL_01 performs() polynomial multiplication and presents
58/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x01)</tt>.
59/// The <tt>0x01</tt> indicates the low 64-bits of <tt>a</tt> and high
60/// 64-bits of <tt>b</tt> are multiplied.
61/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
62/// is MSB and numbered 127, while the the rightmost bit is LSB and
63/// numbered 0.
64/// \since Crypto++ 8.0
65inline uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
66{
67#if defined(_MSC_VER)
68 const __n64 x = { vgetq_lane_u64(a, 0) };
69 const __n64 y = { vgetq_lane_u64(b, 1) };
70 return vmull_p64(x, y);
71#elif defined(__GNUC__)
72 uint64x2_t r;
73 __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t"
74 :"=w" (r) : "w" (a), "w" (vget_high_u64(b)) );
75 return r;
76#else
77 return (uint64x2_t)(vmull_p64(
78 vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
79 vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
80#endif
81}
82
83/// \brief Polynomial multiplication
84/// \param a the first term
85/// \param b the second term
86/// \returns vector product
87/// \details PMULL_10() performs polynomial multiplication and presents
88/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x10)</tt>.
89/// The <tt>0x10</tt> indicates the high 64-bits of <tt>a</tt> and low
90/// 64-bits of <tt>b</tt> are multiplied.
91/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
92/// is MSB and numbered 127, while the the rightmost bit is LSB and
93/// numbered 0.
94/// \since Crypto++ 8.0
95inline uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
96{
97#if defined(_MSC_VER)
98 const __n64 x = { vgetq_lane_u64(a, 1) };
99 const __n64 y = { vgetq_lane_u64(b, 0) };
100 return vmull_p64(x, y);
101#elif defined(__GNUC__)
102 uint64x2_t r;
103 __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t"
104 :"=w" (r) : "w" (vget_high_u64(a)), "w" (b) );
105 return r;
106#else
107 return (uint64x2_t)(vmull_p64(
108 vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
109 vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
110#endif
111}
112
113/// \brief Polynomial multiplication
114/// \param a the first term
115/// \param b the second term
116/// \returns vector product
117/// \details PMULL_11() performs polynomial multiplication and presents
118/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x11)</tt>.
119/// The <tt>0x11</tt> indicates the high 64-bits of <tt>a</tt> and <tt>b</tt>
120/// are multiplied.
121/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
122/// is MSB and numbered 127, while the the rightmost bit is LSB and
123/// numbered 0.
124/// \since Crypto++ 8.0
125inline uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
126{
127#if defined(_MSC_VER)
128 const __n64 x = { vgetq_lane_u64(a, 1) };
129 const __n64 y = { vgetq_lane_u64(b, 1) };
130 return vmull_p64(x, y);
131#elif defined(__GNUC__)
132 uint64x2_t r;
133 __asm __volatile("pmull2 %0.1q, %1.2d, %2.2d \n\t"
134 :"=w" (r) : "w" (a), "w" (b) );
135 return r;
136#else
137 return (uint64x2_t)(vmull_p64(
138 vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
139 vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
140#endif
141}
142
143/// \brief Vector extraction
144/// \param a the first term
145/// \param b the second term
146/// \param c the byte count
147/// \returns vector
148/// \details VEXT_U8() extracts the first <tt>c</tt> bytes of vector
149/// <tt>a</tt> and the remaining bytes in <tt>b</tt>.
150/// \since Crypto++ 8.0
151inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c)
152{
153#if defined(_MSC_VER)
154 return (uint64x2_t)vextq_u8(
155 vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), c);
156#else
157 uint64x2_t r;
158 __asm __volatile("ext %0.16b, %1.16b, %2.16b, %3 \n\t"
159 :"=w" (r) : "w" (a), "w" (b), "I" (c) );
160 return r;
161#endif
162}
163
164/// \brief Vector extraction
165/// \tparam C the byte count
166/// \param a the first term
167/// \param b the second term
168/// \returns vector
169/// \details VEXT_U8() extracts the first <tt>C</tt> bytes of vector
170/// <tt>a</tt> and the remaining bytes in <tt>b</tt>.
171/// \since Crypto++ 8.0
172template <unsigned int C>
173inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b)
174{
175 // https://github.com/weidai11/cryptopp/issues/366
176#if defined(_MSC_VER)
177 return (uint64x2_t)vextq_u8(
178 vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), C);
179#else
180 uint64x2_t r;
181 __asm __volatile("ext %0.16b, %1.16b, %2.16b, %3 \n\t"
182 :"=w" (r) : "w" (a), "w" (b), "I" (C) );
183 return r;
184#endif
185}
186
187#endif // CRYPTOPP_ARM_PMULL_AVAILABLE
188
189#endif // CRYPTOPP_ARM_SIMD_H
uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:35
uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:125
uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:65
uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:95
uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c)
Vector extraction.
Definition: arm_simd.h:151
Library configuration file.