Crypto++ 8.2
Free C&
gf2n_simd.cpp
1// gf2n_simd.cpp - written and placed in the public domain by Jeffrey Walton
2// Also based on PCLMULQDQ code by Jankowski, Laurent and
3// O'Mahony from Intel (see reference below).
4//
5// This source file uses intrinsics and built-ins to gain access to
6// CLMUL, ARMv8a, and Power8 instructions. A separate source file is
7// needed because additional CXXFLAGS are required to enable the
8// appropriate instructions sets in some build configurations.
9//
10// Several speedups were taken from Intel Polynomial Multiplication
11// Instruction and its Usage for Elliptic Curve Cryptography, by
12// Krzysztof Jankowski, Pierre Laurent and Aidan O'Mahony,
13// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/polynomial-multiplication-instructions-paper.pdf
14// There may be more speedups available, see https://eprint.iacr.org/2011/589.pdf.
15// The IACR paper performs some optimizations that the compiler is
16// expected to perform, like Common Subexpression Elimination to save
17// on variables (among others). Note that the compiler may miss the
18// optimization so the IACR paper is useful. However, the code is GPL3
19// and toxic for some users of the library...
20
21#include "pch.h"
22#include "config.h"
23
24#ifndef CRYPTOPP_IMPORTS
25
26#include "gf2n.h"
27
28#if (CRYPTOPP_CLMUL_AVAILABLE)
29# include <emmintrin.h>
30# include <wmmintrin.h>
31#endif
32
33#if (CRYPTOPP_ARM_PMULL_AVAILABLE)
34# include "arm_simd.h"
35#endif
36
37#if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
38# include "ppc_simd.h"
39#endif
40
41ANONYMOUS_NAMESPACE_BEGIN
42
43// ************************** ARMv8 ************************** //
44
45using CryptoPP::word;
46
47#if (CRYPTOPP_ARM_PMULL_AVAILABLE)
48
49// c1c0 = a * b
50inline void
51F2N_Multiply_128x128_ARMv8(uint64x2_t& c1, uint64x2_t& c0, const uint64x2_t& a, const uint64x2_t& b)
52{
53 uint64x2_t t1, t2, z0={0};
54
55 c0 = PMULL_00(a, b);
56 c1 = PMULL_11(a, b);
57 t1 = vmovq_n_u64(vgetq_lane_u64(a, 1));
58 t1 = veorq_u64(a, t1);
59 t2 = vmovq_n_u64(vgetq_lane_u64(b, 1));
60 t2 = veorq_u64(b, t2);
61 t1 = PMULL_00(t1, t2);
62 t1 = veorq_u64(c0, t1);
63 t1 = veorq_u64(c1, t1);
64 t2 = t1;
65 t1 = vextq_u64(z0, t1, 1);
66 t2 = vextq_u64(t2, z0, 1);
67 c0 = veorq_u64(c0, t1);
68 c1 = veorq_u64(c1, t2);
69}
70
71// c3c2c1c0 = a1a0 * b1b0
72inline void
73F2N_Multiply_256x256_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1, uint64x2_t& c0,
74 const uint64x2_t& b1, const uint64x2_t& b0, const uint64x2_t& a1, const uint64x2_t& a0)
75{
76 uint64x2_t c4, c5;
77 uint64x2_t x0=a0, x1=a1, y0=b0, y1=b1;
78
79 F2N_Multiply_128x128_ARMv8(c1, c0, x0, y0);
80 F2N_Multiply_128x128_ARMv8(c3, c2, x1, y1);
81
82 x0 = veorq_u64(x0, x1);
83 y0 = veorq_u64(y0, y1);
84
85 F2N_Multiply_128x128_ARMv8(c5, c4, x0, y0);
86
87 c4 = veorq_u64(c4, c0);
88 c4 = veorq_u64(c4, c2);
89 c5 = veorq_u64(c5, c1);
90 c5 = veorq_u64(c5, c3);
91 c1 = veorq_u64(c1, c4);
92 c2 = veorq_u64(c2, c5);
93}
94
95// c3c2c1c0 = a1a0 * a1a0
96inline void
97F2N_Square_256_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1,
98 uint64x2_t& c0, const uint64x2_t& a1, const uint64x2_t& a0)
99{
100 c0 = PMULL_00(a0, a0);
101 c1 = PMULL_11(a0, a0);
102 c2 = PMULL_00(a1, a1);
103 c3 = PMULL_11(a1, a1);
104}
105
106// x = (x << n), z = 0
107template <unsigned int N>
108inline uint64x2_t ShiftLeft128_ARMv8(uint64x2_t x)
109{
110 uint64x2_t u=x, v, z={0};
111 x = vshlq_n_u64(x, N);
112 u = vshrq_n_u64(u, (64-N));
113 v = vcombine_u64(vget_low_u64(z), vget_low_u64(u));
114 x = vorrq_u64(x, v);
115 return x;
116}
117
118// c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at
119// Intel paper or https://github.com/antonblanchard/crc32-vpmsum.
120inline void
121GF2NT_233_Reduce_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1, uint64x2_t& c0)
122{
123 const unsigned int mask[4] = {
124 0xffffffff, 0xffffffff, 0xffffffff, 0x000001ff,
125 };
126
127 uint64x2_t b3, b2, b1, /*b0,*/ a1, a0, m0, z0={0};
128 m0 = vreinterpretq_u64_u32(vld1q_u32(mask));
129 b1 = c1; a1 = c1;
130 a0 = vcombine_u64(vget_low_u64(c1), vget_low_u64(z0));
131 a1 = vshlq_n_u64(a1, 23);
132 a1 = vshrq_n_u64(a1, 23);
133 c1 = vorrq_u64(a1, a0);
134 b2 = vshrq_n_u64(c2, (64-23));
135 c3 = ShiftLeft128_ARMv8<23>(c3);
136 a0 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
137 c3 = vorrq_u64(c3, a0);
138 b1 = vshrq_n_u64(b1, (64-23));
139 c2 = ShiftLeft128_ARMv8<23>(c2);
140 a0 = vcombine_u64(vget_high_u64(b1), vget_high_u64(z0));
141 c2 = vorrq_u64(c2, a0);
142 b3 = c3;
143 b2 = vshrq_n_u64(c2, (64-10));
144 b3 = ShiftLeft128_ARMv8<10>(b3);
145 a0 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
146 b3 = vorrq_u64(b3, a0);
147 a0 = vcombine_u64(vget_high_u64(c3), vget_high_u64(z0));
148 b3 = veorq_u64(b3, a0);
149 b1 = vshrq_n_u64(b3, (64-23));
150 b3 = ShiftLeft128_ARMv8<23>(b3);
151 b3 = vcombine_u64(vget_high_u64(b3), vget_high_u64(z0));
152 b3 = vorrq_u64(b3, b1);
153 c2 = veorq_u64(c2, b3);
154 b3 = c3;
155 b2 = vshrq_n_u64(c2, (64-10));
156 b3 = ShiftLeft128_ARMv8<10>(b3);
157 b2 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
158 b3 = vorrq_u64(b3, b2);
159 b2 = c2;
160 b2 = ShiftLeft128_ARMv8<10>(b2);
161 a0 = vcombine_u64(vget_low_u64(z0), vget_low_u64(b2));
162 c2 = veorq_u64(c2, a0);
163 a0 = vcombine_u64(vget_low_u64(z0), vget_low_u64(b3));
164 a1 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
165 a0 = vorrq_u64(a0, a1);
166 c3 = veorq_u64(c3, a0);
167 c0 = veorq_u64(c0, c2);
168 c1 = veorq_u64(c1, c3);
169 c1 = vandq_u64(c1, m0);
170}
171
172#endif
173
174// ************************** SSE ************************** //
175
176#if (CRYPTOPP_CLMUL_AVAILABLE)
177
178using CryptoPP::word;
179
180// c1c0 = a * b
181inline void
182F2N_Multiply_128x128_CLMUL(__m128i& c1, __m128i& c0, const __m128i& a, const __m128i& b)
183{
184 __m128i t1, t2;
185
186 c0 = _mm_clmulepi64_si128(a, b, 0x00);
187 c1 = _mm_clmulepi64_si128(a, b, 0x11);
188 t1 = _mm_shuffle_epi32(a, 0xEE);
189 t1 = _mm_xor_si128(a, t1);
190 t2 = _mm_shuffle_epi32(b, 0xEE);
191 t2 = _mm_xor_si128(b, t2);
192 t1 = _mm_clmulepi64_si128(t1, t2, 0x00);
193 t1 = _mm_xor_si128(c0, t1);
194 t1 = _mm_xor_si128(c1, t1);
195 t2 = t1;
196 t1 = _mm_slli_si128(t1, 8);
197 t2 = _mm_srli_si128(t2, 8);
198 c0 = _mm_xor_si128(c0, t1);
199 c1 = _mm_xor_si128(c1, t2);
200}
201
202// c3c2c1c0 = a1a0 * b1b0
203inline void
204F2N_Multiply_256x256_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1, __m128i& c0,
205 const __m128i& b1, const __m128i& b0, const __m128i& a1, const __m128i& a0)
206{
207 __m128i c4, c5;
208 __m128i x0=a0, x1=a1, y0=b0, y1=b1;
209
210 F2N_Multiply_128x128_CLMUL(c1, c0, x0, y0);
211 F2N_Multiply_128x128_CLMUL(c3, c2, x1, y1);
212
213 x0 = _mm_xor_si128(x0, x1);
214 y0 = _mm_xor_si128(y0, y1);
215
216 F2N_Multiply_128x128_CLMUL(c5, c4, x0, y0);
217
218 c4 = _mm_xor_si128(c4, c0);
219 c4 = _mm_xor_si128(c4, c2);
220 c5 = _mm_xor_si128(c5, c1);
221 c5 = _mm_xor_si128(c5, c3);
222 c1 = _mm_xor_si128(c1, c4);
223 c2 = _mm_xor_si128(c2, c5);
224}
225
226// c3c2c1c0 = a1a0 * a1a0
227inline void
228F2N_Square_256_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1,
229 __m128i& c0, const __m128i& a1, const __m128i& a0)
230{
231 c0 = _mm_clmulepi64_si128(a0, a0, 0x00);
232 c1 = _mm_clmulepi64_si128(a0, a0, 0x11);
233 c2 = _mm_clmulepi64_si128(a1, a1, 0x00);
234 c3 = _mm_clmulepi64_si128(a1, a1, 0x11);
235}
236
237// x = (x << n), z = 0
238template <unsigned int N>
239inline __m128i ShiftLeft128_SSE(__m128i x, const __m128i& z)
240{
241 __m128i u=x, v;
242 x = _mm_slli_epi64(x, N);
243 u = _mm_srli_epi64(u, (64-N));
244 v = _mm_unpacklo_epi64(z, u);
245 x = _mm_or_si128(x, v);
246 return x;
247}
248
249// c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at
250// Intel paper or https://github.com/antonblanchard/crc32-vpmsum.
251inline void
252GF2NT_233_Reduce_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1, __m128i& c0)
253{
254 const unsigned int m[4] = {
255 0xffffffff, 0xffffffff, 0xffffffff, 0x000001ff
256 };
257
258 __m128i b3, b2, b1, /*b0,*/ a1, a0, m0, z0;
259 m0 = _mm_set_epi32(m[3], m[2], m[1], m[0]);
260 z0 = _mm_setzero_si128();
261 b1 = c1; a1 = c1;
262 a0 = _mm_move_epi64(c1);
263 a1 = _mm_slli_epi64(a1, 23);
264 a1 = _mm_srli_epi64(a1, 23);
265 c1 = _mm_or_si128(a1, a0);
266 b2 = _mm_srli_epi64(c2, (64-23));
267 c3 = ShiftLeft128_SSE<23>(c3, z0);
268 a0 = _mm_unpackhi_epi64(b2, z0);
269 c3 = _mm_or_si128(c3, a0);
270 b1 = _mm_srli_epi64(b1, (64-23));
271 c2 = ShiftLeft128_SSE<23>(c2, z0);
272 a0 = _mm_unpackhi_epi64(b1, z0);
273 c2 = _mm_or_si128(c2, a0);
274 b3 = c3;
275 b2 = _mm_srli_epi64(c2, (64-10));
276 b3 = ShiftLeft128_SSE<10>(b3, z0);
277 a0 = _mm_unpackhi_epi64(b2, z0);
278 b3 = _mm_or_si128(b3, a0);
279 a0 = _mm_unpackhi_epi64(c3, z0);
280 b3 = _mm_xor_si128(b3, a0);
281 b1 = _mm_srli_epi64(b3, (64-23));
282 b3 = ShiftLeft128_SSE<23>(b3, z0);
283 b3 = _mm_unpackhi_epi64(b3, z0);
284 b3 = _mm_or_si128(b3, b1);
285 c2 = _mm_xor_si128(c2, b3);
286 b3 = c3;
287 b2 = _mm_srli_epi64(c2, (64-10));
288 b3 = ShiftLeft128_SSE<10>(b3, z0);
289 b2 = _mm_unpackhi_epi64(b2, z0);
290 b3 = _mm_or_si128(b3, b2);
291 b2 = c2;
292 b2 = ShiftLeft128_SSE<10>(b2, z0);
293 a0 = _mm_unpacklo_epi64(z0, b2);
294 c2 = _mm_xor_si128(c2, a0);
295 a0 = _mm_unpacklo_epi64(z0, b3);
296 a1 = _mm_unpackhi_epi64(b2, z0);
297 a0 = _mm_or_si128(a0, a1);
298 c3 = _mm_xor_si128(c3, a0);
299 c0 = _mm_xor_si128(c0, c2);
300 c1 = _mm_xor_si128(c1, c3);
301 c1 = _mm_and_si128(c1, m0);
302}
303
304#endif
305
306// ************************* Power8 ************************* //
307
308#if (CRYPTOPP_POWER8_VMULL_AVAILABLE)
309
310using CryptoPP::byte;
311using CryptoPP::word;
312using CryptoPP::uint8x16_p;
313using CryptoPP::uint64x2_p;
314
315using CryptoPP::VecLoad;
316using CryptoPP::VecStore;
317
318using CryptoPP::VecOr;
319using CryptoPP::VecXor;
320using CryptoPP::VecAnd;
321
322using CryptoPP::VecPermute;
323using CryptoPP::VecMergeLow;
324using CryptoPP::VecMergeHigh;
325using CryptoPP::VecShiftLeft;
326using CryptoPP::VecShiftRight;
327
328using CryptoPP::VecPolyMultiply00LE;
329using CryptoPP::VecPolyMultiply11LE;
330
331// c1c0 = a * b
332inline void
333F2N_Multiply_128x128_POWER8(uint64x2_p& c1, uint64x2_p& c0, const uint64x2_p& a, const uint64x2_p& b)
334{
335 uint64x2_p t1, t2;
336 const uint64x2_p z0={0};
337
338 c0 = VecPolyMultiply00LE(a, b);
339 c1 = VecPolyMultiply11LE(a, b);
340 t1 = VecMergeLow(a, a);
341 t1 = VecXor(a, t1);
342 t2 = VecMergeLow(b, b);
343 t2 = VecXor(b, t2);
344 t1 = VecPolyMultiply00LE(t1, t2);
345 t1 = VecXor(c0, t1);
346 t1 = VecXor(c1, t1);
347 t2 = t1;
348 t1 = VecMergeHigh(z0, t1);
349 t2 = VecMergeLow(t2, z0);
350 c0 = VecXor(c0, t1);
351 c1 = VecXor(c1, t2);
352}
353
354// c3c2c1c0 = a1a0 * b1b0
355inline void
356F2N_Multiply_256x256_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1, uint64x2_p& c0,
357 const uint64x2_p& b1, const uint64x2_p& b0, const uint64x2_p& a1, const uint64x2_p& a0)
358{
359 uint64x2_p c4, c5;
360 uint64x2_p x0=a0, x1=a1, y0=b0, y1=b1;
361
362 F2N_Multiply_128x128_POWER8(c1, c0, x0, y0);
363 F2N_Multiply_128x128_POWER8(c3, c2, x1, y1);
364
365 x0 = VecXor(x0, x1);
366 y0 = VecXor(y0, y1);
367
368 F2N_Multiply_128x128_POWER8(c5, c4, x0, y0);
369
370 c4 = VecXor(c4, c0);
371 c4 = VecXor(c4, c2);
372 c5 = VecXor(c5, c1);
373 c5 = VecXor(c5, c3);
374 c1 = VecXor(c1, c4);
375 c2 = VecXor(c2, c5);
376}
377
378// c3c2c1c0 = a1a0 * a1a0
379inline void
380F2N_Square_256_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1,
381 uint64x2_p& c0, const uint64x2_p& a1, const uint64x2_p& a0)
382{
383 c0 = VecPolyMultiply00LE(a0, a0);
384 c1 = VecPolyMultiply11LE(a0, a0);
385 c2 = VecPolyMultiply00LE(a1, a1);
386 c3 = VecPolyMultiply11LE(a1, a1);
387}
388
389// x = (x << n), z = 0
390template <unsigned int N>
391inline uint64x2_p ShiftLeft128_POWER8(uint64x2_p x)
392{
393 uint64x2_p u=x, v;
394 const uint64x2_p z={0};
395
396 x = VecShiftLeft<N>(x);
397 u = VecShiftRight<64-N>(u);
398 v = VecMergeHigh(z, u);
399 x = VecOr(x, v);
400 return x;
401}
402
403// c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at
404// Intel paper or https://github.com/antonblanchard/crc32-vpmsum.
405inline void
406GF2NT_233_Reduce_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1, uint64x2_p& c0)
407{
408 const uint64_t mod[] = {W64LIT(0xffffffffffffffff), W64LIT(0x01ffffffffff)};
409 const uint64x2_p m0 = (uint64x2_p)VecLoad(mod);
410
411 uint64x2_p b3, b2, b1, /*b0,*/ a1, a0;
412 const uint64x2_p z0={0};
413
414 b1 = c1; a1 = c1;
415 a0 = VecMergeHigh(c1, z0);
416 a1 = VecShiftLeft<23>(a1);
417 a1 = VecShiftRight<23>(a1);
418 c1 = VecOr(a1, a0);
419 b2 = VecShiftRight<64-23>(c2);
420 c3 = ShiftLeft128_POWER8<23>(c3);
421 a0 = VecMergeLow(b2, z0);
422 c3 = VecOr(c3, a0);
423 b1 = VecShiftRight<64-23>(b1);
424 c2 = ShiftLeft128_POWER8<23>(c2);
425 a0 = VecMergeLow(b1, z0);
426 c2 = VecOr(c2, a0);
427 b3 = c3;
428 b2 = VecShiftRight<64-10>(c2);
429 b3 = ShiftLeft128_POWER8<10>(b3);
430 a0 = VecMergeLow(b2, z0);
431 b3 = VecOr(b3, a0);
432 a0 = VecMergeLow(c3, z0);
433 b3 = VecXor(b3, a0);
434 b1 = VecShiftRight<64-23>(b3);
435 b3 = ShiftLeft128_POWER8<23>(b3);
436 b3 = VecMergeLow(b3, z0);
437 b3 = VecOr(b3, b1);
438 c2 = VecXor(c2, b3);
439 b3 = c3;
440 b2 = VecShiftRight<64-10>(c2);
441 b3 = ShiftLeft128_POWER8<10>(b3);
442 b2 = VecMergeLow(b2, z0);
443 b3 = VecOr(b3, b2);
444 b2 = c2;
445 b2 = ShiftLeft128_POWER8<10>(b2);
446 a0 = VecMergeHigh(z0, b2);
447 c2 = VecXor(c2, a0);
448 a0 = VecMergeHigh(z0, b3);
449 a1 = VecMergeLow(b2, z0);
450 a0 = VecOr(a0, a1);
451 c3 = VecXor(c3, a0);
452 c0 = VecXor(c0, c2);
453 c1 = VecXor(c1, c3);
454 c1 = VecAnd(c1, m0);
455}
456
457#endif
458
459ANONYMOUS_NAMESPACE_END
460
461NAMESPACE_BEGIN(CryptoPP)
462
463#if (CRYPTOPP_CLMUL_AVAILABLE)
464
465void
466GF2NT_233_Multiply_Reduce_CLMUL(const word* pA, const word* pB, word* pC)
467{
468 const __m128i* pAA = reinterpret_cast<const __m128i*>(pA);
469 const __m128i* pBB = reinterpret_cast<const __m128i*>(pB);
470 __m128i a0 = _mm_loadu_si128(pAA+0);
471 __m128i a1 = _mm_loadu_si128(pAA+1);
472 __m128i b0 = _mm_loadu_si128(pBB+0);
473 __m128i b1 = _mm_loadu_si128(pBB+1);
474
475 __m128i c0, c1, c2, c3;
476 F2N_Multiply_256x256_CLMUL(c3, c2, c1, c0, a1, a0, b1, b0);
477 GF2NT_233_Reduce_CLMUL(c3, c2, c1, c0);
478
479 __m128i* pCC = reinterpret_cast<__m128i*>(pC);
480 _mm_storeu_si128(pCC+0, c0);
481 _mm_storeu_si128(pCC+1, c1);
482}
483
484void
485GF2NT_233_Square_Reduce_CLMUL(const word* pA, word* pC)
486{
487 const __m128i* pAA = reinterpret_cast<const __m128i*>(pA);
488 __m128i a0 = _mm_loadu_si128(pAA+0);
489 __m128i a1 = _mm_loadu_si128(pAA+1);
490
491 __m128i c0, c1, c2, c3;
492 F2N_Square_256_CLMUL(c3, c2, c1, c0, a1, a0);
493 GF2NT_233_Reduce_CLMUL(c3, c2, c1, c0);
494
495 __m128i* pCC = reinterpret_cast<__m128i*>(pC);
496 _mm_storeu_si128(pCC+0, c0);
497 _mm_storeu_si128(pCC+1, c1);
498}
499
500#elif (CRYPTOPP_ARM_PMULL_AVAILABLE)
501
502void
503GF2NT_233_Multiply_Reduce_ARMv8(const word* pA, const word* pB, word* pC)
504{
505 // word is either 32-bit or 64-bit, depending on the platform.
506 // Load using a 32-bit pointer to avoid possible alignment issues.
507 const uint32_t* pAA = reinterpret_cast<const uint32_t*>(pA);
508 const uint32_t* pBB = reinterpret_cast<const uint32_t*>(pB);
509
510 uint64x2_t a0 = vreinterpretq_u64_u32(vld1q_u32(pAA+0));
511 uint64x2_t a1 = vreinterpretq_u64_u32(vld1q_u32(pAA+4));
512 uint64x2_t b0 = vreinterpretq_u64_u32(vld1q_u32(pBB+0));
513 uint64x2_t b1 = vreinterpretq_u64_u32(vld1q_u32(pBB+4));
514
515 uint64x2_t c0, c1, c2, c3;
516 F2N_Multiply_256x256_ARMv8(c3, c2, c1, c0, a1, a0, b1, b0);
517 GF2NT_233_Reduce_ARMv8(c3, c2, c1, c0);
518
519 uint32_t* pCC = reinterpret_cast<uint32_t*>(pC);
520 vst1q_u32(pCC+0, vreinterpretq_u32_u64(c0));
521 vst1q_u32(pCC+4, vreinterpretq_u32_u64(c1));
522}
523
524void
525GF2NT_233_Square_Reduce_ARMv8(const word* pA, word* pC)
526{
527 // word is either 32-bit or 64-bit, depending on the platform.
528 // Load using a 32-bit pointer to avoid possible alignment issues.
529 const uint32_t* pAA = reinterpret_cast<const uint32_t*>(pA);
530 uint64x2_t a0 = vreinterpretq_u64_u32(vld1q_u32(pAA+0));
531 uint64x2_t a1 = vreinterpretq_u64_u32(vld1q_u32(pAA+4));
532
533 uint64x2_t c0, c1, c2, c3;
534 F2N_Square_256_ARMv8(c3, c2, c1, c0, a1, a0);
535 GF2NT_233_Reduce_ARMv8(c3, c2, c1, c0);
536
537 uint32_t* pCC = reinterpret_cast<uint32_t*>(pC);
538 vst1q_u32(pCC+0, vreinterpretq_u32_u64(c0));
539 vst1q_u32(pCC+4, vreinterpretq_u32_u64(c1));
540}
541
542#elif (CRYPTOPP_POWER8_VMULL_AVAILABLE)
543
544void
545GF2NT_233_Multiply_Reduce_POWER8(const word* pA, const word* pB, word* pC)
546{
547 // word is either 32-bit or 64-bit, depending on the platform.
548 // Load using a byte pointer to avoid possible alignment issues.
549 const byte* pAA = reinterpret_cast<const byte*>(pA);
550 const byte* pBB = reinterpret_cast<const byte*>(pB);
551
552 uint64x2_p a0 = (uint64x2_p)VecLoad(pAA+0);
553 uint64x2_p a1 = (uint64x2_p)VecLoad(pAA+16);
554 uint64x2_p b0 = (uint64x2_p)VecLoad(pBB+0);
555 uint64x2_p b1 = (uint64x2_p)VecLoad(pBB+16);
556
557#if (CRYPTOPP_BIG_ENDIAN)
558 const uint8_t mb[] = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
559 const uint8x16_p m = (uint8x16_p)VecLoad(mb);
560 a0 = VecPermute(a0, m);
561 a1 = VecPermute(a1, m);
562 b0 = VecPermute(b0, m);
563 b1 = VecPermute(b1, m);
564#endif
565
566 uint64x2_p c0, c1, c2, c3;
567 F2N_Multiply_256x256_POWER8(c3, c2, c1, c0, a1, a0, b1, b0);
568 GF2NT_233_Reduce_POWER8(c3, c2, c1, c0);
569
570#if (CRYPTOPP_BIG_ENDIAN)
571 c0 = VecPermute(c0, m);
572 c1 = VecPermute(c1, m);
573#endif
574
575 byte* pCC = reinterpret_cast<byte*>(pC);
576 VecStore(c0, pCC+0);
577 VecStore(c1, pCC+16);
578}
579
580void
581GF2NT_233_Square_Reduce_POWER8(const word* pA, word* pC)
582{
583 // word is either 32-bit or 64-bit, depending on the platform.
584 // Load using a byte pointer to avoid possible alignment issues.
585 const byte* pAA = reinterpret_cast<const byte*>(pA);
586 uint64x2_p a0 = (uint64x2_p)VecLoad(pAA+0);
587 uint64x2_p a1 = (uint64x2_p)VecLoad(pAA+16);
588
589#if (CRYPTOPP_BIG_ENDIAN)
590 const uint8_t mb[] = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
591 const uint8x16_p m = (uint8x16_p)VecLoad(mb);
592 a0 = VecPermute(a0, m);
593 a1 = VecPermute(a1, m);
594#endif
595
596 uint64x2_p c0, c1, c2, c3;
597 F2N_Square_256_POWER8(c3, c2, c1, c0, a1, a0);
598 GF2NT_233_Reduce_POWER8(c3, c2, c1, c0);
599
600#if (CRYPTOPP_BIG_ENDIAN)
601 c0 = VecPermute(c0, m);
602 c1 = VecPermute(c1, m);
603#endif
604
605 byte* pCC = reinterpret_cast<byte*>(pC);
606 VecStore(c0, pCC+0);
607 VecStore(c1, pCC+16);
608}
609
610#endif
611
612NAMESPACE_END
613
614#endif // CRYPTOPP_IMPORTS
Support functions for ARM and vector operations.
uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:35
uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:125
Library configuration file.
Classes and functions for schemes over GF(2^n)
Crypto++ library namespace.
Precompiled header file.
Support functions for PowerPC and vector operations.
T1 VecOr(const T1 vec1, const T2 vec2)
OR two vectors.
Definition: ppc_simd.h:899
uint64x2_p VecPolyMultiply11LE(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:1567
uint64x2_p VecPolyMultiply00LE(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:1501
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1010
T VecMergeHigh(const T vec1, const T vec2)
Merge two vectors.
Definition: ppc_simd.h:1217
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:119
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:916
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:139
T VecMergeLow(const T vec1, const T vec2)
Merge two vectors.
Definition: ppc_simd.h:1231
T1 VecAnd(const T1 vec1, const T2 vec2)
AND two vectors.
Definition: ppc_simd.h:882
uint32x4_p VecShiftRight(const uint32x4_p vec)
Shift a packed vector right.
Definition: ppc_simd.h:1296
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:605
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:253