Crypto++ 8.2
Free C&
adv_simd.h
Go to the documentation of this file.
1// adv_simd.h - written and placed in the public domain by Jeffrey Walton
2
3/// \file adv_simd.h
4/// \brief Template for AdvancedProcessBlocks and SIMD processing
5
6// The SIMD based implementations for ciphers that use SSE, NEON and Power7
7// have a commom pattern. Namely, they have a specialized implementation of
8// AdvancedProcessBlocks which processes multiple block using hardware
9// acceleration. After several implementations we noticed a lot of copy and
10// paste occuring. adv_simd.h provides a template to avoid the copy and paste.
11//
12// There are 11 templates provided in this file. The number following the
13// function name, 64 or 128, is the block size. The name following the block
14// size is the arrangement and acceleration. For example 4x1_SSE means Intel
15// SSE using two encrypt (or decrypt) functions: one that operates on 4 SIMD
16// words, and one that operates on 1 SIMD words.
17//
18// The distinction between SIMD words versus cipher blocks is important
19// because 64-bit ciphers use one SIMD word for two cipher blocks. For
20// example, AdvancedProcessBlocks64_6x2_ALTIVEC operates on 6 and 2 SIMD
21// words, which is 12 and 4 cipher blocks. The function will do the right
22// thing even if there is only one 64-bit block to encrypt.
23//
24// * AdvancedProcessBlocks64_2x1_SSE
25// * AdvancedProcessBlocks64_4x1_SSE
26// * AdvancedProcessBlocks128_4x1_SSE
27// * AdvancedProcessBlocks64_6x2_SSE
28// * AdvancedProcessBlocks128_6x2_SSE
29// * AdvancedProcessBlocks64_6x2_NEON
30// * AdvancedProcessBlocks128_4x1_NEON
31// * AdvancedProcessBlocks128_6x2_NEON
32// * AdvancedProcessBlocks64_6x2_ALTIVEC
33// * AdvancedProcessBlocks128_4x1_ALTIVEC
34// * AdvancedProcessBlocks128_6x1_ALTIVEC
35//
36// If an arrangement ends in 2, like 6x2, then the template will handle the
37// single block case by padding with 0's and using the two SIMD word
38// function. This happens at most one time when processing multiple blocks.
39// The extra processing of a zero block is trivial and worth the tradeoff.
40//
41// The MAYBE_CONST macro present on x86 is a SunCC workaround. Some versions
42// of SunCC lose/drop the const-ness in the F1 and F4 functions. It eventually
43// results in a failed link due to the const/non-const mismatch.
44
45#ifndef CRYPTOPP_ADVANCED_SIMD_TEMPLATES
46#define CRYPTOPP_ADVANCED_SIMD_TEMPLATES
47
48#include "config.h"
49#include "misc.h"
50#include "stdcpp.h"
51
52// C1189: error: This header is specific to ARM targets
53#if (CRYPTOPP_ARM_NEON_AVAILABLE) && !defined(_M_ARM64)
54# include <arm_neon.h>
55#endif
56
57#if (CRYPTOPP_ARM_ACLE_AVAILABLE)
58# include <stdint.h>
59# include <arm_acle.h>
60#endif
61
62#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE)
63# include <emmintrin.h>
64# include <xmmintrin.h>
65#endif
66
67// SunCC needs CRYPTOPP_SSSE3_AVAILABLE, too
68#if (CRYPTOPP_SSSE3_AVAILABLE)
69# include <emmintrin.h>
70# include <pmmintrin.h>
71# include <xmmintrin.h>
72#endif
73
74#if defined(__ALTIVEC__)
75# include "ppc_simd.h"
76#endif
77
78// ************************ All block ciphers *********************** //
79
80ANONYMOUS_NAMESPACE_BEGIN
81
82using CryptoPP::BlockTransformation;
83
84CRYPTOPP_CONSTANT(BT_XorInput = BlockTransformation::BT_XorInput)
85CRYPTOPP_CONSTANT(BT_AllowParallel = BlockTransformation::BT_AllowParallel)
86CRYPTOPP_CONSTANT(BT_InBlockIsCounter = BlockTransformation::BT_InBlockIsCounter)
87CRYPTOPP_CONSTANT(BT_ReverseDirection = BlockTransformation::BT_ReverseDirection)
88CRYPTOPP_CONSTANT(BT_DontIncrementInOutPointers = BlockTransformation::BT_DontIncrementInOutPointers)
89
90ANONYMOUS_NAMESPACE_END
91
92// *************************** ARM NEON ************************** //
93
94#if (CRYPTOPP_ARM_NEON_AVAILABLE)
95
96NAMESPACE_BEGIN(CryptoPP)
97
98/// \brief AdvancedProcessBlocks for 2 and 6 blocks
99/// \tparam F2 function to process 2 64-bit blocks
100/// \tparam F6 function to process 6 64-bit blocks
101/// \tparam W word type of the subkey table
102/// \details AdvancedProcessBlocks64_6x2_NEON processes 6 and 2 NEON SIMD words
103/// at a time. For a single block the template uses F2 with a zero block.
104/// \details The subkey type is usually word32 or word64. F2 and F6 must use the
105/// same word type.
106template <typename F2, typename F6, typename W>
107inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6,
108 const W *subKeys, size_t rounds, const byte *inBlocks,
109 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
110{
111 CRYPTOPP_ASSERT(subKeys);
112 CRYPTOPP_ASSERT(inBlocks);
113 CRYPTOPP_ASSERT(outBlocks);
114 CRYPTOPP_ASSERT(length >= 8);
115
116 const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
117 const unsigned int w_two[] = {0, 2<<24, 0, 2<<24};
118 const uint32x4_t s_one = vld1q_u32(w_one);
119 const uint32x4_t s_two = vld1q_u32(w_two);
120
121 const size_t blockSize = 8;
122 const size_t neonBlockSize = 16;
123
124 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : neonBlockSize;
125 size_t xorIncrement = (xorBlocks != NULLPTR) ? neonBlockSize : 0;
126 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : neonBlockSize;
127
128 // Clang and Coverity are generating findings using xorBlocks as a flag.
129 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
130 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
131
132 if (flags & BT_ReverseDirection)
133 {
134 inBlocks = PtrAdd(inBlocks, length - neonBlockSize);
135 xorBlocks = PtrAdd(xorBlocks, length - neonBlockSize);
136 outBlocks = PtrAdd(outBlocks, length - neonBlockSize);
137 inIncrement = 0-inIncrement;
138 xorIncrement = 0-xorIncrement;
139 outIncrement = 0-outIncrement;
140 }
141
142 if (flags & BT_AllowParallel)
143 {
144 while (length >= 6*neonBlockSize)
145 {
146 uint32x4_t block0, block1, block2, block3, block4, block5;
147 if (flags & BT_InBlockIsCounter)
148 {
149 // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
150 // After the dup load we have two counters in the NEON word. Then we need
151 // to increment the low ctr by 0 and the high ctr by 1.
152 const uint8x8_t ctr = vld1_u8(inBlocks);
153 block0 = vaddq_u32(s_one, vreinterpretq_u32_u8(vcombine_u8(ctr,ctr)));
154
155 // After initial increment of {0,1} remaining counters increment by {2,2}.
156 block1 = vaddq_u32(s_two, block0);
157 block2 = vaddq_u32(s_two, block1);
158 block3 = vaddq_u32(s_two, block2);
159 block4 = vaddq_u32(s_two, block3);
160 block5 = vaddq_u32(s_two, block4);
161
162 vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
163 vreinterpretq_u8_u32(vaddq_u32(s_two, block5))));
164 }
165 else
166 {
167 block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
168 inBlocks = PtrAdd(inBlocks, inIncrement);
169 block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
170 inBlocks = PtrAdd(inBlocks, inIncrement);
171 block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
172 inBlocks = PtrAdd(inBlocks, inIncrement);
173 block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
174 inBlocks = PtrAdd(inBlocks, inIncrement);
175 block4 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
176 inBlocks = PtrAdd(inBlocks, inIncrement);
177 block5 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
178 inBlocks = PtrAdd(inBlocks, inIncrement);
179 }
180
181 if (xorInput)
182 {
183 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
184 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
185 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
186 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
187 block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
188 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
189 block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
190 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
191 block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
192 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
193 block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
194 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
195 }
196
197 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
198
199 if (xorOutput)
200 {
201 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
202 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
203 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
204 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
205 block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
206 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
207 block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
208 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
209 block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
210 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
211 block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
212 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
213 }
214
215 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
216 outBlocks = PtrAdd(outBlocks, outIncrement);
217 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
218 outBlocks = PtrAdd(outBlocks, outIncrement);
219 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block2));
220 outBlocks = PtrAdd(outBlocks, outIncrement);
221 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block3));
222 outBlocks = PtrAdd(outBlocks, outIncrement);
223 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block4));
224 outBlocks = PtrAdd(outBlocks, outIncrement);
225 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block5));
226 outBlocks = PtrAdd(outBlocks, outIncrement);
227
228 length -= 6*neonBlockSize;
229 }
230
231 while (length >= 2*neonBlockSize)
232 {
233 uint32x4_t block0, block1;
234 if (flags & BT_InBlockIsCounter)
235 {
236 // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
237 // After the dup load we have two counters in the NEON word. Then we need
238 // to increment the low ctr by 0 and the high ctr by 1.
239 const uint8x8_t ctr = vld1_u8(inBlocks);
240 block0 = vaddq_u32(s_one, vreinterpretq_u32_u8(vcombine_u8(ctr,ctr)));
241
242 // After initial increment of {0,1} remaining counters increment by {2,2}.
243 block1 = vaddq_u32(s_two, block0);
244
245 vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
246 vreinterpretq_u8_u32(vaddq_u32(s_two, block1))));
247 }
248 else
249 {
250 block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
251 inBlocks = PtrAdd(inBlocks, inIncrement);
252 block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
253 inBlocks = PtrAdd(inBlocks, inIncrement);
254 }
255
256 if (xorInput)
257 {
258 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
259 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
260 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
261 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
262 }
263
264 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
265
266 if (xorOutput)
267 {
268 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
269 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
270 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
271 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
272 }
273
274 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
275 outBlocks = PtrAdd(outBlocks, outIncrement);
276 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
277 outBlocks = PtrAdd(outBlocks, outIncrement);
278
279 length -= 2*neonBlockSize;
280 }
281 }
282
283 if (length)
284 {
285 // Adjust to real block size
286 if (flags & BT_ReverseDirection)
287 {
288 inIncrement += inIncrement ? blockSize : 0;
289 xorIncrement += xorIncrement ? blockSize : 0;
290 outIncrement += outIncrement ? blockSize : 0;
291 inBlocks = PtrSub(inBlocks, inIncrement);
292 xorBlocks = PtrSub(xorBlocks, xorIncrement);
293 outBlocks = PtrSub(outBlocks, outIncrement);
294 }
295 else
296 {
297 inIncrement -= inIncrement ? blockSize : 0;
298 xorIncrement -= xorIncrement ? blockSize : 0;
299 outIncrement -= outIncrement ? blockSize : 0;
300 }
301
302 while (length >= blockSize)
303 {
304 uint32x4_t block, zero = {0};
305
306 const uint8x8_t v = vld1_u8(inBlocks);
307 block = vreinterpretq_u32_u8(vcombine_u8(v,v));
308
309 if (xorInput)
310 {
311 const uint8x8_t x = vld1_u8(xorBlocks);
312 block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
313 }
314
315 if (flags & BT_InBlockIsCounter)
316 const_cast<byte *>(inBlocks)[7]++;
317
318 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
319
320 if (xorOutput)
321 {
322 const uint8x8_t x = vld1_u8(xorBlocks);
323 block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
324 }
325
326 vst1_u8(const_cast<byte*>(outBlocks),
327 vget_low_u8(vreinterpretq_u8_u32(block)));
328
329 inBlocks = PtrAdd(inBlocks, inIncrement);
330 outBlocks = PtrAdd(outBlocks, outIncrement);
331 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
332 length -= blockSize;
333 }
334 }
335
336 return length;
337}
338
339/// \brief AdvancedProcessBlocks for 1 and 6 blocks
340/// \tparam F1 function to process 1 128-bit block
341/// \tparam F6 function to process 6 128-bit blocks
342/// \tparam W word type of the subkey table
343/// \details AdvancedProcessBlocks128_6x1_NEON processes 6 and 2 NEON SIMD words
344/// at a time.
345/// \details The subkey type is usually word32 or word64. F1 and F6 must use the
346/// same word type.
347template <typename F1, typename F6, typename W>
348inline size_t AdvancedProcessBlocks128_6x1_NEON(F1 func1, F6 func6,
349 const W *subKeys, size_t rounds, const byte *inBlocks,
350 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
351{
352 CRYPTOPP_ASSERT(subKeys);
353 CRYPTOPP_ASSERT(inBlocks);
354 CRYPTOPP_ASSERT(outBlocks);
355 CRYPTOPP_ASSERT(length >= 16);
356
357 const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
358 const unsigned int w_two[] = {0, 2<<24, 0, 2<<24};
359 const uint32x4_t s_one = vld1q_u32(w_one);
360 const uint32x4_t s_two = vld1q_u32(w_two);
361
362 const size_t blockSize = 16;
363 // const size_t neonBlockSize = 16;
364
365 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
366 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
367 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
368
369 // Clang and Coverity are generating findings using xorBlocks as a flag.
370 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
371 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
372
373 if (flags & BT_ReverseDirection)
374 {
375 inBlocks = PtrAdd(inBlocks, length - blockSize);
376 xorBlocks = PtrAdd(xorBlocks, length - blockSize);
377 outBlocks = PtrAdd(outBlocks, length - blockSize);
378 inIncrement = 0-inIncrement;
379 xorIncrement = 0-xorIncrement;
380 outIncrement = 0-outIncrement;
381 }
382
383 if (flags & BT_AllowParallel)
384 {
385 while (length >= 6*blockSize)
386 {
387 uint64x2_t block0, block1, block2, block3, block4, block5;
388 if (flags & BT_InBlockIsCounter)
389 {
390 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
391 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
392 block1 = vaddq_u64(block0, one);
393 block2 = vaddq_u64(block1, one);
394 block3 = vaddq_u64(block2, one);
395 block4 = vaddq_u64(block3, one);
396 block5 = vaddq_u64(block4, one);
397 vst1q_u8(const_cast<byte*>(inBlocks),
398 vreinterpretq_u8_u64(vaddq_u64(block5, one)));
399 }
400 else
401 {
402 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
403 inBlocks = PtrAdd(inBlocks, inIncrement);
404 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
405 inBlocks = PtrAdd(inBlocks, inIncrement);
406 block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
407 inBlocks = PtrAdd(inBlocks, inIncrement);
408 block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
409 inBlocks = PtrAdd(inBlocks, inIncrement);
410 block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
411 inBlocks = PtrAdd(inBlocks, inIncrement);
412 block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
413 inBlocks = PtrAdd(inBlocks, inIncrement);
414 }
415
416 if (xorInput)
417 {
418 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
419 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
420 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
421 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
422 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
423 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
424 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
425 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
426 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
427 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
428 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
429 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
430 }
431
432 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
433
434 if (xorOutput)
435 {
436 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
437 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
438 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
439 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
440 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
441 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
442 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
443 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
444 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
445 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
446 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
447 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
448 }
449
450 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
451 outBlocks = PtrAdd(outBlocks, outIncrement);
452 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
453 outBlocks = PtrAdd(outBlocks, outIncrement);
454 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
455 outBlocks = PtrAdd(outBlocks, outIncrement);
456 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
457 outBlocks = PtrAdd(outBlocks, outIncrement);
458 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
459 outBlocks = PtrAdd(outBlocks, outIncrement);
460 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
461 outBlocks = PtrAdd(outBlocks, outIncrement);
462
463 length -= 6*blockSize;
464 }
465 }
466
467 while (length >= blockSize)
468 {
469 uint64x2_t block;
470 block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
471
472 if (xorInput)
473 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
474
475 if (flags & BT_InBlockIsCounter)
476 const_cast<byte *>(inBlocks)[15]++;
477
478 func1(block, subKeys, static_cast<unsigned int>(rounds));
479
480 if (xorOutput)
481 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
482
483 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
484
485 inBlocks = PtrAdd(inBlocks, inIncrement);
486 outBlocks = PtrAdd(outBlocks, outIncrement);
487 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
488 length -= blockSize;
489 }
490
491 return length;
492}
493
494/// \brief AdvancedProcessBlocks for 1 and 4 blocks
495/// \tparam F1 function to process 1 128-bit block
496/// \tparam F4 function to process 4 128-bit blocks
497/// \tparam W word type of the subkey table
498/// \details AdvancedProcessBlocks128_4x1_NEON processes 4 and 1 NEON SIMD words
499/// at a time.
500/// \details The subkey type is usually word32 or word64. V is the vector type and it is
501/// usually uint32x4_t or uint32x4_t. F1, F4, and W must use the same word and
502/// vector type.
503template <typename F1, typename F4, typename W>
504inline size_t AdvancedProcessBlocks128_4x1_NEON(F1 func1, F4 func4,
505 const W *subKeys, size_t rounds, const byte *inBlocks,
506 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
507{
508 CRYPTOPP_ASSERT(subKeys);
509 CRYPTOPP_ASSERT(inBlocks);
510 CRYPTOPP_ASSERT(outBlocks);
511 CRYPTOPP_ASSERT(length >= 16);
512
513 const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
514 const unsigned int w_two[] = {0, 2<<24, 0, 2<<24};
515 const uint32x4_t s_one = vld1q_u32(w_one);
516 const uint32x4_t s_two = vld1q_u32(w_two);
517
518 const size_t blockSize = 16;
519 // const size_t neonBlockSize = 16;
520
521 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
522 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
523 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
524
525 // Clang and Coverity are generating findings using xorBlocks as a flag.
526 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
527 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
528
529 if (flags & BT_ReverseDirection)
530 {
531 inBlocks = PtrAdd(inBlocks, length - blockSize);
532 xorBlocks = PtrAdd(xorBlocks, length - blockSize);
533 outBlocks = PtrAdd(outBlocks, length - blockSize);
534 inIncrement = 0-inIncrement;
535 xorIncrement = 0-xorIncrement;
536 outIncrement = 0-outIncrement;
537 }
538
539 if (flags & BT_AllowParallel)
540 {
541 while (length >= 4*blockSize)
542 {
543 uint32x4_t block0, block1, block2, block3;
544 if (flags & BT_InBlockIsCounter)
545 {
546 const uint32x4_t one = s_one;
547 block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
548 block1 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block0), vreinterpretq_u64_u32(one)));
549 block2 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block1), vreinterpretq_u64_u32(one)));
550 block3 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block2), vreinterpretq_u64_u32(one)));
551 vst1q_u8(const_cast<byte*>(inBlocks), vreinterpretq_u8_u64(vaddq_u64(
552 vreinterpretq_u64_u32(block3), vreinterpretq_u64_u32(one))));
553 }
554 else
555 {
556 block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
557 inBlocks = PtrAdd(inBlocks, inIncrement);
558 block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
559 inBlocks = PtrAdd(inBlocks, inIncrement);
560 block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
561 inBlocks = PtrAdd(inBlocks, inIncrement);
562 block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
563 inBlocks = PtrAdd(inBlocks, inIncrement);
564 }
565
566 if (xorInput)
567 {
568 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
569 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
570 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
571 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
572 block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
573 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
574 block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
575 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
576 }
577
578 func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
579
580 if (xorOutput)
581 {
582 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
583 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
584 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
585 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
586 block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
587 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
588 block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
589 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
590 }
591
592 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
593 outBlocks = PtrAdd(outBlocks, outIncrement);
594 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
595 outBlocks = PtrAdd(outBlocks, outIncrement);
596 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block2));
597 outBlocks = PtrAdd(outBlocks, outIncrement);
598 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block3));
599 outBlocks = PtrAdd(outBlocks, outIncrement);
600
601 length -= 4*blockSize;
602 }
603 }
604
605 while (length >= blockSize)
606 {
607 uint32x4_t block = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
608
609 if (xorInput)
610 block = veorq_u32(block, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
611
612 if (flags & BT_InBlockIsCounter)
613 const_cast<byte *>(inBlocks)[15]++;
614
615 func1(block, subKeys, static_cast<unsigned int>(rounds));
616
617 if (xorOutput)
618 block = veorq_u32(block, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
619
620 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block));
621
622 inBlocks = PtrAdd(inBlocks, inIncrement);
623 outBlocks = PtrAdd(outBlocks, outIncrement);
624 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
625 length -= blockSize;
626 }
627
628 return length;
629}
630
631/// \brief AdvancedProcessBlocks for 2 and 6 blocks
632/// \tparam F2 function to process 2 128-bit blocks
633/// \tparam F6 function to process 6 128-bit blocks
634/// \tparam W word type of the subkey table
635/// \details AdvancedProcessBlocks128_6x2_NEON processes 6 and 2 NEON SIMD words
636/// at a time. For a single block the template uses F2 with a zero block.
637/// \details The subkey type is usually word32 or word64. F2 and F6 must use the
638/// same word type.
639template <typename F2, typename F6, typename W>
640inline size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6,
641 const W *subKeys, size_t rounds, const byte *inBlocks,
642 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
643{
644 CRYPTOPP_ASSERT(subKeys);
645 CRYPTOPP_ASSERT(inBlocks);
646 CRYPTOPP_ASSERT(outBlocks);
647 CRYPTOPP_ASSERT(length >= 16);
648
649 const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
650 const unsigned int w_two[] = {0, 2<<24, 0, 2<<24};
651 const uint32x4_t s_one = vld1q_u32(w_one);
652 const uint32x4_t s_two = vld1q_u32(w_two);
653
654 const size_t blockSize = 16;
655 // const size_t neonBlockSize = 16;
656
657 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
658 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
659 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
660
661 // Clang and Coverity are generating findings using xorBlocks as a flag.
662 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
663 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
664
665 if (flags & BT_ReverseDirection)
666 {
667 inBlocks = PtrAdd(inBlocks, length - blockSize);
668 xorBlocks = PtrAdd(xorBlocks, length - blockSize);
669 outBlocks = PtrAdd(outBlocks, length - blockSize);
670 inIncrement = 0-inIncrement;
671 xorIncrement = 0-xorIncrement;
672 outIncrement = 0-outIncrement;
673 }
674
675 if (flags & BT_AllowParallel)
676 {
677 while (length >= 6*blockSize)
678 {
679 uint64x2_t block0, block1, block2, block3, block4, block5;
680 if (flags & BT_InBlockIsCounter)
681 {
682 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
683 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
684 block1 = vaddq_u64(block0, one);
685 block2 = vaddq_u64(block1, one);
686 block3 = vaddq_u64(block2, one);
687 block4 = vaddq_u64(block3, one);
688 block5 = vaddq_u64(block4, one);
689 vst1q_u8(const_cast<byte*>(inBlocks),
690 vreinterpretq_u8_u64(vaddq_u64(block5, one)));
691 }
692 else
693 {
694 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
695 inBlocks = PtrAdd(inBlocks, inIncrement);
696 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
697 inBlocks = PtrAdd(inBlocks, inIncrement);
698 block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
699 inBlocks = PtrAdd(inBlocks, inIncrement);
700 block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
701 inBlocks = PtrAdd(inBlocks, inIncrement);
702 block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
703 inBlocks = PtrAdd(inBlocks, inIncrement);
704 block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
705 inBlocks = PtrAdd(inBlocks, inIncrement);
706 }
707
708 if (xorInput)
709 {
710 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
711 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
712 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
713 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
714 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
715 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
716 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
717 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
718 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
719 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
720 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
721 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
722 }
723
724 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
725
726 if (xorOutput)
727 {
728 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
729 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
730 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
731 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
732 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
733 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
734 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
735 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
736 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
737 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
738 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
739 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
740 }
741
742 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
743 outBlocks = PtrAdd(outBlocks, outIncrement);
744 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
745 outBlocks = PtrAdd(outBlocks, outIncrement);
746 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
747 outBlocks = PtrAdd(outBlocks, outIncrement);
748 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
749 outBlocks = PtrAdd(outBlocks, outIncrement);
750 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
751 outBlocks = PtrAdd(outBlocks, outIncrement);
752 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
753 outBlocks = PtrAdd(outBlocks, outIncrement);
754
755 length -= 6*blockSize;
756 }
757
758 while (length >= 2*blockSize)
759 {
760 uint64x2_t block0, block1;
761 if (flags & BT_InBlockIsCounter)
762 {
763 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
764 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
765 block1 = vaddq_u64(block0, one);
766 vst1q_u8(const_cast<byte*>(inBlocks),
767 vreinterpretq_u8_u64(vaddq_u64(block1, one)));
768 }
769 else
770 {
771 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
772 inBlocks = PtrAdd(inBlocks, inIncrement);
773 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
774 inBlocks = PtrAdd(inBlocks, inIncrement);
775 }
776
777 if (xorInput)
778 {
779 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
780 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
781 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
782 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
783 }
784
785 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
786
787 if (xorOutput)
788 {
789 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
790 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
791 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
792 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
793 }
794
795 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
796 outBlocks = PtrAdd(outBlocks, outIncrement);
797 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
798 outBlocks = PtrAdd(outBlocks, outIncrement);
799
800 length -= 2*blockSize;
801 }
802 }
803
804 while (length >= blockSize)
805 {
806 uint64x2_t block, zero = {0,0};
807 block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
808
809 if (xorInput)
810 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
811
812 if (flags & BT_InBlockIsCounter)
813 const_cast<byte *>(inBlocks)[15]++;
814
815 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
816
817 if (xorOutput)
818 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
819
820 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
821
822 inBlocks = PtrAdd(inBlocks, inIncrement);
823 outBlocks = PtrAdd(outBlocks, outIncrement);
824 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
825 length -= blockSize;
826 }
827
828 return length;
829}
830
831NAMESPACE_END // CryptoPP
832
833#endif // CRYPTOPP_ARM_NEON_AVAILABLE
834
835// *************************** Intel SSE ************************** //
836
837#if defined(CRYPTOPP_SSSE3_AVAILABLE)
838
839// Hack for SunCC, http://github.com/weidai11/cryptopp/issues/224
840#if (__SUNPRO_CC >= 0x5130)
841# define MAYBE_CONST
842# define MAYBE_UNCONST_CAST(T, x) const_cast<MAYBE_CONST T>(x)
843#else
844# define MAYBE_CONST const
845# define MAYBE_UNCONST_CAST(T, x) (x)
846#endif
847
848// Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670
849#ifndef M128_CAST
850# define M128_CAST(x) ((__m128i *)(void *)(x))
851#endif
852#ifndef CONST_M128_CAST
853# define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
854#endif
855
856NAMESPACE_BEGIN(CryptoPP)
857
858/// \brief AdvancedProcessBlocks for 1 and 2 blocks
859/// \tparam F1 function to process 1 64-bit block
860/// \tparam F2 function to process 2 64-bit blocks
861/// \tparam W word type of the subkey table
862/// \details AdvancedProcessBlocks64_2x1_SSE processes 2 and 1 SSE SIMD words
863/// at a time.
864/// \details The subkey type is usually word32 or word64. F1 and F2 must use the
865/// same word type.
866template <typename F1, typename F2, typename W>
867inline size_t AdvancedProcessBlocks64_2x1_SSE(F1 func1, F2 func2,
868 MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
869 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
870{
871 CRYPTOPP_ASSERT(subKeys);
872 CRYPTOPP_ASSERT(inBlocks);
873 CRYPTOPP_ASSERT(outBlocks);
874 CRYPTOPP_ASSERT(length >= 8);
875
876 const size_t blockSize = 8;
877 const size_t xmmBlockSize = 16;
878
879 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
880 size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
881 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
882
883 // Clang and Coverity are generating findings using xorBlocks as a flag.
884 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
885 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
886
887 if (flags & BT_ReverseDirection)
888 {
889 inBlocks = PtrAdd(inBlocks, length - xmmBlockSize);
890 xorBlocks = PtrAdd(xorBlocks, length - xmmBlockSize);
891 outBlocks = PtrAdd(outBlocks, length - xmmBlockSize);
892 inIncrement = 0-inIncrement;
893 xorIncrement = 0-xorIncrement;
894 outIncrement = 0-outIncrement;
895 }
896
897 if (flags & BT_AllowParallel)
898 {
899 double temp[2];
900 while (length >= 2*xmmBlockSize)
901 {
902 __m128i block0, block1;
903 if (flags & BT_InBlockIsCounter)
904 {
905 // Increment of 1 and 2 in big-endian compatible with the ctr byte array.
906 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
907 const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
908
909 // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
910 // After the dup load we have two counters in the XMM word. Then we need
911 // to increment the low ctr by 0 and the high ctr by 1.
912 std::memcpy(temp, inBlocks, blockSize);
913 block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
914
915 // After initial increment of {0,1} remaining counters increment by {2,2}.
916 block1 = _mm_add_epi32(s_two, block0);
917
918 // Store the next counter. When BT_InBlockIsCounter is set then
919 // inBlocks is backed by m_counterArray which is non-const.
920 _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block1)));
921 std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
922 }
923 else
924 {
925 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
926 inBlocks = PtrAdd(inBlocks, inIncrement);
927 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
928 inBlocks = PtrAdd(inBlocks, inIncrement);
929 }
930
931 if (xorInput)
932 {
933 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
934 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
935 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
936 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
937 }
938
939 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
940
941 if (xorOutput)
942 {
943 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
944 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
945 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
946 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
947 }
948
949 _mm_storeu_si128(M128_CAST(outBlocks), block0);
950 outBlocks = PtrAdd(outBlocks, outIncrement);
951 _mm_storeu_si128(M128_CAST(outBlocks), block1);
952 outBlocks = PtrAdd(outBlocks, outIncrement);
953
954 length -= 2*xmmBlockSize;
955 }
956 }
957
958 if (length)
959 {
960 // Adjust to real block size
961 if (flags & BT_ReverseDirection)
962 {
963 inIncrement += inIncrement ? blockSize : 0;
964 xorIncrement += xorIncrement ? blockSize : 0;
965 outIncrement += outIncrement ? blockSize : 0;
966 inBlocks = PtrSub(inBlocks, inIncrement);
967 xorBlocks = PtrSub(xorBlocks, xorIncrement);
968 outBlocks = PtrSub(outBlocks, outIncrement);
969 }
970 else
971 {
972 inIncrement -= inIncrement ? blockSize : 0;
973 xorIncrement -= xorIncrement ? blockSize : 0;
974 outIncrement -= outIncrement ? blockSize : 0;
975 }
976
977 while (length >= blockSize)
978 {
979 double temp[2];
980 std::memcpy(temp, inBlocks, blockSize);
981 __m128i block = _mm_castpd_si128(_mm_load_sd(temp));
982
983 if (xorInput)
984 {
985 std::memcpy(temp, xorBlocks, blockSize);
986 block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
987 }
988
989 if (flags & BT_InBlockIsCounter)
990 const_cast<byte *>(inBlocks)[7]++;
991
992 func1(block, subKeys, static_cast<unsigned int>(rounds));
993
994 if (xorOutput)
995 {
996 std::memcpy(temp, xorBlocks, blockSize);
997 block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
998 }
999
1000 _mm_store_sd(temp, _mm_castsi128_pd(block));
1001 std::memcpy(outBlocks, temp, blockSize);
1002
1003 inBlocks = PtrAdd(inBlocks, inIncrement);
1004 outBlocks = PtrAdd(outBlocks, outIncrement);
1005 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1006 length -= blockSize;
1007 }
1008 }
1009
1010 return length;
1011}
1012
1013/// \brief AdvancedProcessBlocks for 2 and 6 blocks
1014/// \tparam F2 function to process 2 64-bit blocks
1015/// \tparam F6 function to process 6 64-bit blocks
1016/// \tparam W word type of the subkey table
1017/// \details AdvancedProcessBlocks64_6x2_SSE processes 6 and 2 SSE SIMD words
1018/// at a time. For a single block the template uses F2 with a zero block.
1019/// \details The subkey type is usually word32 or word64. F2 and F6 must use the
1020/// same word type.
1021template <typename F2, typename F6, typename W>
1022inline size_t AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6,
1023 MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
1024 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1025{
1026 CRYPTOPP_ASSERT(subKeys);
1027 CRYPTOPP_ASSERT(inBlocks);
1028 CRYPTOPP_ASSERT(outBlocks);
1029 CRYPTOPP_ASSERT(length >= 8);
1030
1031 const size_t blockSize = 8;
1032 const size_t xmmBlockSize = 16;
1033
1034 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
1035 size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
1036 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
1037
1038 // Clang and Coverity are generating findings using xorBlocks as a flag.
1039 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1040 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1041
1042 if (flags & BT_ReverseDirection)
1043 {
1044 inBlocks = PtrAdd(inBlocks, length - xmmBlockSize);
1045 xorBlocks = PtrAdd(xorBlocks, length - xmmBlockSize);
1046 outBlocks = PtrAdd(outBlocks, length - xmmBlockSize);
1047 inIncrement = 0-inIncrement;
1048 xorIncrement = 0-xorIncrement;
1049 outIncrement = 0-outIncrement;
1050 }
1051
1052 if (flags & BT_AllowParallel)
1053 {
1054 double temp[2];
1055 while (length >= 6*xmmBlockSize)
1056 {
1057 __m128i block0, block1, block2, block3, block4, block5;
1058 if (flags & BT_InBlockIsCounter)
1059 {
1060 // Increment of 1 and 2 in big-endian compatible with the ctr byte array.
1061 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1062 const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
1063
1064 // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
1065 // After the dup load we have two counters in the XMM word. Then we need
1066 // to increment the low ctr by 0 and the high ctr by 1.
1067 std::memcpy(temp, inBlocks, blockSize);
1068 block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
1069
1070 // After initial increment of {0,1} remaining counters increment by {2,2}.
1071 block1 = _mm_add_epi32(s_two, block0);
1072 block2 = _mm_add_epi32(s_two, block1);
1073 block3 = _mm_add_epi32(s_two, block2);
1074 block4 = _mm_add_epi32(s_two, block3);
1075 block5 = _mm_add_epi32(s_two, block4);
1076
1077 // Store the next counter. When BT_InBlockIsCounter is set then
1078 // inBlocks is backed by m_counterArray which is non-const.
1079 _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi32(s_two, block5)));
1080 std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
1081 }
1082 else
1083 {
1084 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1085 inBlocks = PtrAdd(inBlocks, inIncrement);
1086 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1087 inBlocks = PtrAdd(inBlocks, inIncrement);
1088 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1089 inBlocks = PtrAdd(inBlocks, inIncrement);
1090 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1091 inBlocks = PtrAdd(inBlocks, inIncrement);
1092 block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1093 inBlocks = PtrAdd(inBlocks, inIncrement);
1094 block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1095 inBlocks = PtrAdd(inBlocks, inIncrement);
1096 }
1097
1098 if (xorInput)
1099 {
1100 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1101 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1102 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1103 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1104 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1105 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1106 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1107 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1108 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1109 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1110 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1111 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1112 }
1113
1114 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
1115
1116 if (xorOutput)
1117 {
1118 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1119 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1120 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1121 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1122 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1123 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1124 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1125 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1126 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1127 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1128 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1129 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1130 }
1131
1132 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1133 outBlocks = PtrAdd(outBlocks, outIncrement);
1134 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1135 outBlocks = PtrAdd(outBlocks, outIncrement);
1136 _mm_storeu_si128(M128_CAST(outBlocks), block2);
1137 outBlocks = PtrAdd(outBlocks, outIncrement);
1138 _mm_storeu_si128(M128_CAST(outBlocks), block3);
1139 outBlocks = PtrAdd(outBlocks, outIncrement);
1140 _mm_storeu_si128(M128_CAST(outBlocks), block4);
1141 outBlocks = PtrAdd(outBlocks, outIncrement);
1142 _mm_storeu_si128(M128_CAST(outBlocks), block5);
1143 outBlocks = PtrAdd(outBlocks, outIncrement);
1144
1145 length -= 6*xmmBlockSize;
1146 }
1147
1148 while (length >= 2*xmmBlockSize)
1149 {
1150 __m128i block0, block1;
1151 if (flags & BT_InBlockIsCounter)
1152 {
1153 // Increment of 1 and 2 in big-endian compatible with the ctr byte array.
1154 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1155 const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
1156
1157 // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
1158 // After the dup load we have two counters in the XMM word. Then we need
1159 // to increment the low ctr by 0 and the high ctr by 1.
1160 std::memcpy(temp, inBlocks, blockSize);
1161 block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
1162
1163 // After initial increment of {0,1} remaining counters increment by {2,2}.
1164 block1 = _mm_add_epi32(s_two, block0);
1165
1166 // Store the next counter. When BT_InBlockIsCounter is set then
1167 // inBlocks is backed by m_counterArray which is non-const.
1168 _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block1)));
1169 std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
1170 }
1171 else
1172 {
1173 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1174 inBlocks = PtrAdd(inBlocks, inIncrement);
1175 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1176 inBlocks = PtrAdd(inBlocks, inIncrement);
1177 }
1178
1179 if (xorInput)
1180 {
1181 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1182 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1183 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1184 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1185 }
1186
1187 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
1188
1189 if (xorOutput)
1190 {
1191 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1192 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1193 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1194 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1195 }
1196
1197 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1198 outBlocks = PtrAdd(outBlocks, outIncrement);
1199 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1200 outBlocks = PtrAdd(outBlocks, outIncrement);
1201
1202 length -= 2*xmmBlockSize;
1203 }
1204 }
1205
1206 if (length)
1207 {
1208 // Adjust to real block size
1209 if (flags & BT_ReverseDirection)
1210 {
1211 inIncrement += inIncrement ? blockSize : 0;
1212 xorIncrement += xorIncrement ? blockSize : 0;
1213 outIncrement += outIncrement ? blockSize : 0;
1214 inBlocks = PtrSub(inBlocks, inIncrement);
1215 xorBlocks = PtrSub(xorBlocks, xorIncrement);
1216 outBlocks = PtrSub(outBlocks, outIncrement);
1217 }
1218 else
1219 {
1220 inIncrement -= inIncrement ? blockSize : 0;
1221 xorIncrement -= xorIncrement ? blockSize : 0;
1222 outIncrement -= outIncrement ? blockSize : 0;
1223 }
1224
1225 while (length >= blockSize)
1226 {
1227 double temp[2];
1228 __m128i block, zero = _mm_setzero_si128();
1229 std::memcpy(temp, inBlocks, blockSize);
1230 block = _mm_castpd_si128(_mm_load_sd(temp));
1231
1232 if (xorInput)
1233 {
1234 std::memcpy(temp, xorBlocks, blockSize);
1235 block = _mm_xor_si128(block,
1236 _mm_castpd_si128(_mm_load_sd(temp)));
1237 }
1238
1239 if (flags & BT_InBlockIsCounter)
1240 const_cast<byte *>(inBlocks)[7]++;
1241
1242 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
1243
1244 if (xorOutput)
1245 {
1246 std::memcpy(temp, xorBlocks, blockSize);
1247 block = _mm_xor_si128(block,
1248 _mm_castpd_si128(_mm_load_sd(temp)));
1249 }
1250
1251 _mm_store_sd(temp, _mm_castsi128_pd(block));
1252 std::memcpy(outBlocks, temp, blockSize);
1253
1254 inBlocks = PtrAdd(inBlocks, inIncrement);
1255 outBlocks = PtrAdd(outBlocks, outIncrement);
1256 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1257 length -= blockSize;
1258 }
1259 }
1260
1261 return length;
1262}
1263
1264/// \brief AdvancedProcessBlocks for 2 and 6 blocks
1265/// \tparam F2 function to process 2 128-bit blocks
1266/// \tparam F6 function to process 6 128-bit blocks
1267/// \tparam W word type of the subkey table
1268/// \details AdvancedProcessBlocks128_6x2_SSE processes 6 and 2 SSE SIMD words
1269/// at a time. For a single block the template uses F2 with a zero block.
1270/// \details The subkey type is usually word32 or word64. F2 and F6 must use the
1271/// same word type.
1272template <typename F2, typename F6, typename W>
1273inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
1274 MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
1275 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1276{
1277 CRYPTOPP_ASSERT(subKeys);
1278 CRYPTOPP_ASSERT(inBlocks);
1279 CRYPTOPP_ASSERT(outBlocks);
1280 CRYPTOPP_ASSERT(length >= 16);
1281
1282 const size_t blockSize = 16;
1283 // const size_t xmmBlockSize = 16;
1284
1285 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1286 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1287 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
1288
1289 // Clang and Coverity are generating findings using xorBlocks as a flag.
1290 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1291 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1292
1293 if (flags & BT_ReverseDirection)
1294 {
1295 inBlocks = PtrAdd(inBlocks, length - blockSize);
1296 xorBlocks = PtrAdd(xorBlocks, length - blockSize);
1297 outBlocks = PtrAdd(outBlocks, length - blockSize);
1298 inIncrement = 0-inIncrement;
1299 xorIncrement = 0-xorIncrement;
1300 outIncrement = 0-outIncrement;
1301 }
1302
1303 if (flags & BT_AllowParallel)
1304 {
1305 while (length >= 6*blockSize)
1306 {
1307 __m128i block0, block1, block2, block3, block4, block5;
1308 if (flags & BT_InBlockIsCounter)
1309 {
1310 // Increment of 1 in big-endian compatible with the ctr byte array.
1311 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1312 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1313 block1 = _mm_add_epi32(block0, s_one);
1314 block2 = _mm_add_epi32(block1, s_one);
1315 block3 = _mm_add_epi32(block2, s_one);
1316 block4 = _mm_add_epi32(block3, s_one);
1317 block5 = _mm_add_epi32(block4, s_one);
1318 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, s_one));
1319 }
1320 else
1321 {
1322 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1323 inBlocks = PtrAdd(inBlocks, inIncrement);
1324 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1325 inBlocks = PtrAdd(inBlocks, inIncrement);
1326 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1327 inBlocks = PtrAdd(inBlocks, inIncrement);
1328 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1329 inBlocks = PtrAdd(inBlocks, inIncrement);
1330 block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1331 inBlocks = PtrAdd(inBlocks, inIncrement);
1332 block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1333 inBlocks = PtrAdd(inBlocks, inIncrement);
1334 }
1335
1336 if (xorInput)
1337 {
1338 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1339 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1340 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1341 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1342 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1343 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1344 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1345 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1346 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1347 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1348 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1349 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1350 }
1351
1352 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
1353
1354 if (xorOutput)
1355 {
1356 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1357 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1358 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1359 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1360 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1361 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1362 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1363 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1364 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1365 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1366 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1367 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1368 }
1369
1370 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1371 outBlocks = PtrAdd(outBlocks, outIncrement);
1372 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1373 outBlocks = PtrAdd(outBlocks, outIncrement);
1374 _mm_storeu_si128(M128_CAST(outBlocks), block2);
1375 outBlocks = PtrAdd(outBlocks, outIncrement);
1376 _mm_storeu_si128(M128_CAST(outBlocks), block3);
1377 outBlocks = PtrAdd(outBlocks, outIncrement);
1378 _mm_storeu_si128(M128_CAST(outBlocks), block4);
1379 outBlocks = PtrAdd(outBlocks, outIncrement);
1380 _mm_storeu_si128(M128_CAST(outBlocks), block5);
1381 outBlocks = PtrAdd(outBlocks, outIncrement);
1382
1383 length -= 6*blockSize;
1384 }
1385
1386 while (length >= 2*blockSize)
1387 {
1388 __m128i block0, block1;
1389 if (flags & BT_InBlockIsCounter)
1390 {
1391 // Increment of 1 in big-endian compatible with the ctr byte array.
1392 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1393 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1394 block1 = _mm_add_epi32(block0, s_one);
1395 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, s_one));
1396 }
1397 else
1398 {
1399 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1400 inBlocks = PtrAdd(inBlocks, inIncrement);
1401 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1402 inBlocks = PtrAdd(inBlocks, inIncrement);
1403 }
1404
1405 if (xorInput)
1406 {
1407 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1408 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1409 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1410 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1411 }
1412
1413 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
1414
1415 if (xorOutput)
1416 {
1417 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1418 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1419 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1420 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1421 }
1422
1423 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1424 outBlocks = PtrAdd(outBlocks, outIncrement);
1425 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1426 outBlocks = PtrAdd(outBlocks, outIncrement);
1427
1428 length -= 2*blockSize;
1429 }
1430 }
1431
1432 while (length >= blockSize)
1433 {
1434 __m128i block, zero = _mm_setzero_si128();
1435 block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1436
1437 if (xorInput)
1438 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1439
1440 if (flags & BT_InBlockIsCounter)
1441 const_cast<byte *>(inBlocks)[15]++;
1442
1443 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
1444
1445 if (xorOutput)
1446 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1447
1448 _mm_storeu_si128(M128_CAST(outBlocks), block);
1449
1450 inBlocks = PtrAdd(inBlocks, inIncrement);
1451 outBlocks = PtrAdd(outBlocks, outIncrement);
1452 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1453 length -= blockSize;
1454 }
1455
1456 return length;
1457}
1458
1459/// \brief AdvancedProcessBlocks for 1 and 4 blocks
1460/// \tparam F1 function to process 1 128-bit block
1461/// \tparam F4 function to process 4 128-bit blocks
1462/// \tparam W word type of the subkey table
1463/// \details AdvancedProcessBlocks128_4x1_SSE processes 4 and 1 SSE SIMD words
1464/// at a time.
1465/// \details The subkey type is usually word32 or word64. F1 and F4 must use the
1466/// same word type.
1467template <typename F1, typename F4, typename W>
1468inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4,
1469 MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
1470 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1471{
1472 CRYPTOPP_ASSERT(subKeys);
1473 CRYPTOPP_ASSERT(inBlocks);
1474 CRYPTOPP_ASSERT(outBlocks);
1475 CRYPTOPP_ASSERT(length >= 16);
1476
1477 const size_t blockSize = 16;
1478 // const size_t xmmBlockSize = 16;
1479
1480 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1481 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1482 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
1483
1484 // Clang and Coverity are generating findings using xorBlocks as a flag.
1485 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1486 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1487
1488 if (flags & BT_ReverseDirection)
1489 {
1490 inBlocks = PtrAdd(inBlocks, length - blockSize);
1491 xorBlocks = PtrAdd(xorBlocks, length - blockSize);
1492 outBlocks = PtrAdd(outBlocks, length - blockSize);
1493 inIncrement = 0-inIncrement;
1494 xorIncrement = 0-xorIncrement;
1495 outIncrement = 0-outIncrement;
1496 }
1497
1498 if (flags & BT_AllowParallel)
1499 {
1500 while (length >= 4*blockSize)
1501 {
1502 __m128i block0, block1, block2, block3;
1503 if (flags & BT_InBlockIsCounter)
1504 {
1505 // Increment of 1 in big-endian compatible with the ctr byte array.
1506 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1507 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1508 block1 = _mm_add_epi32(block0, s_one);
1509 block2 = _mm_add_epi32(block1, s_one);
1510 block3 = _mm_add_epi32(block2, s_one);
1511 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, s_one));
1512 }
1513 else
1514 {
1515 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1516 inBlocks = PtrAdd(inBlocks, inIncrement);
1517 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1518 inBlocks = PtrAdd(inBlocks, inIncrement);
1519 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1520 inBlocks = PtrAdd(inBlocks, inIncrement);
1521 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1522 inBlocks = PtrAdd(inBlocks, inIncrement);
1523 }
1524
1525 if (xorInput)
1526 {
1527 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1528 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1529 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1530 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1531 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1532 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1533 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1534 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1535 }
1536
1537 func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
1538
1539 if (xorOutput)
1540 {
1541 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1542 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1543 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1544 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1545 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1546 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1547 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1548 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1549 }
1550
1551 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1552 outBlocks = PtrAdd(outBlocks, outIncrement);
1553 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1554 outBlocks = PtrAdd(outBlocks, outIncrement);
1555 _mm_storeu_si128(M128_CAST(outBlocks), block2);
1556 outBlocks = PtrAdd(outBlocks, outIncrement);
1557 _mm_storeu_si128(M128_CAST(outBlocks), block3);
1558 outBlocks = PtrAdd(outBlocks, outIncrement);
1559
1560 length -= 4*blockSize;
1561 }
1562 }
1563
1564 while (length >= blockSize)
1565 {
1566 __m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1567
1568 if (xorInput)
1569 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1570
1571 if (flags & BT_InBlockIsCounter)
1572 const_cast<byte *>(inBlocks)[15]++;
1573
1574 func1(block, subKeys, static_cast<unsigned int>(rounds));
1575
1576 if (xorOutput)
1577 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1578
1579 _mm_storeu_si128(M128_CAST(outBlocks), block);
1580
1581 inBlocks = PtrAdd(inBlocks, inIncrement);
1582 outBlocks = PtrAdd(outBlocks, outIncrement);
1583 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1584 length -= blockSize;
1585 }
1586
1587 return length;
1588}
1589
1590/// \brief AdvancedProcessBlocks for 1 and 4 blocks
1591/// \tparam F1 function to process 1 64-bit block
1592/// \tparam F4 function to process 6 64-bit blocks
1593/// \tparam W word type of the subkey table
1594/// \details AdvancedProcessBlocks64_4x1_SSE processes 4 and 1 SSE SIMD words
1595/// at a time.
1596/// \details The subkey type is usually word32 or word64. F1 and F4 must use the
1597/// same word type.
1598template <typename F1, typename F4, typename W>
1599inline size_t AdvancedProcessBlocks64_4x1_SSE(F1 func1, F4 func4,
1600 MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
1601 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1602{
1603 CRYPTOPP_ASSERT(subKeys);
1604 CRYPTOPP_ASSERT(inBlocks);
1605 CRYPTOPP_ASSERT(outBlocks);
1606 CRYPTOPP_ASSERT(length >= 8);
1607
1608 const size_t blockSize = 8;
1609 const size_t xmmBlockSize = 16;
1610
1611 size_t inIncrement = (flags & (BT_InBlockIsCounter | BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
1612 size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
1613 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
1614
1615 // Clang and Coverity are generating findings using xorBlocks as a flag.
1616 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1617 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1618
1619 if (flags & BT_ReverseDirection)
1620 {
1621 inBlocks = PtrAdd(inBlocks, length - xmmBlockSize);
1622 xorBlocks = PtrAdd(xorBlocks, length - xmmBlockSize);
1623 outBlocks = PtrAdd(outBlocks, length - xmmBlockSize);
1624 inIncrement = 0 - inIncrement;
1625 xorIncrement = 0 - xorIncrement;
1626 outIncrement = 0 - outIncrement;
1627 }
1628
1629 if (flags & BT_AllowParallel)
1630 {
1631 while (length >= 4*xmmBlockSize)
1632 {
1633 __m128i block0, block1, block2, block3;
1634 if (flags & BT_InBlockIsCounter)
1635 {
1636 // Increment of 1 and 2 in big-endian compatible with the ctr byte array.
1637 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1638 const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
1639 double temp[2];
1640
1641 // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
1642 // After the dup load we have two counters in the XMM word. Then we need
1643 // to increment the low ctr by 0 and the high ctr by 1.
1644 std::memcpy(temp, inBlocks, blockSize);
1645 block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
1646
1647 // After initial increment of {0,1} remaining counters increment by {2,2}.
1648 block1 = _mm_add_epi32(s_two, block0);
1649 block2 = _mm_add_epi32(s_two, block1);
1650 block3 = _mm_add_epi32(s_two, block2);
1651
1652 // Store the next counter. When BT_InBlockIsCounter is set then
1653 // inBlocks is backed by m_counterArray which is non-const.
1654 _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block3)));
1655 std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
1656 }
1657 else
1658 {
1659 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1660 inBlocks = PtrAdd(inBlocks, inIncrement);
1661 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1662 inBlocks = PtrAdd(inBlocks, inIncrement);
1663 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1664 inBlocks = PtrAdd(inBlocks, inIncrement);
1665 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1666 inBlocks = PtrAdd(inBlocks, inIncrement);
1667 }
1668
1669 if (xorInput)
1670 {
1671 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1672 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1673 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1674 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1675 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1676 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1677 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1678 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1679 }
1680
1681 func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
1682
1683 if (xorOutput)
1684 {
1685 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1686 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1687 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1688 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1689 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1690 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1691 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1692 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1693 }
1694
1695 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1696 outBlocks = PtrAdd(outBlocks, outIncrement);
1697 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1698 outBlocks = PtrAdd(outBlocks, outIncrement);
1699 _mm_storeu_si128(M128_CAST(outBlocks), block2);
1700 outBlocks = PtrAdd(outBlocks, outIncrement);
1701 _mm_storeu_si128(M128_CAST(outBlocks), block3);
1702 outBlocks = PtrAdd(outBlocks, outIncrement);
1703
1704 length -= 4*xmmBlockSize;
1705 }
1706 }
1707
1708 if (length)
1709 {
1710 // Adjust to real block size
1711 if (flags & BT_ReverseDirection)
1712 {
1713 inIncrement += inIncrement ? blockSize : 0;
1714 xorIncrement += xorIncrement ? blockSize : 0;
1715 outIncrement += outIncrement ? blockSize : 0;
1716 inBlocks = PtrSub(inBlocks, inIncrement);
1717 xorBlocks = PtrSub(xorBlocks, xorIncrement);
1718 outBlocks = PtrSub(outBlocks, outIncrement);
1719 }
1720 else
1721 {
1722 inIncrement -= inIncrement ? blockSize : 0;
1723 xorIncrement -= xorIncrement ? blockSize : 0;
1724 outIncrement -= outIncrement ? blockSize : 0;
1725 }
1726
1727 while (length >= blockSize)
1728 {
1729 double temp[2];
1730 std::memcpy(temp, inBlocks, blockSize);
1731 __m128i block = _mm_castpd_si128(_mm_load_sd(temp));
1732
1733 if (xorInput)
1734 {
1735 std::memcpy(temp, xorBlocks, blockSize);
1736 block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
1737 }
1738
1739 if (flags & BT_InBlockIsCounter)
1740 const_cast<byte *>(inBlocks)[7]++;
1741
1742 func1(block, subKeys, static_cast<unsigned int>(rounds));
1743
1744 if (xorOutput)
1745 {
1746 std::memcpy(temp, xorBlocks, blockSize);
1747 block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
1748 }
1749
1750 _mm_store_sd(temp, _mm_castsi128_pd(block));
1751 std::memcpy(outBlocks, temp, blockSize);
1752
1753 inBlocks = PtrAdd(inBlocks, inIncrement);
1754 outBlocks = PtrAdd(outBlocks, outIncrement);
1755 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1756 length -= blockSize;
1757 }
1758 }
1759
1760 return length;
1761}
1762
1763NAMESPACE_END // CryptoPP
1764
1765#endif // CRYPTOPP_SSSE3_AVAILABLE
1766
1767// *********************** Altivec/Power 4 ********************** //
1768
1769#if defined(__ALTIVEC__)
1770
1771NAMESPACE_BEGIN(CryptoPP)
1772
1773/// \brief AdvancedProcessBlocks for 2 and 6 blocks
1774/// \tparam F2 function to process 2 128-bit blocks
1775/// \tparam F6 function to process 6 128-bit blocks
1776/// \tparam W word type of the subkey table
1777/// \details AdvancedProcessBlocks64_6x2_Altivec processes 6 and 2 Altivec SIMD words
1778/// at a time. For a single block the template uses F2 with a zero block.
1779/// \details The subkey type is usually word32 or word64. F2 and F6 must use the
1780/// same word type.
1781template <typename F2, typename F6, typename W>
1782inline size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6,
1783 const W *subKeys, size_t rounds, const byte *inBlocks,
1784 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1785{
1786 CRYPTOPP_ASSERT(subKeys);
1787 CRYPTOPP_ASSERT(inBlocks);
1788 CRYPTOPP_ASSERT(outBlocks);
1789 CRYPTOPP_ASSERT(length >= 8);
1790
1791#if (CRYPTOPP_LITTLE_ENDIAN)
1792 enum {LowOffset=8, HighOffset=0};
1793 const uint32x4_p s_one = {1,0,0,0};
1794 const uint32x4_p s_two = {2,0,2,0};
1795#else
1796 enum {LowOffset=8, HighOffset=0};
1797 const uint32x4_p s_one = {0,0,0,1};
1798 const uint32x4_p s_two = {0,2,0,2};
1799#endif
1800
1801 const size_t blockSize = 8;
1802 const size_t vsxBlockSize = 16;
1803 CRYPTOPP_ALIGN_DATA(16) uint8_t temp[16];
1804
1805 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : vsxBlockSize;
1806 size_t xorIncrement = (xorBlocks != NULLPTR) ? vsxBlockSize : 0;
1807 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : vsxBlockSize;
1808
1809 // Clang and Coverity are generating findings using xorBlocks as a flag.
1810 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1811 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1812
1813 if (flags & BT_ReverseDirection)
1814 {
1815 inBlocks = PtrAdd(inBlocks, length - vsxBlockSize);
1816 xorBlocks = PtrAdd(xorBlocks, length - vsxBlockSize);
1817 outBlocks = PtrAdd(outBlocks, length - vsxBlockSize);
1818 inIncrement = 0-inIncrement;
1819 xorIncrement = 0-xorIncrement;
1820 outIncrement = 0-outIncrement;
1821 }
1822
1823 if (flags & BT_AllowParallel)
1824 {
1825 while (length >= 6*vsxBlockSize)
1826 {
1827 uint32x4_p block0, block1, block2, block3, block4, block5;
1828 if (flags & BT_InBlockIsCounter)
1829 {
1830 // There is no easy way to load 8-bytes into a vector. It is
1831 // even harder without POWER8 due to lack of 64-bit elements.
1832 std::memcpy(temp+LowOffset, inBlocks, 8);
1833 std::memcpy(temp+HighOffset, inBlocks, 8);
1834 uint32x4_p ctr = (uint32x4_p)VecLoadBE(temp);
1835
1836 // For 64-bit block ciphers we need to load the CTR block,
1837 // which is 8 bytes. After the dup load we have two counters
1838 // in the Altivec word. Then we need to increment the low ctr
1839 // by 0 and the high ctr by 1.
1840 block0 = VecAdd(s_one, ctr);
1841
1842 // After initial increment of {0,1} remaining counters
1843 // increment by {2,2}.
1844 block1 = VecAdd(s_two, block0);
1845 block2 = VecAdd(s_two, block1);
1846 block3 = VecAdd(s_two, block2);
1847 block4 = VecAdd(s_two, block3);
1848 block5 = VecAdd(s_two, block4);
1849
1850 // Update the counter in the caller.
1851 const_cast<byte*>(inBlocks)[7] += 12;
1852 }
1853 else
1854 {
1855 block0 = VecLoadBE(inBlocks);
1856 inBlocks = PtrAdd(inBlocks, inIncrement);
1857 block1 = VecLoadBE(inBlocks);
1858 inBlocks = PtrAdd(inBlocks, inIncrement);
1859 block2 = VecLoadBE(inBlocks);
1860 inBlocks = PtrAdd(inBlocks, inIncrement);
1861 block3 = VecLoadBE(inBlocks);
1862 inBlocks = PtrAdd(inBlocks, inIncrement);
1863 block4 = VecLoadBE(inBlocks);
1864 inBlocks = PtrAdd(inBlocks, inIncrement);
1865 block5 = VecLoadBE(inBlocks);
1866 inBlocks = PtrAdd(inBlocks, inIncrement);
1867 }
1868
1869 if (xorInput)
1870 {
1871 block0 = VecXor(block0, VecLoadBE(xorBlocks));
1872 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1873 block1 = VecXor(block1, VecLoadBE(xorBlocks));
1874 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1875 block2 = VecXor(block2, VecLoadBE(xorBlocks));
1876 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1877 block3 = VecXor(block3, VecLoadBE(xorBlocks));
1878 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1879 block4 = VecXor(block4, VecLoadBE(xorBlocks));
1880 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1881 block5 = VecXor(block5, VecLoadBE(xorBlocks));
1882 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1883 }
1884
1885 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
1886
1887 if (xorOutput)
1888 {
1889 block0 = VecXor(block0, VecLoadBE(xorBlocks));
1890 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1891 block1 = VecXor(block1, VecLoadBE(xorBlocks));
1892 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1893 block2 = VecXor(block2, VecLoadBE(xorBlocks));
1894 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1895 block3 = VecXor(block3, VecLoadBE(xorBlocks));
1896 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1897 block4 = VecXor(block4, VecLoadBE(xorBlocks));
1898 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1899 block5 = VecXor(block5, VecLoadBE(xorBlocks));
1900 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1901 }
1902
1903 VecStoreBE(block0, outBlocks);
1904 outBlocks = PtrAdd(outBlocks, outIncrement);
1905 VecStoreBE(block1, outBlocks);
1906 outBlocks = PtrAdd(outBlocks, outIncrement);
1907 VecStoreBE(block2, outBlocks);
1908 outBlocks = PtrAdd(outBlocks, outIncrement);
1909 VecStoreBE(block3, outBlocks);
1910 outBlocks = PtrAdd(outBlocks, outIncrement);
1911 VecStoreBE(block4, outBlocks);
1912 outBlocks = PtrAdd(outBlocks, outIncrement);
1913 VecStoreBE(block5, outBlocks);
1914 outBlocks = PtrAdd(outBlocks, outIncrement);
1915
1916 length -= 6*vsxBlockSize;
1917 }
1918
1919 while (length >= 2*vsxBlockSize)
1920 {
1921 uint32x4_p block0, block1;
1922 if (flags & BT_InBlockIsCounter)
1923 {
1924 // There is no easy way to load 8-bytes into a vector. It is
1925 // even harder without POWER8 due to lack of 64-bit elements.
1926 std::memcpy(temp+LowOffset, inBlocks, 8);
1927 std::memcpy(temp+HighOffset, inBlocks, 8);
1928 uint32x4_p ctr = (uint32x4_p)VecLoadBE(temp);
1929
1930 // For 64-bit block ciphers we need to load the CTR block,
1931 // which is 8 bytes. After the dup load we have two counters
1932 // in the Altivec word. Then we need to increment the low ctr
1933 // by 0 and the high ctr by 1.
1934 block0 = VecAdd(s_one, ctr);
1935
1936 // After initial increment of {0,1} remaining counters
1937 // increment by {2,2}.
1938 block1 = VecAdd(s_two, block0);
1939
1940 // Update the counter in the caller.
1941 const_cast<byte*>(inBlocks)[7] += 4;
1942 }
1943 else
1944 {
1945 block0 = VecLoadBE(inBlocks);
1946 inBlocks = PtrAdd(inBlocks, inIncrement);
1947 block1 = VecLoadBE(inBlocks);
1948 inBlocks = PtrAdd(inBlocks, inIncrement);
1949 }
1950
1951 if (xorInput)
1952 {
1953 block0 = VecXor(block0, VecLoadBE(xorBlocks));
1954 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1955 block1 = VecXor(block1, VecLoadBE(xorBlocks));
1956 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1957 }
1958
1959 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
1960
1961 if (xorOutput)
1962 {
1963 block0 = VecXor(block0, VecLoadBE(xorBlocks));
1964 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1965 block1 = VecXor(block1, VecLoadBE(xorBlocks));
1966 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1967 }
1968
1969 VecStoreBE(block0, outBlocks);
1970 outBlocks = PtrAdd(outBlocks, outIncrement);
1971 VecStoreBE(block1, outBlocks);
1972 outBlocks = PtrAdd(outBlocks, outIncrement);
1973
1974 length -= 2*vsxBlockSize;
1975 }
1976 }
1977
1978 if (length)
1979 {
1980 // Adjust to real block size
1981 if (flags & BT_ReverseDirection)
1982 {
1983 inIncrement += inIncrement ? blockSize : 0;
1984 xorIncrement += xorIncrement ? blockSize : 0;
1985 outIncrement += outIncrement ? blockSize : 0;
1986 inBlocks = PtrSub(inBlocks, inIncrement);
1987 xorBlocks = PtrSub(xorBlocks, xorIncrement);
1988 outBlocks = PtrSub(outBlocks, outIncrement);
1989 }
1990 else
1991 {
1992 inIncrement -= inIncrement ? blockSize : 0;
1993 xorIncrement -= xorIncrement ? blockSize : 0;
1994 outIncrement -= outIncrement ? blockSize : 0;
1995 }
1996
1997 while (length >= blockSize)
1998 {
1999 uint32x4_p block, zero = {0};
2000
2001 // There is no easy way to load 8-bytes into a vector. It is
2002 // even harder without POWER8 due to lack of 64-bit elements.
2003 // The high 8 bytes are "don't care" but it if we don't
2004 // initialize the block then it generates warnings.
2005 std::memcpy(temp+LowOffset, inBlocks, 8);
2006 std::memcpy(temp+HighOffset, inBlocks, 8); // don't care
2007 block = (uint32x4_p)VecLoadBE(temp);
2008
2009 if (xorInput)
2010 {
2011 std::memcpy(temp+LowOffset, xorBlocks, 8);
2012 std::memcpy(temp+HighOffset, xorBlocks, 8); // don't care
2013 uint32x4_p x = (uint32x4_p)VecLoadBE(temp);
2014 block = VecXor(block, x);
2015 }
2016
2017 // Update the counter in the caller.
2018 if (flags & BT_InBlockIsCounter)
2019 const_cast<byte *>(inBlocks)[7]++;
2020
2021 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
2022
2023 if (xorOutput)
2024 {
2025 std::memcpy(temp+LowOffset, xorBlocks, 8);
2026 std::memcpy(temp+HighOffset, xorBlocks, 8); // don't care
2027 uint32x4_p x = (uint32x4_p)VecLoadBE(temp);
2028 block = VecXor(block, x);
2029 }
2030
2031 VecStoreBE(block, temp);
2032 std::memcpy(outBlocks, temp+LowOffset, 8);
2033
2034 inBlocks = PtrAdd(inBlocks, inIncrement);
2035 outBlocks = PtrAdd(outBlocks, outIncrement);
2036 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2037 length -= blockSize;
2038 }
2039 }
2040
2041 return length;
2042}
2043
2044/// \brief AdvancedProcessBlocks for 1 and 4 blocks
2045/// \tparam F1 function to process 1 128-bit block
2046/// \tparam F4 function to process 4 128-bit blocks
2047/// \tparam W word type of the subkey table
2048/// \details AdvancedProcessBlocks128_4x1_ALTIVEC processes 4 and 1 Altivec SIMD words
2049/// at a time.
2050/// \details The subkey type is usually word32 or word64. F1 and F4 must use the
2051/// same word type.
2052template <typename F1, typename F4, typename W>
2053inline size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4,
2054 const W *subKeys, size_t rounds, const byte *inBlocks,
2055 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
2056{
2057 CRYPTOPP_ASSERT(subKeys);
2058 CRYPTOPP_ASSERT(inBlocks);
2059 CRYPTOPP_ASSERT(outBlocks);
2060 CRYPTOPP_ASSERT(length >= 16);
2061
2062#if (CRYPTOPP_LITTLE_ENDIAN)
2063 const uint32x4_p s_one = {1,0,0,0};
2064#else
2065 const uint32x4_p s_one = {0,0,0,1};
2066#endif
2067
2068 const size_t blockSize = 16;
2069 // const size_t vsxBlockSize = 16;
2070
2071 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
2072 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
2073 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
2074
2075 // Clang and Coverity are generating findings using xorBlocks as a flag.
2076 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
2077 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
2078
2079 if (flags & BT_ReverseDirection)
2080 {
2081 inBlocks = PtrAdd(inBlocks, length - blockSize);
2082 xorBlocks = PtrAdd(xorBlocks, length - blockSize);
2083 outBlocks = PtrAdd(outBlocks, length - blockSize);
2084 inIncrement = 0-inIncrement;
2085 xorIncrement = 0-xorIncrement;
2086 outIncrement = 0-outIncrement;
2087 }
2088
2089 if (flags & BT_AllowParallel)
2090 {
2091 while (length >= 4*blockSize)
2092 {
2093 uint32x4_p block0, block1, block2, block3;
2094
2095 if (flags & BT_InBlockIsCounter)
2096 {
2097 block0 = VecLoadBE(inBlocks);
2098 block1 = VecAdd(block0, s_one);
2099 block2 = VecAdd(block1, s_one);
2100 block3 = VecAdd(block2, s_one);
2101
2102 // Hack due to big-endian loads used by POWER8 (and maybe ARM-BE).
2103 // CTR_ModePolicy::OperateKeystream is wired such that after
2104 // returning from this function CTR_ModePolicy will detect wrap on
2105 // on the last counter byte and increment the next to last byte.
2106 // The problem is, with a big-endian load, inBlocks[15] is really
2107 // located at index 15. The vector addition using a 32-bit element
2108 // generates a carry into inBlocks[14] and then CTR_ModePolicy
2109 // increments inBlocks[14] too.
2110 const_cast<byte*>(inBlocks)[15] += 6;
2111 }
2112 else
2113 {
2114 block0 = VecLoadBE(inBlocks);
2115 inBlocks = PtrAdd(inBlocks, inIncrement);
2116 block1 = VecLoadBE(inBlocks);
2117 inBlocks = PtrAdd(inBlocks, inIncrement);
2118 block2 = VecLoadBE(inBlocks);
2119 inBlocks = PtrAdd(inBlocks, inIncrement);
2120 block3 = VecLoadBE(inBlocks);
2121 inBlocks = PtrAdd(inBlocks, inIncrement);
2122 }
2123
2124 if (xorInput)
2125 {
2126 block0 = VecXor(block0, VecLoadBE(xorBlocks));
2127 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2128 block1 = VecXor(block1, VecLoadBE(xorBlocks));
2129 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2130 block2 = VecXor(block2, VecLoadBE(xorBlocks));
2131 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2132 block3 = VecXor(block3, VecLoadBE(xorBlocks));
2133 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2134 }
2135
2136 func4(block0, block1, block2, block3, subKeys, rounds);
2137
2138 if (xorOutput)
2139 {
2140 block0 = VecXor(block0, VecLoadBE(xorBlocks));
2141 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2142 block1 = VecXor(block1, VecLoadBE(xorBlocks));
2143 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2144 block2 = VecXor(block2, VecLoadBE(xorBlocks));
2145 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2146 block3 = VecXor(block3, VecLoadBE(xorBlocks));
2147 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2148 }
2149
2150 VecStoreBE(block0, outBlocks);
2151 outBlocks = PtrAdd(outBlocks, outIncrement);
2152 VecStoreBE(block1, outBlocks);
2153 outBlocks = PtrAdd(outBlocks, outIncrement);
2154 VecStoreBE(block2, outBlocks);
2155 outBlocks = PtrAdd(outBlocks, outIncrement);
2156 VecStoreBE(block3, outBlocks);
2157 outBlocks = PtrAdd(outBlocks, outIncrement);
2158
2159 length -= 4*blockSize;
2160 }
2161 }
2162
2163 while (length >= blockSize)
2164 {
2165 uint32x4_p block = VecLoadBE(inBlocks);
2166
2167 if (xorInput)
2168 block = VecXor(block, VecLoadBE(xorBlocks));
2169
2170 if (flags & BT_InBlockIsCounter)
2171 const_cast<byte *>(inBlocks)[15]++;
2172
2173 func1(block, subKeys, rounds);
2174
2175 if (xorOutput)
2176 block = VecXor(block, VecLoadBE(xorBlocks));
2177
2178 VecStoreBE(block, outBlocks);
2179
2180 inBlocks = PtrAdd(inBlocks, inIncrement);
2181 outBlocks = PtrAdd(outBlocks, outIncrement);
2182 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2183 length -= blockSize;
2184 }
2185
2186 return length;
2187}
2188
2189/// \brief AdvancedProcessBlocks for 1 and 6 blocks
2190/// \tparam F1 function to process 1 128-bit block
2191/// \tparam F6 function to process 6 128-bit blocks
2192/// \tparam W word type of the subkey table
2193/// \details AdvancedProcessBlocks128_6x1_ALTIVEC processes 6 and 1 Altivec SIMD words
2194/// at a time.
2195/// \details The subkey type is usually word32 or word64. F1 and F6 must use the
2196/// same word type.
2197template <typename F1, typename F6, typename W>
2198inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
2199 const W *subKeys, size_t rounds, const byte *inBlocks,
2200 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
2201{
2202 CRYPTOPP_ASSERT(subKeys);
2203 CRYPTOPP_ASSERT(inBlocks);
2204 CRYPTOPP_ASSERT(outBlocks);
2205 CRYPTOPP_ASSERT(length >= 16);
2206
2207#if (CRYPTOPP_LITTLE_ENDIAN)
2208 const uint32x4_p s_one = {1,0,0,0};
2209#else
2210 const uint32x4_p s_one = {0,0,0,1};
2211#endif
2212
2213 const size_t blockSize = 16;
2214 // const size_t vsxBlockSize = 16;
2215
2216 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
2217 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
2218 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
2219
2220 // Clang and Coverity are generating findings using xorBlocks as a flag.
2221 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
2222 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
2223
2224 if (flags & BT_ReverseDirection)
2225 {
2226 inBlocks = PtrAdd(inBlocks, length - blockSize);
2227 xorBlocks = PtrAdd(xorBlocks, length - blockSize);
2228 outBlocks = PtrAdd(outBlocks, length - blockSize);
2229 inIncrement = 0-inIncrement;
2230 xorIncrement = 0-xorIncrement;
2231 outIncrement = 0-outIncrement;
2232 }
2233
2234 if (flags & BT_AllowParallel)
2235 {
2236 while (length >= 6*blockSize)
2237 {
2238 uint32x4_p block0, block1, block2, block3, block4, block5;
2239
2240 if (flags & BT_InBlockIsCounter)
2241 {
2242 block0 = VecLoadBE(inBlocks);
2243 block1 = VecAdd(block0, s_one);
2244 block2 = VecAdd(block1, s_one);
2245 block3 = VecAdd(block2, s_one);
2246 block4 = VecAdd(block3, s_one);
2247 block5 = VecAdd(block4, s_one);
2248
2249 // Hack due to big-endian loads used by POWER8 (and maybe ARM-BE).
2250 // CTR_ModePolicy::OperateKeystream is wired such that after
2251 // returning from this function CTR_ModePolicy will detect wrap on
2252 // on the last counter byte and increment the next to last byte.
2253 // The problem is, with a big-endian load, inBlocks[15] is really
2254 // located at index 15. The vector addition using a 32-bit element
2255 // generates a carry into inBlocks[14] and then CTR_ModePolicy
2256 // increments inBlocks[14] too.
2257 //
2258 // To find this bug we needed a test case with a ctr of 0xNN...FA.
2259 // The last octet is 0xFA and adding 6 creates the wrap to trigger
2260 // the issue. If the last octet was 0xFC then 4 would trigger it.
2261 // We dumb-lucked into the test with SPECK-128. The test case of
2262 // interest is the one with IV 348ECA9766C09F04 826520DE47A212FA.
2263 uint8x16_p temp = VecAdd((uint8x16_p)block5, (uint8x16_p)s_one);
2264 VecStoreBE(temp, const_cast<byte*>(inBlocks));
2265 }
2266 else
2267 {
2268 block0 = VecLoadBE(inBlocks);
2269 inBlocks = PtrAdd(inBlocks, inIncrement);
2270 block1 = VecLoadBE(inBlocks);
2271 inBlocks = PtrAdd(inBlocks, inIncrement);
2272 block2 = VecLoadBE(inBlocks);
2273 inBlocks = PtrAdd(inBlocks, inIncrement);
2274 block3 = VecLoadBE(inBlocks);
2275 inBlocks = PtrAdd(inBlocks, inIncrement);
2276 block4 = VecLoadBE(inBlocks);
2277 inBlocks = PtrAdd(inBlocks, inIncrement);
2278 block5 = VecLoadBE(inBlocks);
2279 inBlocks = PtrAdd(inBlocks, inIncrement);
2280 }
2281
2282 if (xorInput)
2283 {
2284 block0 = VecXor(block0, VecLoadBE(xorBlocks));
2285 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2286 block1 = VecXor(block1, VecLoadBE(xorBlocks));
2287 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2288 block2 = VecXor(block2, VecLoadBE(xorBlocks));
2289 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2290 block3 = VecXor(block3, VecLoadBE(xorBlocks));
2291 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2292 block4 = VecXor(block4, VecLoadBE(xorBlocks));
2293 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2294 block5 = VecXor(block5, VecLoadBE(xorBlocks));
2295 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2296 }
2297
2298 func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
2299
2300 if (xorOutput)
2301 {
2302 block0 = VecXor(block0, VecLoadBE(xorBlocks));
2303 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2304 block1 = VecXor(block1, VecLoadBE(xorBlocks));
2305 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2306 block2 = VecXor(block2, VecLoadBE(xorBlocks));
2307 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2308 block3 = VecXor(block3, VecLoadBE(xorBlocks));
2309 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2310 block4 = VecXor(block4, VecLoadBE(xorBlocks));
2311 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2312 block5 = VecXor(block5, VecLoadBE(xorBlocks));
2313 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2314 }
2315
2316 VecStoreBE(block0, outBlocks);
2317 outBlocks = PtrAdd(outBlocks, outIncrement);
2318 VecStoreBE(block1, outBlocks);
2319 outBlocks = PtrAdd(outBlocks, outIncrement);
2320 VecStoreBE(block2, outBlocks);
2321 outBlocks = PtrAdd(outBlocks, outIncrement);
2322 VecStoreBE(block3, outBlocks);
2323 outBlocks = PtrAdd(outBlocks, outIncrement);
2324 VecStoreBE(block4, outBlocks);
2325 outBlocks = PtrAdd(outBlocks, outIncrement);
2326 VecStoreBE(block5, outBlocks);
2327 outBlocks = PtrAdd(outBlocks, outIncrement);
2328
2329 length -= 6*blockSize;
2330 }
2331 }
2332
2333 while (length >= blockSize)
2334 {
2335 uint32x4_p block = VecLoadBE(inBlocks);
2336
2337 if (xorInput)
2338 block = VecXor(block, VecLoadBE(xorBlocks));
2339
2340 if (flags & BT_InBlockIsCounter)
2341 const_cast<byte *>(inBlocks)[15]++;
2342
2343 func1(block, subKeys, rounds);
2344
2345 if (xorOutput)
2346 block = VecXor(block, VecLoadBE(xorBlocks));
2347
2348 VecStoreBE(block, outBlocks);
2349
2350 inBlocks = PtrAdd(inBlocks, inIncrement);
2351 outBlocks = PtrAdd(outBlocks, outIncrement);
2352 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2353 length -= blockSize;
2354 }
2355
2356 return length;
2357}
2358
2359NAMESPACE_END // CryptoPP
2360
2361#endif // __ALTIVEC__
2362
2363#endif // CRYPTOPP_ADVANCED_SIMD_TEMPLATES
@ BT_InBlockIsCounter
inBlock is a counter
Definition: cryptlib.h:889
@ BT_ReverseDirection
perform the transformation in reverse
Definition: cryptlib.h:895
@ BT_XorInput
Xor inputs before transformation.
Definition: cryptlib.h:893
@ BT_AllowParallel
Allow parallel transformations.
Definition: cryptlib.h:897
@ BT_DontIncrementInOutPointers
should not modify block pointers
Definition: cryptlib.h:891
Library configuration file.
Utility functions for the Crypto++ library.
PTR PtrSub(PTR pointer, OFF offset)
Create a pointer with an offset.
Definition: misc.h:356
PTR PtrAdd(PTR pointer, OFF offset)
Create a pointer with an offset.
Definition: misc.h:343
Crypto++ library namespace.
Support functions for PowerPC and vector operations.
uint32x4_p VecLoadBE(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:440
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:129
void VecStoreBE(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:751
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:119
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:916
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:939
Common C++ header files.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:69