Crypto++ 8.2
Free C&
sha_simd.cpp
1// sha_simd.cpp - written and placed in the public domain by
2// Jeffrey Walton, Uri Blumenthal and Marcel Raad.
3//
4// This source file uses intrinsics to gain access to SHA-NI and
5// ARMv8a SHA instructions. A separate source file is needed
6// because additional CXXFLAGS are required to enable the
7// appropriate instructions sets in some build configurations.
8
9#include "pch.h"
10#include "config.h"
11#include "sha.h"
12#include "misc.h"
13
14#if defined(CRYPTOPP_DISABLE_SHA_ASM)
15# undef CRYPTOPP_X86_ASM_AVAILABLE
16# undef CRYPTOPP_X32_ASM_AVAILABLE
17# undef CRYPTOPP_X64_ASM_AVAILABLE
18# undef CRYPTOPP_SSE2_ASM_AVAILABLE
19#endif
20
21#if (CRYPTOPP_SHANI_AVAILABLE)
22# include <nmmintrin.h>
23# include <immintrin.h>
24#endif
25
26// C1189: error: This header is specific to ARM targets
27#if (CRYPTOPP_ARM_NEON_AVAILABLE) && !defined(_M_ARM64)
28# include <arm_neon.h>
29#endif
30
31#if (CRYPTOPP_ARM_ACLE_AVAILABLE)
32# include <stdint.h>
33# include <arm_acle.h>
34#endif
35
36#if CRYPTOPP_POWER8_SHA_AVAILABLE
37# include "ppc_simd.h"
38#endif
39
40#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
41# include <signal.h>
42# include <setjmp.h>
43#endif
44
45#ifndef EXCEPTION_EXECUTE_HANDLER
46# define EXCEPTION_EXECUTE_HANDLER 1
47#endif
48
49// Clang __m128i casts
50#define M128_CAST(x) ((__m128i *)(void *)(x))
51#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
52
53// Squash MS LNK4221 and libtool warnings
54extern const char SHA_SIMD_FNAME[] = __FILE__;
55
56NAMESPACE_BEGIN(CryptoPP)
57
58// ***************** SHA key tables ********************
59
60extern const word32 SHA256_K[64];
61extern const word64 SHA512_K[80];
62
63// ***************** SIGILL probes ********************
64
65#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
66extern "C" {
67 typedef void (*SigHandler)(int);
68
69 static jmp_buf s_jmpSIGILL;
70 static void SigIllHandler(int)
71 {
72 longjmp(s_jmpSIGILL, 1);
73 }
74}
75#endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
76
77#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8)
78bool CPU_ProbeSHA1()
79{
80#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
81 return false;
82#elif (CRYPTOPP_ARM_SHA1_AVAILABLE)
83# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
84 volatile bool result = true;
85 __try
86 {
87 unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
88 uint32x4_t data1 = vld1q_u32(w+0);
89 uint32x4_t data2 = vld1q_u32(w+4);
90 uint32x4_t data3 = vld1q_u32(w+8);
91
92 uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
93 uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
94 uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
95 uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
96 uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
97
98 result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
99 }
100 __except (EXCEPTION_EXECUTE_HANDLER)
101 {
102 return false;
103 }
104 return result;
105# else
106
107 // longjmp and clobber warnings. Volatile is required.
108 // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
109 volatile bool result = true;
110
111 volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
112 if (oldHandler == SIG_ERR)
113 return false;
114
115 volatile sigset_t oldMask;
116 if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
117 return false;
118
119 if (setjmp(s_jmpSIGILL))
120 result = false;
121 else
122 {
123 unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
124 uint32x4_t data1 = vld1q_u32(w+0);
125 uint32x4_t data2 = vld1q_u32(w+4);
126 uint32x4_t data3 = vld1q_u32(w+8);
127
128 uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
129 uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
130 uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
131 uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
132 uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
133
134 result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
135 }
136
137 sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
138 signal(SIGILL, oldHandler);
139 return result;
140# endif
141#else
142 return false;
143#endif // CRYPTOPP_ARM_SHA1_AVAILABLE
144}
145
146bool CPU_ProbeSHA2()
147{
148#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
149 return false;
150#elif (CRYPTOPP_ARM_SHA2_AVAILABLE)
151# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
152 volatile bool result = true;
153 __try
154 {
155 unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
156 uint32x4_t data1 = vld1q_u32(w+0);
157 uint32x4_t data2 = vld1q_u32(w+4);
158 uint32x4_t data3 = vld1q_u32(w+8);
159
160 uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
161 uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
162 uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
163 uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
164
165 result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
166 }
167 __except (EXCEPTION_EXECUTE_HANDLER)
168 {
169 return false;
170 }
171 return result;
172#else
173
174 // longjmp and clobber warnings. Volatile is required.
175 // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
176 volatile bool result = true;
177
178 volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
179 if (oldHandler == SIG_ERR)
180 return false;
181
182 volatile sigset_t oldMask;
183 if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
184 return false;
185
186 if (setjmp(s_jmpSIGILL))
187 result = false;
188 else
189 {
190 unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
191 uint32x4_t data1 = vld1q_u32(w+0);
192 uint32x4_t data2 = vld1q_u32(w+4);
193 uint32x4_t data3 = vld1q_u32(w+8);
194
195 uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
196 uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
197 uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
198 uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
199
200 result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
201 }
202
203 sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
204 signal(SIGILL, oldHandler);
205 return result;
206# endif
207#else
208 return false;
209#endif // CRYPTOPP_ARM_SHA2_AVAILABLE
210}
211#endif // ARM32 or ARM64
212
213// ***************** Intel x86 SHA ********************
214
215/////////////////////////////////////
216// start of Walton and Gulley code //
217/////////////////////////////////////
218
219#if CRYPTOPP_SHANI_AVAILABLE
220// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
221void SHA1_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order)
222{
223 CRYPTOPP_ASSERT(state);
224 CRYPTOPP_ASSERT(data);
225 CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
226
227 __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
228 __m128i MASK, MSG0, MSG1, MSG2, MSG3;
229
230 // Load initial values
231 ABCD = _mm_loadu_si128(CONST_M128_CAST(state));
232 E0 = _mm_set_epi32(state[4], 0, 0, 0);
233 ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
234
235 // IA-32 SHA is little endian, SHA::Transform is big endian,
236 // and SHA::HashMultipleBlocks can be either. ByteOrder
237 // allows us to avoid extra endian reversals. It saves 1.0 cpb.
238 MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement
239 _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15) :
240 _mm_set_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12) ;
241
242 while (length >= SHA1::BLOCKSIZE)
243 {
244 // Save current hash
245 ABCD_SAVE = ABCD;
246 E0_SAVE = E0;
247
248 // Rounds 0-3
249 MSG0 = _mm_loadu_si128(CONST_M128_CAST(data+0));
250 MSG0 = _mm_shuffle_epi8(MSG0, MASK);
251 E0 = _mm_add_epi32(E0, MSG0);
252 E1 = ABCD;
253 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
254
255 // Rounds 4-7
256 MSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
257 MSG1 = _mm_shuffle_epi8(MSG1, MASK);
258 E1 = _mm_sha1nexte_epu32(E1, MSG1);
259 E0 = ABCD;
260 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
261 MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
262
263 // Rounds 8-11
264 MSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
265 MSG2 = _mm_shuffle_epi8(MSG2, MASK);
266 E0 = _mm_sha1nexte_epu32(E0, MSG2);
267 E1 = ABCD;
268 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
269 MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
270 MSG0 = _mm_xor_si128(MSG0, MSG2);
271
272 // Rounds 12-15
273 MSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
274 MSG3 = _mm_shuffle_epi8(MSG3, MASK);
275 E1 = _mm_sha1nexte_epu32(E1, MSG3);
276 E0 = ABCD;
277 MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
278 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
279 MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
280 MSG1 = _mm_xor_si128(MSG1, MSG3);
281
282 // Rounds 16-19
283 E0 = _mm_sha1nexte_epu32(E0, MSG0);
284 E1 = ABCD;
285 MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
286 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
287 MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
288 MSG2 = _mm_xor_si128(MSG2, MSG0);
289
290 // Rounds 20-23
291 E1 = _mm_sha1nexte_epu32(E1, MSG1);
292 E0 = ABCD;
293 MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
294 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
295 MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
296 MSG3 = _mm_xor_si128(MSG3, MSG1);
297
298 // Rounds 24-27
299 E0 = _mm_sha1nexte_epu32(E0, MSG2);
300 E1 = ABCD;
301 MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
302 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
303 MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
304 MSG0 = _mm_xor_si128(MSG0, MSG2);
305
306 // Rounds 28-31
307 E1 = _mm_sha1nexte_epu32(E1, MSG3);
308 E0 = ABCD;
309 MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
310 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
311 MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
312 MSG1 = _mm_xor_si128(MSG1, MSG3);
313
314 // Rounds 32-35
315 E0 = _mm_sha1nexte_epu32(E0, MSG0);
316 E1 = ABCD;
317 MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
318 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
319 MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
320 MSG2 = _mm_xor_si128(MSG2, MSG0);
321
322 // Rounds 36-39
323 E1 = _mm_sha1nexte_epu32(E1, MSG1);
324 E0 = ABCD;
325 MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
326 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
327 MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
328 MSG3 = _mm_xor_si128(MSG3, MSG1);
329
330 // Rounds 40-43
331 E0 = _mm_sha1nexte_epu32(E0, MSG2);
332 E1 = ABCD;
333 MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
334 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
335 MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
336 MSG0 = _mm_xor_si128(MSG0, MSG2);
337
338 // Rounds 44-47
339 E1 = _mm_sha1nexte_epu32(E1, MSG3);
340 E0 = ABCD;
341 MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
342 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
343 MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
344 MSG1 = _mm_xor_si128(MSG1, MSG3);
345
346 // Rounds 48-51
347 E0 = _mm_sha1nexte_epu32(E0, MSG0);
348 E1 = ABCD;
349 MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
350 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
351 MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
352 MSG2 = _mm_xor_si128(MSG2, MSG0);
353
354 // Rounds 52-55
355 E1 = _mm_sha1nexte_epu32(E1, MSG1);
356 E0 = ABCD;
357 MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
358 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
359 MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
360 MSG3 = _mm_xor_si128(MSG3, MSG1);
361
362 // Rounds 56-59
363 E0 = _mm_sha1nexte_epu32(E0, MSG2);
364 E1 = ABCD;
365 MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
366 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
367 MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
368 MSG0 = _mm_xor_si128(MSG0, MSG2);
369
370 // Rounds 60-63
371 E1 = _mm_sha1nexte_epu32(E1, MSG3);
372 E0 = ABCD;
373 MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
374 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
375 MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
376 MSG1 = _mm_xor_si128(MSG1, MSG3);
377
378 // Rounds 64-67
379 E0 = _mm_sha1nexte_epu32(E0, MSG0);
380 E1 = ABCD;
381 MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
382 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
383 MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
384 MSG2 = _mm_xor_si128(MSG2, MSG0);
385
386 // Rounds 68-71
387 E1 = _mm_sha1nexte_epu32(E1, MSG1);
388 E0 = ABCD;
389 MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
390 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
391 MSG3 = _mm_xor_si128(MSG3, MSG1);
392
393 // Rounds 72-75
394 E0 = _mm_sha1nexte_epu32(E0, MSG2);
395 E1 = ABCD;
396 MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
397 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
398
399 // Rounds 76-79
400 E1 = _mm_sha1nexte_epu32(E1, MSG3);
401 E0 = ABCD;
402 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
403
404 // Add values back to state
405 E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
406 ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);
407
408 data += SHA1::BLOCKSIZE/sizeof(word32);
409 length -= SHA1::BLOCKSIZE;
410 }
411
412 // Save state
413 ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
414 _mm_storeu_si128(M128_CAST(state), ABCD);
415 state[4] = _mm_extract_epi32(E0, 3);
416}
417
418// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
419void SHA256_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order)
420{
421 CRYPTOPP_ASSERT(state);
422 CRYPTOPP_ASSERT(data);
423 CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
424
425 __m128i STATE0, STATE1;
426 __m128i MSG, TMP, MASK;
427 __m128i TMSG0, TMSG1, TMSG2, TMSG3;
428 __m128i ABEF_SAVE, CDGH_SAVE;
429
430 // Load initial values
431 TMP = _mm_loadu_si128(M128_CAST(&state[0]));
432 STATE1 = _mm_loadu_si128(M128_CAST(&state[4]));
433
434 // IA-32 SHA is little endian, SHA::Transform is big endian,
435 // and SHA::HashMultipleBlocks can be either. ByteOrder
436 // allows us to avoid extra endian reversals. It saves 1.0 cpb.
437 MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement
438 _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3) :
439 _mm_set_epi8(15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0) ;
440
441 TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
442 STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
443 STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
444 STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
445
446 while (length >= SHA256::BLOCKSIZE)
447 {
448 // Save current hash
449 ABEF_SAVE = STATE0;
450 CDGH_SAVE = STATE1;
451
452 // Rounds 0-3
453 MSG = _mm_loadu_si128(CONST_M128_CAST(data+0));
454 TMSG0 = _mm_shuffle_epi8(MSG, MASK);
455 MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98)));
456 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
457 MSG = _mm_shuffle_epi32(MSG, 0x0E);
458 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
459
460 // Rounds 4-7
461 TMSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
462 TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
463 MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B)));
464 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
465 MSG = _mm_shuffle_epi32(MSG, 0x0E);
466 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
467 TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
468
469 // Rounds 8-11
470 TMSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
471 TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
472 MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98)));
473 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
474 MSG = _mm_shuffle_epi32(MSG, 0x0E);
475 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
476 TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
477
478 // Rounds 12-15
479 TMSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
480 TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
481 MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74)));
482 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
483 TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
484 TMSG0 = _mm_add_epi32(TMSG0, TMP);
485 TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
486 MSG = _mm_shuffle_epi32(MSG, 0x0E);
487 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
488 TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
489
490 // Rounds 16-19
491 MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x240CA1CC0FC19DC6), W64LIT(0xEFBE4786E49B69C1)));
492 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
493 TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
494 TMSG1 = _mm_add_epi32(TMSG1, TMP);
495 TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
496 MSG = _mm_shuffle_epi32(MSG, 0x0E);
497 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
498 TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
499
500 // Rounds 20-23
501 MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x76F988DA5CB0A9DC), W64LIT(0x4A7484AA2DE92C6F)));
502 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
503 TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
504 TMSG2 = _mm_add_epi32(TMSG2, TMP);
505 TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
506 MSG = _mm_shuffle_epi32(MSG, 0x0E);
507 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
508 TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
509
510 // Rounds 24-27
511 MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xBF597FC7B00327C8), W64LIT(0xA831C66D983E5152)));
512 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
513 TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
514 TMSG3 = _mm_add_epi32(TMSG3, TMP);
515 TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
516 MSG = _mm_shuffle_epi32(MSG, 0x0E);
517 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
518 TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
519
520 // Rounds 28-31
521 MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x1429296706CA6351), W64LIT(0xD5A79147C6E00BF3)));
522 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
523 TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
524 TMSG0 = _mm_add_epi32(TMSG0, TMP);
525 TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
526 MSG = _mm_shuffle_epi32(MSG, 0x0E);
527 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
528 TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
529
530 // Rounds 32-35
531 MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x53380D134D2C6DFC), W64LIT(0x2E1B213827B70A85)));
532 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
533 TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
534 TMSG1 = _mm_add_epi32(TMSG1, TMP);
535 TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
536 MSG = _mm_shuffle_epi32(MSG, 0x0E);
537 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
538 TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
539
540 // Rounds 36-39
541 MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x92722C8581C2C92E), W64LIT(0x766A0ABB650A7354)));
542 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
543 TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
544 TMSG2 = _mm_add_epi32(TMSG2, TMP);
545 TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
546 MSG = _mm_shuffle_epi32(MSG, 0x0E);
547 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
548 TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
549
550 // Rounds 40-43
551 MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xC76C51A3C24B8B70), W64LIT(0xA81A664BA2BFE8A1)));
552 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
553 TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
554 TMSG3 = _mm_add_epi32(TMSG3, TMP);
555 TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
556 MSG = _mm_shuffle_epi32(MSG, 0x0E);
557 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
558 TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
559
560 // Rounds 44-47
561 MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x106AA070F40E3585), W64LIT(0xD6990624D192E819)));
562 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
563 TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
564 TMSG0 = _mm_add_epi32(TMSG0, TMP);
565 TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
566 MSG = _mm_shuffle_epi32(MSG, 0x0E);
567 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
568 TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
569
570 // Rounds 48-51
571 MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x34B0BCB52748774C), W64LIT(0x1E376C0819A4C116)));
572 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
573 TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
574 TMSG1 = _mm_add_epi32(TMSG1, TMP);
575 TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
576 MSG = _mm_shuffle_epi32(MSG, 0x0E);
577 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
578 TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
579
580 // Rounds 52-55
581 MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x682E6FF35B9CCA4F), W64LIT(0x4ED8AA4A391C0CB3)));
582 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
583 TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
584 TMSG2 = _mm_add_epi32(TMSG2, TMP);
585 TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
586 MSG = _mm_shuffle_epi32(MSG, 0x0E);
587 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
588
589 // Rounds 56-59
590 MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x8CC7020884C87814), W64LIT(0x78A5636F748F82EE)));
591 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
592 TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
593 TMSG3 = _mm_add_epi32(TMSG3, TMP);
594 TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
595 MSG = _mm_shuffle_epi32(MSG, 0x0E);
596 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
597
598 // Rounds 60-63
599 MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC67178F2BEF9A3F7), W64LIT(0xA4506CEB90BEFFFA)));
600 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
601 MSG = _mm_shuffle_epi32(MSG, 0x0E);
602 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
603
604 // Add values back to state
605 STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
606 STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
607
608 data += SHA256::BLOCKSIZE/sizeof(word32);
609 length -= SHA256::BLOCKSIZE;
610 }
611
612 TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
613 STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
614 STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
615 STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
616
617 // Save state
618 _mm_storeu_si128(M128_CAST(&state[0]), STATE0);
619 _mm_storeu_si128(M128_CAST(&state[4]), STATE1);
620}
621#endif // CRYPTOPP_SHANI_AVAILABLE
622
623///////////////////////////////////
624// end of Walton and Gulley code //
625///////////////////////////////////
626
627// ***************** ARMV8 SHA ********************
628
629/////////////////////////////////////////////////////////////
630// start of Walton, Schneiders, O'Rourke and Hovsmith code //
631/////////////////////////////////////////////////////////////
632
633#if CRYPTOPP_ARM_SHA1_AVAILABLE
634void SHA1_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order)
635{
636 CRYPTOPP_ASSERT(state);
637 CRYPTOPP_ASSERT(data);
638 CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
639
640 uint32x4_t C0, C1, C2, C3;
641 uint32x4_t ABCD, ABCD_SAVED;
642 uint32x4_t MSG0, MSG1, MSG2, MSG3;
643 uint32x4_t TMP0, TMP1;
644 uint32_t E0, E0_SAVED, E1;
645
646 // Load initial values
647 C0 = vdupq_n_u32(0x5A827999);
648 C1 = vdupq_n_u32(0x6ED9EBA1);
649 C2 = vdupq_n_u32(0x8F1BBCDC);
650 C3 = vdupq_n_u32(0xCA62C1D6);
651
652 ABCD = vld1q_u32(&state[0]);
653 E0 = state[4];
654
655 while (length >= SHA1::BLOCKSIZE)
656 {
657 // Save current hash
658 ABCD_SAVED = ABCD;
659 E0_SAVED = E0;
660
661 MSG0 = vld1q_u32(data + 0);
662 MSG1 = vld1q_u32(data + 4);
663 MSG2 = vld1q_u32(data + 8);
664 MSG3 = vld1q_u32(data + 12);
665
666 if (order == BIG_ENDIAN_ORDER) // Data arrangement
667 {
668 MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
669 MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
670 MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
671 MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
672 }
673
674 TMP0 = vaddq_u32(MSG0, C0);
675 TMP1 = vaddq_u32(MSG1, C0);
676
677 // Rounds 0-3
678 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
679 ABCD = vsha1cq_u32(ABCD, E0, TMP0);
680 TMP0 = vaddq_u32(MSG2, C0);
681 MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
682
683 // Rounds 4-7
684 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
685 ABCD = vsha1cq_u32(ABCD, E1, TMP1);
686 TMP1 = vaddq_u32(MSG3, C0);
687 MSG0 = vsha1su1q_u32(MSG0, MSG3);
688 MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
689
690 // Rounds 8-11
691 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
692 ABCD = vsha1cq_u32(ABCD, E0, TMP0);
693 TMP0 = vaddq_u32(MSG0, C0);
694 MSG1 = vsha1su1q_u32(MSG1, MSG0);
695 MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
696
697 // Rounds 12-15
698 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
699 ABCD = vsha1cq_u32(ABCD, E1, TMP1);
700 TMP1 = vaddq_u32(MSG1, C1);
701 MSG2 = vsha1su1q_u32(MSG2, MSG1);
702 MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
703
704 // Rounds 16-19
705 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
706 ABCD = vsha1cq_u32(ABCD, E0, TMP0);
707 TMP0 = vaddq_u32(MSG2, C1);
708 MSG3 = vsha1su1q_u32(MSG3, MSG2);
709 MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
710
711 // Rounds 20-23
712 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
713 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
714 TMP1 = vaddq_u32(MSG3, C1);
715 MSG0 = vsha1su1q_u32(MSG0, MSG3);
716 MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
717
718 // Rounds 24-27
719 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
720 ABCD = vsha1pq_u32(ABCD, E0, TMP0);
721 TMP0 = vaddq_u32(MSG0, C1);
722 MSG1 = vsha1su1q_u32(MSG1, MSG0);
723 MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
724
725 // Rounds 28-31
726 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
727 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
728 TMP1 = vaddq_u32(MSG1, C1);
729 MSG2 = vsha1su1q_u32(MSG2, MSG1);
730 MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
731
732 // Rounds 32-35
733 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
734 ABCD = vsha1pq_u32(ABCD, E0, TMP0);
735 TMP0 = vaddq_u32(MSG2, C2);
736 MSG3 = vsha1su1q_u32(MSG3, MSG2);
737 MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
738
739 // Rounds 36-39
740 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
741 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
742 TMP1 = vaddq_u32(MSG3, C2);
743 MSG0 = vsha1su1q_u32(MSG0, MSG3);
744 MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
745
746 // Rounds 40-43
747 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
748 ABCD = vsha1mq_u32(ABCD, E0, TMP0);
749 TMP0 = vaddq_u32(MSG0, C2);
750 MSG1 = vsha1su1q_u32(MSG1, MSG0);
751 MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
752
753 // Rounds 44-47
754 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
755 ABCD = vsha1mq_u32(ABCD, E1, TMP1);
756 TMP1 = vaddq_u32(MSG1, C2);
757 MSG2 = vsha1su1q_u32(MSG2, MSG1);
758 MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
759
760 // Rounds 48-51
761 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
762 ABCD = vsha1mq_u32(ABCD, E0, TMP0);
763 TMP0 = vaddq_u32(MSG2, C2);
764 MSG3 = vsha1su1q_u32(MSG3, MSG2);
765 MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
766
767 // Rounds 52-55
768 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
769 ABCD = vsha1mq_u32(ABCD, E1, TMP1);
770 TMP1 = vaddq_u32(MSG3, C3);
771 MSG0 = vsha1su1q_u32(MSG0, MSG3);
772 MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
773
774 // Rounds 56-59
775 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
776 ABCD = vsha1mq_u32(ABCD, E0, TMP0);
777 TMP0 = vaddq_u32(MSG0, C3);
778 MSG1 = vsha1su1q_u32(MSG1, MSG0);
779 MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
780
781 // Rounds 60-63
782 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
783 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
784 TMP1 = vaddq_u32(MSG1, C3);
785 MSG2 = vsha1su1q_u32(MSG2, MSG1);
786 MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
787
788 // Rounds 64-67
789 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
790 ABCD = vsha1pq_u32(ABCD, E0, TMP0);
791 TMP0 = vaddq_u32(MSG2, C3);
792 MSG3 = vsha1su1q_u32(MSG3, MSG2);
793 MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
794
795 // Rounds 68-71
796 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
797 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
798 TMP1 = vaddq_u32(MSG3, C3);
799 MSG0 = vsha1su1q_u32(MSG0, MSG3);
800
801 // Rounds 72-75
802 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
803 ABCD = vsha1pq_u32(ABCD, E0, TMP0);
804
805 // Rounds 76-79
806 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
807 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
808
809 E0 += E0_SAVED;
810 ABCD = vaddq_u32(ABCD_SAVED, ABCD);
811
812 data += SHA1::BLOCKSIZE/sizeof(word32);
813 length -= SHA1::BLOCKSIZE;
814 }
815
816 // Save state
817 vst1q_u32(&state[0], ABCD);
818 state[4] = E0;
819}
820#endif // CRYPTOPP_ARM_SHA1_AVAILABLE
821
822#if CRYPTOPP_ARM_SHA2_AVAILABLE
823void SHA256_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order)
824{
825 CRYPTOPP_ASSERT(state);
826 CRYPTOPP_ASSERT(data);
827 CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
828
829 uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
830 uint32x4_t MSG0, MSG1, MSG2, MSG3;
831 uint32x4_t TMP0, TMP1, TMP2;
832
833 // Load initial values
834 STATE0 = vld1q_u32(&state[0]);
835 STATE1 = vld1q_u32(&state[4]);
836
837 while (length >= SHA256::BLOCKSIZE)
838 {
839 // Save current hash
840 ABEF_SAVE = STATE0;
841 CDGH_SAVE = STATE1;
842
843 // Load message
844 MSG0 = vld1q_u32(data + 0);
845 MSG1 = vld1q_u32(data + 4);
846 MSG2 = vld1q_u32(data + 8);
847 MSG3 = vld1q_u32(data + 12);
848
849 if (order == BIG_ENDIAN_ORDER) // Data arrangement
850 {
851 MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
852 MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
853 MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
854 MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
855 }
856
857 TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x00]));
858
859 // Rounds 0-3
860 MSG0 = vsha256su0q_u32(MSG0, MSG1);
861 TMP2 = STATE0;
862 TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x04]));
863 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
864 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
865 MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);;
866
867 // Rounds 4-7
868 MSG1 = vsha256su0q_u32(MSG1, MSG2);
869 TMP2 = STATE0;
870 TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x08]));
871 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
872 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
873 MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);;
874
875 // Rounds 8-11
876 MSG2 = vsha256su0q_u32(MSG2, MSG3);
877 TMP2 = STATE0;
878 TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x0c]));
879 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
880 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
881 MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);;
882
883 // Rounds 12-15
884 MSG3 = vsha256su0q_u32(MSG3, MSG0);
885 TMP2 = STATE0;
886 TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x10]));
887 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
888 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
889 MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);;
890
891 // Rounds 16-19
892 MSG0 = vsha256su0q_u32(MSG0, MSG1);
893 TMP2 = STATE0;
894 TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x14]));
895 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
896 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
897 MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);;
898
899 // Rounds 20-23
900 MSG1 = vsha256su0q_u32(MSG1, MSG2);
901 TMP2 = STATE0;
902 TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x18]));
903 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
904 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
905 MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);;
906
907 // Rounds 24-27
908 MSG2 = vsha256su0q_u32(MSG2, MSG3);
909 TMP2 = STATE0;
910 TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x1c]));
911 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
912 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
913 MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);;
914
915 // Rounds 28-31
916 MSG3 = vsha256su0q_u32(MSG3, MSG0);
917 TMP2 = STATE0;
918 TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x20]));
919 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
920 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
921 MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);;
922
923 // Rounds 32-35
924 MSG0 = vsha256su0q_u32(MSG0, MSG1);
925 TMP2 = STATE0;
926 TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x24]));
927 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
928 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
929 MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);;
930
931 // Rounds 36-39
932 MSG1 = vsha256su0q_u32(MSG1, MSG2);
933 TMP2 = STATE0;
934 TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x28]));
935 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
936 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
937 MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);;
938
939 // Rounds 40-43
940 MSG2 = vsha256su0q_u32(MSG2, MSG3);
941 TMP2 = STATE0;
942 TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x2c]));
943 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
944 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
945 MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);;
946
947 // Rounds 44-47
948 MSG3 = vsha256su0q_u32(MSG3, MSG0);
949 TMP2 = STATE0;
950 TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x30]));
951 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
952 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
953 MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);;
954
955 // Rounds 48-51
956 TMP2 = STATE0;
957 TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x34]));
958 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
959 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);;
960
961 // Rounds 52-55
962 TMP2 = STATE0;
963 TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x38]));
964 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
965 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);;
966
967 // Rounds 56-59
968 TMP2 = STATE0;
969 TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x3c]));
970 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
971 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);;
972
973 // Rounds 60-63
974 TMP2 = STATE0;
975 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
976 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);;
977
978 // Add back to state
979 STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
980 STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
981
982 data += SHA256::BLOCKSIZE/sizeof(word32);
983 length -= SHA256::BLOCKSIZE;
984 }
985
986 // Save state
987 vst1q_u32(&state[0], STATE0);
988 vst1q_u32(&state[4], STATE1);
989}
990#endif // CRYPTOPP_ARM_SHA2_AVAILABLE
991
992///////////////////////////////////////////////////////////
993// end of Walton, Schneiders, O'Rourke and Hovsmith code //
994///////////////////////////////////////////////////////////
995
996// ***************** Power8 SHA ********************
997
998//////////////////////////////////////////////////
999// start Gustavo, Serra, Scalet and Walton code //
1000//////////////////////////////////////////////////
1001
1002#if CRYPTOPP_POWER8_SHA_AVAILABLE
1003
1004// Indexes into the S[] array
1005enum {A=0, B=1, C, D, E, F, G, H};
1006
1007inline
1008uint32x4_p VecLoad32(const word32* data, int offset)
1009{
1010#if (CRYPTOPP_LITTLE_ENDIAN)
1011 const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
1012 const uint32x4_p val = VecLoad(offset, data);
1013 return (uint32x4_p)VecPermute(val, val, mask);
1014#else
1015 return VecLoad(offset, data);
1016#endif
1017}
1018
1019template<class T> inline
1020void VecStore32(const T data, word32 dest[4])
1021{
1022 VecStore(data, dest);
1023}
1024
1025inline
1026uint32x4_p VectorCh(const uint32x4_p x, const uint32x4_p y, const uint32x4_p z)
1027{
1028 // The trick below is due to Andy Polyakov and Jack Lloyd
1029 return vec_sel(z,y,x);
1030}
1031
1032inline
1033uint32x4_p VectorMaj(const uint32x4_p x, const uint32x4_p y, const uint32x4_p z)
1034{
1035 // The trick below is due to Andy Polyakov and Jack Lloyd
1036 return vec_sel(y, z, VecXor(x, y));
1037}
1038
1039inline
1040uint32x4_p Vector_sigma0(const uint32x4_p val)
1041{
1042 return VecSHA256<0,0>(val);
1043}
1044
1045inline
1046uint32x4_p Vector_sigma1(const uint32x4_p val)
1047{
1048 return VecSHA256<0,0xf>(val);
1049}
1050
1051inline
1052uint32x4_p VectorSigma0(const uint32x4_p val)
1053{
1054 return VecSHA256<1,0>(val);
1055}
1056
1057inline
1058uint32x4_p VectorSigma1(const uint32x4_p val)
1059{
1060 return VecSHA256<1,0xf>(val);
1061}
1062
1063inline
1064uint32x4_p VectorPack(const uint32x4_p a, const uint32x4_p b,
1065 const uint32x4_p c, const uint32x4_p d)
1066{
1067 const uint8x16_p m1 = {0,1,2,3, 16,17,18,19, 0,0,0,0, 0,0,0,0};
1068 const uint8x16_p m2 = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
1069 return VecPermute(VecPermute(a,b,m1), VecPermute(c,d,m1), m2);
1070}
1071
1072template <unsigned int R> inline
1073void SHA256_ROUND1(uint32x4_p W[16], uint32x4_p S[8], const uint32x4_p K, const uint32x4_p M)
1074{
1075 uint32x4_p T1, T2;
1076
1077 W[R] = M;
1078 T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
1079 T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1080
1081 S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1082 S[E] = S[D] + T1;
1083 S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1084 S[A] = T1 + T2;
1085}
1086
1087template <unsigned int R> inline
1088void SHA256_ROUND2(uint32x4_p W[16], uint32x4_p S[8], const uint32x4_p K)
1089{
1090 // Indexes into the W[] array
1091 enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
1092
1093 const uint32x4_p s0 = Vector_sigma0(W[IDX1]);
1094 const uint32x4_p s1 = Vector_sigma1(W[IDX14]);
1095
1096 uint32x4_p T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
1097 T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
1098 uint32x4_p T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1099
1100 S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1101 S[E] = S[D] + T1;
1102 S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1103 S[A] = T1 + T2;
1104}
1105
1106void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t length, ByteOrder order)
1107{
1108 CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
1109 CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
1110 CRYPTOPP_UNUSED(order);
1111
1112 const uint32_t* k = reinterpret_cast<const uint32_t*>(SHA256_K);
1113 const uint32_t* m = reinterpret_cast<const uint32_t*>(data);
1114
1115 uint32x4_p abcd = VecLoad(state+0);
1116 uint32x4_p efgh = VecLoad(state+4);
1117 uint32x4_p W[16], S[8], vm, vk;
1118
1119 size_t blocks = length / SHA256::BLOCKSIZE;
1120 while (blocks--)
1121 {
1122 unsigned int offset=0;
1123
1124 S[A] = abcd; S[E] = efgh;
1125 S[B] = VecShiftLeftOctet<4>(S[A]);
1126 S[F] = VecShiftLeftOctet<4>(S[E]);
1127 S[C] = VecShiftLeftOctet<4>(S[B]);
1128 S[G] = VecShiftLeftOctet<4>(S[F]);
1129 S[D] = VecShiftLeftOctet<4>(S[C]);
1130 S[H] = VecShiftLeftOctet<4>(S[G]);
1131
1132 // Rounds 0-16
1133 vk = VecLoad(offset, k);
1134 vm = VecLoad32(m, offset);
1135 SHA256_ROUND1<0>(W,S, vk,vm);
1136 offset+=16;
1137
1138 vk = VecShiftLeftOctet<4>(vk);
1139 vm = VecShiftLeftOctet<4>(vm);
1140 SHA256_ROUND1<1>(W,S, vk,vm);
1141
1142 vk = VecShiftLeftOctet<4>(vk);
1143 vm = VecShiftLeftOctet<4>(vm);
1144 SHA256_ROUND1<2>(W,S, vk,vm);
1145
1146 vk = VecShiftLeftOctet<4>(vk);
1147 vm = VecShiftLeftOctet<4>(vm);
1148 SHA256_ROUND1<3>(W,S, vk,vm);
1149
1150 vk = VecLoad(offset, k);
1151 vm = VecLoad32(m, offset);
1152 SHA256_ROUND1<4>(W,S, vk,vm);
1153 offset+=16;
1154
1155 vk = VecShiftLeftOctet<4>(vk);
1156 vm = VecShiftLeftOctet<4>(vm);
1157 SHA256_ROUND1<5>(W,S, vk,vm);
1158
1159 vk = VecShiftLeftOctet<4>(vk);
1160 vm = VecShiftLeftOctet<4>(vm);
1161 SHA256_ROUND1<6>(W,S, vk,vm);
1162
1163 vk = VecShiftLeftOctet<4>(vk);
1164 vm = VecShiftLeftOctet<4>(vm);
1165 SHA256_ROUND1<7>(W,S, vk,vm);
1166
1167 vk = VecLoad(offset, k);
1168 vm = VecLoad32(m, offset);
1169 SHA256_ROUND1<8>(W,S, vk,vm);
1170 offset+=16;
1171
1172 vk = VecShiftLeftOctet<4>(vk);
1173 vm = VecShiftLeftOctet<4>(vm);
1174 SHA256_ROUND1<9>(W,S, vk,vm);
1175
1176 vk = VecShiftLeftOctet<4>(vk);
1177 vm = VecShiftLeftOctet<4>(vm);
1178 SHA256_ROUND1<10>(W,S, vk,vm);
1179
1180 vk = VecShiftLeftOctet<4>(vk);
1181 vm = VecShiftLeftOctet<4>(vm);
1182 SHA256_ROUND1<11>(W,S, vk,vm);
1183
1184 vk = VecLoad(offset, k);
1185 vm = VecLoad32(m, offset);
1186 SHA256_ROUND1<12>(W,S, vk,vm);
1187 offset+=16;
1188
1189 vk = VecShiftLeftOctet<4>(vk);
1190 vm = VecShiftLeftOctet<4>(vm);
1191 SHA256_ROUND1<13>(W,S, vk,vm);
1192
1193 vk = VecShiftLeftOctet<4>(vk);
1194 vm = VecShiftLeftOctet<4>(vm);
1195 SHA256_ROUND1<14>(W,S, vk,vm);
1196
1197 vk = VecShiftLeftOctet<4>(vk);
1198 vm = VecShiftLeftOctet<4>(vm);
1199 SHA256_ROUND1<15>(W,S, vk,vm);
1200
1201 m += 16; // 32-bit words, not bytes
1202
1203 // Rounds 16-64
1204 for (unsigned int i=16; i<64; i+=16)
1205 {
1206 vk = VecLoad(offset, k);
1207 SHA256_ROUND2<0>(W,S, vk);
1208 SHA256_ROUND2<1>(W,S, VecShiftLeftOctet<4>(vk));
1209 SHA256_ROUND2<2>(W,S, VecShiftLeftOctet<8>(vk));
1210 SHA256_ROUND2<3>(W,S, VecShiftLeftOctet<12>(vk));
1211 offset+=16;
1212
1213 vk = VecLoad(offset, k);
1214 SHA256_ROUND2<4>(W,S, vk);
1215 SHA256_ROUND2<5>(W,S, VecShiftLeftOctet<4>(vk));
1216 SHA256_ROUND2<6>(W,S, VecShiftLeftOctet<8>(vk));
1217 SHA256_ROUND2<7>(W,S, VecShiftLeftOctet<12>(vk));
1218 offset+=16;
1219
1220 vk = VecLoad(offset, k);
1221 SHA256_ROUND2<8>(W,S, vk);
1222 SHA256_ROUND2<9>(W,S, VecShiftLeftOctet<4>(vk));
1223 SHA256_ROUND2<10>(W,S, VecShiftLeftOctet<8>(vk));
1224 SHA256_ROUND2<11>(W,S, VecShiftLeftOctet<12>(vk));
1225 offset+=16;
1226
1227 vk = VecLoad(offset, k);
1228 SHA256_ROUND2<12>(W,S, vk);
1229 SHA256_ROUND2<13>(W,S, VecShiftLeftOctet<4>(vk));
1230 SHA256_ROUND2<14>(W,S, VecShiftLeftOctet<8>(vk));
1231 SHA256_ROUND2<15>(W,S, VecShiftLeftOctet<12>(vk));
1232 offset+=16;
1233 }
1234
1235 abcd += VectorPack(S[A],S[B],S[C],S[D]);
1236 efgh += VectorPack(S[E],S[F],S[G],S[H]);
1237 }
1238
1239 VecStore32(abcd, state+0);
1240 VecStore32(efgh, state+4);
1241}
1242
1243inline
1244void VecStore64(const uint64x2_p val, word64* data)
1245{
1246 VecStore(val, data);
1247}
1248
1249inline
1250uint64x2_p VecLoad64(const word64* data, int offset)
1251{
1252#if (CRYPTOPP_LITTLE_ENDIAN)
1253 const uint8x16_p mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
1254 return VecPermute(VecLoad(offset, data), mask);
1255#else
1256 return VecLoad(offset, data);
1257#endif
1258}
1259
1260inline
1261uint64x2_p VectorCh(const uint64x2_p x, const uint64x2_p y, const uint64x2_p z)
1262{
1263 // The trick below is due to Andy Polyakov and Jack Lloyd
1264 return vec_sel(z,y,x);
1265}
1266
1267inline
1268uint64x2_p VectorMaj(const uint64x2_p x, const uint64x2_p y, const uint64x2_p z)
1269{
1270 // The trick below is due to Andy Polyakov and Jack Lloyd
1271 return vec_sel(y, z, VecXor(x, y));
1272}
1273
1274inline
1275uint64x2_p Vector_sigma0(const uint64x2_p val)
1276{
1277 return VecSHA512<0,0>(val);
1278}
1279
1280inline
1281uint64x2_p Vector_sigma1(const uint64x2_p val)
1282{
1283 return VecSHA512<0,0xf>(val);
1284}
1285
1286inline
1287uint64x2_p VectorSigma0(const uint64x2_p val)
1288{
1289 return VecSHA512<1,0>(val);
1290}
1291
1292inline
1293uint64x2_p VectorSigma1(const uint64x2_p val)
1294{
1295 return VecSHA512<1,0xf>(val);
1296}
1297
1298inline
1299uint64x2_p VectorPack(const uint64x2_p x, const uint64x2_p y)
1300{
1301 const uint8x16_p m = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
1302 return VecPermute(x,y,m);
1303}
1304
1305template <unsigned int R> inline
1306void SHA512_ROUND1(uint64x2_p W[16], uint64x2_p S[8], const uint64x2_p K, const uint64x2_p M)
1307{
1308 uint64x2_p T1, T2;
1309
1310 W[R] = M;
1311 T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
1312 T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1313
1314 S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1315 S[E] = S[D] + T1;
1316 S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1317 S[A] = T1 + T2;
1318}
1319
1320template <unsigned int R> inline
1321void SHA512_ROUND2(uint64x2_p W[16], uint64x2_p S[8], const uint64x2_p K)
1322{
1323 // Indexes into the W[] array
1324 enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
1325
1326 const uint64x2_p s0 = Vector_sigma0(W[IDX1]);
1327 const uint64x2_p s1 = Vector_sigma1(W[IDX14]);
1328
1329 uint64x2_p T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
1330 T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
1331 uint64x2_p T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1332
1333 S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1334 S[E] = S[D] + T1;
1335 S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1336 S[A] = T1 + T2;
1337}
1338
1339void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t length, ByteOrder order)
1340{
1341 CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
1342 CRYPTOPP_ASSERT(length >= SHA512::BLOCKSIZE);
1343 CRYPTOPP_UNUSED(order);
1344
1345 const uint64_t* k = reinterpret_cast<const uint64_t*>(SHA512_K);
1346 const uint64_t* m = reinterpret_cast<const uint64_t*>(data);
1347
1348 uint64x2_p ab = VecLoad(state+0);
1349 uint64x2_p cd = VecLoad(state+2);
1350 uint64x2_p ef = VecLoad(state+4);
1351 uint64x2_p gh = VecLoad(state+6);
1352 uint64x2_p W[16], S[8], vm, vk;
1353
1354 size_t blocks = length / SHA512::BLOCKSIZE;
1355 while (blocks--)
1356 {
1357 unsigned int offset=0;
1358
1359 S[A] = ab; S[C] = cd;
1360 S[E] = ef; S[G] = gh;
1361 S[B] = VecShiftLeftOctet<8>(S[A]);
1362 S[D] = VecShiftLeftOctet<8>(S[C]);
1363 S[F] = VecShiftLeftOctet<8>(S[E]);
1364 S[H] = VecShiftLeftOctet<8>(S[G]);
1365
1366 // Rounds 0-16
1367 vk = VecLoad(offset, k);
1368 vm = VecLoad64(m, offset);
1369 SHA512_ROUND1<0>(W,S, vk,vm);
1370 offset+=16;
1371
1372 vk = VecShiftLeftOctet<8>(vk);
1373 vm = VecShiftLeftOctet<8>(vm);
1374 SHA512_ROUND1<1>(W,S, vk,vm);
1375
1376 vk = VecLoad(offset, k);
1377 vm = VecLoad64(m, offset);
1378 SHA512_ROUND1<2>(W,S, vk,vm);
1379 offset+=16;
1380
1381 vk = VecShiftLeftOctet<8>(vk);
1382 vm = VecShiftLeftOctet<8>(vm);
1383 SHA512_ROUND1<3>(W,S, vk,vm);
1384
1385 vk = VecLoad(offset, k);
1386 vm = VecLoad64(m, offset);
1387 SHA512_ROUND1<4>(W,S, vk,vm);
1388 offset+=16;
1389
1390 vk = VecShiftLeftOctet<8>(vk);
1391 vm = VecShiftLeftOctet<8>(vm);
1392 SHA512_ROUND1<5>(W,S, vk,vm);
1393
1394 vk = VecLoad(offset, k);
1395 vm = VecLoad64(m, offset);
1396 SHA512_ROUND1<6>(W,S, vk,vm);
1397 offset+=16;
1398
1399 vk = VecShiftLeftOctet<8>(vk);
1400 vm = VecShiftLeftOctet<8>(vm);
1401 SHA512_ROUND1<7>(W,S, vk,vm);
1402
1403 vk = VecLoad(offset, k);
1404 vm = VecLoad64(m, offset);
1405 SHA512_ROUND1<8>(W,S, vk,vm);
1406 offset+=16;
1407
1408 vk = VecShiftLeftOctet<8>(vk);
1409 vm = VecShiftLeftOctet<8>(vm);
1410 SHA512_ROUND1<9>(W,S, vk,vm);
1411
1412 vk = VecLoad(offset, k);
1413 vm = VecLoad64(m, offset);
1414 SHA512_ROUND1<10>(W,S, vk,vm);
1415 offset+=16;
1416
1417 vk = VecShiftLeftOctet<8>(vk);
1418 vm = VecShiftLeftOctet<8>(vm);
1419 SHA512_ROUND1<11>(W,S, vk,vm);
1420
1421 vk = VecLoad(offset, k);
1422 vm = VecLoad64(m, offset);
1423 SHA512_ROUND1<12>(W,S, vk,vm);
1424 offset+=16;
1425
1426 vk = VecShiftLeftOctet<8>(vk);
1427 vm = VecShiftLeftOctet<8>(vm);
1428 SHA512_ROUND1<13>(W,S, vk,vm);
1429
1430 vk = VecLoad(offset, k);
1431 vm = VecLoad64(m, offset);
1432 SHA512_ROUND1<14>(W,S, vk,vm);
1433 offset+=16;
1434
1435 vk = VecShiftLeftOctet<8>(vk);
1436 vm = VecShiftLeftOctet<8>(vm);
1437 SHA512_ROUND1<15>(W,S, vk,vm);
1438
1439 m += 16; // 64-bit words, not bytes
1440
1441 // Rounds 16-80
1442 for (unsigned int i=16; i<80; i+=16)
1443 {
1444 vk = VecLoad(offset, k);
1445 SHA512_ROUND2<0>(W,S, vk);
1446 SHA512_ROUND2<1>(W,S, VecShiftLeftOctet<8>(vk));
1447 offset+=16;
1448
1449 vk = VecLoad(offset, k);
1450 SHA512_ROUND2<2>(W,S, vk);
1451 SHA512_ROUND2<3>(W,S, VecShiftLeftOctet<8>(vk));
1452 offset+=16;
1453
1454 vk = VecLoad(offset, k);
1455 SHA512_ROUND2<4>(W,S, vk);
1456 SHA512_ROUND2<5>(W,S, VecShiftLeftOctet<8>(vk));
1457 offset+=16;
1458
1459 vk = VecLoad(offset, k);
1460 SHA512_ROUND2<6>(W,S, vk);
1461 SHA512_ROUND2<7>(W,S, VecShiftLeftOctet<8>(vk));
1462 offset+=16;
1463
1464 vk = VecLoad(offset, k);
1465 SHA512_ROUND2<8>(W,S, vk);
1466 SHA512_ROUND2<9>(W,S, VecShiftLeftOctet<8>(vk));
1467 offset+=16;
1468
1469 vk = VecLoad(offset, k);
1470 SHA512_ROUND2<10>(W,S, vk);
1471 SHA512_ROUND2<11>(W,S, VecShiftLeftOctet<8>(vk));
1472 offset+=16;
1473
1474 vk = VecLoad(offset, k);
1475 SHA512_ROUND2<12>(W,S, vk);
1476 SHA512_ROUND2<13>(W,S, VecShiftLeftOctet<8>(vk));
1477 offset+=16;
1478
1479 vk = VecLoad(offset, k);
1480 SHA512_ROUND2<14>(W,S, vk);
1481 SHA512_ROUND2<15>(W,S, VecShiftLeftOctet<8>(vk));
1482 offset+=16;
1483 }
1484
1485 ab += VectorPack(S[A],S[B]);
1486 cd += VectorPack(S[C],S[D]);
1487 ef += VectorPack(S[E],S[F]);
1488 gh += VectorPack(S[G],S[H]);
1489 }
1490
1491 VecStore64(ab, state+0);
1492 VecStore64(cd, state+2);
1493 VecStore64(ef, state+4);
1494 VecStore64(gh, state+6);
1495}
1496
1497#endif // CRYPTOPP_POWER8_SHA_AVAILABLE
1498
1499////////////////////////////////////////////////
1500// end Gustavo, Serra, Scalet and Walton code //
1501////////////////////////////////////////////////
1502
1503NAMESPACE_END
Library configuration file.
ByteOrder
Provides the byte ordering.
Definition: cryptlib.h:143
@ BIG_ENDIAN_ORDER
byte order is big-endian
Definition: cryptlib.h:147
Utility functions for the Crypto++ library.
Crypto++ library namespace.
Precompiled header file.
Support functions for PowerPC and vector operations.
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:129
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1010
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:119
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:916
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:139
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:605
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:253
Classes for SHA-1 and SHA-2 family of message digests.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:69