Crypto++ 8.2
Free C&
cham_simd.cpp
1// cham_simd.cpp - written and placed in the public domain by Jeffrey Walton
2//
3// This source file uses intrinsics and built-ins to gain access to
4// SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
5// source file is needed because additional CXXFLAGS are required to enable
6// the appropriate instructions sets in some build configurations.
7
8#include "pch.h"
9#include "config.h"
10
11#include "cham.h"
12#include "misc.h"
13
14// Uncomment for benchmarking C++ against SSE or NEON.
15// Do so in both simon.cpp and simon-simd.cpp.
16// #undef CRYPTOPP_SSSE3_AVAILABLE
17// #undef CRYPTOPP_ARM_NEON_AVAILABLE
18
19#if (CRYPTOPP_SSSE3_AVAILABLE)
20#include "adv_simd.h"
21# include <pmmintrin.h>
22# include <tmmintrin.h>
23#endif
24
25#if defined(__XOP__)
26# include <ammintrin.h>
27#endif
28
29#if defined(__AVX512F__)
30# define CRYPTOPP_AVX512_ROTATE 1
31# include <immintrin.h>
32#endif
33
34// Squash MS LNK4221 and libtool warnings
35extern const char CHAM_SIMD_FNAME[] = __FILE__;
36
37ANONYMOUS_NAMESPACE_BEGIN
38
39using CryptoPP::word16;
40using CryptoPP::word32;
41
42#if (CRYPTOPP_SSSE3_AVAILABLE)
43
44//////////////////////////////////////////////////////////////////////////
45
46NAMESPACE_BEGIN(W16) // CHAM64, 16-bit word size
47
48template <unsigned int R>
49inline __m128i RotateLeft16(const __m128i& val)
50{
51#if defined(__XOP__)
52 return _mm_roti_epi16(val, R);
53#else
54 return _mm_or_si128(
55 _mm_slli_epi16(val, R), _mm_srli_epi16(val, 16-R));
56#endif
57}
58
59template <unsigned int R>
60inline __m128i RotateRight16(const __m128i& val)
61{
62#if defined(__XOP__)
63 return _mm_roti_epi16(val, 16-R);
64#else
65 return _mm_or_si128(
66 _mm_slli_epi16(val, 16-R), _mm_srli_epi16(val, R));
67#endif
68}
69
70template <>
71inline __m128i RotateLeft16<8>(const __m128i& val)
72{
73#if defined(__XOP__)
74 return _mm_roti_epi16(val, 8);
75#else
76 const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1);
77 return _mm_shuffle_epi8(val, mask);
78#endif
79}
80
81template <>
82inline __m128i RotateRight16<8>(const __m128i& val)
83{
84#if defined(__XOP__)
85 return _mm_roti_epi16(val, 16-8);
86#else
87 const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1);
88 return _mm_shuffle_epi8(val, mask);
89#endif
90}
91
92template <unsigned int IDX>
93inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
94 const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
95{
96 // Should not be instantiated
97 CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
98 CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
99 CRYPTOPP_UNUSED(e); CRYPTOPP_UNUSED(f);
100 CRYPTOPP_UNUSED(g); CRYPTOPP_UNUSED(h);
102 return _mm_setzero_si128();
103}
104
105template <>
106inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
107 const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
108{
109 // The shuffle converts to and from little-endian for SSE. A specialized
110 // CHAM implementation can avoid the shuffle by framing the data for
111 // encryption, decryption and benchmarks. The library cannot take the
112 // speed-up because of the byte oriented API.
113 const __m128i r1 = _mm_unpacklo_epi16(a, b);
114 const __m128i r2 = _mm_unpacklo_epi16(c, d);
115 const __m128i r3 = _mm_unpacklo_epi16(e, f);
116 const __m128i r4 = _mm_unpacklo_epi16(g, h);
117
118 const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
119 const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
120 return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
121 _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
122}
123
124template <>
125inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
126 const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
127{
128 // The shuffle converts to and from little-endian for SSE. A specialized
129 // CHAM implementation can avoid the shuffle by framing the data for
130 // encryption, decryption and benchmarks. The library cannot take the
131 // speed-up because of the byte oriented API.
132 const __m128i r1 = _mm_unpacklo_epi16(a, b);
133 const __m128i r2 = _mm_unpacklo_epi16(c, d);
134 const __m128i r3 = _mm_unpacklo_epi16(e, f);
135 const __m128i r4 = _mm_unpacklo_epi16(g, h);
136
137 const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
138 const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
139 return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
140 _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
141}
142
143template <>
144inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
145 const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
146{
147 // The shuffle converts to and from little-endian for SSE. A specialized
148 // CHAM implementation can avoid the shuffle by framing the data for
149 // encryption, decryption and benchmarks. The library cannot take the
150 // speed-up because of the byte oriented API.
151 const __m128i r1 = _mm_unpacklo_epi16(a, b);
152 const __m128i r2 = _mm_unpacklo_epi16(c, d);
153 const __m128i r3 = _mm_unpacklo_epi16(e, f);
154 const __m128i r4 = _mm_unpacklo_epi16(g, h);
155
156 const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
157 const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
158 return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
159 _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
160}
161
162template <>
163inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
164 const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
165{
166 // The shuffle converts to and from little-endian for SSE. A specialized
167 // CHAM implementation can avoid the shuffle by framing the data for
168 // encryption, decryption and benchmarks. The library cannot take the
169 // speed-up because of the byte oriented API.
170 const __m128i r1 = _mm_unpacklo_epi16(a, b);
171 const __m128i r2 = _mm_unpacklo_epi16(c, d);
172 const __m128i r3 = _mm_unpacklo_epi16(e, f);
173 const __m128i r4 = _mm_unpacklo_epi16(g, h);
174
175 const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
176 const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
177 return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
178 _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
179}
180
181template <>
182inline __m128i UnpackXMM<4>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
183 const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
184{
185 // The shuffle converts to and from little-endian for SSE. A specialized
186 // CHAM implementation can avoid the shuffle by framing the data for
187 // encryption, decryption and benchmarks. The library cannot take the
188 // speed-up because of the byte oriented API.
189 const __m128i r1 = _mm_unpackhi_epi16(a, b);
190 const __m128i r2 = _mm_unpackhi_epi16(c, d);
191 const __m128i r3 = _mm_unpackhi_epi16(e, f);
192 const __m128i r4 = _mm_unpackhi_epi16(g, h);
193
194 const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
195 const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
196 return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
197 _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
198}
199
200template <>
201inline __m128i UnpackXMM<5>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
202 const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
203{
204 // The shuffle converts to and from little-endian for SSE. A specialized
205 // CHAM implementation can avoid the shuffle by framing the data for
206 // encryption, decryption and benchmarks. The library cannot take the
207 // speed-up because of the byte oriented API.
208 const __m128i r1 = _mm_unpackhi_epi16(a, b);
209 const __m128i r2 = _mm_unpackhi_epi16(c, d);
210 const __m128i r3 = _mm_unpackhi_epi16(e, f);
211 const __m128i r4 = _mm_unpackhi_epi16(g, h);
212
213 const __m128i r5 = _mm_unpacklo_epi32(r1, r2);
214 const __m128i r6 = _mm_unpacklo_epi32(r3, r4);
215 return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
216 _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
217}
218
219template <>
220inline __m128i UnpackXMM<6>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
221 const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
222{
223 // The shuffle converts to and from little-endian for SSE. A specialized
224 // CHAM implementation can avoid the shuffle by framing the data for
225 // encryption, decryption and benchmarks. The library cannot take the
226 // speed-up because of the byte oriented API.
227 const __m128i r1 = _mm_unpackhi_epi16(a, b);
228 const __m128i r2 = _mm_unpackhi_epi16(c, d);
229 const __m128i r3 = _mm_unpackhi_epi16(e, f);
230 const __m128i r4 = _mm_unpackhi_epi16(g, h);
231
232 const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
233 const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
234 return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6),
235 _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
236}
237
238template <>
239inline __m128i UnpackXMM<7>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
240 const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
241{
242 // The shuffle converts to and from little-endian for SSE. A specialized
243 // CHAM implementation can avoid the shuffle by framing the data for
244 // encryption, decryption and benchmarks. The library cannot take the
245 // speed-up because of the byte oriented API.
246 const __m128i r1 = _mm_unpackhi_epi16(a, b);
247 const __m128i r2 = _mm_unpackhi_epi16(c, d);
248 const __m128i r3 = _mm_unpackhi_epi16(e, f);
249 const __m128i r4 = _mm_unpackhi_epi16(g, h);
250
251 const __m128i r5 = _mm_unpackhi_epi32(r1, r2);
252 const __m128i r6 = _mm_unpackhi_epi32(r3, r4);
253 return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6),
254 _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1));
255}
256
257template <unsigned int IDX>
258inline __m128i UnpackXMM(const __m128i& v)
259{
260 // Should not be instantiated
261 CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0);
262
263 return _mm_setzero_si128();
264}
265
266template <>
267inline __m128i UnpackXMM<0>(const __m128i& v)
268{
269 return _mm_shuffle_epi8(v, _mm_set_epi8(0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1));
270}
271
272template <>
273inline __m128i UnpackXMM<1>(const __m128i& v)
274{
275 return _mm_shuffle_epi8(v, _mm_set_epi8(2,3, 2,3, 2,3, 2,3, 2,3, 2,3, 2,3, 2,3));
276}
277
278template <>
279inline __m128i UnpackXMM<2>(const __m128i& v)
280{
281 return _mm_shuffle_epi8(v, _mm_set_epi8(4,5, 4,5, 4,5, 4,5, 4,5, 4,5, 4,5, 4,5));
282}
283
284template <>
285inline __m128i UnpackXMM<3>(const __m128i& v)
286{
287 return _mm_shuffle_epi8(v, _mm_set_epi8(6,7, 6,7, 6,7, 6,7, 6,7, 6,7, 6,7, 6,7));
288}
289
290template <>
291inline __m128i UnpackXMM<4>(const __m128i& v)
292{
293 return _mm_shuffle_epi8(v, _mm_set_epi8(8,9, 8,9, 8,9, 8,9, 8,9, 8,9, 8,9, 8,9));
294}
295
296template <>
297inline __m128i UnpackXMM<5>(const __m128i& v)
298{
299 return _mm_shuffle_epi8(v, _mm_set_epi8(10,11, 10,11, 10,11, 10,11, 10,11, 10,11, 10,11, 10,11));
300}
301
302template <>
303inline __m128i UnpackXMM<6>(const __m128i& v)
304{
305 return _mm_shuffle_epi8(v, _mm_set_epi8(12,13, 12,13, 12,13, 12,13, 12,13, 12,13, 12,13, 12,13));
306}
307
308template <>
309inline __m128i UnpackXMM<7>(const __m128i& v)
310{
311 return _mm_shuffle_epi8(v, _mm_set_epi8(14,15, 14,15, 14,15, 14,15, 14,15, 14,15, 14,15, 14,15));
312}
313
314template <unsigned int IDX>
315inline __m128i UnpackXMM(const __m128i& a, const __m128i& b)
316{
317 const __m128i& z = _mm_setzero_si128();
318 return UnpackXMM<IDX>(a, b, z, z, z, z, z, z);
319}
320
321template <unsigned int IDX>
322inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d,
323 const __m128i& e, const __m128i& f, const __m128i& g, const __m128i& h)
324{
325 return UnpackXMM<IDX>(a, b, c, d, e, f, g, h);
326}
327
328template <unsigned int IDX>
329inline __m128i RepackXMM(const __m128i& v)
330{
331 return UnpackXMM<IDX>(v);
332}
333
334inline void CHAM64_Enc_Block(__m128i &block0,
335 const word16 *subkeys, unsigned int /*rounds*/)
336{
337 // Rearrange the data for vectorization. UnpackXMM includes a
338 // little-endian swap for SSE. Thanks to Peter Cordes for help
339 // with packing and unpacking.
340 // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
341 __m128i a = UnpackXMM<0>(block0);
342 __m128i b = UnpackXMM<1>(block0);
343 __m128i c = UnpackXMM<2>(block0);
344 __m128i d = UnpackXMM<3>(block0);
345 __m128i e = UnpackXMM<4>(block0);
346 __m128i f = UnpackXMM<5>(block0);
347 __m128i g = UnpackXMM<6>(block0);
348 __m128i h = UnpackXMM<7>(block0);
349
350 const unsigned int rounds = 80;
351 __m128i counter = _mm_set_epi16(0,0,0,0,0,0,0,0);
352 __m128i increment = _mm_set_epi16(1,1,1,1,1,1,1,1);
353
354 const unsigned int MASK = 15;
355 for (int i=0; i<static_cast<int>(rounds); i+=4)
356 {
357 __m128i k, kr, t1, t2, t3, t4;
358 k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i+0) & MASK])));
359
360 // Shuffle out key
361 kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
362
363 t1 = _mm_xor_si128(a, counter);
364 t3 = _mm_xor_si128(e, counter);
365 t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
366 t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
367 a = RotateLeft16<8>(_mm_add_epi16(t1, t2));
368 e = RotateLeft16<8>(_mm_add_epi16(t3, t4));
369
370 counter = _mm_add_epi16(counter, increment);
371 kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
372
373 t1 = _mm_xor_si128(b, counter);
374 t3 = _mm_xor_si128(f, counter);
375 t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
376 t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
377 b = RotateLeft16<1>(_mm_add_epi16(t1, t2));
378 f = RotateLeft16<1>(_mm_add_epi16(t3, t4));
379
380 counter = _mm_add_epi16(counter, increment);
381 kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
382
383 t1 = _mm_xor_si128(c, counter);
384 t3 = _mm_xor_si128(g, counter);
385 t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
386 t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
387 c = RotateLeft16<8>(_mm_add_epi16(t1, t2));
388 g = RotateLeft16<8>(_mm_add_epi16(t3, t4));
389
390 counter = _mm_add_epi16(counter, increment);
391 kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
392
393 t1 = _mm_xor_si128(d, counter);
394 t3 = _mm_xor_si128(h, counter);
395 t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
396 t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
397 d = RotateLeft16<1>(_mm_add_epi16(t1, t2));
398 h = RotateLeft16<1>(_mm_add_epi16(t3, t4));
399
400 counter = _mm_add_epi16(counter, increment);
401 }
402
403 // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
404 block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
405}
406
407inline void CHAM64_Dec_Block(__m128i &block0,
408 const word16 *subkeys, unsigned int /*rounds*/)
409{
410 // Rearrange the data for vectorization. UnpackXMM includes a
411 // little-endian swap for SSE. Thanks to Peter Cordes for help
412 // with packing and unpacking.
413 // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
414 __m128i a = UnpackXMM<0>(block0);
415 __m128i b = UnpackXMM<1>(block0);
416 __m128i c = UnpackXMM<2>(block0);
417 __m128i d = UnpackXMM<3>(block0);
418 __m128i e = UnpackXMM<4>(block0);
419 __m128i f = UnpackXMM<5>(block0);
420 __m128i g = UnpackXMM<6>(block0);
421 __m128i h = UnpackXMM<7>(block0);
422
423 const unsigned int rounds = 80;
424 __m128i counter = _mm_set_epi16(rounds-1,rounds-1,rounds-1,rounds-1, rounds-1,rounds-1,rounds-1,rounds-1);
425 __m128i decrement = _mm_set_epi16(1,1,1,1,1,1,1,1);
426
427 const unsigned int MASK = 15;
428 for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
429 {
430 __m128i k, kr, t1, t2, t3, t4;
431 k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i-3) & MASK])));
432
433 // Shuffle out key
434 kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
435
436 // Odd round
437 t1 = RotateRight16<1>(d);
438 t3 = RotateRight16<1>(h);
439 t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
440 t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
441 d = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
442 h = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
443
444 counter = _mm_sub_epi16(counter, decrement);
445 kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
446
447 // Even round
448 t1 = RotateRight16<8>(c);
449 t3 = RotateRight16<8>(g);
450 t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
451 t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
452 c = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
453 g = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
454
455 counter = _mm_sub_epi16(counter, decrement);
456 kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
457
458 // Odd round
459 t1 = RotateRight16<1>(b);
460 t3 = RotateRight16<1>(f);
461 t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
462 t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
463 b = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
464 f = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
465
466 counter = _mm_sub_epi16(counter, decrement);
467 kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
468
469 // Even round
470 t1 = RotateRight16<8>(a);
471 t3 = RotateRight16<8>(e);
472 t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
473 t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
474 a = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
475 e = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
476
477 counter = _mm_sub_epi16(counter, decrement);
478 }
479
480 // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
481 block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
482}
483
484inline void CHAM64_Enc_2_Blocks(__m128i &block0,
485 __m128i &block1, const word16 *subkeys, unsigned int /*rounds*/)
486{
487 // Rearrange the data for vectorization. UnpackXMM includes a
488 // little-endian swap for SSE. Thanks to Peter Cordes for help
489 // with packing and unpacking.
490 // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
491 __m128i a = UnpackXMM<0>(block0, block1);
492 __m128i b = UnpackXMM<1>(block0, block1);
493 __m128i c = UnpackXMM<2>(block0, block1);
494 __m128i d = UnpackXMM<3>(block0, block1);
495 __m128i e = UnpackXMM<4>(block0, block1);
496 __m128i f = UnpackXMM<5>(block0, block1);
497 __m128i g = UnpackXMM<6>(block0, block1);
498 __m128i h = UnpackXMM<7>(block0, block1);
499
500 const unsigned int rounds = 80;
501 __m128i counter = _mm_set_epi16(0,0,0,0,0,0,0,0);
502 __m128i increment = _mm_set_epi16(1,1,1,1,1,1,1,1);
503
504 const unsigned int MASK = 15;
505 for (int i=0; i<static_cast<int>(rounds); i+=4)
506 {
507 __m128i k, kr, t1, t2, t3, t4;
508 k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[i & MASK])));
509
510 // Shuffle out key
511 kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
512
513 t1 = _mm_xor_si128(a, counter);
514 t3 = _mm_xor_si128(e, counter);
515 t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
516 t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
517 a = RotateLeft16<8>(_mm_add_epi16(t1, t2));
518 e = RotateLeft16<8>(_mm_add_epi16(t3, t4));
519
520 counter = _mm_add_epi16(counter, increment);
521 kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
522
523 t1 = _mm_xor_si128(b, counter);
524 t3 = _mm_xor_si128(f, counter);
525 t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
526 t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
527 b = RotateLeft16<1>(_mm_add_epi16(t1, t2));
528 f = RotateLeft16<1>(_mm_add_epi16(t3, t4));
529
530 counter = _mm_add_epi16(counter, increment);
531 kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
532
533 t1 = _mm_xor_si128(c, counter);
534 t3 = _mm_xor_si128(g, counter);
535 t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
536 t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
537 c = RotateLeft16<8>(_mm_add_epi16(t1, t2));
538 g = RotateLeft16<8>(_mm_add_epi16(t3, t4));
539
540 counter = _mm_add_epi16(counter, increment);
541 kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
542
543 t1 = _mm_xor_si128(d, counter);
544 t3 = _mm_xor_si128(h, counter);
545 t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
546 t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
547 d = RotateLeft16<1>(_mm_add_epi16(t1, t2));
548 h = RotateLeft16<1>(_mm_add_epi16(t3, t4));
549
550 counter = _mm_add_epi16(counter, increment);
551 }
552
553 // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
554 block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
555 block1 = RepackXMM<1>(a,b,c,d,e,f,g,h);
556}
557
558inline void CHAM64_Dec_2_Blocks(__m128i &block0,
559 __m128i &block1, const word16 *subkeys, unsigned int /*rounds*/)
560{
561 // Rearrange the data for vectorization. UnpackXMM includes a
562 // little-endian swap for SSE. Thanks to Peter Cordes for help
563 // with packing and unpacking.
564 // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ...
565 __m128i a = UnpackXMM<0>(block0, block1);
566 __m128i b = UnpackXMM<1>(block0, block1);
567 __m128i c = UnpackXMM<2>(block0, block1);
568 __m128i d = UnpackXMM<3>(block0, block1);
569 __m128i e = UnpackXMM<4>(block0, block1);
570 __m128i f = UnpackXMM<5>(block0, block1);
571 __m128i g = UnpackXMM<6>(block0, block1);
572 __m128i h = UnpackXMM<7>(block0, block1);
573
574 const unsigned int rounds = 80;
575 __m128i counter = _mm_set_epi16(rounds-1,rounds-1,rounds-1,rounds-1, rounds-1,rounds-1,rounds-1,rounds-1);
576 __m128i decrement = _mm_set_epi16(1,1,1,1,1,1,1,1);
577
578 const unsigned int MASK = 15;
579 for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
580 {
581 __m128i k, kr, t1, t2, t3, t4;
582 k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i-3) & MASK])));
583
584 // Shuffle out key
585 kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6));
586
587 // Odd round
588 t1 = RotateRight16<1>(d);
589 t3 = RotateRight16<1>(h);
590 t2 = _mm_xor_si128(RotateLeft16<8>(a), kr);
591 t4 = _mm_xor_si128(RotateLeft16<8>(e), kr);
592 d = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
593 h = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
594
595 counter = _mm_sub_epi16(counter, decrement);
596 kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4));
597
598 // Even round
599 t1 = RotateRight16<8>(c);
600 t3 = RotateRight16<8>(g);
601 t2 = _mm_xor_si128(RotateLeft16<1>(d), kr);
602 t4 = _mm_xor_si128(RotateLeft16<1>(h), kr);
603 c = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
604 g = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
605
606 counter = _mm_sub_epi16(counter, decrement);
607 kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2));
608
609 // Odd round
610 t1 = RotateRight16<1>(b);
611 t3 = RotateRight16<1>(f);
612 t2 = _mm_xor_si128(RotateLeft16<8>(c), kr);
613 t4 = _mm_xor_si128(RotateLeft16<8>(g), kr);
614 b = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
615 f = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
616
617 counter = _mm_sub_epi16(counter, decrement);
618 kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0));
619
620 // Even round
621 t1 = RotateRight16<8>(a);
622 t3 = RotateRight16<8>(e);
623 t2 = _mm_xor_si128(RotateLeft16<1>(b), kr);
624 t4 = _mm_xor_si128(RotateLeft16<1>(f), kr);
625 a = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter);
626 e = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter);
627
628 counter = _mm_sub_epi16(counter, decrement);
629 }
630
631 // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ...
632 block0 = RepackXMM<0>(a,b,c,d,e,f,g,h);
633 block1 = RepackXMM<1>(a,b,c,d,e,f,g,h);
634}
635
636NAMESPACE_END // W16
637
638//////////////////////////////////////////////////////////////////////////
639
640NAMESPACE_BEGIN(W32) // CHAM128, 32-bit word size
641
642template <unsigned int R>
643inline __m128i RotateLeft32(const __m128i& val)
644{
645#if defined(CRYPTOPP_AVX512_ROTATE)
646 return _mm_rol_epi32(val, R);
647#elif defined(__XOP__)
648 return _mm_roti_epi32(val, R);
649#else
650 return _mm_or_si128(
651 _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
652#endif
653}
654
655template <unsigned int R>
656inline __m128i RotateRight32(const __m128i& val)
657{
658#if defined(CRYPTOPP_AVX512_ROTATE)
659 return _mm_ror_epi32(val, R);
660#elif defined(__XOP__)
661 return _mm_roti_epi32(val, 32-R);
662#else
663 return _mm_or_si128(
664 _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
665#endif
666}
667
668// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
669template <>
670inline __m128i RotateLeft32<8>(const __m128i& val)
671{
672#if defined(__XOP__)
673 return _mm_roti_epi32(val, 8);
674#else
675 const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
676 return _mm_shuffle_epi8(val, mask);
677#endif
678}
679
680// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
681template <>
682inline __m128i RotateRight32<8>(const __m128i& val)
683{
684#if defined(__XOP__)
685 return _mm_roti_epi32(val, 32-8);
686#else
687 const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
688 return _mm_shuffle_epi8(val, mask);
689#endif
690}
691
692template <unsigned int IDX>
693inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
694{
695 // Should not be instantiated
696 CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
697 CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
699 return _mm_setzero_si128();
700}
701
702template <>
703inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
704{
705 // The shuffle converts to and from little-endian for SSE. A specialized
706 // CHAM implementation can avoid the shuffle by framing the data for
707 // encryption, decryption and benchmarks. The library cannot take the
708 // speed-up because of the byte oriented API.
709 const __m128i r1 = _mm_unpacklo_epi32(a, b);
710 const __m128i r2 = _mm_unpacklo_epi32(c, d);
711 return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
712 _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
713}
714
715template <>
716inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
717{
718 // The shuffle converts to and from little-endian for SSE. A specialized
719 // CHAM implementation can avoid the shuffle by framing the data for
720 // encryption, decryption and benchmarks. The library cannot take the
721 // speed-up because of the byte oriented API.
722 const __m128i r1 = _mm_unpacklo_epi32(a, b);
723 const __m128i r2 = _mm_unpacklo_epi32(c, d);
724 return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
725 _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
726}
727
728template <>
729inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
730{
731 // The shuffle converts to and from little-endian for SSE. A specialized
732 // CHAM implementation can avoid the shuffle by framing the data for
733 // encryption, decryption and benchmarks. The library cannot take the
734 // speed-up because of the byte oriented API.
735 const __m128i r1 = _mm_unpackhi_epi32(a, b);
736 const __m128i r2 = _mm_unpackhi_epi32(c, d);
737 return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
738 _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
739}
740
741template <>
742inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
743{
744 // The shuffle converts to and from little-endian for SSE. A specialized
745 // CHAM implementation can avoid the shuffle by framing the data for
746 // encryption, decryption and benchmarks. The library cannot take the
747 // speed-up because of the byte oriented API.
748 const __m128i r1 = _mm_unpackhi_epi32(a, b);
749 const __m128i r2 = _mm_unpackhi_epi32(c, d);
750 return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
751 _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
752}
753
754template <unsigned int IDX>
755inline __m128i UnpackXMM(const __m128i& v)
756{
757 // Should not be instantiated
758 CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0);
759 return _mm_setzero_si128();
760}
761
762template <>
763inline __m128i UnpackXMM<0>(const __m128i& v)
764{
765 return _mm_shuffle_epi8(v, _mm_set_epi8(0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3));
766}
767
768template <>
769inline __m128i UnpackXMM<1>(const __m128i& v)
770{
771 return _mm_shuffle_epi8(v, _mm_set_epi8(4,5,6,7, 4,5,6,7, 4,5,6,7, 4,5,6,7));
772}
773
774template <>
775inline __m128i UnpackXMM<2>(const __m128i& v)
776{
777 return _mm_shuffle_epi8(v, _mm_set_epi8(8,9,10,11, 8,9,10,11, 8,9,10,11, 8,9,10,11));
778}
779
780template <>
781inline __m128i UnpackXMM<3>(const __m128i& v)
782{
783 return _mm_shuffle_epi8(v, _mm_set_epi8(12,13,14,15, 12,13,14,15, 12,13,14,15, 12,13,14,15));
784}
785
786template <unsigned int IDX>
787inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
788{
789 return UnpackXMM<IDX>(a, b, c, d);
790}
791
792template <unsigned int IDX>
793inline __m128i RepackXMM(const __m128i& v)
794{
795 return UnpackXMM<IDX>(v);
796}
797
798inline void CHAM128_Enc_Block(__m128i &block0,
799 const word32 *subkeys, unsigned int rounds)
800{
801 // Rearrange the data for vectorization. UnpackXMM includes a
802 // little-endian swap for SSE. Thanks to Peter Cordes for help
803 // with packing and unpacking.
804 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
805 __m128i a = UnpackXMM<0>(block0);
806 __m128i b = UnpackXMM<1>(block0);
807 __m128i c = UnpackXMM<2>(block0);
808 __m128i d = UnpackXMM<3>(block0);
809
810 __m128i counter = _mm_set_epi32(0,0,0,0);
811 __m128i increment = _mm_set_epi32(1,1,1,1);
812
813 const unsigned int MASK = (rounds == 80 ? 7 : 15);
814 for (int i=0; i<static_cast<int>(rounds); i+=4)
815 {
816 __m128i k, k1, k2, t1, t2;
817 k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i+0) & MASK])));
818
819 // Shuffle out two subkeys
820 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
821 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
822
823 t1 = _mm_xor_si128(a, counter);
824 t2 = _mm_xor_si128(RotateLeft32<1>(b), k1);
825 a = RotateLeft32<8>(_mm_add_epi32(t1, t2));
826
827 counter = _mm_add_epi32(counter, increment);
828
829 t1 = _mm_xor_si128(b, counter);
830 t2 = _mm_xor_si128(RotateLeft32<8>(c), k2);
831 b = RotateLeft32<1>(_mm_add_epi32(t1, t2));
832
833 counter = _mm_add_epi32(counter, increment);
834
835 k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i+2) & MASK])));
836
837 // Shuffle out two subkeys
838 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
839 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
840
841 t1 = _mm_xor_si128(c, counter);
842 t2 = _mm_xor_si128(RotateLeft32<1>(d), k1);
843 c = RotateLeft32<8>(_mm_add_epi32(t1, t2));
844
845 counter = _mm_add_epi32(counter, increment);
846
847 t1 = _mm_xor_si128(d, counter);
848 t2 = _mm_xor_si128(RotateLeft32<8>(a), k2);
849 d = RotateLeft32<1>(_mm_add_epi32(t1, t2));
850
851 counter = _mm_add_epi32(counter, increment);
852 }
853
854 // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
855 block0 = RepackXMM<0>(a,b,c,d);
856}
857
858inline void CHAM128_Dec_Block(__m128i &block0,
859 const word32 *subkeys, unsigned int rounds)
860{
861 // Rearrange the data for vectorization. UnpackXMM includes a
862 // little-endian swap for SSE. Thanks to Peter Cordes for help
863 // with packing and unpacking.
864 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
865 __m128i a = UnpackXMM<0>(block0);
866 __m128i b = UnpackXMM<1>(block0);
867 __m128i c = UnpackXMM<2>(block0);
868 __m128i d = UnpackXMM<3>(block0);
869
870 __m128i counter = _mm_set_epi32(rounds-1,rounds-1,rounds-1,rounds-1);
871 __m128i decrement = _mm_set_epi32(1,1,1,1);
872
873 const unsigned int MASK = (rounds == 80 ? 7 : 15);
874 for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
875 {
876 __m128i k, k1, k2, t1, t2;
877 k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i-1) & MASK])));
878
879 // Shuffle out two subkeys
880 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
881 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
882
883 // Odd round
884 t1 = RotateRight32<1>(d);
885 t2 = _mm_xor_si128(RotateLeft32<8>(a), k1);
886 d = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
887
888 counter = _mm_sub_epi32(counter, decrement);
889
890 // Even round
891 t1 = RotateRight32<8>(c);
892 t2 = _mm_xor_si128(RotateLeft32<1>(d), k2);
893 c = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
894
895 counter = _mm_sub_epi32(counter, decrement);
896 k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i-3) & MASK])));
897
898 // Shuffle out two subkeys
899 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
900 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
901
902 // Odd round
903 t1 = RotateRight32<1>(b);
904 t2 = _mm_xor_si128(RotateLeft32<8>(c), k1);
905 b = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
906
907 counter = _mm_sub_epi32(counter, decrement);
908
909 // Even round
910 t1 = RotateRight32<8>(a);
911 t2 = _mm_xor_si128(RotateLeft32<1>(b), k2);
912 a = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
913
914 counter = _mm_sub_epi32(counter, decrement);
915 }
916
917 // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
918 block0 = RepackXMM<0>(a,b,c,d);
919}
920
921inline void CHAM128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
922 __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
923{
924 // Rearrange the data for vectorization. UnpackXMM includes a
925 // little-endian swap for SSE. Thanks to Peter Cordes for help
926 // with packing and unpacking.
927 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
928 __m128i a = UnpackXMM<0>(block0, block1, block2, block3);
929 __m128i b = UnpackXMM<1>(block0, block1, block2, block3);
930 __m128i c = UnpackXMM<2>(block0, block1, block2, block3);
931 __m128i d = UnpackXMM<3>(block0, block1, block2, block3);
932
933 __m128i counter = _mm_set_epi32(0,0,0,0);
934 __m128i increment = _mm_set_epi32(1,1,1,1);
935
936 const unsigned int MASK = (rounds == 80 ? 7 : 15);
937 for (int i=0; i<static_cast<int>(rounds); i+=4)
938 {
939 __m128i k, k1, k2, t1, t2;
940 k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i+0) & MASK])));
941
942 // Shuffle out two subkeys
943 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
944 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
945
946 t1 = _mm_xor_si128(a, counter);
947 t2 = _mm_xor_si128(RotateLeft32<1>(b), k1);
948 a = RotateLeft32<8>(_mm_add_epi32(t1, t2));
949
950 counter = _mm_add_epi32(counter, increment);
951
952 t1 = _mm_xor_si128(b, counter);
953 t2 = _mm_xor_si128(RotateLeft32<8>(c), k2);
954 b = RotateLeft32<1>(_mm_add_epi32(t1, t2));
955
956 counter = _mm_add_epi32(counter, increment);
957 k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i+2) & MASK])));
958
959 // Shuffle out two subkeys
960 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
961 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
962
963 t1 = _mm_xor_si128(c, counter);
964 t2 = _mm_xor_si128(RotateLeft32<1>(d), k1);
965 c = RotateLeft32<8>(_mm_add_epi32(t1, t2));
966
967 counter = _mm_add_epi32(counter, increment);
968
969 t1 = _mm_xor_si128(d, counter);
970 t2 = _mm_xor_si128(RotateLeft32<8>(a), k2);
971 d = RotateLeft32<1>(_mm_add_epi32(t1, t2));
972
973 counter = _mm_add_epi32(counter, increment);
974 }
975
976 // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
977 block0 = RepackXMM<0>(a,b,c,d);
978 block1 = RepackXMM<1>(a,b,c,d);
979 block2 = RepackXMM<2>(a,b,c,d);
980 block3 = RepackXMM<3>(a,b,c,d);
981}
982
983inline void CHAM128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
984 __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
985{
986 // Rearrange the data for vectorization. UnpackXMM includes a
987 // little-endian swap for SSE. Thanks to Peter Cordes for help
988 // with packing and unpacking.
989 // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
990 __m128i a = UnpackXMM<0>(block0, block1, block2, block3);
991 __m128i b = UnpackXMM<1>(block0, block1, block2, block3);
992 __m128i c = UnpackXMM<2>(block0, block1, block2, block3);
993 __m128i d = UnpackXMM<3>(block0, block1, block2, block3);
994
995 __m128i counter = _mm_set_epi32(rounds-1,rounds-1,rounds-1,rounds-1);
996 __m128i decrement = _mm_set_epi32(1,1,1,1);
997
998 const unsigned int MASK = (rounds == 80 ? 7 : 15);
999 for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
1000 {
1001 __m128i k, k1, k2, t1, t2;
1002 k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i-1) & MASK])));
1003
1004 // Shuffle out two subkeys
1005 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
1006 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
1007
1008 // Odd round
1009 t1 = RotateRight32<1>(d);
1010 t2 = _mm_xor_si128(RotateLeft32<8>(a), k1);
1011 d = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
1012
1013 counter = _mm_sub_epi32(counter, decrement);
1014
1015 // Even round
1016 t1 = RotateRight32<8>(c);
1017 t2 = _mm_xor_si128(RotateLeft32<1>(d), k2);
1018 c = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
1019
1020 counter = _mm_sub_epi32(counter, decrement);
1021 k = _mm_castpd_si128(_mm_load_sd((const double*)(&subkeys[(i-3) & MASK])));
1022
1023 // Shuffle out two subkeys
1024 k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
1025 k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
1026
1027 // Odd round
1028 t1 = RotateRight32<1>(b);
1029 t2 = _mm_xor_si128(RotateLeft32<8>(c), k1);
1030 b = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
1031
1032 counter = _mm_sub_epi32(counter, decrement);
1033
1034 // Even round
1035 t1 = RotateRight32<8>(a);
1036 t2 = _mm_xor_si128(RotateLeft32<1>(b), k2);
1037 a = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
1038
1039 counter = _mm_sub_epi32(counter, decrement);
1040 }
1041
1042 // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
1043 block0 = RepackXMM<0>(a,b,c,d);
1044 block1 = RepackXMM<1>(a,b,c,d);
1045 block2 = RepackXMM<2>(a,b,c,d);
1046 block3 = RepackXMM<3>(a,b,c,d);
1047}
1048
1049//////////////////////////////////////////////////////////////////////////
1050
1051NAMESPACE_END // W32
1052
1053#endif // CRYPTOPP_SSSE3_AVAILABLE
1054
1055ANONYMOUS_NAMESPACE_END
1056
1057NAMESPACE_BEGIN(CryptoPP)
1058
1059#if defined(CRYPTOPP_SSSE3_AVAILABLE)
1060size_t CHAM64_Enc_AdvancedProcessBlocks_SSSE3(const word16* subKeys, size_t rounds,
1061 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1062{
1063 return AdvancedProcessBlocks64_2x1_SSE(W16::CHAM64_Enc_Block, W16::CHAM64_Enc_2_Blocks,
1064 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1065}
1066
1067size_t CHAM64_Dec_AdvancedProcessBlocks_SSSE3(const word16* subKeys, size_t rounds,
1068 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1069{
1070 return AdvancedProcessBlocks64_2x1_SSE(W16::CHAM64_Dec_Block, W16::CHAM64_Dec_2_Blocks,
1071 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1072}
1073
1074size_t CHAM128_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
1075 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1076{
1077 return AdvancedProcessBlocks128_4x1_SSE(W32::CHAM128_Enc_Block, W32::CHAM128_Enc_4_Blocks,
1078 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1079}
1080
1081size_t CHAM128_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
1082 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1083{
1084 return AdvancedProcessBlocks128_4x1_SSE(W32::CHAM128_Dec_Block, W32::CHAM128_Dec_4_Blocks,
1085 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1086}
1087#endif // CRYPTOPP_SSSE3_AVAILABLE
1088
1089NAMESPACE_END
Template for AdvancedProcessBlocks and SIMD processing.
Classes for the CHAM block cipher.
Library configuration file.
Utility functions for the Crypto++ library.
Crypto++ library namespace.
Precompiled header file.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:69