Crypto++ 8.2
Free C&
salsa.cpp
1// salsa.cpp - originally written and placed in the public domain by Wei Dai
2
3// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM salsa.cpp" to generate MASM code
4
5#include "pch.h"
6#include "config.h"
7
8#ifndef CRYPTOPP_GENERATE_X64_MASM
9
10#include "salsa.h"
11#include "argnames.h"
12#include "misc.h"
13#include "cpu.h"
14
15#if CRYPTOPP_MSC_VERSION
16# pragma warning(disable: 4702 4740)
17#endif
18
19// Clang due to "Inline assembly operands don't work with .intel_syntax"
20// https://llvm.org/bugs/show_bug.cgi?id=24232
21#if defined(CRYPTOPP_DISABLE_SALSA_ASM)
22# undef CRYPTOPP_X86_ASM_AVAILABLE
23# undef CRYPTOPP_X32_ASM_AVAILABLE
24# undef CRYPTOPP_X64_ASM_AVAILABLE
25# undef CRYPTOPP_SSE2_ASM_AVAILABLE
26# undef CRYPTOPP_SSSE3_ASM_AVAILABLE
27#endif
28
29NAMESPACE_BEGIN(CryptoPP)
30
31#if defined(CRYPTOPP_DEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING)
32void Salsa20_TestInstantiations()
33{
36}
37#endif
38
39void Salsa20_Core(word32* data, unsigned int rounds)
40{
41 CRYPTOPP_ASSERT(data != NULLPTR);
42 CRYPTOPP_ASSERT(rounds % 2 == 0);
43
44 CRYPTOPP_ALIGN_DATA(16) word32 x[16];
45
46 for (size_t i = 0; i < 16; ++i)
47 x[i] = data[i];
48
49 // Rounds must be even
50 for (size_t i = 0; i < rounds; i += 2)
51 {
52 x[ 4] ^= rotlConstant< 7>(x[ 0]+x[12]);
53 x[ 8] ^= rotlConstant< 9>(x[ 4]+x[ 0]);
54 x[12] ^= rotlConstant<13>(x[ 8]+x[ 4]);
55 x[ 0] ^= rotlConstant<18>(x[12]+x[ 8]);
56
57 x[ 9] ^= rotlConstant< 7>(x[ 5]+x[ 1]);
58 x[13] ^= rotlConstant< 9>(x[ 9]+x[ 5]);
59 x[ 1] ^= rotlConstant<13>(x[13]+x[ 9]);
60 x[ 5] ^= rotlConstant<18>(x[ 1]+x[13]);
61
62 x[14] ^= rotlConstant< 7>(x[10]+x[ 6]);
63 x[ 2] ^= rotlConstant< 9>(x[14]+x[10]);
64 x[ 6] ^= rotlConstant<13>(x[ 2]+x[14]);
65 x[10] ^= rotlConstant<18>(x[ 6]+x[ 2]);
66
67 x[ 3] ^= rotlConstant< 7>(x[15]+x[11]);
68 x[ 7] ^= rotlConstant< 9>(x[ 3]+x[15]);
69 x[11] ^= rotlConstant<13>(x[ 7]+x[ 3]);
70 x[15] ^= rotlConstant<18>(x[11]+x[ 7]);
71
72 x[ 1] ^= rotlConstant< 7>(x[ 0]+x[ 3]);
73 x[ 2] ^= rotlConstant< 9>(x[ 1]+x[ 0]);
74 x[ 3] ^= rotlConstant<13>(x[ 2]+x[ 1]);
75 x[ 0] ^= rotlConstant<18>(x[ 3]+x[ 2]);
76
77 x[ 6] ^= rotlConstant< 7>(x[ 5]+x[ 4]);
78 x[ 7] ^= rotlConstant< 9>(x[ 6]+x[ 5]);
79 x[ 4] ^= rotlConstant<13>(x[ 7]+x[ 6]);
80 x[ 5] ^= rotlConstant<18>(x[ 4]+x[ 7]);
81
82 x[11] ^= rotlConstant< 7>(x[10]+x[ 9]);
83 x[ 8] ^= rotlConstant< 9>(x[11]+x[10]);
84 x[ 9] ^= rotlConstant<13>(x[ 8]+x[11]);
85 x[10] ^= rotlConstant<18>(x[ 9]+x[ 8]);
86
87 x[12] ^= rotlConstant< 7>(x[15]+x[14]);
88 x[13] ^= rotlConstant< 9>(x[12]+x[15]);
89 x[14] ^= rotlConstant<13>(x[13]+x[12]);
90 x[15] ^= rotlConstant<18>(x[14]+x[13]);
91 }
92
93// OpenMP 4.0 released July 2013.
94#if _OPENMP >= 201307
95 #pragma omp simd
96 for (size_t i = 0; i < 16; ++i)
97 data[i] += x[i];
98#else
99 for (size_t i = 0; i < 16; ++i)
100 data[i] += x[i];
101#endif
102}
103
104std::string Salsa20_Policy::AlgorithmProvider() const
105{
106#if CRYPTOPP_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_SALSA_ASM)
107 if (HasSSE2())
108 return "SSE2";
109#endif
110 return "C++";
111}
112
113void Salsa20_Policy::CipherSetKey(const NameValuePairs &params, const byte *key, size_t length)
114{
115 // Use previous rounds as the default value
116 int rounds = params.GetIntValueWithDefault(Name::Rounds(), m_rounds);
117 if (rounds != 20 && rounds != 12 && rounds != 8)
118 throw InvalidRounds(Salsa20::StaticAlgorithmName(), rounds);
119
120 // Latch a good value
121 m_rounds = rounds;
122
123 // m_state is reordered for SSE2
125 get1(m_state[13])(m_state[10])(m_state[7])(m_state[4]);
126 GetBlock<word32, LittleEndian> get2(key + length - 16);
127 get2(m_state[15])(m_state[12])(m_state[9])(m_state[6]);
128
129 // "expand 16-byte k" or "expand 32-byte k"
130 m_state[0] = 0x61707865;
131 m_state[1] = (length == 16) ? 0x3120646e : 0x3320646e;
132 m_state[2] = (length == 16) ? 0x79622d36 : 0x79622d32;
133 m_state[3] = 0x6b206574;
134}
135
136void Salsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length)
137{
138 CRYPTOPP_UNUSED(keystreamBuffer), CRYPTOPP_UNUSED(length);
139 CRYPTOPP_ASSERT(length==8);
140
142 get(m_state[14])(m_state[11]);
143 m_state[8] = m_state[5] = 0;
144}
145
146void Salsa20_Policy::SeekToIteration(lword iterationCount)
147{
148 m_state[8] = (word32)iterationCount;
149 m_state[5] = (word32)SafeRightShift<32>(iterationCount);
150}
151
152#if (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64)
153unsigned int Salsa20_Policy::GetAlignment() const
154{
155#if CRYPTOPP_SSE2_ASM_AVAILABLE
156 if (HasSSE2())
157 return 16;
158 else
159#endif
160 return GetAlignmentOf<word32>();
161}
162
163unsigned int Salsa20_Policy::GetOptimalBlockSize() const
164{
165#if CRYPTOPP_SSE2_ASM_AVAILABLE
166 if (HasSSE2())
167 return 4*BYTES_PER_ITERATION;
168 else
169#endif
170 return BYTES_PER_ITERATION;
171}
172#endif
173
174#ifdef CRYPTOPP_X64_MASM_AVAILABLE
175extern "C" {
176void Salsa20_OperateKeystream(byte *output, const byte *input, size_t iterationCount, int rounds, void *state);
177}
178#endif
179
180#if CRYPTOPP_MSC_VERSION
181# pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
182#endif
183
184void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
185{
186#endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
187
188#ifdef CRYPTOPP_X64_MASM_AVAILABLE
189 Salsa20_OperateKeystream(output, input, iterationCount, m_rounds, m_state.data());
190 return;
191#endif
192
193#if CRYPTOPP_SSE2_ASM_AVAILABLE
194#ifdef CRYPTOPP_GENERATE_X64_MASM
195 ALIGN 8
196 Salsa20_OperateKeystream PROC FRAME
197 mov r10, [rsp + 5*8] ; state
198 alloc_stack(10*16 + 32*16 + 8)
199 save_xmm128 xmm6, 0200h
200 save_xmm128 xmm7, 0210h
201 save_xmm128 xmm8, 0220h
202 save_xmm128 xmm9, 0230h
203 save_xmm128 xmm10, 0240h
204 save_xmm128 xmm11, 0250h
205 save_xmm128 xmm12, 0260h
206 save_xmm128 xmm13, 0270h
207 save_xmm128 xmm14, 0280h
208 save_xmm128 xmm15, 0290h
209 .endprolog
210
211 #define REG_output rcx
212 #define REG_input rdx
213 #define REG_iterationCount r8
214 #define REG_state r10
215 #define REG_rounds e9d
216 #define REG_roundsLeft eax
217 #define REG_temp32 r11d
218 #define REG_temp r11
219 #define SSE2_WORKSPACE rsp
220#else
221 if (HasSSE2())
222 {
223 #if CRYPTOPP_BOOL_X64
224 #define REG_output %1
225 #define REG_input %0
226 #define REG_iterationCount %2
227 #define REG_state %4 /* constant */
228 #define REG_rounds %3 /* constant */
229 #define REG_roundsLeft eax
230 #define REG_temp32 edx
231 #define REG_temp rdx
232 #define SSE2_WORKSPACE %5 /* constant */
233
234 CRYPTOPP_ALIGN_DATA(16) byte workspace[16*32];
235 #else
236 #define REG_output edi
237 #define REG_input eax
238 #define REG_iterationCount ecx
239 #define REG_state esi
240 #define REG_rounds edx
241 #define REG_roundsLeft ebx
242 #define REG_temp32 ebp
243 #define REG_temp ebp
244 #define SSE2_WORKSPACE esp + WORD_SZ
245 #endif
246
247 #ifdef __GNUC__
248 __asm__ __volatile__
249 (
250 INTEL_NOPREFIX
251 AS_PUSH_IF86( bx)
252 #else
253 void *s = m_state.data();
254 word32 r = m_rounds;
255
256 AS2( mov REG_iterationCount, iterationCount)
257 AS2( mov REG_input, input)
258 AS2( mov REG_output, output)
259 AS2( mov REG_state, s)
260 AS2( mov REG_rounds, r)
261 #endif
262#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
263
264 AS_PUSH_IF86( bp)
265 AS2( cmp REG_iterationCount, 4)
266 ASJ( jl, 5, f)
267
268#if CRYPTOPP_BOOL_X86
269 AS2( mov ebx, esp)
270 AS2( and esp, -16)
271 AS2( sub esp, 32*16)
272 AS1( push ebx)
273#endif
274
275#define SSE2_EXPAND_S(i, j) \
276 ASS( pshufd xmm4, xmm##i, j, j, j, j) \
277 AS2( movdqa [SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4)
278
279 AS2( movdqa xmm0, [REG_state + 0*16])
280 AS2( movdqa xmm1, [REG_state + 1*16])
281 AS2( movdqa xmm2, [REG_state + 2*16])
282 AS2( movdqa xmm3, [REG_state + 3*16])
283 SSE2_EXPAND_S(0, 0)
284 SSE2_EXPAND_S(0, 1)
285 SSE2_EXPAND_S(0, 2)
286 SSE2_EXPAND_S(0, 3)
287 SSE2_EXPAND_S(1, 0)
288 SSE2_EXPAND_S(1, 2)
289 SSE2_EXPAND_S(1, 3)
290 SSE2_EXPAND_S(2, 1)
291 SSE2_EXPAND_S(2, 2)
292 SSE2_EXPAND_S(2, 3)
293 SSE2_EXPAND_S(3, 0)
294 SSE2_EXPAND_S(3, 1)
295 SSE2_EXPAND_S(3, 2)
296 SSE2_EXPAND_S(3, 3)
297
298#define SSE2_EXPAND_S85(i) \
299 AS2( mov dword ptr [SSE2_WORKSPACE + 8*16 + i*4 + 256], REG_roundsLeft) \
300 AS2( mov dword ptr [SSE2_WORKSPACE + 5*16 + i*4 + 256], REG_temp32) \
301 AS2( add REG_roundsLeft, 1) \
302 AS2( adc REG_temp32, 0)
303
304 ASL(1)
305 AS2( mov REG_roundsLeft, dword ptr [REG_state + 8*4])
306 AS2( mov REG_temp32, dword ptr [REG_state + 5*4])
307 SSE2_EXPAND_S85(0)
308 SSE2_EXPAND_S85(1)
309 SSE2_EXPAND_S85(2)
310 SSE2_EXPAND_S85(3)
311 AS2( mov dword ptr [REG_state + 8*4], REG_roundsLeft)
312 AS2( mov dword ptr [REG_state + 5*4], REG_temp32)
313
314#ifdef __XOP__
315#define SSE2_QUARTER_ROUND(a, b, d, i) \
316 AS2( movdqa xmm4, xmm##d) \
317 AS2( paddd xmm4, xmm##a) \
318 AS3( vprotd xmm4, xmm4, i) \
319 AS2( pxor xmm##b, xmm4)
320#else
321#define SSE2_QUARTER_ROUND(a, b, d, i) \
322 AS2( movdqa xmm4, xmm##d) \
323 AS2( paddd xmm4, xmm##a) \
324 AS2( movdqa xmm5, xmm4) \
325 AS2( pslld xmm4, i) \
326 AS2( psrld xmm5, 32-i) \
327 AS2( pxor xmm##b, xmm4) \
328 AS2( pxor xmm##b, xmm5)
329#endif
330
331#define L01(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##A, [SSE2_WORKSPACE + d*16 + i*256]) /* y3 */
332#define L02(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##C, [SSE2_WORKSPACE + a*16 + i*256]) /* y0 */
333#define L03(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* y0+y3 */
334
335#ifdef __XOP__
336#define L04(A,B,C,D,a,b,c,d,i)
337#define L05(A,B,C,D,a,b,c,d,i) AS3( vprotd xmm##A, xmm##A, 7)
338#define L06(A,B,C,D,a,b,c,d,i)
339#define L07(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + b*16 + i*256])
340#define L08(A,B,C,D,a,b,c,d,i)
341#else
342#define L04(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
343#define L05(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 7)
344#define L06(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-7)
345#define L07(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + b*16 + i*256])
346#define L08(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z1 */
347#endif
348
349#define L09(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + b*16], xmm##A)
350#define L10(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
351#define L11(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* z1+y0 */
352
353#ifdef __XOP__
354#define L12(A,B,C,D,a,b,c,d,i)
355#define L13(A,B,C,D,a,b,c,d,i) AS3( vprotd xmm##A, xmm##A, 9)
356#define L14(A,B,C,D,a,b,c,d,i)
357#define L15(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + c*16 + i*256])
358#define L16(A,B,C,D,a,b,c,d,i)
359#else
360#define L12(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
361#define L13(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 9)
362#define L14(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-9)
363#define L15(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + c*16 + i*256])
364#define L16(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z2 */
365#endif
366
367#define L17(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + c*16], xmm##A)
368#define L18(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
369#define L19(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##B) /* z2+z1 */
370
371#ifdef __XOP__
372#define L20(A,B,C,D,a,b,c,d,i)
373#define L21(A,B,C,D,a,b,c,d,i) AS3( vprotd xmm##A, xmm##A, 13)
374#define L22(A,B,C,D,a,b,c,d,i)
375#define L23(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + d*16 + i*256])
376#define L24(A,B,C,D,a,b,c,d,i)
377#else
378#define L20(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
379#define L21(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 13)
380#define L22(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-13)
381#define L23(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + d*16 + i*256])
382#define L24(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z3 */
383#endif
384
385#define L25(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + d*16], xmm##A)
386#define L26(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##D) /* z3+z2 */
387
388#ifdef __XOP__
389#define L27(A,B,C,D,a,b,c,d,i)
390#define L28(A,B,C,D,a,b,c,d,i) AS3( vprotd xmm##A, xmm##A, 18)
391#define L29(A,B,C,D,a,b,c,d,i)
392#define L30(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##C) /* xor y0 */
393#define L31(A,B,C,D,a,b,c,d,i)
394#else
395#define L27(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
396#define L28(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 18)
397#define L29(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-18)
398#define L30(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##C) /* xor y0 */
399#define L31(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z0 */
400#endif
401
402#define L32(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + a*16], xmm##A)
403
404#define SSE2_QUARTER_ROUND_X8(i, a, b, c, d, e, f, g, h) \
405 L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) \
406 L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) \
407 L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) \
408 L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) \
409 L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) \
410 L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) \
411 L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) \
412 L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) \
413 L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) \
414 L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) \
415 L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) \
416 L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) \
417 L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) \
418 L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) \
419 L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) \
420 L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) \
421 L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) \
422 L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) \
423 L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) \
424 L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) \
425 L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) \
426 L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) \
427 L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) \
428 L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) \
429 L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) \
430 L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) \
431 L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) \
432 L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) \
433 L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) \
434 L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) \
435 L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) \
436 L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i)
437
438#define SSE2_QUARTER_ROUND_X16(i, a, b, c, d, e, f, g, h, A, B, C, D, E, F, G, H) \
439 L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) L01(8,9,10,11, A,B,C,D, i) L01(12,13,14,15, E,F,G,H, i) \
440 L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) L02(8,9,10,11, A,B,C,D, i) L02(12,13,14,15, E,F,G,H, i) \
441 L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) L03(8,9,10,11, A,B,C,D, i) L03(12,13,14,15, E,F,G,H, i) \
442 L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) L04(8,9,10,11, A,B,C,D, i) L04(12,13,14,15, E,F,G,H, i) \
443 L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) L05(8,9,10,11, A,B,C,D, i) L05(12,13,14,15, E,F,G,H, i) \
444 L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) L06(8,9,10,11, A,B,C,D, i) L06(12,13,14,15, E,F,G,H, i) \
445 L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) L07(8,9,10,11, A,B,C,D, i) L07(12,13,14,15, E,F,G,H, i) \
446 L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) L08(8,9,10,11, A,B,C,D, i) L08(12,13,14,15, E,F,G,H, i) \
447 L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) L09(8,9,10,11, A,B,C,D, i) L09(12,13,14,15, E,F,G,H, i) \
448 L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) L10(8,9,10,11, A,B,C,D, i) L10(12,13,14,15, E,F,G,H, i) \
449 L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) L11(8,9,10,11, A,B,C,D, i) L11(12,13,14,15, E,F,G,H, i) \
450 L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) L12(8,9,10,11, A,B,C,D, i) L12(12,13,14,15, E,F,G,H, i) \
451 L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) L13(8,9,10,11, A,B,C,D, i) L13(12,13,14,15, E,F,G,H, i) \
452 L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) L14(8,9,10,11, A,B,C,D, i) L14(12,13,14,15, E,F,G,H, i) \
453 L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) L15(8,9,10,11, A,B,C,D, i) L15(12,13,14,15, E,F,G,H, i) \
454 L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) L16(8,9,10,11, A,B,C,D, i) L16(12,13,14,15, E,F,G,H, i) \
455 L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) L17(8,9,10,11, A,B,C,D, i) L17(12,13,14,15, E,F,G,H, i) \
456 L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) L18(8,9,10,11, A,B,C,D, i) L18(12,13,14,15, E,F,G,H, i) \
457 L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) L19(8,9,10,11, A,B,C,D, i) L19(12,13,14,15, E,F,G,H, i) \
458 L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) L20(8,9,10,11, A,B,C,D, i) L20(12,13,14,15, E,F,G,H, i) \
459 L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) L21(8,9,10,11, A,B,C,D, i) L21(12,13,14,15, E,F,G,H, i) \
460 L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) L22(8,9,10,11, A,B,C,D, i) L22(12,13,14,15, E,F,G,H, i) \
461 L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) L23(8,9,10,11, A,B,C,D, i) L23(12,13,14,15, E,F,G,H, i) \
462 L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) L24(8,9,10,11, A,B,C,D, i) L24(12,13,14,15, E,F,G,H, i) \
463 L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) L25(8,9,10,11, A,B,C,D, i) L25(12,13,14,15, E,F,G,H, i) \
464 L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) L26(8,9,10,11, A,B,C,D, i) L26(12,13,14,15, E,F,G,H, i) \
465 L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) L27(8,9,10,11, A,B,C,D, i) L27(12,13,14,15, E,F,G,H, i) \
466 L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) L28(8,9,10,11, A,B,C,D, i) L28(12,13,14,15, E,F,G,H, i) \
467 L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) L29(8,9,10,11, A,B,C,D, i) L29(12,13,14,15, E,F,G,H, i) \
468 L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) L30(8,9,10,11, A,B,C,D, i) L30(12,13,14,15, E,F,G,H, i) \
469 L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) L31(8,9,10,11, A,B,C,D, i) L31(12,13,14,15, E,F,G,H, i) \
470 L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i) L32(8,9,10,11, A,B,C,D, i) L32(12,13,14,15, E,F,G,H, i)
471
472#if CRYPTOPP_BOOL_X64
473 SSE2_QUARTER_ROUND_X16(1, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
474#else
475 SSE2_QUARTER_ROUND_X8(1, 2, 6, 10, 14, 3, 7, 11, 15)
476 SSE2_QUARTER_ROUND_X8(1, 0, 4, 8, 12, 1, 5, 9, 13)
477#endif
478 AS2( mov REG_roundsLeft, REG_rounds)
479 ASJ( jmp, 2, f)
480
481 ASL(SSE2_Salsa_Output)
482 AS2( movdqa xmm0, xmm4)
483 AS2( punpckldq xmm4, xmm5)
484 AS2( movdqa xmm1, xmm6)
485 AS2( punpckldq xmm6, xmm7)
486 AS2( movdqa xmm2, xmm4)
487 AS2( punpcklqdq xmm4, xmm6) // e
488 AS2( punpckhqdq xmm2, xmm6) // f
489 AS2( punpckhdq xmm0, xmm5)
490 AS2( punpckhdq xmm1, xmm7)
491 AS2( movdqa xmm6, xmm0)
492 AS2( punpcklqdq xmm0, xmm1) // g
493 AS2( punpckhqdq xmm6, xmm1) // h
494 AS_XMM_OUTPUT4(SSE2_Salsa_Output_A, REG_input, REG_output, 4, 2, 0, 6, 1, 0, 4, 8, 12, 1)
495 AS1( ret)
496
497 ASL(6)
498#if CRYPTOPP_BOOL_X64
499 SSE2_QUARTER_ROUND_X16(0, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
500 ASL(2)
501 SSE2_QUARTER_ROUND_X16(0, 0, 13, 10, 7, 1, 14, 11, 4, 2, 15, 8, 5, 3, 12, 9, 6)
502#else
503 SSE2_QUARTER_ROUND_X8(0, 2, 6, 10, 14, 3, 7, 11, 15)
504 SSE2_QUARTER_ROUND_X8(0, 0, 4, 8, 12, 1, 5, 9, 13)
505 ASL(2)
506 SSE2_QUARTER_ROUND_X8(0, 2, 15, 8, 5, 3, 12, 9, 6)
507 SSE2_QUARTER_ROUND_X8(0, 0, 13, 10, 7, 1, 14, 11, 4)
508#endif
509 AS2( sub REG_roundsLeft, 2)
510 ASJ( jnz, 6, b)
511
512#define SSE2_OUTPUT_4(a, b, c, d) \
513 AS2( movdqa xmm4, [SSE2_WORKSPACE + a*16 + 256])\
514 AS2( paddd xmm4, [SSE2_WORKSPACE + a*16])\
515 AS2( movdqa xmm5, [SSE2_WORKSPACE + b*16 + 256])\
516 AS2( paddd xmm5, [SSE2_WORKSPACE + b*16])\
517 AS2( movdqa xmm6, [SSE2_WORKSPACE + c*16 + 256])\
518 AS2( paddd xmm6, [SSE2_WORKSPACE + c*16])\
519 AS2( movdqa xmm7, [SSE2_WORKSPACE + d*16 + 256])\
520 AS2( paddd xmm7, [SSE2_WORKSPACE + d*16])\
521 ASC( call, SSE2_Salsa_Output)
522
523 SSE2_OUTPUT_4(0, 13, 10, 7)
524 SSE2_OUTPUT_4(4, 1, 14, 11)
525 SSE2_OUTPUT_4(8, 5, 2, 15)
526 SSE2_OUTPUT_4(12, 9, 6, 3)
527 AS2( test REG_input, REG_input)
528 ASJ( jz, 9, f)
529 AS2( add REG_input, 12*16)
530 ASL(9)
531 AS2( add REG_output, 12*16)
532 AS2( sub REG_iterationCount, 4)
533 AS2( cmp REG_iterationCount, 4)
534 ASJ( jge, 1, b)
535 AS_POP_IF86( sp)
536
537 ASL(5)
538 AS2( sub REG_iterationCount, 1)
539 ASJ( jl, 4, f)
540 AS2( movdqa xmm0, [REG_state + 0*16])
541 AS2( movdqa xmm1, [REG_state + 1*16])
542 AS2( movdqa xmm2, [REG_state + 2*16])
543 AS2( movdqa xmm3, [REG_state + 3*16])
544 AS2( mov REG_roundsLeft, REG_rounds)
545
546 ASL(0)
547 SSE2_QUARTER_ROUND(0, 1, 3, 7)
548 SSE2_QUARTER_ROUND(1, 2, 0, 9)
549 SSE2_QUARTER_ROUND(2, 3, 1, 13)
550 SSE2_QUARTER_ROUND(3, 0, 2, 18)
551 ASS( pshufd xmm1, xmm1, 2, 1, 0, 3)
552 ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
553 ASS( pshufd xmm3, xmm3, 0, 3, 2, 1)
554 SSE2_QUARTER_ROUND(0, 3, 1, 7)
555 SSE2_QUARTER_ROUND(3, 2, 0, 9)
556 SSE2_QUARTER_ROUND(2, 1, 3, 13)
557 SSE2_QUARTER_ROUND(1, 0, 2, 18)
558 ASS( pshufd xmm1, xmm1, 0, 3, 2, 1)
559 ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
560 ASS( pshufd xmm3, xmm3, 2, 1, 0, 3)
561 AS2( sub REG_roundsLeft, 2)
562 ASJ( jnz, 0, b)
563
564 AS2( paddd xmm0, [REG_state + 0*16])
565 AS2( paddd xmm1, [REG_state + 1*16])
566 AS2( paddd xmm2, [REG_state + 2*16])
567 AS2( paddd xmm3, [REG_state + 3*16])
568
569 AS2( add dword ptr [REG_state + 8*4], 1)
570 AS2( adc dword ptr [REG_state + 5*4], 0)
571
572 AS2( pcmpeqb xmm6, xmm6) // all ones
573 AS2( psrlq xmm6, 32) // lo32 mask
574 ASS( pshufd xmm7, xmm6, 0, 1, 2, 3) // hi32 mask
575 AS2( movdqa xmm4, xmm0)
576 AS2( movdqa xmm5, xmm3)
577 AS2( pand xmm0, xmm7)
578 AS2( pand xmm4, xmm6)
579 AS2( pand xmm3, xmm6)
580 AS2( pand xmm5, xmm7)
581 AS2( por xmm4, xmm5) // 0,13,2,15
582 AS2( movdqa xmm5, xmm1)
583 AS2( pand xmm1, xmm7)
584 AS2( pand xmm5, xmm6)
585 AS2( por xmm0, xmm5) // 4,1,6,3
586 AS2( pand xmm6, xmm2)
587 AS2( pand xmm2, xmm7)
588 AS2( por xmm1, xmm6) // 8,5,10,7
589 AS2( por xmm2, xmm3) // 12,9,14,11
590
591 AS2( movdqa xmm5, xmm4)
592 AS2( movdqa xmm6, xmm0)
593 AS3( shufpd xmm4, xmm1, 2) // 0,13,10,7
594 AS3( shufpd xmm0, xmm2, 2) // 4,1,14,11
595 AS3( shufpd xmm1, xmm5, 2) // 8,5,2,15
596 AS3( shufpd xmm2, xmm6, 2) // 12,9,6,3
597
598 // output keystream
599 AS_XMM_OUTPUT4(SSE2_Salsa_Output_B, REG_input, REG_output, 4, 0, 1, 2, 3, 0, 1, 2, 3, 4)
600 ASJ( jmp, 5, b)
601 ASL(4)
602
603 AS_POP_IF86( bp)
604#ifdef __GNUC__
605 AS_POP_IF86( bx)
606 ATT_PREFIX
607 #if CRYPTOPP_BOOL_X64
608 : "+r" (input), "+r" (output), "+r" (iterationCount)
609 : "r" (m_rounds), "r" (m_state.begin()), "r" (workspace)
610 : "%eax", "%rdx", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
611 #else
612 : "+a" (input), "+D" (output), "+c" (iterationCount)
613 : "d" (m_rounds), "S" (m_state.begin())
614 : "memory", "cc"
615 #endif
616 );
617#endif
618#ifdef CRYPTOPP_GENERATE_X64_MASM
619 movdqa xmm6, [rsp + 0200h]
620 movdqa xmm7, [rsp + 0210h]
621 movdqa xmm8, [rsp + 0220h]
622 movdqa xmm9, [rsp + 0230h]
623 movdqa xmm10, [rsp + 0240h]
624 movdqa xmm11, [rsp + 0250h]
625 movdqa xmm12, [rsp + 0260h]
626 movdqa xmm13, [rsp + 0270h]
627 movdqa xmm14, [rsp + 0280h]
628 movdqa xmm15, [rsp + 0290h]
629 add rsp, 10*16 + 32*16 + 8
630 ret
631Salsa20_OperateKeystream ENDP
632#else
633 }
634 else
635#endif
636#endif
637#ifndef CRYPTOPP_GENERATE_X64_MASM
638 {
639 word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
640
641 while (iterationCount--)
642 {
643 x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3];
644 x4 = m_state[4]; x5 = m_state[5]; x6 = m_state[6]; x7 = m_state[7];
645 x8 = m_state[8]; x9 = m_state[9]; x10 = m_state[10]; x11 = m_state[11];
646 x12 = m_state[12]; x13 = m_state[13]; x14 = m_state[14]; x15 = m_state[15];
647
648 for (int i=m_rounds; i>0; i-=2)
649 {
650 #define QUARTER_ROUND(a, b, c, d) \
651 b = b ^ rotlConstant<7>(a + d); \
652 c = c ^ rotlConstant<9>(b + a); \
653 d = d ^ rotlConstant<13>(c + b); \
654 a = a ^ rotlConstant<18>(d + c);
655
656 QUARTER_ROUND(x0, x4, x8, x12)
657 QUARTER_ROUND(x1, x5, x9, x13)
658 QUARTER_ROUND(x2, x6, x10, x14)
659 QUARTER_ROUND(x3, x7, x11, x15)
660
661 QUARTER_ROUND(x0, x13, x10, x7)
662 QUARTER_ROUND(x1, x14, x11, x4)
663 QUARTER_ROUND(x2, x15, x8, x5)
664 QUARTER_ROUND(x3, x12, x9, x6)
665 }
666
667#ifndef CRYPTOPP_DOXYGEN_PROCESSING
668 #define SALSA_OUTPUT(x) {\
669 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
670 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\
671 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\
672 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\
673 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
674 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\
675 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\
676 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\
677 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
678 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\
679 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\
680 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\
681 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
682 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\
683 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\
684 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);}
685
687 #undef SALSA_OUTPUT
688#endif
689
690 if (++m_state[8] == 0)
691 ++m_state[5];
692 }
693 }
694} // see comment above if an internal compiler error occurs here
695
696void XSalsa20_Policy::CipherSetKey(const NameValuePairs &params, const byte *key, size_t length)
697{
698 m_rounds = params.GetIntValueWithDefault(Name::Rounds(), m_rounds);
699 if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20))
700 throw InvalidRounds(XSalsa20::StaticAlgorithmName(), m_rounds);
701
702 GetUserKey(LITTLE_ENDIAN_ORDER, m_key.begin(), m_key.size(), key, length);
703 if (length == 16)
704 memcpy(m_key.begin()+4, m_key.begin(), 16);
705
706 // "expand 32-byte k"
707 m_state[0] = 0x61707865;
708 m_state[1] = 0x3320646e;
709 m_state[2] = 0x79622d32;
710 m_state[3] = 0x6b206574;
711}
712
713void XSalsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length)
714{
715 CRYPTOPP_UNUSED(keystreamBuffer), CRYPTOPP_UNUSED(length);
716 CRYPTOPP_ASSERT(length==24);
717
718 word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
719
721 get(x14)(x11)(x8)(x5)(m_state[14])(m_state[11]);
722
723 x13 = m_key[0]; x10 = m_key[1]; x7 = m_key[2]; x4 = m_key[3];
724 x15 = m_key[4]; x12 = m_key[5]; x9 = m_key[6]; x6 = m_key[7];
725 x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3];
726
727 for (int i=m_rounds; i>0; i-=2)
728 {
729 QUARTER_ROUND(x0, x4, x8, x12)
730 QUARTER_ROUND(x1, x5, x9, x13)
731 QUARTER_ROUND(x2, x6, x10, x14)
732 QUARTER_ROUND(x3, x7, x11, x15)
733
734 QUARTER_ROUND(x0, x13, x10, x7)
735 QUARTER_ROUND(x1, x14, x11, x4)
736 QUARTER_ROUND(x2, x15, x8, x5)
737 QUARTER_ROUND(x3, x12, x9, x6)
738 }
739
740 m_state[13] = x0; m_state[10] = x1; m_state[7] = x2; m_state[4] = x3;
741 m_state[15] = x14; m_state[12] = x11; m_state[9] = x8; m_state[6] = x5;
742 m_state[8] = m_state[5] = 0;
743}
744
745NAMESPACE_END
746
747#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
Standard names for retrieving values by name when working with NameValuePairs.
Access a block of memory.
Definition: misc.h:2455
Exception thrown when an invalid number of rounds is encountered.
Definition: simple.h:60
Interface for retrieving values given their names.
Definition: cryptlib.h:294
int GetIntValueWithDefault(const char *name, int defaultValue) const
Get a named value with type int, with default.
Definition: cryptlib.h:395
iterator begin()
Provides an iterator pointing to the first element in the memory block.
Definition: secblock.h:772
A::pointer data()
Provides a pointer to the first element in the memory block.
Definition: secblock.h:789
size_type size() const
Provides the count of elements in the SecBlock.
Definition: secblock.h:797
SymmetricCipher implementation.
Definition: strciphr.h:674
void CipherSetKey(const NameValuePairs &params, const byte *key, size_t length)
Key the cipher.
Definition: salsa.cpp:696
void CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length)
Resynchronize the cipher.
Definition: salsa.cpp:713
Library configuration file.
Functions for CPU features and intrinsics.
bool HasSSE2()
Determines SSE2 availability.
Definition: cpu.h:116
@ LITTLE_ENDIAN_ORDER
byte order is little-endian
Definition: cryptlib.h:145
Utility functions for the Crypto++ library.
Crypto++ library namespace.
const char * Rounds()
int
Definition: argnames.h:24
Precompiled header file.
Classes for Salsa and Salsa20 stream ciphers.
#define CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(x, y)
Helper macro to implement OperateKeystream.
Definition: strciphr.h:268
KeystreamOperation
Keystream operation flags.
Definition: strciphr.h:88
virtual unsigned int GetAlignment() const
Provides data alignment requirements.
Definition: strciphr.h:112
virtual unsigned int GetOptimalBlockSize() const
Provides number of ideal bytes to process.
Definition: strciphr.h:123
static const int BYTES_PER_ITERATION
Number of bytes for an iteration.
Definition: strciphr.h:211
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:69