Crypto++
salsa.cpp
1 // salsa.cpp - written and placed in the public domain by Wei Dai
2 
3 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM salsa.cpp" to generate MASM code
4 
5 #include "pch.h"
6 
7 #ifndef CRYPTOPP_GENERATE_X64_MASM
8 
9 #include "salsa.h"
10 #include "misc.h"
11 #include "argnames.h"
12 #include "cpu.h"
13 
14 NAMESPACE_BEGIN(CryptoPP)
15 
16 void Salsa20_TestInstantiations()
17 {
19 }
20 
21 void Salsa20_Policy::CipherSetKey(const NameValuePairs &params, const byte *key, size_t length)
22 {
23  m_rounds = params.GetIntValueWithDefault(Name::Rounds(), 20);
24 
25  if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20))
26  throw InvalidRounds(Salsa20::StaticAlgorithmName(), m_rounds);
27 
28  // m_state is reordered for SSE2
30  get1(m_state[13])(m_state[10])(m_state[7])(m_state[4]);
31  GetBlock<word32, LittleEndian> get2(key + length - 16);
32  get2(m_state[15])(m_state[12])(m_state[9])(m_state[6]);
33 
34  // "expand 16-byte k" or "expand 32-byte k"
35  m_state[0] = 0x61707865;
36  m_state[1] = (length == 16) ? 0x3120646e : 0x3320646e;
37  m_state[2] = (length == 16) ? 0x79622d36 : 0x79622d32;
38  m_state[3] = 0x6b206574;
39 }
40 
41 void Salsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length)
42 {
43  assert(length==8);
45  get(m_state[14])(m_state[11]);
46  m_state[8] = m_state[5] = 0;
47 }
48 
49 void Salsa20_Policy::SeekToIteration(lword iterationCount)
50 {
51  m_state[8] = (word32)iterationCount;
52  m_state[5] = (word32)SafeRightShift<32>(iterationCount);
53 }
54 
55 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
56 unsigned int Salsa20_Policy::GetAlignment() const
57 {
58 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
59  if (HasSSE2())
60  return 16;
61  else
62 #endif
63  return GetAlignmentOf<word32>();
64 }
65 
66 unsigned int Salsa20_Policy::GetOptimalBlockSize() const
67 {
68 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
69  if (HasSSE2())
70  return 4*BYTES_PER_ITERATION;
71  else
72 #endif
73  return BYTES_PER_ITERATION;
74 }
75 #endif
76 
77 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
78 extern "C" {
79 void Salsa20_OperateKeystream(byte *output, const byte *input, size_t iterationCount, int rounds, void *state);
80 }
81 #endif
82 
83 #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
84 
85 void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
86 {
87 #endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
88 
89 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
90  Salsa20_OperateKeystream(output, input, iterationCount, m_rounds, m_state.data());
91  return;
92 #endif
93 
94 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
95 #ifdef CRYPTOPP_GENERATE_X64_MASM
96  ALIGN 8
97  Salsa20_OperateKeystream PROC FRAME
98  mov r10, [rsp + 5*8] ; state
99  alloc_stack(10*16 + 32*16 + 8)
100  save_xmm128 xmm6, 0200h
101  save_xmm128 xmm7, 0210h
102  save_xmm128 xmm8, 0220h
103  save_xmm128 xmm9, 0230h
104  save_xmm128 xmm10, 0240h
105  save_xmm128 xmm11, 0250h
106  save_xmm128 xmm12, 0260h
107  save_xmm128 xmm13, 0270h
108  save_xmm128 xmm14, 0280h
109  save_xmm128 xmm15, 0290h
110  .endprolog
111 
112  #define REG_output rcx
113  #define REG_input rdx
114  #define REG_iterationCount r8
115  #define REG_state r10
116  #define REG_rounds e9d
117  #define REG_roundsLeft eax
118  #define REG_temp32 r11d
119  #define REG_temp r11
120  #define SSE2_WORKSPACE rsp
121 #else
122  if (HasSSE2())
123  {
124  #if CRYPTOPP_BOOL_X64
125  #define REG_output %4
126  #define REG_input %1
127  #define REG_iterationCount %2
128  #define REG_state %3
129  #define REG_rounds %0
130  #define REG_roundsLeft eax
131  #define REG_temp32 edx
132  #define REG_temp rdx
133  #define SSE2_WORKSPACE %5
134 
136  #else
137  #define REG_output edi
138  #define REG_input eax
139  #define REG_iterationCount ecx
140  #define REG_state esi
141  #define REG_rounds edx
142  #define REG_roundsLeft ebx
143  #define REG_temp32 ebp
144  #define REG_temp ebp
145  #define SSE2_WORKSPACE esp + WORD_SZ
146  #endif
147 
148  #ifdef __GNUC__
149  __asm__ __volatile__
150  (
151  ".intel_syntax noprefix;"
152  AS_PUSH_IF86( bx)
153  #else
154  void *s = m_state.data();
155  word32 r = m_rounds;
156 
157  AS2( mov REG_iterationCount, iterationCount)
158  AS2( mov REG_input, input)
159  AS2( mov REG_output, output)
160  AS2( mov REG_state, s)
161  AS2( mov REG_rounds, r)
162  #endif
163 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
164 
165  AS_PUSH_IF86( bp)
166  AS2( cmp REG_iterationCount, 4)
167  ASJ( jl, 5, f)
168 
169 #if CRYPTOPP_BOOL_X86
170  AS2( mov ebx, esp)
171  AS2( and esp, -16)
172  AS2( sub esp, 32*16)
173  AS1( push ebx)
174 #endif
175 
176 #define SSE2_EXPAND_S(i, j) \
177  ASS( pshufd xmm4, xmm##i, j, j, j, j) \
178  AS2( movdqa [SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4)
179 
180  AS2( movdqa xmm0, [REG_state + 0*16])
181  AS2( movdqa xmm1, [REG_state + 1*16])
182  AS2( movdqa xmm2, [REG_state + 2*16])
183  AS2( movdqa xmm3, [REG_state + 3*16])
184  SSE2_EXPAND_S(0, 0)
185  SSE2_EXPAND_S(0, 1)
186  SSE2_EXPAND_S(0, 2)
187  SSE2_EXPAND_S(0, 3)
188  SSE2_EXPAND_S(1, 0)
189  SSE2_EXPAND_S(1, 2)
190  SSE2_EXPAND_S(1, 3)
191  SSE2_EXPAND_S(2, 1)
192  SSE2_EXPAND_S(2, 2)
193  SSE2_EXPAND_S(2, 3)
194  SSE2_EXPAND_S(3, 0)
195  SSE2_EXPAND_S(3, 1)
196  SSE2_EXPAND_S(3, 2)
197  SSE2_EXPAND_S(3, 3)
198 
199 #define SSE2_EXPAND_S85(i) \
200  AS2( mov dword ptr [SSE2_WORKSPACE + 8*16 + i*4 + 256], REG_roundsLeft) \
201  AS2( mov dword ptr [SSE2_WORKSPACE + 5*16 + i*4 + 256], REG_temp32) \
202  AS2( add REG_roundsLeft, 1) \
203  AS2( adc REG_temp32, 0)
204 
205  ASL(1)
206  AS2( mov REG_roundsLeft, dword ptr [REG_state + 8*4])
207  AS2( mov REG_temp32, dword ptr [REG_state + 5*4])
208  SSE2_EXPAND_S85(0)
209  SSE2_EXPAND_S85(1)
210  SSE2_EXPAND_S85(2)
211  SSE2_EXPAND_S85(3)
212  AS2( mov dword ptr [REG_state + 8*4], REG_roundsLeft)
213  AS2( mov dword ptr [REG_state + 5*4], REG_temp32)
214 
215 #define SSE2_QUARTER_ROUND(a, b, d, i) \
216  AS2( movdqa xmm4, xmm##d) \
217  AS2( paddd xmm4, xmm##a) \
218  AS2( movdqa xmm5, xmm4) \
219  AS2( pslld xmm4, i) \
220  AS2( psrld xmm5, 32-i) \
221  AS2( pxor xmm##b, xmm4) \
222  AS2( pxor xmm##b, xmm5)
223 
224 #define L01(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##A, [SSE2_WORKSPACE + d*16 + i*256]) /* y3 */
225 #define L02(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##C, [SSE2_WORKSPACE + a*16 + i*256]) /* y0 */
226 #define L03(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* y0+y3 */
227 #define L04(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
228 #define L05(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 7)
229 #define L06(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-7)
230 #define L07(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + b*16 + i*256])
231 #define L08(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z1 */
232 #define L09(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + b*16], xmm##A)
233 #define L10(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
234 #define L11(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* z1+y0 */
235 #define L12(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
236 #define L13(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 9)
237 #define L14(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-9)
238 #define L15(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + c*16 + i*256])
239 #define L16(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z2 */
240 #define L17(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + c*16], xmm##A)
241 #define L18(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
242 #define L19(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##B) /* z2+z1 */
243 #define L20(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
244 #define L21(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 13)
245 #define L22(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-13)
246 #define L23(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + d*16 + i*256])
247 #define L24(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z3 */
248 #define L25(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + d*16], xmm##A)
249 #define L26(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##D) /* z3+z2 */
250 #define L27(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
251 #define L28(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 18)
252 #define L29(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-18)
253 #define L30(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##C) /* xor y0 */
254 #define L31(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z0 */
255 #define L32(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + a*16], xmm##A)
256 
257 #define SSE2_QUARTER_ROUND_X8(i, a, b, c, d, e, f, g, h) \
258  L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) \
259  L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) \
260  L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) \
261  L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) \
262  L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) \
263  L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) \
264  L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) \
265  L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) \
266  L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) \
267  L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) \
268  L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) \
269  L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) \
270  L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) \
271  L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) \
272  L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) \
273  L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) \
274  L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) \
275  L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) \
276  L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) \
277  L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) \
278  L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) \
279  L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) \
280  L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) \
281  L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) \
282  L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) \
283  L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) \
284  L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) \
285  L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) \
286  L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) \
287  L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) \
288  L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) \
289  L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i)
290 
291 #define SSE2_QUARTER_ROUND_X16(i, a, b, c, d, e, f, g, h, A, B, C, D, E, F, G, H) \
292  L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) L01(8,9,10,11, A,B,C,D, i) L01(12,13,14,15, E,F,G,H, i) \
293  L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) L02(8,9,10,11, A,B,C,D, i) L02(12,13,14,15, E,F,G,H, i) \
294  L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) L03(8,9,10,11, A,B,C,D, i) L03(12,13,14,15, E,F,G,H, i) \
295  L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) L04(8,9,10,11, A,B,C,D, i) L04(12,13,14,15, E,F,G,H, i) \
296  L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) L05(8,9,10,11, A,B,C,D, i) L05(12,13,14,15, E,F,G,H, i) \
297  L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) L06(8,9,10,11, A,B,C,D, i) L06(12,13,14,15, E,F,G,H, i) \
298  L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) L07(8,9,10,11, A,B,C,D, i) L07(12,13,14,15, E,F,G,H, i) \
299  L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) L08(8,9,10,11, A,B,C,D, i) L08(12,13,14,15, E,F,G,H, i) \
300  L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) L09(8,9,10,11, A,B,C,D, i) L09(12,13,14,15, E,F,G,H, i) \
301  L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) L10(8,9,10,11, A,B,C,D, i) L10(12,13,14,15, E,F,G,H, i) \
302  L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) L11(8,9,10,11, A,B,C,D, i) L11(12,13,14,15, E,F,G,H, i) \
303  L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) L12(8,9,10,11, A,B,C,D, i) L12(12,13,14,15, E,F,G,H, i) \
304  L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) L13(8,9,10,11, A,B,C,D, i) L13(12,13,14,15, E,F,G,H, i) \
305  L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) L14(8,9,10,11, A,B,C,D, i) L14(12,13,14,15, E,F,G,H, i) \
306  L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) L15(8,9,10,11, A,B,C,D, i) L15(12,13,14,15, E,F,G,H, i) \
307  L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) L16(8,9,10,11, A,B,C,D, i) L16(12,13,14,15, E,F,G,H, i) \
308  L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) L17(8,9,10,11, A,B,C,D, i) L17(12,13,14,15, E,F,G,H, i) \
309  L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) L18(8,9,10,11, A,B,C,D, i) L18(12,13,14,15, E,F,G,H, i) \
310  L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) L19(8,9,10,11, A,B,C,D, i) L19(12,13,14,15, E,F,G,H, i) \
311  L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) L20(8,9,10,11, A,B,C,D, i) L20(12,13,14,15, E,F,G,H, i) \
312  L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) L21(8,9,10,11, A,B,C,D, i) L21(12,13,14,15, E,F,G,H, i) \
313  L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) L22(8,9,10,11, A,B,C,D, i) L22(12,13,14,15, E,F,G,H, i) \
314  L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) L23(8,9,10,11, A,B,C,D, i) L23(12,13,14,15, E,F,G,H, i) \
315  L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) L24(8,9,10,11, A,B,C,D, i) L24(12,13,14,15, E,F,G,H, i) \
316  L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) L25(8,9,10,11, A,B,C,D, i) L25(12,13,14,15, E,F,G,H, i) \
317  L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) L26(8,9,10,11, A,B,C,D, i) L26(12,13,14,15, E,F,G,H, i) \
318  L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) L27(8,9,10,11, A,B,C,D, i) L27(12,13,14,15, E,F,G,H, i) \
319  L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) L28(8,9,10,11, A,B,C,D, i) L28(12,13,14,15, E,F,G,H, i) \
320  L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) L29(8,9,10,11, A,B,C,D, i) L29(12,13,14,15, E,F,G,H, i) \
321  L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) L30(8,9,10,11, A,B,C,D, i) L30(12,13,14,15, E,F,G,H, i) \
322  L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) L31(8,9,10,11, A,B,C,D, i) L31(12,13,14,15, E,F,G,H, i) \
323  L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i) L32(8,9,10,11, A,B,C,D, i) L32(12,13,14,15, E,F,G,H, i)
324 
325 #if CRYPTOPP_BOOL_X64
326  SSE2_QUARTER_ROUND_X16(1, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
327 #else
328  SSE2_QUARTER_ROUND_X8(1, 2, 6, 10, 14, 3, 7, 11, 15)
329  SSE2_QUARTER_ROUND_X8(1, 0, 4, 8, 12, 1, 5, 9, 13)
330 #endif
331  AS2( mov REG_roundsLeft, REG_rounds)
332  ASJ( jmp, 2, f)
333 
334  ASL(SSE2_Salsa_Output)
335  AS2( movdqa xmm0, xmm4)
336  AS2( punpckldq xmm4, xmm5)
337  AS2( movdqa xmm1, xmm6)
338  AS2( punpckldq xmm6, xmm7)
339  AS2( movdqa xmm2, xmm4)
340  AS2( punpcklqdq xmm4, xmm6) // e
341  AS2( punpckhqdq xmm2, xmm6) // f
342  AS2( punpckhdq xmm0, xmm5)
343  AS2( punpckhdq xmm1, xmm7)
344  AS2( movdqa xmm6, xmm0)
345  AS2( punpcklqdq xmm0, xmm1) // g
346  AS2( punpckhqdq xmm6, xmm1) // h
347  AS_XMM_OUTPUT4(SSE2_Salsa_Output_A, REG_input, REG_output, 4, 2, 0, 6, 1, 0, 4, 8, 12, 1)
348  AS1( ret)
349 
350  ASL(6)
351 #if CRYPTOPP_BOOL_X64
352  SSE2_QUARTER_ROUND_X16(0, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
353  ASL(2)
354  SSE2_QUARTER_ROUND_X16(0, 0, 13, 10, 7, 1, 14, 11, 4, 2, 15, 8, 5, 3, 12, 9, 6)
355 #else
356  SSE2_QUARTER_ROUND_X8(0, 2, 6, 10, 14, 3, 7, 11, 15)
357  SSE2_QUARTER_ROUND_X8(0, 0, 4, 8, 12, 1, 5, 9, 13)
358  ASL(2)
359  SSE2_QUARTER_ROUND_X8(0, 2, 15, 8, 5, 3, 12, 9, 6)
360  SSE2_QUARTER_ROUND_X8(0, 0, 13, 10, 7, 1, 14, 11, 4)
361 #endif
362  AS2( sub REG_roundsLeft, 2)
363  ASJ( jnz, 6, b)
364 
365 #define SSE2_OUTPUT_4(a, b, c, d) \
366  AS2( movdqa xmm4, [SSE2_WORKSPACE + a*16 + 256])\
367  AS2( paddd xmm4, [SSE2_WORKSPACE + a*16])\
368  AS2( movdqa xmm5, [SSE2_WORKSPACE + b*16 + 256])\
369  AS2( paddd xmm5, [SSE2_WORKSPACE + b*16])\
370  AS2( movdqa xmm6, [SSE2_WORKSPACE + c*16 + 256])\
371  AS2( paddd xmm6, [SSE2_WORKSPACE + c*16])\
372  AS2( movdqa xmm7, [SSE2_WORKSPACE + d*16 + 256])\
373  AS2( paddd xmm7, [SSE2_WORKSPACE + d*16])\
374  ASC( call, SSE2_Salsa_Output)
375 
376  SSE2_OUTPUT_4(0, 13, 10, 7)
377  SSE2_OUTPUT_4(4, 1, 14, 11)
378  SSE2_OUTPUT_4(8, 5, 2, 15)
379  SSE2_OUTPUT_4(12, 9, 6, 3)
380  AS2( test REG_input, REG_input)
381  ASJ( jz, 9, f)
382  AS2( add REG_input, 12*16)
383  ASL(9)
384  AS2( add REG_output, 12*16)
385  AS2( sub REG_iterationCount, 4)
386  AS2( cmp REG_iterationCount, 4)
387  ASJ( jge, 1, b)
388  AS_POP_IF86( sp)
389 
390  ASL(5)
391  AS2( sub REG_iterationCount, 1)
392  ASJ( jl, 4, f)
393  AS2( movdqa xmm0, [REG_state + 0*16])
394  AS2( movdqa xmm1, [REG_state + 1*16])
395  AS2( movdqa xmm2, [REG_state + 2*16])
396  AS2( movdqa xmm3, [REG_state + 3*16])
397  AS2( mov REG_roundsLeft, REG_rounds)
398 
399  ASL(0)
400  SSE2_QUARTER_ROUND(0, 1, 3, 7)
401  SSE2_QUARTER_ROUND(1, 2, 0, 9)
402  SSE2_QUARTER_ROUND(2, 3, 1, 13)
403  SSE2_QUARTER_ROUND(3, 0, 2, 18)
404  ASS( pshufd xmm1, xmm1, 2, 1, 0, 3)
405  ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
406  ASS( pshufd xmm3, xmm3, 0, 3, 2, 1)
407  SSE2_QUARTER_ROUND(0, 3, 1, 7)
408  SSE2_QUARTER_ROUND(3, 2, 0, 9)
409  SSE2_QUARTER_ROUND(2, 1, 3, 13)
410  SSE2_QUARTER_ROUND(1, 0, 2, 18)
411  ASS( pshufd xmm1, xmm1, 0, 3, 2, 1)
412  ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
413  ASS( pshufd xmm3, xmm3, 2, 1, 0, 3)
414  AS2( sub REG_roundsLeft, 2)
415  ASJ( jnz, 0, b)
416 
417  AS2( paddd xmm0, [REG_state + 0*16])
418  AS2( paddd xmm1, [REG_state + 1*16])
419  AS2( paddd xmm2, [REG_state + 2*16])
420  AS2( paddd xmm3, [REG_state + 3*16])
421 
422  AS2( add dword ptr [REG_state + 8*4], 1)
423  AS2( adc dword ptr [REG_state + 5*4], 0)
424 
425  AS2( pcmpeqb xmm6, xmm6) // all ones
426  AS2( psrlq xmm6, 32) // lo32 mask
427  ASS( pshufd xmm7, xmm6, 0, 1, 2, 3) // hi32 mask
428  AS2( movdqa xmm4, xmm0)
429  AS2( movdqa xmm5, xmm3)
430  AS2( pand xmm0, xmm7)
431  AS2( pand xmm4, xmm6)
432  AS2( pand xmm3, xmm6)
433  AS2( pand xmm5, xmm7)
434  AS2( por xmm4, xmm5) // 0,13,2,15
435  AS2( movdqa xmm5, xmm1)
436  AS2( pand xmm1, xmm7)
437  AS2( pand xmm5, xmm6)
438  AS2( por xmm0, xmm5) // 4,1,6,3
439  AS2( pand xmm6, xmm2)
440  AS2( pand xmm2, xmm7)
441  AS2( por xmm1, xmm6) // 8,5,10,7
442  AS2( por xmm2, xmm3) // 12,9,14,11
443 
444  AS2( movdqa xmm5, xmm4)
445  AS2( movdqa xmm6, xmm0)
446  AS3( shufpd xmm4, xmm1, 2) // 0,13,10,7
447  AS3( shufpd xmm0, xmm2, 2) // 4,1,14,11
448  AS3( shufpd xmm1, xmm5, 2) // 8,5,2,15
449  AS3( shufpd xmm2, xmm6, 2) // 12,9,6,3
450 
451  // output keystream
452  AS_XMM_OUTPUT4(SSE2_Salsa_Output_B, REG_input, REG_output, 4, 0, 1, 2, 3, 0, 1, 2, 3, 4)
453  ASJ( jmp, 5, b)
454  ASL(4)
455 
456  AS_POP_IF86( bp)
457 #ifdef __GNUC__
458  AS_POP_IF86( bx)
459  ".att_syntax prefix;"
460  :
461  #if CRYPTOPP_BOOL_X64
462  : "r" (m_rounds), "r" (input), "r" (iterationCount), "r" (m_state.data()), "r" (output), "r" (workspace.m_ptr)
463  : "%eax", "%edx", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
464  #else
465  : "d" (m_rounds), "a" (input), "c" (iterationCount), "S" (m_state.data()), "D" (output)
466  : "memory", "cc"
467  #endif
468  );
469 #endif
470 #ifdef CRYPTOPP_GENERATE_X64_MASM
471  movdqa xmm6, [rsp + 0200h]
472  movdqa xmm7, [rsp + 0210h]
473  movdqa xmm8, [rsp + 0220h]
474  movdqa xmm9, [rsp + 0230h]
475  movdqa xmm10, [rsp + 0240h]
476  movdqa xmm11, [rsp + 0250h]
477  movdqa xmm12, [rsp + 0260h]
478  movdqa xmm13, [rsp + 0270h]
479  movdqa xmm14, [rsp + 0280h]
480  movdqa xmm15, [rsp + 0290h]
481  add rsp, 10*16 + 32*16 + 8
482  ret
483 Salsa20_OperateKeystream ENDP
484 #else
485  }
486  else
487 #endif
488 #endif
489 #ifndef CRYPTOPP_GENERATE_X64_MASM
490  {
491  word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
492 
493  while (iterationCount--)
494  {
495  x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3];
496  x4 = m_state[4]; x5 = m_state[5]; x6 = m_state[6]; x7 = m_state[7];
497  x8 = m_state[8]; x9 = m_state[9]; x10 = m_state[10]; x11 = m_state[11];
498  x12 = m_state[12]; x13 = m_state[13]; x14 = m_state[14]; x15 = m_state[15];
499 
500  for (int i=m_rounds; i>0; i-=2)
501  {
502  #define QUARTER_ROUND(a, b, c, d) \
503  b = b ^ rotlFixed(a + d, 7); \
504  c = c ^ rotlFixed(b + a, 9); \
505  d = d ^ rotlFixed(c + b, 13); \
506  a = a ^ rotlFixed(d + c, 18);
507 
508  QUARTER_ROUND(x0, x4, x8, x12)
509  QUARTER_ROUND(x1, x5, x9, x13)
510  QUARTER_ROUND(x2, x6, x10, x14)
511  QUARTER_ROUND(x3, x7, x11, x15)
512 
513  QUARTER_ROUND(x0, x13, x10, x7)
514  QUARTER_ROUND(x1, x14, x11, x4)
515  QUARTER_ROUND(x2, x15, x8, x5)
516  QUARTER_ROUND(x3, x12, x9, x6)
517  }
518 
519  #define SALSA_OUTPUT(x) {\
520  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
521  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\
522  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\
523  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\
524  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
525  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\
526  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\
527  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\
528  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
529  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\
530  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\
531  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\
532  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
533  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\
534  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\
535  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);}
536 
537 #ifndef CRYPTOPP_DOXYGEN_PROCESSING
538  CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION);
539 #endif
540 
541  if (++m_state[8] == 0)
542  ++m_state[5];
543  }
544  }
545 } // see comment above if an internal compiler error occurs here
546 
547 void XSalsa20_Policy::CipherSetKey(const NameValuePairs &params, const byte *key, size_t length)
548 {
549  m_rounds = params.GetIntValueWithDefault(Name::Rounds(), 20);
550 
551  if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20))
552  throw InvalidRounds(XSalsa20::StaticAlgorithmName(), m_rounds);
553 
554  GetUserKey(LITTLE_ENDIAN_ORDER, m_key.begin(), m_key.size(), key, length);
555  if (length == 16)
556  memcpy(m_key.begin()+4, m_key.begin(), 16);
557 
558  // "expand 32-byte k"
559  m_state[0] = 0x61707865;
560  m_state[1] = 0x3320646e;
561  m_state[2] = 0x79622d32;
562  m_state[3] = 0x6b206574;
563 }
564 
565 void XSalsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length)
566 {
567  assert(length==24);
568 
569  word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
570 
572  get(x14)(x11)(x8)(x5)(m_state[14])(m_state[11]);
573 
574  x13 = m_key[0]; x10 = m_key[1]; x7 = m_key[2]; x4 = m_key[3];
575  x15 = m_key[4]; x12 = m_key[5]; x9 = m_key[6]; x6 = m_key[7];
576  x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3];
577 
578  for (int i=m_rounds; i>0; i-=2)
579  {
580  QUARTER_ROUND(x0, x4, x8, x12)
581  QUARTER_ROUND(x1, x5, x9, x13)
582  QUARTER_ROUND(x2, x6, x10, x14)
583  QUARTER_ROUND(x3, x7, x11, x15)
584 
585  QUARTER_ROUND(x0, x13, x10, x7)
586  QUARTER_ROUND(x1, x14, x11, x4)
587  QUARTER_ROUND(x2, x15, x8, x5)
588  QUARTER_ROUND(x3, x12, x9, x6)
589  }
590 
591  m_state[13] = x0; m_state[10] = x1; m_state[7] = x2; m_state[4] = x3;
592  m_state[15] = x14; m_state[12] = x11; m_state[9] = x8; m_state[6] = x5;
593  m_state[8] = m_state[5] = 0;
594 }
595 
596 NAMESPACE_END
597 
598 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM