7 #ifndef CRYPTOPP_GENERATE_X64_MASM
14 NAMESPACE_BEGIN(CryptoPP)
16 void Salsa20_TestInstantiations()
21 void Salsa20_Policy::CipherSetKey(
const NameValuePairs ¶ms,
const byte *key,
size_t length)
25 if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20))
26 throw InvalidRounds(Salsa20::StaticAlgorithmName(), m_rounds);
30 get1(m_state[13])(m_state[10])(m_state[7])(m_state[4]);
32 get2(m_state[15])(m_state[12])(m_state[9])(m_state[6]);
35 m_state[0] = 0x61707865;
36 m_state[1] = (length == 16) ? 0x3120646e : 0x3320646e;
37 m_state[2] = (length == 16) ? 0x79622d36 : 0x79622d32;
38 m_state[3] = 0x6b206574;
41 void Salsa20_Policy::CipherResynchronize(byte *keystreamBuffer,
const byte *IV,
size_t length)
45 get(m_state[14])(m_state[11]);
46 m_state[8] = m_state[5] = 0;
49 void Salsa20_Policy::SeekToIteration(lword iterationCount)
51 m_state[8] = (word32)iterationCount;
52 m_state[5] = (word32)SafeRightShift<32>(iterationCount);
55 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
56 unsigned int Salsa20_Policy::GetAlignment()
const
58 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
63 return GetAlignmentOf<word32>();
66 unsigned int Salsa20_Policy::GetOptimalBlockSize()
const
68 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
70 return 4*BYTES_PER_ITERATION;
73 return BYTES_PER_ITERATION;
77 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
79 void Salsa20_OperateKeystream(byte *output,
const byte *input,
size_t iterationCount,
int rounds,
void *state);
83 #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
85 void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output,
const byte *input,
size_t iterationCount)
87 #endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
89 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
90 Salsa20_OperateKeystream(output, input, iterationCount, m_rounds, m_state.data());
94 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
95 #ifdef CRYPTOPP_GENERATE_X64_MASM
97 Salsa20_OperateKeystream PROC FRAME
98 mov r10, [rsp + 5*8] ; state
99 alloc_stack(10*16 + 32*16 + 8)
100 save_xmm128 xmm6, 0200h
101 save_xmm128 xmm7, 0210h
102 save_xmm128 xmm8, 0220h
103 save_xmm128 xmm9, 0230h
104 save_xmm128 xmm10, 0240h
105 save_xmm128 xmm11, 0250h
106 save_xmm128 xmm12, 0260h
107 save_xmm128 xmm13, 0270h
108 save_xmm128 xmm14, 0280h
109 save_xmm128 xmm15, 0290h
112 #define REG_output rcx
113 #define REG_input rdx
114 #define REG_iterationCount r8
115 #define REG_state r10
116 #define REG_rounds e9d
117 #define REG_roundsLeft eax
118 #define REG_temp32 r11d
120 #define SSE2_WORKSPACE rsp
124 #if CRYPTOPP_BOOL_X64
125 #define REG_output %4
127 #define REG_iterationCount %2
129 #define REG_rounds %0
130 #define REG_roundsLeft eax
131 #define REG_temp32 edx
133 #define SSE2_WORKSPACE %5
137 #define REG_output edi
138 #define REG_input eax
139 #define REG_iterationCount ecx
140 #define REG_state esi
141 #define REG_rounds edx
142 #define REG_roundsLeft ebx
143 #define REG_temp32 ebp
145 #define SSE2_WORKSPACE esp + WORD_SZ
151 ".intel_syntax noprefix;"
154 void *s = m_state.data();
157 AS2( mov REG_iterationCount, iterationCount)
158 AS2( mov REG_input, input)
159 AS2( mov REG_output, output)
160 AS2( mov REG_state, s)
161 AS2( mov REG_rounds, r)
163 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
166 AS2( cmp REG_iterationCount, 4)
169 #if CRYPTOPP_BOOL_X86
176 #define SSE2_EXPAND_S(i, j) \
177 ASS( pshufd xmm4, xmm##i, j, j, j, j) \
178 AS2( movdqa [SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4)
180 AS2( movdqa xmm0, [REG_state + 0*16])
181 AS2( movdqa xmm1, [REG_state + 1*16])
182 AS2( movdqa xmm2, [REG_state + 2*16])
183 AS2( movdqa xmm3, [REG_state + 3*16])
199 #define SSE2_EXPAND_S85(i) \
200 AS2( mov dword ptr [SSE2_WORKSPACE + 8*16 + i*4 + 256], REG_roundsLeft) \
201 AS2( mov dword ptr [SSE2_WORKSPACE + 5*16 + i*4 + 256], REG_temp32) \
202 AS2( add REG_roundsLeft, 1) \
203 AS2( adc REG_temp32, 0)
206 AS2( mov REG_roundsLeft, dword ptr [REG_state + 8*4])
207 AS2( mov REG_temp32, dword ptr [REG_state + 5*4])
212 AS2( mov dword ptr [REG_state + 8*4], REG_roundsLeft)
213 AS2( mov dword ptr [REG_state + 5*4], REG_temp32)
215 #define SSE2_QUARTER_ROUND(a, b, d, i) \
216 AS2( movdqa xmm4, xmm##d) \
217 AS2( paddd xmm4, xmm##a) \
218 AS2( movdqa xmm5, xmm4) \
219 AS2( pslld xmm4, i) \
220 AS2( psrld xmm5, 32-i) \
221 AS2( pxor xmm##b, xmm4) \
222 AS2( pxor xmm##b, xmm5)
224 #define L01(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##A, [SSE2_WORKSPACE + d*16 + i*256])
225 #define L02(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##C, [SSE2_WORKSPACE + a*16 + i*256])
226 #define L03(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C)
227 #define L04(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
228 #define L05(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 7)
229 #define L06(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-7)
230 #define L07(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + b*16 + i*256])
231 #define L08(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B)
232 #define L09(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + b*16], xmm##A)
233 #define L10(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
234 #define L11(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C)
235 #define L12(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
236 #define L13(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 9)
237 #define L14(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-9)
238 #define L15(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + c*16 + i*256])
239 #define L16(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D)
240 #define L17(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + c*16], xmm##A)
241 #define L18(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
242 #define L19(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##B)
243 #define L20(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
244 #define L21(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 13)
245 #define L22(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-13)
246 #define L23(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + d*16 + i*256])
247 #define L24(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B)
248 #define L25(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + d*16], xmm##A)
249 #define L26(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##D)
250 #define L27(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
251 #define L28(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 18)
252 #define L29(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-18)
253 #define L30(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##C)
254 #define L31(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D)
255 #define L32(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + a*16], xmm##A)
257 #define SSE2_QUARTER_ROUND_X8(i, a, b, c, d, e, f, g, h) \
258 L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) \
259 L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) \
260 L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) \
261 L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) \
262 L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) \
263 L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) \
264 L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) \
265 L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) \
266 L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) \
267 L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) \
268 L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) \
269 L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) \
270 L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) \
271 L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) \
272 L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) \
273 L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) \
274 L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) \
275 L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) \
276 L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) \
277 L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) \
278 L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) \
279 L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) \
280 L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) \
281 L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) \
282 L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) \
283 L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) \
284 L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) \
285 L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) \
286 L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) \
287 L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) \
288 L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) \
289 L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i)
291 #define SSE2_QUARTER_ROUND_X16(i, a, b, c, d, e, f, g, h, A, B, C, D, E, F, G, H) \
292 L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) L01(8,9,10,11, A,B,C,D, i) L01(12,13,14,15, E,F,G,H, i) \
293 L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) L02(8,9,10,11, A,B,C,D, i) L02(12,13,14,15, E,F,G,H, i) \
294 L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) L03(8,9,10,11, A,B,C,D, i) L03(12,13,14,15, E,F,G,H, i) \
295 L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) L04(8,9,10,11, A,B,C,D, i) L04(12,13,14,15, E,F,G,H, i) \
296 L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) L05(8,9,10,11, A,B,C,D, i) L05(12,13,14,15, E,F,G,H, i) \
297 L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) L06(8,9,10,11, A,B,C,D, i) L06(12,13,14,15, E,F,G,H, i) \
298 L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) L07(8,9,10,11, A,B,C,D, i) L07(12,13,14,15, E,F,G,H, i) \
299 L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) L08(8,9,10,11, A,B,C,D, i) L08(12,13,14,15, E,F,G,H, i) \
300 L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) L09(8,9,10,11, A,B,C,D, i) L09(12,13,14,15, E,F,G,H, i) \
301 L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) L10(8,9,10,11, A,B,C,D, i) L10(12,13,14,15, E,F,G,H, i) \
302 L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) L11(8,9,10,11, A,B,C,D, i) L11(12,13,14,15, E,F,G,H, i) \
303 L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) L12(8,9,10,11, A,B,C,D, i) L12(12,13,14,15, E,F,G,H, i) \
304 L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) L13(8,9,10,11, A,B,C,D, i) L13(12,13,14,15, E,F,G,H, i) \
305 L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) L14(8,9,10,11, A,B,C,D, i) L14(12,13,14,15, E,F,G,H, i) \
306 L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) L15(8,9,10,11, A,B,C,D, i) L15(12,13,14,15, E,F,G,H, i) \
307 L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) L16(8,9,10,11, A,B,C,D, i) L16(12,13,14,15, E,F,G,H, i) \
308 L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) L17(8,9,10,11, A,B,C,D, i) L17(12,13,14,15, E,F,G,H, i) \
309 L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) L18(8,9,10,11, A,B,C,D, i) L18(12,13,14,15, E,F,G,H, i) \
310 L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) L19(8,9,10,11, A,B,C,D, i) L19(12,13,14,15, E,F,G,H, i) \
311 L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) L20(8,9,10,11, A,B,C,D, i) L20(12,13,14,15, E,F,G,H, i) \
312 L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) L21(8,9,10,11, A,B,C,D, i) L21(12,13,14,15, E,F,G,H, i) \
313 L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) L22(8,9,10,11, A,B,C,D, i) L22(12,13,14,15, E,F,G,H, i) \
314 L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) L23(8,9,10,11, A,B,C,D, i) L23(12,13,14,15, E,F,G,H, i) \
315 L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) L24(8,9,10,11, A,B,C,D, i) L24(12,13,14,15, E,F,G,H, i) \
316 L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) L25(8,9,10,11, A,B,C,D, i) L25(12,13,14,15, E,F,G,H, i) \
317 L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) L26(8,9,10,11, A,B,C,D, i) L26(12,13,14,15, E,F,G,H, i) \
318 L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) L27(8,9,10,11, A,B,C,D, i) L27(12,13,14,15, E,F,G,H, i) \
319 L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) L28(8,9,10,11, A,B,C,D, i) L28(12,13,14,15, E,F,G,H, i) \
320 L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) L29(8,9,10,11, A,B,C,D, i) L29(12,13,14,15, E,F,G,H, i) \
321 L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) L30(8,9,10,11, A,B,C,D, i) L30(12,13,14,15, E,F,G,H, i) \
322 L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) L31(8,9,10,11, A,B,C,D, i) L31(12,13,14,15, E,F,G,H, i) \
323 L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i) L32(8,9,10,11, A,B,C,D, i) L32(12,13,14,15, E,F,G,H, i)
325 #if CRYPTOPP_BOOL_X64
326 SSE2_QUARTER_ROUND_X16(1, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
328 SSE2_QUARTER_ROUND_X8(1, 2, 6, 10, 14, 3, 7, 11, 15)
329 SSE2_QUARTER_ROUND_X8(1, 0, 4, 8, 12, 1, 5, 9, 13)
331 AS2( mov REG_roundsLeft, REG_rounds)
334 ASL(SSE2_Salsa_Output)
335 AS2( movdqa xmm0, xmm4)
336 AS2( punpckldq xmm4, xmm5)
337 AS2( movdqa xmm1, xmm6)
338 AS2( punpckldq xmm6, xmm7)
339 AS2( movdqa xmm2, xmm4)
340 AS2( punpcklqdq xmm4, xmm6)
341 AS2( punpckhqdq xmm2, xmm6)
342 AS2( punpckhdq xmm0, xmm5)
343 AS2( punpckhdq xmm1, xmm7)
344 AS2( movdqa xmm6, xmm0)
345 AS2( punpcklqdq xmm0, xmm1)
346 AS2( punpckhqdq xmm6, xmm1)
347 AS_XMM_OUTPUT4(SSE2_Salsa_Output_A, REG_input, REG_output, 4, 2, 0, 6, 1, 0, 4, 8, 12, 1)
351 #if CRYPTOPP_BOOL_X64
352 SSE2_QUARTER_ROUND_X16(0, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
354 SSE2_QUARTER_ROUND_X16(0, 0, 13, 10, 7, 1, 14, 11, 4, 2, 15, 8, 5, 3, 12, 9, 6)
356 SSE2_QUARTER_ROUND_X8(0, 2, 6, 10, 14, 3, 7, 11, 15)
357 SSE2_QUARTER_ROUND_X8(0, 0, 4, 8, 12, 1, 5, 9, 13)
359 SSE2_QUARTER_ROUND_X8(0, 2, 15, 8, 5, 3, 12, 9, 6)
360 SSE2_QUARTER_ROUND_X8(0, 0, 13, 10, 7, 1, 14, 11, 4)
362 AS2( sub REG_roundsLeft, 2)
365 #define SSE2_OUTPUT_4(a, b, c, d) \
366 AS2( movdqa xmm4, [SSE2_WORKSPACE + a*16 + 256])\
367 AS2( paddd xmm4, [SSE2_WORKSPACE + a*16])\
368 AS2( movdqa xmm5, [SSE2_WORKSPACE + b*16 + 256])\
369 AS2( paddd xmm5, [SSE2_WORKSPACE + b*16])\
370 AS2( movdqa xmm6, [SSE2_WORKSPACE + c*16 + 256])\
371 AS2( paddd xmm6, [SSE2_WORKSPACE + c*16])\
372 AS2( movdqa xmm7, [SSE2_WORKSPACE + d*16 + 256])\
373 AS2( paddd xmm7, [SSE2_WORKSPACE + d*16])\
374 ASC( call, SSE2_Salsa_Output)
376 SSE2_OUTPUT_4(0, 13, 10, 7)
377 SSE2_OUTPUT_4(4, 1, 14, 11)
378 SSE2_OUTPUT_4(8, 5, 2, 15)
379 SSE2_OUTPUT_4(12, 9, 6, 3)
380 AS2( test REG_input, REG_input)
382 AS2( add REG_input, 12*16)
384 AS2( add REG_output, 12*16)
385 AS2( sub REG_iterationCount, 4)
386 AS2( cmp REG_iterationCount, 4)
391 AS2( sub REG_iterationCount, 1)
393 AS2( movdqa xmm0, [REG_state + 0*16])
394 AS2( movdqa xmm1, [REG_state + 1*16])
395 AS2( movdqa xmm2, [REG_state + 2*16])
396 AS2( movdqa xmm3, [REG_state + 3*16])
397 AS2( mov REG_roundsLeft, REG_rounds)
400 SSE2_QUARTER_ROUND(0, 1, 3, 7)
401 SSE2_QUARTER_ROUND(1, 2, 0, 9)
402 SSE2_QUARTER_ROUND(2, 3, 1, 13)
403 SSE2_QUARTER_ROUND(3, 0, 2, 18)
404 ASS( pshufd xmm1, xmm1, 2, 1, 0, 3)
405 ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
406 ASS( pshufd xmm3, xmm3, 0, 3, 2, 1)
407 SSE2_QUARTER_ROUND(0, 3, 1, 7)
408 SSE2_QUARTER_ROUND(3, 2, 0, 9)
409 SSE2_QUARTER_ROUND(2, 1, 3, 13)
410 SSE2_QUARTER_ROUND(1, 0, 2, 18)
411 ASS( pshufd xmm1, xmm1, 0, 3, 2, 1)
412 ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
413 ASS( pshufd xmm3, xmm3, 2, 1, 0, 3)
414 AS2( sub REG_roundsLeft, 2)
417 AS2( paddd xmm0, [REG_state + 0*16])
418 AS2( paddd xmm1, [REG_state + 1*16])
419 AS2( paddd xmm2, [REG_state + 2*16])
420 AS2( paddd xmm3, [REG_state + 3*16])
422 AS2( add dword ptr [REG_state + 8*4], 1)
423 AS2( adc dword ptr [REG_state + 5*4], 0)
425 AS2( pcmpeqb xmm6, xmm6)
427 ASS( pshufd xmm7, xmm6, 0, 1, 2, 3)
428 AS2( movdqa xmm4, xmm0)
429 AS2( movdqa xmm5, xmm3)
430 AS2( pand xmm0, xmm7)
431 AS2( pand xmm4, xmm6)
432 AS2( pand xmm3, xmm6)
433 AS2( pand xmm5, xmm7)
435 AS2( movdqa xmm5, xmm1)
436 AS2( pand xmm1, xmm7)
437 AS2( pand xmm5, xmm6)
439 AS2( pand xmm6, xmm2)
440 AS2( pand xmm2, xmm7)
444 AS2( movdqa xmm5, xmm4)
445 AS2( movdqa xmm6, xmm0)
446 AS3( shufpd xmm4, xmm1, 2)
447 AS3( shufpd xmm0, xmm2, 2)
448 AS3( shufpd xmm1, xmm5, 2)
449 AS3( shufpd xmm2, xmm6, 2)
452 AS_XMM_OUTPUT4(SSE2_Salsa_Output_B, REG_input, REG_output, 4, 0, 1, 2, 3, 0, 1, 2, 3, 4)
459 ".att_syntax prefix;"
461 #if CRYPTOPP_BOOL_X64
462 :
"r" (m_rounds),
"r" (input),
"r" (iterationCount),
"r" (m_state.data()),
"r" (output),
"r" (workspace.m_ptr)
463 :
"%eax",
"%edx",
"memory",
"cc",
"%xmm0",
"%xmm1",
"%xmm2",
"%xmm3",
"%xmm4",
"%xmm5",
"%xmm6",
"%xmm7",
"%xmm8",
"%xmm9",
"%xmm10",
"%xmm11",
"%xmm12",
"%xmm13",
"%xmm14",
"%xmm15"
465 :
"d" (m_rounds),
"a" (input),
"c" (iterationCount),
"S" (m_state.data()),
"D" (output)
470 #ifdef CRYPTOPP_GENERATE_X64_MASM
471 movdqa xmm6, [rsp + 0200h]
472 movdqa xmm7, [rsp + 0210h]
473 movdqa xmm8, [rsp + 0220h]
474 movdqa xmm9, [rsp + 0230h]
475 movdqa xmm10, [rsp + 0240h]
476 movdqa xmm11, [rsp + 0250h]
477 movdqa xmm12, [rsp + 0260h]
478 movdqa xmm13, [rsp + 0270h]
479 movdqa xmm14, [rsp + 0280h]
480 movdqa xmm15, [rsp + 0290h]
481 add rsp, 10*16 + 32*16 + 8
483 Salsa20_OperateKeystream ENDP
489 #ifndef CRYPTOPP_GENERATE_X64_MASM
491 word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
493 while (iterationCount--)
495 x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3];
496 x4 = m_state[4]; x5 = m_state[5]; x6 = m_state[6]; x7 = m_state[7];
497 x8 = m_state[8]; x9 = m_state[9]; x10 = m_state[10]; x11 = m_state[11];
498 x12 = m_state[12]; x13 = m_state[13]; x14 = m_state[14]; x15 = m_state[15];
500 for (
int i=m_rounds; i>0; i-=2)
502 #define QUARTER_ROUND(a, b, c, d) \
503 b = b ^ rotlFixed(a + d, 7); \
504 c = c ^ rotlFixed(b + a, 9); \
505 d = d ^ rotlFixed(c + b, 13); \
506 a = a ^ rotlFixed(d + c, 18);
508 QUARTER_ROUND(x0, x4, x8, x12)
509 QUARTER_ROUND(x1, x5, x9, x13)
510 QUARTER_ROUND(x2, x6, x10, x14)
511 QUARTER_ROUND(x3, x7, x11, x15)
513 QUARTER_ROUND(x0, x13, x10, x7)
514 QUARTER_ROUND(x1, x14, x11, x4)
515 QUARTER_ROUND(x2, x15, x8, x5)
516 QUARTER_ROUND(x3, x12, x9, x6)
519 #define SALSA_OUTPUT(x) {\
520 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
521 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\
522 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\
523 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\
524 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
525 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\
526 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\
527 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\
528 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
529 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\
530 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\
531 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\
532 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
533 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\
534 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\
535 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);}
537 #ifndef CRYPTOPP_DOXYGEN_PROCESSING
538 CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION);
541 if (++m_state[8] == 0)
547 void XSalsa20_Policy::CipherSetKey(
const NameValuePairs ¶ms,
const byte *key,
size_t length)
551 if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20))
552 throw InvalidRounds(XSalsa20::StaticAlgorithmName(), m_rounds);
554 GetUserKey(LITTLE_ENDIAN_ORDER, m_key.begin(), m_key.size(), key, length);
556 memcpy(m_key.begin()+4, m_key.begin(), 16);
559 m_state[0] = 0x61707865;
560 m_state[1] = 0x3320646e;
561 m_state[2] = 0x79622d32;
562 m_state[3] = 0x6b206574;
565 void XSalsa20_Policy::CipherResynchronize(byte *keystreamBuffer,
const byte *IV,
size_t length)
569 word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
572 get(x14)(x11)(x8)(x5)(m_state[14])(m_state[11]);
574 x13 = m_key[0]; x10 = m_key[1]; x7 = m_key[2]; x4 = m_key[3];
575 x15 = m_key[4]; x12 = m_key[5]; x9 = m_key[6]; x6 = m_key[7];
576 x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3];
578 for (
int i=m_rounds; i>0; i-=2)
580 QUARTER_ROUND(x0, x4, x8, x12)
581 QUARTER_ROUND(x1, x5, x9, x13)
582 QUARTER_ROUND(x2, x6, x10, x14)
583 QUARTER_ROUND(x3, x7, x11, x15)
585 QUARTER_ROUND(x0, x13, x10, x7)
586 QUARTER_ROUND(x1, x14, x11, x4)
587 QUARTER_ROUND(x2, x15, x8, x5)
588 QUARTER_ROUND(x3, x12, x9, x6)
591 m_state[13] = x0; m_state[10] = x1; m_state[7] = x2; m_state[4] = x3;
592 m_state[15] = x14; m_state[12] = x11; m_state[9] = x8; m_state[6] = x5;
593 m_state[8] = m_state[5] = 0;
598 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM