33 #include "m4ri_config.h"
36 #include <emmintrin.h>
54 __m128i *__c = (__m128i*)c;
55 __m128i *__t1 = (__m128i*)t1;
56 __m128i *__t2 = (__m128i*)t2;
57 __m128i *__t3 = (__m128i*)t3;
58 __m128i *__t4 = (__m128i*)t4;
59 __m128i *__t5 = (__m128i*)t5;
60 __m128i *__t6 = (__m128i*)t6;
61 __m128i *__t7 = (__m128i*)t7;
62 __m128i *__t8 = (__m128i*)t8;
63 const __m128i *eof = (__m128i*)((
unsigned long)(c + wide) & ~0xFUL);
67 xmm1 = _mm_xor_si128(*__c, *__t1++);
68 xmm1 = _mm_xor_si128(xmm1, *__t2++);
69 xmm1 = _mm_xor_si128(xmm1, *__t3++);
70 xmm1 = _mm_xor_si128(xmm1, *__t4++);
71 xmm1 = _mm_xor_si128(xmm1, *__t5++);
72 xmm1 = _mm_xor_si128(xmm1, *__t6++);
73 xmm1 = _mm_xor_si128(xmm1, *__t7++);
74 xmm1 = _mm_xor_si128(xmm1, *__t8++);
86 wide = ((
sizeof(
word) * wide) % 16) /
sizeof(
word);
89 for(
wi_t i = 0; i < wide; ++i) {
90 c[i] ^= t1[i] ^ t2[i] ^ t3[i] ^ t4[i] ^ t5[i] ^ t6[i] ^ t7[i] ^ t8[i];
93 __M4RI_DD_RAWROW(c, wide_in);
106 __m128i *__c = (__m128i*)c;
107 __m128i *__t1 = (__m128i*)t1;
108 __m128i *__t2 = (__m128i*)t2;
109 __m128i *__t3 = (__m128i*)t3;
110 __m128i *__t4 = (__m128i*)t4;
111 const __m128i *eof = (__m128i*)((
unsigned long)(c + wide) & ~0xFUL);
115 xmm1 = _mm_xor_si128(*__c, *__t1++);
116 xmm1 = _mm_xor_si128(xmm1, *__t2++);
117 xmm1 = _mm_xor_si128(xmm1, *__t3++);
118 xmm1 = _mm_xor_si128(xmm1, *__t4++);
126 wide = ((
sizeof(
word) * wide) % 16) /
sizeof(
word);
129 __M4RI_DD_RAWROW(c, wide_in);
132 #endif // __M4RI_HAVE_SSE2
133 wi_t n = (wide + 7) / 8;
135 case 0:
do { *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
136 case 7: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
137 case 6: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
138 case 5: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
139 case 4: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
140 case 3: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
141 case 2: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
142 case 1: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
145 __M4RI_DD_RAWROW(c, wide_in);
158 __m128i *__c = (__m128i*)c;
159 __m128i *__t1 = (__m128i*)t1;
160 __m128i *__t2 = (__m128i*)t2;
161 __m128i *__t3 = (__m128i*)t3;
162 const __m128i *eof = (__m128i*)((
unsigned long)(c + wide) & ~0xFUL);
166 xmm1 = _mm_xor_si128(*__c, *__t1++);
167 xmm1 = _mm_xor_si128(xmm1, *__t2++);
168 xmm1 = _mm_xor_si128(xmm1, *__t3++);
175 wide = ((
sizeof(
word) * wide) % 16) /
sizeof(
word);
178 __M4RI_DD_RAWROW(c, wide_in);
181 #endif // __M4RI_HAVE_SSE2
182 wi_t n = (wide + 7) / 8;
184 case 0:
do { *c++ ^= *t1++ ^ *t2++ ^ *t3++;
185 case 7: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
186 case 6: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
187 case 5: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
188 case 4: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
189 case 3: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
190 case 2: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
191 case 1: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
194 __M4RI_DD_RAWROW(c, wide_in);
208 __m128i *__c = (__m128i*)c;
209 __m128i *__t1 = (__m128i*)t1;
210 __m128i *__t2 = (__m128i*)t2;
211 const __m128i *eof = (__m128i*)((
unsigned long)(c + wide) & ~0xFUL);
215 xmm1 = _mm_xor_si128(*__c, *__t1++);
216 xmm1 = _mm_xor_si128(xmm1, *__t2++);
222 wide = ((
sizeof(
word) * wide) % 16) /
sizeof(
word);
225 __M4RI_DD_RAWROW(c, wide_in);
228 #endif // __M4RI_HAVE_SSE2
229 wi_t n = (wide + 7) / 8;
231 case 0:
do { *c++ ^= *t1++ ^ *t2++;
232 case 7: *c++ ^= *t1++ ^ *t2++;
233 case 6: *c++ ^= *t1++ ^ *t2++;
234 case 5: *c++ ^= *t1++ ^ *t2++;
235 case 4: *c++ ^= *t1++ ^ *t2++;
236 case 3: *c++ ^= *t1++ ^ *t2++;
237 case 2: *c++ ^= *t1++ ^ *t2++;
238 case 1: *c++ ^= *t1++ ^ *t2++;
241 __M4RI_DD_RAWROW(c, wide_in);
259 __m128i *__c = (__m128i*)c;
260 __m128i *__t1 = (__m128i*)t1;
261 const __m128i *eof = (__m128i*)((
unsigned long)(c + wide) & ~0xFUL);
266 xmm1 = _mm_xor_si128(*__c, *__t1++);
268 xmm1 = _mm_xor_si128(*__c, *__t1++);
273 xmm1 = _mm_xor_si128(*__c, *__t1++);
279 wide = ((
sizeof(
word) * wide) % 16) /
sizeof(
word);
282 __M4RI_DD_RAWROW(c, wide_in);
285 #endif // __M4RI_HAVE_SSE2
287 wi_t n = (wide + 7) / 8;
289 case 0:
do { *c++ ^= *t1++;
290 case 7: *c++ ^= *t1++;
291 case 6: *c++ ^= *t1++;
292 case 5: *c++ ^= *t1++;
293 case 4: *c++ ^= *t1++;
294 case 3: *c++ ^= *t1++;
295 case 2: *c++ ^= *t1++;
296 case 1: *c++ ^= *t1++;
299 __M4RI_DD_RAWROW(c, wide_in);
303 #ifdef __M4RI_M4RM_GRAY8
304 #define _MZD_COMBINE _mzd_combine8(c, t1, t2, t3, t4, t5, t6, t7, t8, wide)
305 #else // __M4RI_M4RM_GRAY8
306 #define _MZD_COMBINE _mzd_combine4(c, t1, t2, t3, t4, wide)
307 #endif // __M4RI_M4RM_GRAY8