root / lab4 / .minix-src / include / clang-3.6 / avx2intrin.h @ 14
History | View | Annotate | Download (42.2 KB)
1 |
/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
|
---|---|
2 |
*
|
3 |
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
4 |
* of this software and associated documentation files (the "Software"), to deal
|
5 |
* in the Software without restriction, including without limitation the rights
|
6 |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7 |
* copies of the Software, and to permit persons to whom the Software is
|
8 |
* furnished to do so, subject to the following conditions:
|
9 |
*
|
10 |
* The above copyright notice and this permission notice shall be included in
|
11 |
* all copies or substantial portions of the Software.
|
12 |
*
|
13 |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14 |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15 |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16 |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17 |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18 |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19 |
* THE SOFTWARE.
|
20 |
*
|
21 |
*===-----------------------------------------------------------------------===
|
22 |
*/
|
23 |
|
24 |
#ifndef __IMMINTRIN_H
|
25 |
#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead." |
26 |
#endif
|
27 |
|
28 |
#ifndef __AVX2INTRIN_H
|
29 |
#define __AVX2INTRIN_H
|
30 |
|
31 |
/* SSE4 Multiple Packed Sums of Absolute Difference. */
|
32 |
#define _mm256_mpsadbw_epu8(X, Y, M) __builtin_ia32_mpsadbw256((X), (Y), (M))
|
33 |
|
34 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
35 |
_mm256_abs_epi8(__m256i __a) |
36 |
{ |
37 |
return (__m256i)__builtin_ia32_pabsb256((__v32qi)__a);
|
38 |
} |
39 |
|
40 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
41 |
_mm256_abs_epi16(__m256i __a) |
42 |
{ |
43 |
return (__m256i)__builtin_ia32_pabsw256((__v16hi)__a);
|
44 |
} |
45 |
|
46 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
47 |
_mm256_abs_epi32(__m256i __a) |
48 |
{ |
49 |
return (__m256i)__builtin_ia32_pabsd256((__v8si)__a);
|
50 |
} |
51 |
|
52 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
53 |
_mm256_packs_epi16(__m256i __a, __m256i __b) |
54 |
{ |
55 |
return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
|
56 |
} |
57 |
|
58 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
59 |
_mm256_packs_epi32(__m256i __a, __m256i __b) |
60 |
{ |
61 |
return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
|
62 |
} |
63 |
|
64 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
65 |
_mm256_packus_epi16(__m256i __a, __m256i __b) |
66 |
{ |
67 |
return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
|
68 |
} |
69 |
|
70 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
71 |
_mm256_packus_epi32(__m256i __V1, __m256i __V2) |
72 |
{ |
73 |
return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
|
74 |
} |
75 |
|
76 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
77 |
_mm256_add_epi8(__m256i __a, __m256i __b) |
78 |
{ |
79 |
return (__m256i)((__v32qi)__a + (__v32qi)__b);
|
80 |
} |
81 |
|
82 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
83 |
_mm256_add_epi16(__m256i __a, __m256i __b) |
84 |
{ |
85 |
return (__m256i)((__v16hi)__a + (__v16hi)__b);
|
86 |
} |
87 |
|
88 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
89 |
_mm256_add_epi32(__m256i __a, __m256i __b) |
90 |
{ |
91 |
return (__m256i)((__v8si)__a + (__v8si)__b);
|
92 |
} |
93 |
|
94 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
95 |
_mm256_add_epi64(__m256i __a, __m256i __b) |
96 |
{ |
97 |
return __a + __b;
|
98 |
} |
99 |
|
100 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
101 |
_mm256_adds_epi8(__m256i __a, __m256i __b) |
102 |
{ |
103 |
return (__m256i)__builtin_ia32_paddsb256((__v32qi)__a, (__v32qi)__b);
|
104 |
} |
105 |
|
106 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
107 |
_mm256_adds_epi16(__m256i __a, __m256i __b) |
108 |
{ |
109 |
return (__m256i)__builtin_ia32_paddsw256((__v16hi)__a, (__v16hi)__b);
|
110 |
} |
111 |
|
112 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
113 |
_mm256_adds_epu8(__m256i __a, __m256i __b) |
114 |
{ |
115 |
return (__m256i)__builtin_ia32_paddusb256((__v32qi)__a, (__v32qi)__b);
|
116 |
} |
117 |
|
118 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
119 |
_mm256_adds_epu16(__m256i __a, __m256i __b) |
120 |
{ |
121 |
return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b);
|
122 |
} |
123 |
|
124 |
#define _mm256_alignr_epi8(a, b, n) __extension__ ({ \
|
125 |
__m256i __a = (a); \ |
126 |
__m256i __b = (b); \ |
127 |
(__m256i)__builtin_ia32_palignr256((__v32qi)__a, (__v32qi)__b, (n)); }) |
128 |
|
129 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
130 |
_mm256_and_si256(__m256i __a, __m256i __b) |
131 |
{ |
132 |
return __a & __b;
|
133 |
} |
134 |
|
135 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
136 |
_mm256_andnot_si256(__m256i __a, __m256i __b) |
137 |
{ |
138 |
return ~__a & __b;
|
139 |
} |
140 |
|
141 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
142 |
_mm256_avg_epu8(__m256i __a, __m256i __b) |
143 |
{ |
144 |
return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
|
145 |
} |
146 |
|
147 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
148 |
_mm256_avg_epu16(__m256i __a, __m256i __b) |
149 |
{ |
150 |
return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
|
151 |
} |
152 |
|
153 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
154 |
_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M) |
155 |
{ |
156 |
return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
|
157 |
(__v32qi)__M); |
158 |
} |
159 |
|
160 |
#define _mm256_blend_epi16(V1, V2, M) __extension__ ({ \
|
161 |
__m256i __V1 = (V1); \ |
162 |
__m256i __V2 = (V2); \ |
163 |
(__m256d)__builtin_shufflevector((__v16hi)__V1, (__v16hi)__V2, \ |
164 |
(((M) & 0x01) ? 16 : 0), \ |
165 |
(((M) & 0x02) ? 17 : 1), \ |
166 |
(((M) & 0x04) ? 18 : 2), \ |
167 |
(((M) & 0x08) ? 19 : 3), \ |
168 |
(((M) & 0x10) ? 20 : 4), \ |
169 |
(((M) & 0x20) ? 21 : 5), \ |
170 |
(((M) & 0x40) ? 22 : 6), \ |
171 |
(((M) & 0x80) ? 23 : 7), \ |
172 |
(((M) & 0x01) ? 24 : 8), \ |
173 |
(((M) & 0x02) ? 25 : 9), \ |
174 |
(((M) & 0x04) ? 26 : 10), \ |
175 |
(((M) & 0x08) ? 27 : 11), \ |
176 |
(((M) & 0x10) ? 28 : 12), \ |
177 |
(((M) & 0x20) ? 29 : 13), \ |
178 |
(((M) & 0x40) ? 30 : 14), \ |
179 |
(((M) & 0x80) ? 31 : 15)); }) |
180 |
|
181 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
182 |
_mm256_cmpeq_epi8(__m256i __a, __m256i __b) |
183 |
{ |
184 |
return (__m256i)((__v32qi)__a == (__v32qi)__b);
|
185 |
} |
186 |
|
187 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
188 |
_mm256_cmpeq_epi16(__m256i __a, __m256i __b) |
189 |
{ |
190 |
return (__m256i)((__v16hi)__a == (__v16hi)__b);
|
191 |
} |
192 |
|
193 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
194 |
_mm256_cmpeq_epi32(__m256i __a, __m256i __b) |
195 |
{ |
196 |
return (__m256i)((__v8si)__a == (__v8si)__b);
|
197 |
} |
198 |
|
199 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
200 |
_mm256_cmpeq_epi64(__m256i __a, __m256i __b) |
201 |
{ |
202 |
return (__m256i)(__a == __b);
|
203 |
} |
204 |
|
205 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
206 |
_mm256_cmpgt_epi8(__m256i __a, __m256i __b) |
207 |
{ |
208 |
return (__m256i)((__v32qi)__a > (__v32qi)__b);
|
209 |
} |
210 |
|
211 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
212 |
_mm256_cmpgt_epi16(__m256i __a, __m256i __b) |
213 |
{ |
214 |
return (__m256i)((__v16hi)__a > (__v16hi)__b);
|
215 |
} |
216 |
|
217 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
218 |
_mm256_cmpgt_epi32(__m256i __a, __m256i __b) |
219 |
{ |
220 |
return (__m256i)((__v8si)__a > (__v8si)__b);
|
221 |
} |
222 |
|
223 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
224 |
_mm256_cmpgt_epi64(__m256i __a, __m256i __b) |
225 |
{ |
226 |
return (__m256i)(__a > __b);
|
227 |
} |
228 |
|
229 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
230 |
_mm256_hadd_epi16(__m256i __a, __m256i __b) |
231 |
{ |
232 |
return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
|
233 |
} |
234 |
|
235 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
236 |
_mm256_hadd_epi32(__m256i __a, __m256i __b) |
237 |
{ |
238 |
return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
|
239 |
} |
240 |
|
241 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
242 |
_mm256_hadds_epi16(__m256i __a, __m256i __b) |
243 |
{ |
244 |
return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
|
245 |
} |
246 |
|
247 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
248 |
_mm256_hsub_epi16(__m256i __a, __m256i __b) |
249 |
{ |
250 |
return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
|
251 |
} |
252 |
|
253 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
254 |
_mm256_hsub_epi32(__m256i __a, __m256i __b) |
255 |
{ |
256 |
return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
|
257 |
} |
258 |
|
259 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
260 |
_mm256_hsubs_epi16(__m256i __a, __m256i __b) |
261 |
{ |
262 |
return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
|
263 |
} |
264 |
|
265 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
266 |
_mm256_maddubs_epi16(__m256i __a, __m256i __b) |
267 |
{ |
268 |
return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
|
269 |
} |
270 |
|
271 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
272 |
_mm256_madd_epi16(__m256i __a, __m256i __b) |
273 |
{ |
274 |
return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
|
275 |
} |
276 |
|
277 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
278 |
_mm256_max_epi8(__m256i __a, __m256i __b) |
279 |
{ |
280 |
return (__m256i)__builtin_ia32_pmaxsb256((__v32qi)__a, (__v32qi)__b);
|
281 |
} |
282 |
|
283 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
284 |
_mm256_max_epi16(__m256i __a, __m256i __b) |
285 |
{ |
286 |
return (__m256i)__builtin_ia32_pmaxsw256((__v16hi)__a, (__v16hi)__b);
|
287 |
} |
288 |
|
289 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
290 |
_mm256_max_epi32(__m256i __a, __m256i __b) |
291 |
{ |
292 |
return (__m256i)__builtin_ia32_pmaxsd256((__v8si)__a, (__v8si)__b);
|
293 |
} |
294 |
|
295 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
296 |
_mm256_max_epu8(__m256i __a, __m256i __b) |
297 |
{ |
298 |
return (__m256i)__builtin_ia32_pmaxub256((__v32qi)__a, (__v32qi)__b);
|
299 |
} |
300 |
|
301 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
302 |
_mm256_max_epu16(__m256i __a, __m256i __b) |
303 |
{ |
304 |
return (__m256i)__builtin_ia32_pmaxuw256((__v16hi)__a, (__v16hi)__b);
|
305 |
} |
306 |
|
307 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
308 |
_mm256_max_epu32(__m256i __a, __m256i __b) |
309 |
{ |
310 |
return (__m256i)__builtin_ia32_pmaxud256((__v8si)__a, (__v8si)__b);
|
311 |
} |
312 |
|
313 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
314 |
_mm256_min_epi8(__m256i __a, __m256i __b) |
315 |
{ |
316 |
return (__m256i)__builtin_ia32_pminsb256((__v32qi)__a, (__v32qi)__b);
|
317 |
} |
318 |
|
319 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
320 |
_mm256_min_epi16(__m256i __a, __m256i __b) |
321 |
{ |
322 |
return (__m256i)__builtin_ia32_pminsw256((__v16hi)__a, (__v16hi)__b);
|
323 |
} |
324 |
|
325 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
326 |
_mm256_min_epi32(__m256i __a, __m256i __b) |
327 |
{ |
328 |
return (__m256i)__builtin_ia32_pminsd256((__v8si)__a, (__v8si)__b);
|
329 |
} |
330 |
|
331 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
332 |
_mm256_min_epu8(__m256i __a, __m256i __b) |
333 |
{ |
334 |
return (__m256i)__builtin_ia32_pminub256((__v32qi)__a, (__v32qi)__b);
|
335 |
} |
336 |
|
337 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
338 |
_mm256_min_epu16(__m256i __a, __m256i __b) |
339 |
{ |
340 |
return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__a, (__v16hi)__b);
|
341 |
} |
342 |
|
343 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
344 |
_mm256_min_epu32(__m256i __a, __m256i __b) |
345 |
{ |
346 |
return (__m256i)__builtin_ia32_pminud256((__v8si)__a, (__v8si)__b);
|
347 |
} |
348 |
|
349 |
static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
350 |
_mm256_movemask_epi8(__m256i __a) |
351 |
{ |
352 |
return __builtin_ia32_pmovmskb256((__v32qi)__a);
|
353 |
} |
354 |
|
355 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
356 |
_mm256_cvtepi8_epi16(__m128i __V) |
357 |
{ |
358 |
return (__m256i)__builtin_ia32_pmovsxbw256((__v16qi)__V);
|
359 |
} |
360 |
|
361 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
362 |
_mm256_cvtepi8_epi32(__m128i __V) |
363 |
{ |
364 |
return (__m256i)__builtin_ia32_pmovsxbd256((__v16qi)__V);
|
365 |
} |
366 |
|
367 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
368 |
_mm256_cvtepi8_epi64(__m128i __V) |
369 |
{ |
370 |
return (__m256i)__builtin_ia32_pmovsxbq256((__v16qi)__V);
|
371 |
} |
372 |
|
373 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
374 |
_mm256_cvtepi16_epi32(__m128i __V) |
375 |
{ |
376 |
return (__m256i)__builtin_ia32_pmovsxwd256((__v8hi)__V);
|
377 |
} |
378 |
|
379 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
380 |
_mm256_cvtepi16_epi64(__m128i __V) |
381 |
{ |
382 |
return (__m256i)__builtin_ia32_pmovsxwq256((__v8hi)__V);
|
383 |
} |
384 |
|
385 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
386 |
_mm256_cvtepi32_epi64(__m128i __V) |
387 |
{ |
388 |
return (__m256i)__builtin_ia32_pmovsxdq256((__v4si)__V);
|
389 |
} |
390 |
|
391 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
392 |
_mm256_cvtepu8_epi16(__m128i __V) |
393 |
{ |
394 |
return (__m256i)__builtin_ia32_pmovzxbw256((__v16qi)__V);
|
395 |
} |
396 |
|
397 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
398 |
_mm256_cvtepu8_epi32(__m128i __V) |
399 |
{ |
400 |
return (__m256i)__builtin_ia32_pmovzxbd256((__v16qi)__V);
|
401 |
} |
402 |
|
403 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
404 |
_mm256_cvtepu8_epi64(__m128i __V) |
405 |
{ |
406 |
return (__m256i)__builtin_ia32_pmovzxbq256((__v16qi)__V);
|
407 |
} |
408 |
|
409 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
410 |
_mm256_cvtepu16_epi32(__m128i __V) |
411 |
{ |
412 |
return (__m256i)__builtin_ia32_pmovzxwd256((__v8hi)__V);
|
413 |
} |
414 |
|
415 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
416 |
_mm256_cvtepu16_epi64(__m128i __V) |
417 |
{ |
418 |
return (__m256i)__builtin_ia32_pmovzxwq256((__v8hi)__V);
|
419 |
} |
420 |
|
421 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
422 |
_mm256_cvtepu32_epi64(__m128i __V) |
423 |
{ |
424 |
return (__m256i)__builtin_ia32_pmovzxdq256((__v4si)__V);
|
425 |
} |
426 |
|
427 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
428 |
_mm256_mul_epi32(__m256i __a, __m256i __b) |
429 |
{ |
430 |
return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
|
431 |
} |
432 |
|
433 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
434 |
_mm256_mulhrs_epi16(__m256i __a, __m256i __b) |
435 |
{ |
436 |
return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
|
437 |
} |
438 |
|
439 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
440 |
_mm256_mulhi_epu16(__m256i __a, __m256i __b) |
441 |
{ |
442 |
return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
|
443 |
} |
444 |
|
445 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
446 |
_mm256_mulhi_epi16(__m256i __a, __m256i __b) |
447 |
{ |
448 |
return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
|
449 |
} |
450 |
|
451 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
452 |
_mm256_mullo_epi16(__m256i __a, __m256i __b) |
453 |
{ |
454 |
return (__m256i)((__v16hi)__a * (__v16hi)__b);
|
455 |
} |
456 |
|
457 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
458 |
_mm256_mullo_epi32 (__m256i __a, __m256i __b) |
459 |
{ |
460 |
return (__m256i)((__v8si)__a * (__v8si)__b);
|
461 |
} |
462 |
|
463 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
464 |
_mm256_mul_epu32(__m256i __a, __m256i __b) |
465 |
{ |
466 |
return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
|
467 |
} |
468 |
|
469 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
470 |
_mm256_or_si256(__m256i __a, __m256i __b) |
471 |
{ |
472 |
return __a | __b;
|
473 |
} |
474 |
|
475 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
476 |
_mm256_sad_epu8(__m256i __a, __m256i __b) |
477 |
{ |
478 |
return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
|
479 |
} |
480 |
|
481 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
482 |
_mm256_shuffle_epi8(__m256i __a, __m256i __b) |
483 |
{ |
484 |
return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
|
485 |
} |
486 |
|
487 |
#define _mm256_shuffle_epi32(a, imm) __extension__ ({ \
|
488 |
__m256i __a = (a); \ |
489 |
(__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)_mm256_set1_epi32(0), \
|
490 |
(imm) & 0x3, ((imm) & 0xc) >> 2, \ |
491 |
((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ |
492 |
4 + (((imm) & 0x03) >> 0), \ |
493 |
4 + (((imm) & 0x0c) >> 2), \ |
494 |
4 + (((imm) & 0x30) >> 4), \ |
495 |
4 + (((imm) & 0xc0) >> 6)); }) |
496 |
|
497 |
#define _mm256_shufflehi_epi16(a, imm) __extension__ ({ \
|
498 |
__m256i __a = (a); \ |
499 |
(__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)_mm256_set1_epi16(0), \
|
500 |
0, 1, 2, 3, \ |
501 |
4 + (((imm) & 0x03) >> 0), \ |
502 |
4 + (((imm) & 0x0c) >> 2), \ |
503 |
4 + (((imm) & 0x30) >> 4), \ |
504 |
4 + (((imm) & 0xc0) >> 6), \ |
505 |
8, 9, 10, 11, \ |
506 |
12 + (((imm) & 0x03) >> 0), \ |
507 |
12 + (((imm) & 0x0c) >> 2), \ |
508 |
12 + (((imm) & 0x30) >> 4), \ |
509 |
12 + (((imm) & 0xc0) >> 6)); }) |
510 |
|
511 |
#define _mm256_shufflelo_epi16(a, imm) __extension__ ({ \
|
512 |
__m256i __a = (a); \ |
513 |
(__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)_mm256_set1_epi16(0), \
|
514 |
(imm) & 0x3,((imm) & 0xc) >> 2, \ |
515 |
((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ |
516 |
4, 5, 6, 7, \ |
517 |
8 + (((imm) & 0x03) >> 0), \ |
518 |
8 + (((imm) & 0x0c) >> 2), \ |
519 |
8 + (((imm) & 0x30) >> 4), \ |
520 |
8 + (((imm) & 0xc0) >> 6), \ |
521 |
12, 13, 14, 15); }) |
522 |
|
523 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
524 |
_mm256_sign_epi8(__m256i __a, __m256i __b) |
525 |
{ |
526 |
return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
|
527 |
} |
528 |
|
529 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
530 |
_mm256_sign_epi16(__m256i __a, __m256i __b) |
531 |
{ |
532 |
return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
|
533 |
} |
534 |
|
535 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
536 |
_mm256_sign_epi32(__m256i __a, __m256i __b) |
537 |
{ |
538 |
return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
|
539 |
} |
540 |
|
541 |
#define _mm256_slli_si256(a, count) __extension__ ({ \
|
542 |
__m256i __a = (a); \ |
543 |
(__m256i)__builtin_ia32_pslldqi256(__a, (count)*8); })
|
544 |
|
545 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
546 |
_mm256_slli_epi16(__m256i __a, int __count)
|
547 |
{ |
548 |
return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
|
549 |
} |
550 |
|
551 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
552 |
_mm256_sll_epi16(__m256i __a, __m128i __count) |
553 |
{ |
554 |
return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
|
555 |
} |
556 |
|
557 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
558 |
_mm256_slli_epi32(__m256i __a, int __count)
|
559 |
{ |
560 |
return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
|
561 |
} |
562 |
|
563 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
564 |
_mm256_sll_epi32(__m256i __a, __m128i __count) |
565 |
{ |
566 |
return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
|
567 |
} |
568 |
|
569 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
570 |
_mm256_slli_epi64(__m256i __a, int __count)
|
571 |
{ |
572 |
return __builtin_ia32_psllqi256(__a, __count);
|
573 |
} |
574 |
|
575 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
576 |
_mm256_sll_epi64(__m256i __a, __m128i __count) |
577 |
{ |
578 |
return __builtin_ia32_psllq256(__a, __count);
|
579 |
} |
580 |
|
581 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
582 |
_mm256_srai_epi16(__m256i __a, int __count)
|
583 |
{ |
584 |
return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
|
585 |
} |
586 |
|
587 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
588 |
_mm256_sra_epi16(__m256i __a, __m128i __count) |
589 |
{ |
590 |
return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
|
591 |
} |
592 |
|
593 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
594 |
_mm256_srai_epi32(__m256i __a, int __count)
|
595 |
{ |
596 |
return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
|
597 |
} |
598 |
|
599 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
600 |
_mm256_sra_epi32(__m256i __a, __m128i __count) |
601 |
{ |
602 |
return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
|
603 |
} |
604 |
|
605 |
#define _mm256_srli_si256(a, count) __extension__ ({ \
|
606 |
__m256i __a = (a); \ |
607 |
(__m256i)__builtin_ia32_psrldqi256(__a, (count)*8); })
|
608 |
|
609 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
610 |
_mm256_srli_epi16(__m256i __a, int __count)
|
611 |
{ |
612 |
return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
|
613 |
} |
614 |
|
615 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
616 |
_mm256_srl_epi16(__m256i __a, __m128i __count) |
617 |
{ |
618 |
return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
|
619 |
} |
620 |
|
621 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
622 |
_mm256_srli_epi32(__m256i __a, int __count)
|
623 |
{ |
624 |
return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
|
625 |
} |
626 |
|
627 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
628 |
_mm256_srl_epi32(__m256i __a, __m128i __count) |
629 |
{ |
630 |
return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
|
631 |
} |
632 |
|
633 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
634 |
_mm256_srli_epi64(__m256i __a, int __count)
|
635 |
{ |
636 |
return __builtin_ia32_psrlqi256(__a, __count);
|
637 |
} |
638 |
|
639 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
640 |
_mm256_srl_epi64(__m256i __a, __m128i __count) |
641 |
{ |
642 |
return __builtin_ia32_psrlq256(__a, __count);
|
643 |
} |
644 |
|
645 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
646 |
_mm256_sub_epi8(__m256i __a, __m256i __b) |
647 |
{ |
648 |
return (__m256i)((__v32qi)__a - (__v32qi)__b);
|
649 |
} |
650 |
|
651 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
652 |
_mm256_sub_epi16(__m256i __a, __m256i __b) |
653 |
{ |
654 |
return (__m256i)((__v16hi)__a - (__v16hi)__b);
|
655 |
} |
656 |
|
657 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
658 |
_mm256_sub_epi32(__m256i __a, __m256i __b) |
659 |
{ |
660 |
return (__m256i)((__v8si)__a - (__v8si)__b);
|
661 |
} |
662 |
|
663 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
664 |
_mm256_sub_epi64(__m256i __a, __m256i __b) |
665 |
{ |
666 |
return __a - __b;
|
667 |
} |
668 |
|
669 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
670 |
_mm256_subs_epi8(__m256i __a, __m256i __b) |
671 |
{ |
672 |
return (__m256i)__builtin_ia32_psubsb256((__v32qi)__a, (__v32qi)__b);
|
673 |
} |
674 |
|
675 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
676 |
_mm256_subs_epi16(__m256i __a, __m256i __b) |
677 |
{ |
678 |
return (__m256i)__builtin_ia32_psubsw256((__v16hi)__a, (__v16hi)__b);
|
679 |
} |
680 |
|
681 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
682 |
_mm256_subs_epu8(__m256i __a, __m256i __b) |
683 |
{ |
684 |
return (__m256i)__builtin_ia32_psubusb256((__v32qi)__a, (__v32qi)__b);
|
685 |
} |
686 |
|
687 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
688 |
_mm256_subs_epu16(__m256i __a, __m256i __b) |
689 |
{ |
690 |
return (__m256i)__builtin_ia32_psubusw256((__v16hi)__a, (__v16hi)__b);
|
691 |
} |
692 |
|
693 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
694 |
_mm256_unpackhi_epi8(__m256i __a, __m256i __b) |
695 |
{ |
696 |
return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31); |
697 |
} |
698 |
|
699 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
700 |
_mm256_unpackhi_epi16(__m256i __a, __m256i __b) |
701 |
{ |
702 |
return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); |
703 |
} |
704 |
|
705 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
706 |
_mm256_unpackhi_epi32(__m256i __a, __m256i __b) |
707 |
{ |
708 |
return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7); |
709 |
} |
710 |
|
711 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
712 |
_mm256_unpackhi_epi64(__m256i __a, __m256i __b) |
713 |
{ |
714 |
return (__m256i)__builtin_shufflevector(__a, __b, 1, 4+1, 3, 4+3); |
715 |
} |
716 |
|
717 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
718 |
_mm256_unpacklo_epi8(__m256i __a, __m256i __b) |
719 |
{ |
720 |
return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23); |
721 |
} |
722 |
|
723 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
724 |
_mm256_unpacklo_epi16(__m256i __a, __m256i __b) |
725 |
{ |
726 |
return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11); |
727 |
} |
728 |
|
729 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
730 |
_mm256_unpacklo_epi32(__m256i __a, __m256i __b) |
731 |
{ |
732 |
return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5); |
733 |
} |
734 |
|
735 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
736 |
_mm256_unpacklo_epi64(__m256i __a, __m256i __b) |
737 |
{ |
738 |
return (__m256i)__builtin_shufflevector(__a, __b, 0, 4+0, 2, 4+2); |
739 |
} |
740 |
|
741 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
742 |
_mm256_xor_si256(__m256i __a, __m256i __b) |
743 |
{ |
744 |
return __a ^ __b;
|
745 |
} |
746 |
|
747 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
748 |
_mm256_stream_load_si256(__m256i *__V) |
749 |
{ |
750 |
return (__m256i)__builtin_ia32_movntdqa256((__v4di *)__V);
|
751 |
} |
752 |
|
753 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
754 |
_mm_broadcastss_ps(__m128 __X) |
755 |
{ |
756 |
return (__m128)__builtin_ia32_vbroadcastss_ps((__v4sf)__X);
|
757 |
} |
758 |
|
759 |
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
|
760 |
_mm256_broadcastss_ps(__m128 __X) |
761 |
{ |
762 |
return (__m256)__builtin_ia32_vbroadcastss_ps256((__v4sf)__X);
|
763 |
} |
764 |
|
765 |
static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
|
766 |
_mm256_broadcastsd_pd(__m128d __X) |
767 |
{ |
768 |
return (__m256d)__builtin_ia32_vbroadcastsd_pd256((__v2df)__X);
|
769 |
} |
770 |
|
771 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
772 |
_mm256_broadcastsi128_si256(__m128i __X) |
773 |
{ |
774 |
return (__m256i)__builtin_ia32_vbroadcastsi256(__X);
|
775 |
} |
776 |
|
777 |
#define _mm_blend_epi32(V1, V2, M) __extension__ ({ \
|
778 |
__m128i __V1 = (V1); \ |
779 |
__m128i __V2 = (V2); \ |
780 |
(__m128i)__builtin_shufflevector((__v4si)__V1, (__v4si)__V2, \ |
781 |
(((M) & 0x01) ? 4 : 0), \ |
782 |
(((M) & 0x02) ? 5 : 1), \ |
783 |
(((M) & 0x04) ? 6 : 2), \ |
784 |
(((M) & 0x08) ? 7 : 3)); }) |
785 |
|
786 |
#define _mm256_blend_epi32(V1, V2, M) __extension__ ({ \
|
787 |
__m256i __V1 = (V1); \ |
788 |
__m256i __V2 = (V2); \ |
789 |
(__m256i)__builtin_shufflevector((__v8si)__V1, (__v8si)__V2, \ |
790 |
(((M) & 0x01) ? 8 : 0), \ |
791 |
(((M) & 0x02) ? 9 : 1), \ |
792 |
(((M) & 0x04) ? 10 : 2), \ |
793 |
(((M) & 0x08) ? 11 : 3), \ |
794 |
(((M) & 0x10) ? 12 : 4), \ |
795 |
(((M) & 0x20) ? 13 : 5), \ |
796 |
(((M) & 0x40) ? 14 : 6), \ |
797 |
(((M) & 0x80) ? 15 : 7)); }) |
798 |
|
799 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
800 |
_mm256_broadcastb_epi8(__m128i __X) |
801 |
{ |
802 |
return (__m256i)__builtin_ia32_pbroadcastb256((__v16qi)__X);
|
803 |
} |
804 |
|
805 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
806 |
_mm256_broadcastw_epi16(__m128i __X) |
807 |
{ |
808 |
return (__m256i)__builtin_ia32_pbroadcastw256((__v8hi)__X);
|
809 |
} |
810 |
|
811 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
812 |
_mm256_broadcastd_epi32(__m128i __X) |
813 |
{ |
814 |
return (__m256i)__builtin_ia32_pbroadcastd256((__v4si)__X);
|
815 |
} |
816 |
|
817 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
818 |
_mm256_broadcastq_epi64(__m128i __X) |
819 |
{ |
820 |
return (__m256i)__builtin_ia32_pbroadcastq256(__X);
|
821 |
} |
822 |
|
823 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
824 |
_mm_broadcastb_epi8(__m128i __X) |
825 |
{ |
826 |
return (__m128i)__builtin_ia32_pbroadcastb128((__v16qi)__X);
|
827 |
} |
828 |
|
829 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
830 |
_mm_broadcastw_epi16(__m128i __X) |
831 |
{ |
832 |
return (__m128i)__builtin_ia32_pbroadcastw128((__v8hi)__X);
|
833 |
} |
834 |
|
835 |
|
836 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
837 |
_mm_broadcastd_epi32(__m128i __X) |
838 |
{ |
839 |
return (__m128i)__builtin_ia32_pbroadcastd128((__v4si)__X);
|
840 |
} |
841 |
|
842 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
843 |
_mm_broadcastq_epi64(__m128i __X) |
844 |
{ |
845 |
return (__m128i)__builtin_ia32_pbroadcastq128(__X);
|
846 |
} |
847 |
|
848 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
849 |
_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b) |
850 |
{ |
851 |
return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
|
852 |
} |
853 |
|
854 |
#define _mm256_permute4x64_pd(V, M) __extension__ ({ \
|
855 |
__m256d __V = (V); \ |
856 |
(__m256d)__builtin_shufflevector((__v4df)__V, (__v4df) _mm256_setzero_pd(), \ |
857 |
(M) & 0x3, ((M) & 0xc) >> 2, \ |
858 |
((M) & 0x30) >> 4, ((M) & 0xc0) >> 6); }) |
859 |
|
860 |
static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
|
861 |
_mm256_permutevar8x32_ps(__m256 __a, __m256 __b) |
862 |
{ |
863 |
return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8sf)__b);
|
864 |
} |
865 |
|
866 |
#define _mm256_permute4x64_epi64(V, M) __extension__ ({ \
|
867 |
__m256i __V = (V); \ |
868 |
(__m256i)__builtin_shufflevector((__v4di)__V, (__v4di) _mm256_setzero_si256(), \ |
869 |
(M) & 0x3, ((M) & 0xc) >> 2, \ |
870 |
((M) & 0x30) >> 4, ((M) & 0xc0) >> 6); }) |
871 |
|
872 |
#define _mm256_permute2x128_si256(V1, V2, M) __extension__ ({ \
|
873 |
__m256i __V1 = (V1); \ |
874 |
__m256i __V2 = (V2); \ |
875 |
(__m256i)__builtin_ia32_permti256(__V1, __V2, (M)); }) |
876 |
|
877 |
#define _mm256_extracti128_si256(A, O) __extension__ ({ \
|
878 |
__m256i __A = (A); \ |
879 |
(__m128i)__builtin_ia32_extract128i256(__A, (O)); }) |
880 |
|
881 |
#define _mm256_inserti128_si256(V1, V2, O) __extension__ ({ \
|
882 |
__m256i __V1 = (V1); \ |
883 |
__m128i __V2 = (V2); \ |
884 |
(__m256i)__builtin_ia32_insert128i256(__V1, __V2, (O)); }) |
885 |
|
886 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
887 |
_mm256_maskload_epi32(int const *__X, __m256i __M) |
888 |
{ |
889 |
return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M); |
890 |
} |
891 |
|
892 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
893 |
_mm256_maskload_epi64(long long const *__X, __m256i __M) |
894 |
{ |
895 |
return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, __M); |
896 |
} |
897 |
|
898 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
899 |
_mm_maskload_epi32(int const *__X, __m128i __M) |
900 |
{ |
901 |
return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M); |
902 |
} |
903 |
|
904 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
905 |
_mm_maskload_epi64(long long const *__X, __m128i __M) |
906 |
{ |
907 |
return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M); |
908 |
} |
909 |
|
910 |
static __inline__ void __attribute__((__always_inline__, __nodebug__)) |
911 |
_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
|
912 |
{ |
913 |
__builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y); |
914 |
} |
915 |
|
916 |
static __inline__ void __attribute__((__always_inline__, __nodebug__)) |
917 |
_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y) |
918 |
{ |
919 |
__builtin_ia32_maskstoreq256((__v4di *)__X, __M, __Y); |
920 |
} |
921 |
|
922 |
static __inline__ void __attribute__((__always_inline__, __nodebug__)) |
923 |
_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
|
924 |
{ |
925 |
__builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y); |
926 |
} |
927 |
|
928 |
static __inline__ void __attribute__((__always_inline__, __nodebug__)) |
929 |
_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y) |
930 |
{ |
931 |
__builtin_ia32_maskstoreq(( __v2di *)__X, __M, __Y); |
932 |
} |
933 |
|
934 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
935 |
_mm256_sllv_epi32(__m256i __X, __m256i __Y) |
936 |
{ |
937 |
return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
|
938 |
} |
939 |
|
940 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
941 |
_mm_sllv_epi32(__m128i __X, __m128i __Y) |
942 |
{ |
943 |
return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
|
944 |
} |
945 |
|
946 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
947 |
_mm256_sllv_epi64(__m256i __X, __m256i __Y) |
948 |
{ |
949 |
return (__m256i)__builtin_ia32_psllv4di(__X, __Y);
|
950 |
} |
951 |
|
952 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
953 |
_mm_sllv_epi64(__m128i __X, __m128i __Y) |
954 |
{ |
955 |
return (__m128i)__builtin_ia32_psllv2di(__X, __Y);
|
956 |
} |
957 |
|
958 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
959 |
_mm256_srav_epi32(__m256i __X, __m256i __Y) |
960 |
{ |
961 |
return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
|
962 |
} |
963 |
|
964 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
965 |
_mm_srav_epi32(__m128i __X, __m128i __Y) |
966 |
{ |
967 |
return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
|
968 |
} |
969 |
|
970 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
971 |
_mm256_srlv_epi32(__m256i __X, __m256i __Y) |
972 |
{ |
973 |
return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
|
974 |
} |
975 |
|
976 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
977 |
_mm_srlv_epi32(__m128i __X, __m128i __Y) |
978 |
{ |
979 |
return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
|
980 |
} |
981 |
|
982 |
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
|
983 |
_mm256_srlv_epi64(__m256i __X, __m256i __Y) |
984 |
{ |
985 |
return (__m256i)__builtin_ia32_psrlv4di(__X, __Y);
|
986 |
} |
987 |
|
988 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
989 |
_mm_srlv_epi64(__m128i __X, __m128i __Y) |
990 |
{ |
991 |
return (__m128i)__builtin_ia32_psrlv2di(__X, __Y);
|
992 |
} |
993 |
|
994 |
#define _mm_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \
|
995 |
__m128d __a = (a); \ |
996 |
double const *__m = (m); \ |
997 |
__m128i __i = (i); \ |
998 |
__m128d __mask = (mask); \ |
999 |
(__m128d)__builtin_ia32_gatherd_pd((__v2df)__a, (const __v2df *)__m, \
|
1000 |
(__v4si)__i, (__v2df)__mask, (s)); }) |
1001 |
|
1002 |
#define _mm256_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \
|
1003 |
__m256d __a = (a); \ |
1004 |
double const *__m = (m); \ |
1005 |
__m128i __i = (i); \ |
1006 |
__m256d __mask = (mask); \ |
1007 |
(__m256d)__builtin_ia32_gatherd_pd256((__v4df)__a, (const __v4df *)__m, \
|
1008 |
(__v4si)__i, (__v4df)__mask, (s)); }) |
1009 |
|
1010 |
#define _mm_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \
|
1011 |
__m128d __a = (a); \ |
1012 |
double const *__m = (m); \ |
1013 |
__m128i __i = (i); \ |
1014 |
__m128d __mask = (mask); \ |
1015 |
(__m128d)__builtin_ia32_gatherq_pd((__v2df)__a, (const __v2df *)__m, \
|
1016 |
(__v2di)__i, (__v2df)__mask, (s)); }) |
1017 |
|
1018 |
#define _mm256_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \
|
1019 |
__m256d __a = (a); \ |
1020 |
double const *__m = (m); \ |
1021 |
__m256i __i = (i); \ |
1022 |
__m256d __mask = (mask); \ |
1023 |
(__m256d)__builtin_ia32_gatherq_pd256((__v4df)__a, (const __v4df *)__m, \
|
1024 |
(__v4di)__i, (__v4df)__mask, (s)); }) |
1025 |
|
1026 |
#define _mm_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \
|
1027 |
__m128 __a = (a); \ |
1028 |
float const *__m = (m); \ |
1029 |
__m128i __i = (i); \ |
1030 |
__m128 __mask = (mask); \ |
1031 |
(__m128)__builtin_ia32_gatherd_ps((__v4sf)__a, (const __v4sf *)__m, \
|
1032 |
(__v4si)__i, (__v4sf)__mask, (s)); }) |
1033 |
|
1034 |
#define _mm256_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \
|
1035 |
__m256 __a = (a); \ |
1036 |
float const *__m = (m); \ |
1037 |
__m256i __i = (i); \ |
1038 |
__m256 __mask = (mask); \ |
1039 |
(__m256)__builtin_ia32_gatherd_ps256((__v8sf)__a, (const __v8sf *)__m, \
|
1040 |
(__v8si)__i, (__v8sf)__mask, (s)); }) |
1041 |
|
1042 |
#define _mm_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \
|
1043 |
__m128 __a = (a); \ |
1044 |
float const *__m = (m); \ |
1045 |
__m128i __i = (i); \ |
1046 |
__m128 __mask = (mask); \ |
1047 |
(__m128)__builtin_ia32_gatherq_ps((__v4sf)__a, (const __v4sf *)__m, \
|
1048 |
(__v2di)__i, (__v4sf)__mask, (s)); }) |
1049 |
|
1050 |
#define _mm256_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \
|
1051 |
__m128 __a = (a); \ |
1052 |
float const *__m = (m); \ |
1053 |
__m256i __i = (i); \ |
1054 |
__m128 __mask = (mask); \ |
1055 |
(__m128)__builtin_ia32_gatherq_ps256((__v4sf)__a, (const __v4sf *)__m, \
|
1056 |
(__v4di)__i, (__v4sf)__mask, (s)); }) |
1057 |
|
1058 |
#define _mm_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \
|
1059 |
__m128i __a = (a); \ |
1060 |
int const *__m = (m); \ |
1061 |
__m128i __i = (i); \ |
1062 |
__m128i __mask = (mask); \ |
1063 |
(__m128i)__builtin_ia32_gatherd_d((__v4si)__a, (const __v4si *)__m, \
|
1064 |
(__v4si)__i, (__v4si)__mask, (s)); }) |
1065 |
|
1066 |
#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \
|
1067 |
__m256i __a = (a); \ |
1068 |
int const *__m = (m); \ |
1069 |
__m256i __i = (i); \ |
1070 |
__m256i __mask = (mask); \ |
1071 |
(__m256i)__builtin_ia32_gatherd_d256((__v8si)__a, (const __v8si *)__m, \
|
1072 |
(__v8si)__i, (__v8si)__mask, (s)); }) |
1073 |
|
1074 |
#define _mm_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \
|
1075 |
__m128i __a = (a); \ |
1076 |
int const *__m = (m); \ |
1077 |
__m128i __i = (i); \ |
1078 |
__m128i __mask = (mask); \ |
1079 |
(__m128i)__builtin_ia32_gatherq_d((__v4si)__a, (const __v4si *)__m, \
|
1080 |
(__v2di)__i, (__v4si)__mask, (s)); }) |
1081 |
|
1082 |
#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \
|
1083 |
__m128i __a = (a); \ |
1084 |
int const *__m = (m); \ |
1085 |
__m256i __i = (i); \ |
1086 |
__m128i __mask = (mask); \ |
1087 |
(__m128i)__builtin_ia32_gatherq_d256((__v4si)__a, (const __v4si *)__m, \
|
1088 |
(__v4di)__i, (__v4si)__mask, (s)); }) |
1089 |
|
1090 |
#define _mm_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \
|
1091 |
__m128i __a = (a); \ |
1092 |
long long const *__m = (m); \ |
1093 |
__m128i __i = (i); \ |
1094 |
__m128i __mask = (mask); \ |
1095 |
(__m128i)__builtin_ia32_gatherd_q((__v2di)__a, (const __v2di *)__m, \
|
1096 |
(__v4si)__i, (__v2di)__mask, (s)); }) |
1097 |
|
1098 |
#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \
|
1099 |
__m256i __a = (a); \ |
1100 |
long long const *__m = (m); \ |
1101 |
__m128i __i = (i); \ |
1102 |
__m256i __mask = (mask); \ |
1103 |
(__m256i)__builtin_ia32_gatherd_q256((__v4di)__a, (const __v4di *)__m, \
|
1104 |
(__v4si)__i, (__v4di)__mask, (s)); }) |
1105 |
|
1106 |
#define _mm_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \
|
1107 |
__m128i __a = (a); \ |
1108 |
long long const *__m = (m); \ |
1109 |
__m128i __i = (i); \ |
1110 |
__m128i __mask = (mask); \ |
1111 |
(__m128i)__builtin_ia32_gatherq_q((__v2di)__a, (const __v2di *)__m, \
|
1112 |
(__v2di)__i, (__v2di)__mask, (s)); }) |
1113 |
|
1114 |
#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \
|
1115 |
__m256i __a = (a); \ |
1116 |
long long const *__m = (m); \ |
1117 |
__m256i __i = (i); \ |
1118 |
__m256i __mask = (mask); \ |
1119 |
(__m256i)__builtin_ia32_gatherq_q256((__v4di)__a, (const __v4di *)__m, \
|
1120 |
(__v4di)__i, (__v4di)__mask, (s)); }) |
1121 |
|
1122 |
#define _mm_i32gather_pd(m, i, s) __extension__ ({ \
|
1123 |
double const *__m = (m); \ |
1124 |
__m128i __i = (i); \ |
1125 |
(__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_setzero_pd(), \ |
1126 |
(const __v2df *)__m, (__v4si)__i, \
|
1127 |
(__v2df)_mm_set1_pd((double)(long long int)-1), (s)); }) |
1128 |
|
1129 |
#define _mm256_i32gather_pd(m, i, s) __extension__ ({ \
|
1130 |
double const *__m = (m); \ |
1131 |
__m128i __i = (i); \ |
1132 |
(__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_setzero_pd(), \ |
1133 |
(const __v4df *)__m, (__v4si)__i, \
|
1134 |
(__v4df)_mm256_set1_pd((double)(long long int)-1), (s)); }) |
1135 |
|
1136 |
#define _mm_i64gather_pd(m, i, s) __extension__ ({ \
|
1137 |
double const *__m = (m); \ |
1138 |
__m128i __i = (i); \ |
1139 |
(__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_setzero_pd(), \ |
1140 |
(const __v2df *)__m, (__v2di)__i, \
|
1141 |
(__v2df)_mm_set1_pd((double)(long long int)-1), (s)); }) |
1142 |
|
1143 |
#define _mm256_i64gather_pd(m, i, s) __extension__ ({ \
|
1144 |
double const *__m = (m); \ |
1145 |
__m256i __i = (i); \ |
1146 |
(__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_setzero_pd(), \ |
1147 |
(const __v4df *)__m, (__v4di)__i, \
|
1148 |
(__v4df)_mm256_set1_pd((double)(long long int)-1), (s)); }) |
1149 |
|
1150 |
#define _mm_i32gather_ps(m, i, s) __extension__ ({ \
|
1151 |
float const *__m = (m); \ |
1152 |
__m128i __i = (i); \ |
1153 |
(__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_setzero_ps(), \ |
1154 |
(const __v4sf *)__m, (__v4si)__i, \
|
1155 |
(__v4sf)_mm_set1_ps((float)(int)-1), (s)); }) |
1156 |
|
1157 |
#define _mm256_i32gather_ps(m, i, s) __extension__ ({ \
|
1158 |
float const *__m = (m); \ |
1159 |
__m256i __i = (i); \ |
1160 |
(__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_setzero_ps(), \ |
1161 |
(const __v8sf *)__m, (__v8si)__i, \
|
1162 |
(__v8sf)_mm256_set1_ps((float)(int)-1), (s)); }) |
1163 |
|
1164 |
#define _mm_i64gather_ps(m, i, s) __extension__ ({ \
|
1165 |
float const *__m = (m); \ |
1166 |
__m128i __i = (i); \ |
1167 |
(__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_setzero_ps(), \ |
1168 |
(const __v4sf *)__m, (__v2di)__i, \
|
1169 |
(__v4sf)_mm_set1_ps((float)(int)-1), (s)); }) |
1170 |
|
1171 |
#define _mm256_i64gather_ps(m, i, s) __extension__ ({ \
|
1172 |
float const *__m = (m); \ |
1173 |
__m256i __i = (i); \ |
1174 |
(__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_setzero_ps(), \ |
1175 |
(const __v4sf *)__m, (__v4di)__i, \
|
1176 |
(__v4sf)_mm_set1_ps((float)(int)-1), (s)); }) |
1177 |
|
1178 |
#define _mm_i32gather_epi32(m, i, s) __extension__ ({ \
|
1179 |
int const *__m = (m); \ |
1180 |
__m128i __i = (i); \ |
1181 |
(__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_setzero_si128(), \ |
1182 |
(const __v4si *)__m, (__v4si)__i, \
|
1183 |
(__v4si)_mm_set1_epi32(-1), (s)); })
|
1184 |
|
1185 |
#define _mm256_i32gather_epi32(m, i, s) __extension__ ({ \
|
1186 |
int const *__m = (m); \ |
1187 |
__m256i __i = (i); \ |
1188 |
(__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_setzero_si256(), \ |
1189 |
(const __v8si *)__m, (__v8si)__i, \
|
1190 |
(__v8si)_mm256_set1_epi32(-1), (s)); })
|
1191 |
|
1192 |
#define _mm_i64gather_epi32(m, i, s) __extension__ ({ \
|
1193 |
int const *__m = (m); \ |
1194 |
__m128i __i = (i); \ |
1195 |
(__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_setzero_si128(), \ |
1196 |
(const __v4si *)__m, (__v2di)__i, \
|
1197 |
(__v4si)_mm_set1_epi32(-1), (s)); })
|
1198 |
|
1199 |
#define _mm256_i64gather_epi32(m, i, s) __extension__ ({ \
|
1200 |
int const *__m = (m); \ |
1201 |
__m256i __i = (i); \ |
1202 |
(__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_setzero_si128(), \ |
1203 |
(const __v4si *)__m, (__v4di)__i, \
|
1204 |
(__v4si)_mm_set1_epi32(-1), (s)); })
|
1205 |
|
1206 |
#define _mm_i32gather_epi64(m, i, s) __extension__ ({ \
|
1207 |
long long const *__m = (m); \ |
1208 |
__m128i __i = (i); \ |
1209 |
(__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_setzero_si128(), \ |
1210 |
(const __v2di *)__m, (__v4si)__i, \
|
1211 |
(__v2di)_mm_set1_epi64x(-1), (s)); })
|
1212 |
|
1213 |
#define _mm256_i32gather_epi64(m, i, s) __extension__ ({ \
|
1214 |
long long const *__m = (m); \ |
1215 |
__m128i __i = (i); \ |
1216 |
(__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_setzero_si256(), \ |
1217 |
(const __v4di *)__m, (__v4si)__i, \
|
1218 |
(__v4di)_mm256_set1_epi64x(-1), (s)); })
|
1219 |
|
1220 |
#define _mm_i64gather_epi64(m, i, s) __extension__ ({ \
|
1221 |
long long const *__m = (m); \ |
1222 |
__m128i __i = (i); \ |
1223 |
(__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_setzero_si128(), \ |
1224 |
(const __v2di *)__m, (__v2di)__i, \
|
1225 |
(__v2di)_mm_set1_epi64x(-1), (s)); })
|
1226 |
|
1227 |
#define _mm256_i64gather_epi64(m, i, s) __extension__ ({ \
|
1228 |
long long const *__m = (m); \ |
1229 |
__m256i __i = (i); \ |
1230 |
(__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_setzero_si256(), \ |
1231 |
(const __v4di *)__m, (__v4di)__i, \
|
1232 |
(__v4di)_mm256_set1_epi64x(-1), (s)); })
|
1233 |
|
1234 |
#endif /* __AVX2INTRIN_H */ |