root / lab4 / .minix-src / include / clang-3.6 / smmintrin.h @ 14
History | View | Annotate | Download (17.9 KB)
1 |
/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
|
---|---|
2 |
*
|
3 |
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
4 |
* of this software and associated documentation files (the "Software"), to deal
|
5 |
* in the Software without restriction, including without limitation the rights
|
6 |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7 |
* copies of the Software, and to permit persons to whom the Software is
|
8 |
* furnished to do so, subject to the following conditions:
|
9 |
*
|
10 |
* The above copyright notice and this permission notice shall be included in
|
11 |
* all copies or substantial portions of the Software.
|
12 |
*
|
13 |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14 |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15 |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16 |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17 |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18 |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19 |
* THE SOFTWARE.
|
20 |
*
|
21 |
*===-----------------------------------------------------------------------===
|
22 |
*/
|
23 |
|
24 |
#ifndef _SMMINTRIN_H
|
25 |
#define _SMMINTRIN_H
|
26 |
|
27 |
#ifndef __SSE4_1__
|
28 |
#error "SSE4.1 instruction set not enabled" |
29 |
#else
|
30 |
|
31 |
#include <tmmintrin.h> |
32 |
|
33 |
/* SSE4 Rounding macros. */
|
34 |
#define _MM_FROUND_TO_NEAREST_INT 0x00 |
35 |
#define _MM_FROUND_TO_NEG_INF 0x01 |
36 |
#define _MM_FROUND_TO_POS_INF 0x02 |
37 |
#define _MM_FROUND_TO_ZERO 0x03 |
38 |
#define _MM_FROUND_CUR_DIRECTION 0x04 |
39 |
|
40 |
#define _MM_FROUND_RAISE_EXC 0x00 |
41 |
#define _MM_FROUND_NO_EXC 0x08 |
42 |
|
43 |
#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
|
44 |
#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
|
45 |
#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
|
46 |
#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
|
47 |
#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
|
48 |
#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
|
49 |
|
50 |
#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)
|
51 |
#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)
|
52 |
#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
|
53 |
#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
|
54 |
|
55 |
#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)
|
56 |
#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)
|
57 |
#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
|
58 |
#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
|
59 |
|
60 |
#define _mm_round_ps(X, M) __extension__ ({ \
|
61 |
__m128 __X = (X); \ |
62 |
(__m128) __builtin_ia32_roundps((__v4sf)__X, (M)); }) |
63 |
|
64 |
#define _mm_round_ss(X, Y, M) __extension__ ({ \
|
65 |
__m128 __X = (X); \ |
66 |
__m128 __Y = (Y); \ |
67 |
(__m128) __builtin_ia32_roundss((__v4sf)__X, (__v4sf)__Y, (M)); }) |
68 |
|
69 |
#define _mm_round_pd(X, M) __extension__ ({ \
|
70 |
__m128d __X = (X); \ |
71 |
(__m128d) __builtin_ia32_roundpd((__v2df)__X, (M)); }) |
72 |
|
73 |
#define _mm_round_sd(X, Y, M) __extension__ ({ \
|
74 |
__m128d __X = (X); \ |
75 |
__m128d __Y = (Y); \ |
76 |
(__m128d) __builtin_ia32_roundsd((__v2df)__X, (__v2df)__Y, (M)); }) |
77 |
|
78 |
/* SSE4 Packed Blending Intrinsics. */
|
79 |
#define _mm_blend_pd(V1, V2, M) __extension__ ({ \
|
80 |
__m128d __V1 = (V1); \ |
81 |
__m128d __V2 = (V2); \ |
82 |
(__m128d)__builtin_shufflevector((__v2df)__V1, (__v2df)__V2, \ |
83 |
(((M) & 0x01) ? 2 : 0), \ |
84 |
(((M) & 0x02) ? 3 : 1)); }) |
85 |
|
86 |
#define _mm_blend_ps(V1, V2, M) __extension__ ({ \
|
87 |
__m128 __V1 = (V1); \ |
88 |
__m128 __V2 = (V2); \ |
89 |
(__m128)__builtin_shufflevector((__v4sf)__V1, (__v4sf)__V2, \ |
90 |
(((M) & 0x01) ? 4 : 0), \ |
91 |
(((M) & 0x02) ? 5 : 1), \ |
92 |
(((M) & 0x04) ? 6 : 2), \ |
93 |
(((M) & 0x08) ? 7 : 3)); }) |
94 |
|
95 |
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
|
96 |
_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M) |
97 |
{ |
98 |
return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
|
99 |
(__v2df)__M); |
100 |
} |
101 |
|
102 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
103 |
_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M) |
104 |
{ |
105 |
return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
|
106 |
(__v4sf)__M); |
107 |
} |
108 |
|
109 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
110 |
_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M) |
111 |
{ |
112 |
return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
|
113 |
(__v16qi)__M); |
114 |
} |
115 |
|
116 |
#define _mm_blend_epi16(V1, V2, M) __extension__ ({ \
|
117 |
__m128i __V1 = (V1); \ |
118 |
__m128i __V2 = (V2); \ |
119 |
(__m128i)__builtin_shufflevector((__v8hi)__V1, (__v8hi)__V2, \ |
120 |
(((M) & 0x01) ? 8 : 0), \ |
121 |
(((M) & 0x02) ? 9 : 1), \ |
122 |
(((M) & 0x04) ? 10 : 2), \ |
123 |
(((M) & 0x08) ? 11 : 3), \ |
124 |
(((M) & 0x10) ? 12 : 4), \ |
125 |
(((M) & 0x20) ? 13 : 5), \ |
126 |
(((M) & 0x40) ? 14 : 6), \ |
127 |
(((M) & 0x80) ? 15 : 7)); }) |
128 |
|
129 |
/* SSE4 Dword Multiply Instructions. */
|
130 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
131 |
_mm_mullo_epi32 (__m128i __V1, __m128i __V2) |
132 |
{ |
133 |
return (__m128i) ((__v4si)__V1 * (__v4si)__V2);
|
134 |
} |
135 |
|
136 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
137 |
_mm_mul_epi32 (__m128i __V1, __m128i __V2) |
138 |
{ |
139 |
return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
|
140 |
} |
141 |
|
142 |
/* SSE4 Floating Point Dot Product Instructions. */
|
143 |
#define _mm_dp_ps(X, Y, M) __extension__ ({ \
|
144 |
__m128 __X = (X); \ |
145 |
__m128 __Y = (Y); \ |
146 |
(__m128) __builtin_ia32_dpps((__v4sf)__X, (__v4sf)__Y, (M)); }) |
147 |
|
148 |
#define _mm_dp_pd(X, Y, M) __extension__ ({\
|
149 |
__m128d __X = (X); \ |
150 |
__m128d __Y = (Y); \ |
151 |
(__m128d) __builtin_ia32_dppd((__v2df)__X, (__v2df)__Y, (M)); }) |
152 |
|
153 |
/* SSE4 Streaming Load Hint Instruction. */
|
154 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
155 |
_mm_stream_load_si128 (__m128i *__V) |
156 |
{ |
157 |
return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __V);
|
158 |
} |
159 |
|
160 |
/* SSE4 Packed Integer Min/Max Instructions. */
|
161 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
162 |
_mm_min_epi8 (__m128i __V1, __m128i __V2) |
163 |
{ |
164 |
return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
|
165 |
} |
166 |
|
167 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
168 |
_mm_max_epi8 (__m128i __V1, __m128i __V2) |
169 |
{ |
170 |
return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
|
171 |
} |
172 |
|
173 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
174 |
_mm_min_epu16 (__m128i __V1, __m128i __V2) |
175 |
{ |
176 |
return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
|
177 |
} |
178 |
|
179 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
180 |
_mm_max_epu16 (__m128i __V1, __m128i __V2) |
181 |
{ |
182 |
return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
|
183 |
} |
184 |
|
185 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
186 |
_mm_min_epi32 (__m128i __V1, __m128i __V2) |
187 |
{ |
188 |
return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
|
189 |
} |
190 |
|
191 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
192 |
_mm_max_epi32 (__m128i __V1, __m128i __V2) |
193 |
{ |
194 |
return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
|
195 |
} |
196 |
|
197 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
198 |
_mm_min_epu32 (__m128i __V1, __m128i __V2) |
199 |
{ |
200 |
return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
|
201 |
} |
202 |
|
203 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
204 |
_mm_max_epu32 (__m128i __V1, __m128i __V2) |
205 |
{ |
206 |
return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
|
207 |
} |
208 |
|
209 |
/* SSE4 Insertion and Extraction from XMM Register Instructions. */
|
210 |
#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
|
211 |
#define _mm_extract_ps(X, N) (__extension__ \
|
212 |
({ union { int __i; float __f; } __t; \ |
213 |
__v4sf __a = (__v4sf)(X); \ |
214 |
__t.__f = __a[(N) & 3]; \
|
215 |
__t.__i;})) |
216 |
|
217 |
/* Miscellaneous insert and extract macros. */
|
218 |
/* Extract a single-precision float from X at index N into D. */
|
219 |
#define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \
|
220 |
(D) = __a[N]; })) |
221 |
|
222 |
/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
|
223 |
an index suitable for _mm_insert_ps. */
|
224 |
#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z)) |
225 |
|
226 |
/* Extract a float from X at index N into the first index of the return. */
|
227 |
#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \
|
228 |
_MM_MK_INSERTPS_NDX((N), 0, 0x0e)) |
229 |
|
230 |
/* Insert int into packed integer array at index. */
|
231 |
#define _mm_insert_epi8(X, I, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \
|
232 |
__a[(N) & 15] = (I); \
|
233 |
__a;})) |
234 |
#define _mm_insert_epi32(X, I, N) (__extension__ ({ __v4si __a = (__v4si)(X); \
|
235 |
__a[(N) & 3] = (I); \
|
236 |
__a;})) |
237 |
#ifdef __x86_64__
|
238 |
#define _mm_insert_epi64(X, I, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
|
239 |
__a[(N) & 1] = (I); \
|
240 |
__a;})) |
241 |
#endif /* __x86_64__ */ |
242 |
|
243 |
/* Extract int from packed integer array at index. This returns the element
|
244 |
* as a zero extended value, so it is unsigned.
|
245 |
*/
|
246 |
#define _mm_extract_epi8(X, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \
|
247 |
(int)(unsigned char) \ |
248 |
__a[(N) & 15];}))
|
249 |
#define _mm_extract_epi32(X, N) (__extension__ ({ __v4si __a = (__v4si)(X); \
|
250 |
__a[(N) & 3];}))
|
251 |
#ifdef __x86_64__
|
252 |
#define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
|
253 |
__a[(N) & 1];}))
|
254 |
#endif /* __x86_64 */ |
255 |
|
256 |
/* SSE4 128-bit Packed Integer Comparisons. */
|
257 |
static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
258 |
_mm_testz_si128(__m128i __M, __m128i __V) |
259 |
{ |
260 |
return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
|
261 |
} |
262 |
|
263 |
static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
264 |
_mm_testc_si128(__m128i __M, __m128i __V) |
265 |
{ |
266 |
return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
|
267 |
} |
268 |
|
269 |
static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
270 |
_mm_testnzc_si128(__m128i __M, __m128i __V) |
271 |
{ |
272 |
return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
|
273 |
} |
274 |
|
275 |
#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
|
276 |
#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
|
277 |
#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
|
278 |
|
279 |
/* SSE4 64-bit Packed Integer Comparisons. */
|
280 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
281 |
_mm_cmpeq_epi64(__m128i __V1, __m128i __V2) |
282 |
{ |
283 |
return (__m128i)((__v2di)__V1 == (__v2di)__V2);
|
284 |
} |
285 |
|
286 |
/* SSE4 Packed Integer Sign-Extension. */
|
287 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
288 |
_mm_cvtepi8_epi16(__m128i __V) |
289 |
{ |
290 |
return (__m128i) __builtin_ia32_pmovsxbw128((__v16qi) __V);
|
291 |
} |
292 |
|
293 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
294 |
_mm_cvtepi8_epi32(__m128i __V) |
295 |
{ |
296 |
return (__m128i) __builtin_ia32_pmovsxbd128((__v16qi) __V);
|
297 |
} |
298 |
|
299 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
300 |
_mm_cvtepi8_epi64(__m128i __V) |
301 |
{ |
302 |
return (__m128i) __builtin_ia32_pmovsxbq128((__v16qi) __V);
|
303 |
} |
304 |
|
305 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
306 |
_mm_cvtepi16_epi32(__m128i __V) |
307 |
{ |
308 |
return (__m128i) __builtin_ia32_pmovsxwd128((__v8hi) __V);
|
309 |
} |
310 |
|
311 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
312 |
_mm_cvtepi16_epi64(__m128i __V) |
313 |
{ |
314 |
return (__m128i) __builtin_ia32_pmovsxwq128((__v8hi)__V);
|
315 |
} |
316 |
|
317 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
318 |
_mm_cvtepi32_epi64(__m128i __V) |
319 |
{ |
320 |
return (__m128i) __builtin_ia32_pmovsxdq128((__v4si)__V);
|
321 |
} |
322 |
|
323 |
/* SSE4 Packed Integer Zero-Extension. */
|
324 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
325 |
_mm_cvtepu8_epi16(__m128i __V) |
326 |
{ |
327 |
return (__m128i) __builtin_ia32_pmovzxbw128((__v16qi) __V);
|
328 |
} |
329 |
|
330 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
331 |
_mm_cvtepu8_epi32(__m128i __V) |
332 |
{ |
333 |
return (__m128i) __builtin_ia32_pmovzxbd128((__v16qi)__V);
|
334 |
} |
335 |
|
336 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
337 |
_mm_cvtepu8_epi64(__m128i __V) |
338 |
{ |
339 |
return (__m128i) __builtin_ia32_pmovzxbq128((__v16qi)__V);
|
340 |
} |
341 |
|
342 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
343 |
_mm_cvtepu16_epi32(__m128i __V) |
344 |
{ |
345 |
return (__m128i) __builtin_ia32_pmovzxwd128((__v8hi)__V);
|
346 |
} |
347 |
|
348 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
349 |
_mm_cvtepu16_epi64(__m128i __V) |
350 |
{ |
351 |
return (__m128i) __builtin_ia32_pmovzxwq128((__v8hi)__V);
|
352 |
} |
353 |
|
354 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
355 |
_mm_cvtepu32_epi64(__m128i __V) |
356 |
{ |
357 |
return (__m128i) __builtin_ia32_pmovzxdq128((__v4si)__V);
|
358 |
} |
359 |
|
360 |
/* SSE4 Pack with Unsigned Saturation. */
|
361 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
362 |
_mm_packus_epi32(__m128i __V1, __m128i __V2) |
363 |
{ |
364 |
return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
|
365 |
} |
366 |
|
367 |
/* SSE4 Multiple Packed Sums of Absolute Difference. */
|
368 |
#define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \
|
369 |
__m128i __X = (X); \ |
370 |
__m128i __Y = (Y); \ |
371 |
(__m128i) __builtin_ia32_mpsadbw128((__v16qi)__X, (__v16qi)__Y, (M)); }) |
372 |
|
373 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
374 |
_mm_minpos_epu16(__m128i __V) |
375 |
{ |
376 |
return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
|
377 |
} |
378 |
|
379 |
/* These definitions are normally in nmmintrin.h, but gcc puts them in here
|
380 |
so we'll do the same. */
|
381 |
#ifdef __SSE4_2__
|
382 |
|
383 |
/* These specify the type of data that we're comparing. */
|
384 |
#define _SIDD_UBYTE_OPS 0x00 |
385 |
#define _SIDD_UWORD_OPS 0x01 |
386 |
#define _SIDD_SBYTE_OPS 0x02 |
387 |
#define _SIDD_SWORD_OPS 0x03 |
388 |
|
389 |
/* These specify the type of comparison operation. */
|
390 |
#define _SIDD_CMP_EQUAL_ANY 0x00 |
391 |
#define _SIDD_CMP_RANGES 0x04 |
392 |
#define _SIDD_CMP_EQUAL_EACH 0x08 |
393 |
#define _SIDD_CMP_EQUAL_ORDERED 0x0c |
394 |
|
395 |
/* These macros specify the polarity of the operation. */
|
396 |
#define _SIDD_POSITIVE_POLARITY 0x00 |
397 |
#define _SIDD_NEGATIVE_POLARITY 0x10 |
398 |
#define _SIDD_MASKED_POSITIVE_POLARITY 0x20 |
399 |
#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 |
400 |
|
401 |
/* These macros are used in _mm_cmpXstri() to specify the return. */
|
402 |
#define _SIDD_LEAST_SIGNIFICANT 0x00 |
403 |
#define _SIDD_MOST_SIGNIFICANT 0x40 |
404 |
|
405 |
/* These macros are used in _mm_cmpXstri() to specify the return. */
|
406 |
#define _SIDD_BIT_MASK 0x00 |
407 |
#define _SIDD_UNIT_MASK 0x40 |
408 |
|
409 |
/* SSE4.2 Packed Comparison Intrinsics. */
|
410 |
#define _mm_cmpistrm(A, B, M) __builtin_ia32_pcmpistrm128((A), (B), (M))
|
411 |
#define _mm_cmpistri(A, B, M) __builtin_ia32_pcmpistri128((A), (B), (M))
|
412 |
|
413 |
#define _mm_cmpestrm(A, LA, B, LB, M) \
|
414 |
__builtin_ia32_pcmpestrm128((A), (LA), (B), (LB), (M)) |
415 |
#define _mm_cmpestri(A, LA, B, LB, M) \
|
416 |
__builtin_ia32_pcmpestri128((A), (LA), (B), (LB), (M)) |
417 |
|
418 |
/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
|
419 |
#define _mm_cmpistra(A, B, M) \
|
420 |
__builtin_ia32_pcmpistria128((A), (B), (M)) |
421 |
#define _mm_cmpistrc(A, B, M) \
|
422 |
__builtin_ia32_pcmpistric128((A), (B), (M)) |
423 |
#define _mm_cmpistro(A, B, M) \
|
424 |
__builtin_ia32_pcmpistrio128((A), (B), (M)) |
425 |
#define _mm_cmpistrs(A, B, M) \
|
426 |
__builtin_ia32_pcmpistris128((A), (B), (M)) |
427 |
#define _mm_cmpistrz(A, B, M) \
|
428 |
__builtin_ia32_pcmpistriz128((A), (B), (M)) |
429 |
|
430 |
#define _mm_cmpestra(A, LA, B, LB, M) \
|
431 |
__builtin_ia32_pcmpestria128((A), (LA), (B), (LB), (M)) |
432 |
#define _mm_cmpestrc(A, LA, B, LB, M) \
|
433 |
__builtin_ia32_pcmpestric128((A), (LA), (B), (LB), (M)) |
434 |
#define _mm_cmpestro(A, LA, B, LB, M) \
|
435 |
__builtin_ia32_pcmpestrio128((A), (LA), (B), (LB), (M)) |
436 |
#define _mm_cmpestrs(A, LA, B, LB, M) \
|
437 |
__builtin_ia32_pcmpestris128((A), (LA), (B), (LB), (M)) |
438 |
#define _mm_cmpestrz(A, LA, B, LB, M) \
|
439 |
__builtin_ia32_pcmpestriz128((A), (LA), (B), (LB), (M)) |
440 |
|
441 |
/* SSE4.2 Compare Packed Data -- Greater Than. */
|
442 |
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
443 |
_mm_cmpgt_epi64(__m128i __V1, __m128i __V2) |
444 |
{ |
445 |
return (__m128i)((__v2di)__V1 > (__v2di)__V2);
|
446 |
} |
447 |
|
448 |
/* SSE4.2 Accumulate CRC32. */
|
449 |
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) |
450 |
_mm_crc32_u8(unsigned int __C, unsigned char __D) |
451 |
{ |
452 |
return __builtin_ia32_crc32qi(__C, __D);
|
453 |
} |
454 |
|
455 |
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) |
456 |
_mm_crc32_u16(unsigned int __C, unsigned short __D) |
457 |
{ |
458 |
return __builtin_ia32_crc32hi(__C, __D);
|
459 |
} |
460 |
|
461 |
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) |
462 |
_mm_crc32_u32(unsigned int __C, unsigned int __D) |
463 |
{ |
464 |
return __builtin_ia32_crc32si(__C, __D);
|
465 |
} |
466 |
|
467 |
#ifdef __x86_64__
|
468 |
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__)) |
469 |
_mm_crc32_u64(unsigned long long __C, unsigned long long __D) |
470 |
{ |
471 |
return __builtin_ia32_crc32di(__C, __D);
|
472 |
} |
473 |
#endif /* __x86_64__ */ |
474 |
|
475 |
#ifdef __POPCNT__
|
476 |
#include <popcntintrin.h> |
477 |
#endif
|
478 |
|
479 |
#endif /* __SSE4_2__ */ |
480 |
#endif /* __SSE4_1__ */ |
481 |
|
482 |
#endif /* _SMMINTRIN_H */ |