root / lab4 / .minix-src / include / clang-3.6 / smmintrin.h @ 14
History | View | Annotate | Download (17.9 KB)
1 | 13 | up20180614 | /*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
|
---|---|---|---|
2 | *
|
||
3 | * Permission is hereby granted, free of charge, to any person obtaining a copy
|
||
4 | * of this software and associated documentation files (the "Software"), to deal
|
||
5 | * in the Software without restriction, including without limitation the rights
|
||
6 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||
7 | * copies of the Software, and to permit persons to whom the Software is
|
||
8 | * furnished to do so, subject to the following conditions:
|
||
9 | *
|
||
10 | * The above copyright notice and this permission notice shall be included in
|
||
11 | * all copies or substantial portions of the Software.
|
||
12 | *
|
||
13 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||
14 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||
15 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||
16 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||
17 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||
18 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||
19 | * THE SOFTWARE.
|
||
20 | *
|
||
21 | *===-----------------------------------------------------------------------===
|
||
22 | */
|
||
23 | |||
24 | #ifndef _SMMINTRIN_H
|
||
25 | #define _SMMINTRIN_H
|
||
26 | |||
27 | #ifndef __SSE4_1__
|
||
28 | #error "SSE4.1 instruction set not enabled" |
||
29 | #else
|
||
30 | |||
31 | #include <tmmintrin.h> |
||
32 | |||
33 | /* SSE4 Rounding macros. */
|
||
34 | #define _MM_FROUND_TO_NEAREST_INT 0x00 |
||
35 | #define _MM_FROUND_TO_NEG_INF 0x01 |
||
36 | #define _MM_FROUND_TO_POS_INF 0x02 |
||
37 | #define _MM_FROUND_TO_ZERO 0x03 |
||
38 | #define _MM_FROUND_CUR_DIRECTION 0x04 |
||
39 | |||
40 | #define _MM_FROUND_RAISE_EXC 0x00 |
||
41 | #define _MM_FROUND_NO_EXC 0x08 |
||
42 | |||
43 | #define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
|
||
44 | #define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
|
||
45 | #define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
|
||
46 | #define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
|
||
47 | #define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
|
||
48 | #define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
|
||
49 | |||
50 | #define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)
|
||
51 | #define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)
|
||
52 | #define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
|
||
53 | #define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
|
||
54 | |||
55 | #define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)
|
||
56 | #define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)
|
||
57 | #define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
|
||
58 | #define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
|
||
59 | |||
60 | #define _mm_round_ps(X, M) __extension__ ({ \
|
||
61 | __m128 __X = (X); \ |
||
62 | (__m128) __builtin_ia32_roundps((__v4sf)__X, (M)); }) |
||
63 | |||
64 | #define _mm_round_ss(X, Y, M) __extension__ ({ \
|
||
65 | __m128 __X = (X); \ |
||
66 | __m128 __Y = (Y); \ |
||
67 | (__m128) __builtin_ia32_roundss((__v4sf)__X, (__v4sf)__Y, (M)); }) |
||
68 | |||
69 | #define _mm_round_pd(X, M) __extension__ ({ \
|
||
70 | __m128d __X = (X); \ |
||
71 | (__m128d) __builtin_ia32_roundpd((__v2df)__X, (M)); }) |
||
72 | |||
73 | #define _mm_round_sd(X, Y, M) __extension__ ({ \
|
||
74 | __m128d __X = (X); \ |
||
75 | __m128d __Y = (Y); \ |
||
76 | (__m128d) __builtin_ia32_roundsd((__v2df)__X, (__v2df)__Y, (M)); }) |
||
77 | |||
78 | /* SSE4 Packed Blending Intrinsics. */
|
||
79 | #define _mm_blend_pd(V1, V2, M) __extension__ ({ \
|
||
80 | __m128d __V1 = (V1); \ |
||
81 | __m128d __V2 = (V2); \ |
||
82 | (__m128d)__builtin_shufflevector((__v2df)__V1, (__v2df)__V2, \ |
||
83 | (((M) & 0x01) ? 2 : 0), \ |
||
84 | (((M) & 0x02) ? 3 : 1)); }) |
||
85 | |||
86 | #define _mm_blend_ps(V1, V2, M) __extension__ ({ \
|
||
87 | __m128 __V1 = (V1); \ |
||
88 | __m128 __V2 = (V2); \ |
||
89 | (__m128)__builtin_shufflevector((__v4sf)__V1, (__v4sf)__V2, \ |
||
90 | (((M) & 0x01) ? 4 : 0), \ |
||
91 | (((M) & 0x02) ? 5 : 1), \ |
||
92 | (((M) & 0x04) ? 6 : 2), \ |
||
93 | (((M) & 0x08) ? 7 : 3)); }) |
||
94 | |||
95 | static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
|
||
96 | _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M) |
||
97 | { |
||
98 | return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
|
||
99 | (__v2df)__M); |
||
100 | } |
||
101 | |||
102 | static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
||
103 | _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M) |
||
104 | { |
||
105 | return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
|
||
106 | (__v4sf)__M); |
||
107 | } |
||
108 | |||
109 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
110 | _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M) |
||
111 | { |
||
112 | return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
|
||
113 | (__v16qi)__M); |
||
114 | } |
||
115 | |||
116 | #define _mm_blend_epi16(V1, V2, M) __extension__ ({ \
|
||
117 | __m128i __V1 = (V1); \ |
||
118 | __m128i __V2 = (V2); \ |
||
119 | (__m128i)__builtin_shufflevector((__v8hi)__V1, (__v8hi)__V2, \ |
||
120 | (((M) & 0x01) ? 8 : 0), \ |
||
121 | (((M) & 0x02) ? 9 : 1), \ |
||
122 | (((M) & 0x04) ? 10 : 2), \ |
||
123 | (((M) & 0x08) ? 11 : 3), \ |
||
124 | (((M) & 0x10) ? 12 : 4), \ |
||
125 | (((M) & 0x20) ? 13 : 5), \ |
||
126 | (((M) & 0x40) ? 14 : 6), \ |
||
127 | (((M) & 0x80) ? 15 : 7)); }) |
||
128 | |||
129 | /* SSE4 Dword Multiply Instructions. */
|
||
130 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
131 | _mm_mullo_epi32 (__m128i __V1, __m128i __V2) |
||
132 | { |
||
133 | return (__m128i) ((__v4si)__V1 * (__v4si)__V2);
|
||
134 | } |
||
135 | |||
136 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
137 | _mm_mul_epi32 (__m128i __V1, __m128i __V2) |
||
138 | { |
||
139 | return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
|
||
140 | } |
||
141 | |||
142 | /* SSE4 Floating Point Dot Product Instructions. */
|
||
143 | #define _mm_dp_ps(X, Y, M) __extension__ ({ \
|
||
144 | __m128 __X = (X); \ |
||
145 | __m128 __Y = (Y); \ |
||
146 | (__m128) __builtin_ia32_dpps((__v4sf)__X, (__v4sf)__Y, (M)); }) |
||
147 | |||
148 | #define _mm_dp_pd(X, Y, M) __extension__ ({\
|
||
149 | __m128d __X = (X); \ |
||
150 | __m128d __Y = (Y); \ |
||
151 | (__m128d) __builtin_ia32_dppd((__v2df)__X, (__v2df)__Y, (M)); }) |
||
152 | |||
153 | /* SSE4 Streaming Load Hint Instruction. */
|
||
154 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
155 | _mm_stream_load_si128 (__m128i *__V) |
||
156 | { |
||
157 | return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __V);
|
||
158 | } |
||
159 | |||
160 | /* SSE4 Packed Integer Min/Max Instructions. */
|
||
161 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
162 | _mm_min_epi8 (__m128i __V1, __m128i __V2) |
||
163 | { |
||
164 | return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
|
||
165 | } |
||
166 | |||
167 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
168 | _mm_max_epi8 (__m128i __V1, __m128i __V2) |
||
169 | { |
||
170 | return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
|
||
171 | } |
||
172 | |||
173 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
174 | _mm_min_epu16 (__m128i __V1, __m128i __V2) |
||
175 | { |
||
176 | return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
|
||
177 | } |
||
178 | |||
179 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
180 | _mm_max_epu16 (__m128i __V1, __m128i __V2) |
||
181 | { |
||
182 | return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
|
||
183 | } |
||
184 | |||
185 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
186 | _mm_min_epi32 (__m128i __V1, __m128i __V2) |
||
187 | { |
||
188 | return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
|
||
189 | } |
||
190 | |||
191 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
192 | _mm_max_epi32 (__m128i __V1, __m128i __V2) |
||
193 | { |
||
194 | return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
|
||
195 | } |
||
196 | |||
197 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
198 | _mm_min_epu32 (__m128i __V1, __m128i __V2) |
||
199 | { |
||
200 | return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
|
||
201 | } |
||
202 | |||
203 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
204 | _mm_max_epu32 (__m128i __V1, __m128i __V2) |
||
205 | { |
||
206 | return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
|
||
207 | } |
||
208 | |||
209 | /* SSE4 Insertion and Extraction from XMM Register Instructions. */
|
||
210 | #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
|
||
211 | #define _mm_extract_ps(X, N) (__extension__ \
|
||
212 | ({ union { int __i; float __f; } __t; \ |
||
213 | __v4sf __a = (__v4sf)(X); \ |
||
214 | __t.__f = __a[(N) & 3]; \
|
||
215 | __t.__i;})) |
||
216 | |||
217 | /* Miscellaneous insert and extract macros. */
|
||
218 | /* Extract a single-precision float from X at index N into D. */
|
||
219 | #define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \
|
||
220 | (D) = __a[N]; })) |
||
221 | |||
222 | /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
|
||
223 | an index suitable for _mm_insert_ps. */
|
||
224 | #define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z)) |
||
225 | |||
226 | /* Extract a float from X at index N into the first index of the return. */
|
||
227 | #define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \
|
||
228 | _MM_MK_INSERTPS_NDX((N), 0, 0x0e)) |
||
229 | |||
230 | /* Insert int into packed integer array at index. */
|
||
231 | #define _mm_insert_epi8(X, I, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \
|
||
232 | __a[(N) & 15] = (I); \
|
||
233 | __a;})) |
||
234 | #define _mm_insert_epi32(X, I, N) (__extension__ ({ __v4si __a = (__v4si)(X); \
|
||
235 | __a[(N) & 3] = (I); \
|
||
236 | __a;})) |
||
237 | #ifdef __x86_64__
|
||
238 | #define _mm_insert_epi64(X, I, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
|
||
239 | __a[(N) & 1] = (I); \
|
||
240 | __a;})) |
||
241 | #endif /* __x86_64__ */ |
||
242 | |||
243 | /* Extract int from packed integer array at index. This returns the element
|
||
244 | * as a zero extended value, so it is unsigned.
|
||
245 | */
|
||
246 | #define _mm_extract_epi8(X, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \
|
||
247 | (int)(unsigned char) \ |
||
248 | __a[(N) & 15];}))
|
||
249 | #define _mm_extract_epi32(X, N) (__extension__ ({ __v4si __a = (__v4si)(X); \
|
||
250 | __a[(N) & 3];}))
|
||
251 | #ifdef __x86_64__
|
||
252 | #define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
|
||
253 | __a[(N) & 1];}))
|
||
254 | #endif /* __x86_64 */ |
||
255 | |||
256 | /* SSE4 128-bit Packed Integer Comparisons. */
|
||
257 | static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
||
258 | _mm_testz_si128(__m128i __M, __m128i __V) |
||
259 | { |
||
260 | return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
|
||
261 | } |
||
262 | |||
263 | static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
||
264 | _mm_testc_si128(__m128i __M, __m128i __V) |
||
265 | { |
||
266 | return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
|
||
267 | } |
||
268 | |||
269 | static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
||
270 | _mm_testnzc_si128(__m128i __M, __m128i __V) |
||
271 | { |
||
272 | return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
|
||
273 | } |
||
274 | |||
275 | #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
|
||
276 | #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
|
||
277 | #define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
|
||
278 | |||
279 | /* SSE4 64-bit Packed Integer Comparisons. */
|
||
280 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
281 | _mm_cmpeq_epi64(__m128i __V1, __m128i __V2) |
||
282 | { |
||
283 | return (__m128i)((__v2di)__V1 == (__v2di)__V2);
|
||
284 | } |
||
285 | |||
286 | /* SSE4 Packed Integer Sign-Extension. */
|
||
287 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
288 | _mm_cvtepi8_epi16(__m128i __V) |
||
289 | { |
||
290 | return (__m128i) __builtin_ia32_pmovsxbw128((__v16qi) __V);
|
||
291 | } |
||
292 | |||
293 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
294 | _mm_cvtepi8_epi32(__m128i __V) |
||
295 | { |
||
296 | return (__m128i) __builtin_ia32_pmovsxbd128((__v16qi) __V);
|
||
297 | } |
||
298 | |||
299 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
300 | _mm_cvtepi8_epi64(__m128i __V) |
||
301 | { |
||
302 | return (__m128i) __builtin_ia32_pmovsxbq128((__v16qi) __V);
|
||
303 | } |
||
304 | |||
305 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
306 | _mm_cvtepi16_epi32(__m128i __V) |
||
307 | { |
||
308 | return (__m128i) __builtin_ia32_pmovsxwd128((__v8hi) __V);
|
||
309 | } |
||
310 | |||
311 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
312 | _mm_cvtepi16_epi64(__m128i __V) |
||
313 | { |
||
314 | return (__m128i) __builtin_ia32_pmovsxwq128((__v8hi)__V);
|
||
315 | } |
||
316 | |||
317 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
318 | _mm_cvtepi32_epi64(__m128i __V) |
||
319 | { |
||
320 | return (__m128i) __builtin_ia32_pmovsxdq128((__v4si)__V);
|
||
321 | } |
||
322 | |||
323 | /* SSE4 Packed Integer Zero-Extension. */
|
||
324 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
325 | _mm_cvtepu8_epi16(__m128i __V) |
||
326 | { |
||
327 | return (__m128i) __builtin_ia32_pmovzxbw128((__v16qi) __V);
|
||
328 | } |
||
329 | |||
330 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
331 | _mm_cvtepu8_epi32(__m128i __V) |
||
332 | { |
||
333 | return (__m128i) __builtin_ia32_pmovzxbd128((__v16qi)__V);
|
||
334 | } |
||
335 | |||
336 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
337 | _mm_cvtepu8_epi64(__m128i __V) |
||
338 | { |
||
339 | return (__m128i) __builtin_ia32_pmovzxbq128((__v16qi)__V);
|
||
340 | } |
||
341 | |||
342 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
343 | _mm_cvtepu16_epi32(__m128i __V) |
||
344 | { |
||
345 | return (__m128i) __builtin_ia32_pmovzxwd128((__v8hi)__V);
|
||
346 | } |
||
347 | |||
348 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
349 | _mm_cvtepu16_epi64(__m128i __V) |
||
350 | { |
||
351 | return (__m128i) __builtin_ia32_pmovzxwq128((__v8hi)__V);
|
||
352 | } |
||
353 | |||
354 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
355 | _mm_cvtepu32_epi64(__m128i __V) |
||
356 | { |
||
357 | return (__m128i) __builtin_ia32_pmovzxdq128((__v4si)__V);
|
||
358 | } |
||
359 | |||
360 | /* SSE4 Pack with Unsigned Saturation. */
|
||
361 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
362 | _mm_packus_epi32(__m128i __V1, __m128i __V2) |
||
363 | { |
||
364 | return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
|
||
365 | } |
||
366 | |||
367 | /* SSE4 Multiple Packed Sums of Absolute Difference. */
|
||
368 | #define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \
|
||
369 | __m128i __X = (X); \ |
||
370 | __m128i __Y = (Y); \ |
||
371 | (__m128i) __builtin_ia32_mpsadbw128((__v16qi)__X, (__v16qi)__Y, (M)); }) |
||
372 | |||
373 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
374 | _mm_minpos_epu16(__m128i __V) |
||
375 | { |
||
376 | return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
|
||
377 | } |
||
378 | |||
379 | /* These definitions are normally in nmmintrin.h, but gcc puts them in here
|
||
380 | so we'll do the same. */
|
||
381 | #ifdef __SSE4_2__
|
||
382 | |||
383 | /* These specify the type of data that we're comparing. */
|
||
384 | #define _SIDD_UBYTE_OPS 0x00 |
||
385 | #define _SIDD_UWORD_OPS 0x01 |
||
386 | #define _SIDD_SBYTE_OPS 0x02 |
||
387 | #define _SIDD_SWORD_OPS 0x03 |
||
388 | |||
389 | /* These specify the type of comparison operation. */
|
||
390 | #define _SIDD_CMP_EQUAL_ANY 0x00 |
||
391 | #define _SIDD_CMP_RANGES 0x04 |
||
392 | #define _SIDD_CMP_EQUAL_EACH 0x08 |
||
393 | #define _SIDD_CMP_EQUAL_ORDERED 0x0c |
||
394 | |||
395 | /* These macros specify the polarity of the operation. */
|
||
396 | #define _SIDD_POSITIVE_POLARITY 0x00 |
||
397 | #define _SIDD_NEGATIVE_POLARITY 0x10 |
||
398 | #define _SIDD_MASKED_POSITIVE_POLARITY 0x20 |
||
399 | #define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 |
||
400 | |||
401 | /* These macros are used in _mm_cmpXstri() to specify the return. */
|
||
402 | #define _SIDD_LEAST_SIGNIFICANT 0x00 |
||
403 | #define _SIDD_MOST_SIGNIFICANT 0x40 |
||
404 | |||
405 | /* These macros are used in _mm_cmpXstri() to specify the return. */
|
||
406 | #define _SIDD_BIT_MASK 0x00 |
||
407 | #define _SIDD_UNIT_MASK 0x40 |
||
408 | |||
409 | /* SSE4.2 Packed Comparison Intrinsics. */
|
||
410 | #define _mm_cmpistrm(A, B, M) __builtin_ia32_pcmpistrm128((A), (B), (M))
|
||
411 | #define _mm_cmpistri(A, B, M) __builtin_ia32_pcmpistri128((A), (B), (M))
|
||
412 | |||
413 | #define _mm_cmpestrm(A, LA, B, LB, M) \
|
||
414 | __builtin_ia32_pcmpestrm128((A), (LA), (B), (LB), (M)) |
||
415 | #define _mm_cmpestri(A, LA, B, LB, M) \
|
||
416 | __builtin_ia32_pcmpestri128((A), (LA), (B), (LB), (M)) |
||
417 | |||
418 | /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
|
||
419 | #define _mm_cmpistra(A, B, M) \
|
||
420 | __builtin_ia32_pcmpistria128((A), (B), (M)) |
||
421 | #define _mm_cmpistrc(A, B, M) \
|
||
422 | __builtin_ia32_pcmpistric128((A), (B), (M)) |
||
423 | #define _mm_cmpistro(A, B, M) \
|
||
424 | __builtin_ia32_pcmpistrio128((A), (B), (M)) |
||
425 | #define _mm_cmpistrs(A, B, M) \
|
||
426 | __builtin_ia32_pcmpistris128((A), (B), (M)) |
||
427 | #define _mm_cmpistrz(A, B, M) \
|
||
428 | __builtin_ia32_pcmpistriz128((A), (B), (M)) |
||
429 | |||
430 | #define _mm_cmpestra(A, LA, B, LB, M) \
|
||
431 | __builtin_ia32_pcmpestria128((A), (LA), (B), (LB), (M)) |
||
432 | #define _mm_cmpestrc(A, LA, B, LB, M) \
|
||
433 | __builtin_ia32_pcmpestric128((A), (LA), (B), (LB), (M)) |
||
434 | #define _mm_cmpestro(A, LA, B, LB, M) \
|
||
435 | __builtin_ia32_pcmpestrio128((A), (LA), (B), (LB), (M)) |
||
436 | #define _mm_cmpestrs(A, LA, B, LB, M) \
|
||
437 | __builtin_ia32_pcmpestris128((A), (LA), (B), (LB), (M)) |
||
438 | #define _mm_cmpestrz(A, LA, B, LB, M) \
|
||
439 | __builtin_ia32_pcmpestriz128((A), (LA), (B), (LB), (M)) |
||
440 | |||
441 | /* SSE4.2 Compare Packed Data -- Greater Than. */
|
||
442 | static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||
443 | _mm_cmpgt_epi64(__m128i __V1, __m128i __V2) |
||
444 | { |
||
445 | return (__m128i)((__v2di)__V1 > (__v2di)__V2);
|
||
446 | } |
||
447 | |||
448 | /* SSE4.2 Accumulate CRC32. */
|
||
449 | static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) |
||
450 | _mm_crc32_u8(unsigned int __C, unsigned char __D) |
||
451 | { |
||
452 | return __builtin_ia32_crc32qi(__C, __D);
|
||
453 | } |
||
454 | |||
455 | static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) |
||
456 | _mm_crc32_u16(unsigned int __C, unsigned short __D) |
||
457 | { |
||
458 | return __builtin_ia32_crc32hi(__C, __D);
|
||
459 | } |
||
460 | |||
461 | static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) |
||
462 | _mm_crc32_u32(unsigned int __C, unsigned int __D) |
||
463 | { |
||
464 | return __builtin_ia32_crc32si(__C, __D);
|
||
465 | } |
||
466 | |||
467 | #ifdef __x86_64__
|
||
468 | static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__)) |
||
469 | _mm_crc32_u64(unsigned long long __C, unsigned long long __D) |
||
470 | { |
||
471 | return __builtin_ia32_crc32di(__C, __D);
|
||
472 | } |
||
473 | #endif /* __x86_64__ */ |
||
474 | |||
475 | #ifdef __POPCNT__
|
||
476 | #include <popcntintrin.h> |
||
477 | #endif
|
||
478 | |||
479 | #endif /* __SSE4_2__ */ |
||
480 | #endif /* __SSE4_1__ */ |
||
481 | |||
482 | #endif /* _SMMINTRIN_H */ |