root / lab4 / .minix-src / include / clang-3.6 / xmmintrin.h @ 14
History | View | Annotate | Download (26.9 KB)
1 |
/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
|
---|---|
2 |
*
|
3 |
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
4 |
* of this software and associated documentation files (the "Software"), to deal
|
5 |
* in the Software without restriction, including without limitation the rights
|
6 |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7 |
* copies of the Software, and to permit persons to whom the Software is
|
8 |
* furnished to do so, subject to the following conditions:
|
9 |
*
|
10 |
* The above copyright notice and this permission notice shall be included in
|
11 |
* all copies or substantial portions of the Software.
|
12 |
*
|
13 |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14 |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15 |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16 |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17 |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18 |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19 |
* THE SOFTWARE.
|
20 |
*
|
21 |
*===-----------------------------------------------------------------------===
|
22 |
*/
|
23 |
|
24 |
#ifndef __XMMINTRIN_H
|
25 |
#define __XMMINTRIN_H
|
26 |
|
27 |
#ifndef __SSE__
|
28 |
#error "SSE instruction set not enabled" |
29 |
#else
|
30 |
|
31 |
#include <mmintrin.h> |
32 |
|
33 |
typedef int __v4si __attribute__((__vector_size__(16))); |
34 |
typedef float __v4sf __attribute__((__vector_size__(16))); |
35 |
typedef float __m128 __attribute__((__vector_size__(16))); |
36 |
|
37 |
/* This header should only be included in a hosted environment as it depends on
|
38 |
* a standard library to provide allocation routines. */
|
39 |
#if __STDC_HOSTED__
|
40 |
#include <mm_malloc.h> |
41 |
#endif
|
42 |
|
43 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
44 |
_mm_add_ss(__m128 __a, __m128 __b) |
45 |
{ |
46 |
__a[0] += __b[0]; |
47 |
return __a;
|
48 |
} |
49 |
|
50 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
51 |
_mm_add_ps(__m128 __a, __m128 __b) |
52 |
{ |
53 |
return __a + __b;
|
54 |
} |
55 |
|
56 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
57 |
_mm_sub_ss(__m128 __a, __m128 __b) |
58 |
{ |
59 |
__a[0] -= __b[0]; |
60 |
return __a;
|
61 |
} |
62 |
|
63 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
64 |
_mm_sub_ps(__m128 __a, __m128 __b) |
65 |
{ |
66 |
return __a - __b;
|
67 |
} |
68 |
|
69 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
70 |
_mm_mul_ss(__m128 __a, __m128 __b) |
71 |
{ |
72 |
__a[0] *= __b[0]; |
73 |
return __a;
|
74 |
} |
75 |
|
76 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
77 |
_mm_mul_ps(__m128 __a, __m128 __b) |
78 |
{ |
79 |
return __a * __b;
|
80 |
} |
81 |
|
82 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
83 |
_mm_div_ss(__m128 __a, __m128 __b) |
84 |
{ |
85 |
__a[0] /= __b[0]; |
86 |
return __a;
|
87 |
} |
88 |
|
89 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
90 |
_mm_div_ps(__m128 __a, __m128 __b) |
91 |
{ |
92 |
return __a / __b;
|
93 |
} |
94 |
|
95 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
96 |
_mm_sqrt_ss(__m128 __a) |
97 |
{ |
98 |
__m128 __c = __builtin_ia32_sqrtss(__a); |
99 |
return (__m128) { __c[0], __a[1], __a[2], __a[3] }; |
100 |
} |
101 |
|
102 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
103 |
_mm_sqrt_ps(__m128 __a) |
104 |
{ |
105 |
return __builtin_ia32_sqrtps(__a);
|
106 |
} |
107 |
|
108 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
109 |
_mm_rcp_ss(__m128 __a) |
110 |
{ |
111 |
__m128 __c = __builtin_ia32_rcpss(__a); |
112 |
return (__m128) { __c[0], __a[1], __a[2], __a[3] }; |
113 |
} |
114 |
|
115 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
116 |
_mm_rcp_ps(__m128 __a) |
117 |
{ |
118 |
return __builtin_ia32_rcpps(__a);
|
119 |
} |
120 |
|
121 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
122 |
_mm_rsqrt_ss(__m128 __a) |
123 |
{ |
124 |
__m128 __c = __builtin_ia32_rsqrtss(__a); |
125 |
return (__m128) { __c[0], __a[1], __a[2], __a[3] }; |
126 |
} |
127 |
|
128 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
129 |
_mm_rsqrt_ps(__m128 __a) |
130 |
{ |
131 |
return __builtin_ia32_rsqrtps(__a);
|
132 |
} |
133 |
|
134 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
135 |
_mm_min_ss(__m128 __a, __m128 __b) |
136 |
{ |
137 |
return __builtin_ia32_minss(__a, __b);
|
138 |
} |
139 |
|
140 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
141 |
_mm_min_ps(__m128 __a, __m128 __b) |
142 |
{ |
143 |
return __builtin_ia32_minps(__a, __b);
|
144 |
} |
145 |
|
146 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
147 |
_mm_max_ss(__m128 __a, __m128 __b) |
148 |
{ |
149 |
return __builtin_ia32_maxss(__a, __b);
|
150 |
} |
151 |
|
152 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
153 |
_mm_max_ps(__m128 __a, __m128 __b) |
154 |
{ |
155 |
return __builtin_ia32_maxps(__a, __b);
|
156 |
} |
157 |
|
158 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
159 |
_mm_and_ps(__m128 __a, __m128 __b) |
160 |
{ |
161 |
return (__m128)((__v4si)__a & (__v4si)__b);
|
162 |
} |
163 |
|
164 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
165 |
_mm_andnot_ps(__m128 __a, __m128 __b) |
166 |
{ |
167 |
return (__m128)(~(__v4si)__a & (__v4si)__b);
|
168 |
} |
169 |
|
170 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
171 |
_mm_or_ps(__m128 __a, __m128 __b) |
172 |
{ |
173 |
return (__m128)((__v4si)__a | (__v4si)__b);
|
174 |
} |
175 |
|
176 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
177 |
_mm_xor_ps(__m128 __a, __m128 __b) |
178 |
{ |
179 |
return (__m128)((__v4si)__a ^ (__v4si)__b);
|
180 |
} |
181 |
|
182 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
183 |
_mm_cmpeq_ss(__m128 __a, __m128 __b) |
184 |
{ |
185 |
return (__m128)__builtin_ia32_cmpeqss(__a, __b);
|
186 |
} |
187 |
|
188 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
189 |
_mm_cmpeq_ps(__m128 __a, __m128 __b) |
190 |
{ |
191 |
return (__m128)__builtin_ia32_cmpeqps(__a, __b);
|
192 |
} |
193 |
|
194 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
195 |
_mm_cmplt_ss(__m128 __a, __m128 __b) |
196 |
{ |
197 |
return (__m128)__builtin_ia32_cmpltss(__a, __b);
|
198 |
} |
199 |
|
200 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
201 |
_mm_cmplt_ps(__m128 __a, __m128 __b) |
202 |
{ |
203 |
return (__m128)__builtin_ia32_cmpltps(__a, __b);
|
204 |
} |
205 |
|
206 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
207 |
_mm_cmple_ss(__m128 __a, __m128 __b) |
208 |
{ |
209 |
return (__m128)__builtin_ia32_cmpless(__a, __b);
|
210 |
} |
211 |
|
212 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
213 |
_mm_cmple_ps(__m128 __a, __m128 __b) |
214 |
{ |
215 |
return (__m128)__builtin_ia32_cmpleps(__a, __b);
|
216 |
} |
217 |
|
218 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
219 |
_mm_cmpgt_ss(__m128 __a, __m128 __b) |
220 |
{ |
221 |
return (__m128)__builtin_shufflevector(__a,
|
222 |
__builtin_ia32_cmpltss(__b, __a), |
223 |
4, 1, 2, 3); |
224 |
} |
225 |
|
226 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
227 |
_mm_cmpgt_ps(__m128 __a, __m128 __b) |
228 |
{ |
229 |
return (__m128)__builtin_ia32_cmpltps(__b, __a);
|
230 |
} |
231 |
|
232 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
233 |
_mm_cmpge_ss(__m128 __a, __m128 __b) |
234 |
{ |
235 |
return (__m128)__builtin_shufflevector(__a,
|
236 |
__builtin_ia32_cmpless(__b, __a), |
237 |
4, 1, 2, 3); |
238 |
} |
239 |
|
240 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
241 |
_mm_cmpge_ps(__m128 __a, __m128 __b) |
242 |
{ |
243 |
return (__m128)__builtin_ia32_cmpleps(__b, __a);
|
244 |
} |
245 |
|
246 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
247 |
_mm_cmpneq_ss(__m128 __a, __m128 __b) |
248 |
{ |
249 |
return (__m128)__builtin_ia32_cmpneqss(__a, __b);
|
250 |
} |
251 |
|
252 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
253 |
_mm_cmpneq_ps(__m128 __a, __m128 __b) |
254 |
{ |
255 |
return (__m128)__builtin_ia32_cmpneqps(__a, __b);
|
256 |
} |
257 |
|
258 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
259 |
_mm_cmpnlt_ss(__m128 __a, __m128 __b) |
260 |
{ |
261 |
return (__m128)__builtin_ia32_cmpnltss(__a, __b);
|
262 |
} |
263 |
|
264 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
265 |
_mm_cmpnlt_ps(__m128 __a, __m128 __b) |
266 |
{ |
267 |
return (__m128)__builtin_ia32_cmpnltps(__a, __b);
|
268 |
} |
269 |
|
270 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
271 |
_mm_cmpnle_ss(__m128 __a, __m128 __b) |
272 |
{ |
273 |
return (__m128)__builtin_ia32_cmpnless(__a, __b);
|
274 |
} |
275 |
|
276 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
277 |
_mm_cmpnle_ps(__m128 __a, __m128 __b) |
278 |
{ |
279 |
return (__m128)__builtin_ia32_cmpnleps(__a, __b);
|
280 |
} |
281 |
|
282 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
283 |
_mm_cmpngt_ss(__m128 __a, __m128 __b) |
284 |
{ |
285 |
return (__m128)__builtin_shufflevector(__a,
|
286 |
__builtin_ia32_cmpnltss(__b, __a), |
287 |
4, 1, 2, 3); |
288 |
} |
289 |
|
290 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
291 |
_mm_cmpngt_ps(__m128 __a, __m128 __b) |
292 |
{ |
293 |
return (__m128)__builtin_ia32_cmpnltps(__b, __a);
|
294 |
} |
295 |
|
296 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
297 |
_mm_cmpnge_ss(__m128 __a, __m128 __b) |
298 |
{ |
299 |
return (__m128)__builtin_shufflevector(__a,
|
300 |
__builtin_ia32_cmpnless(__b, __a), |
301 |
4, 1, 2, 3); |
302 |
} |
303 |
|
304 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
305 |
_mm_cmpnge_ps(__m128 __a, __m128 __b) |
306 |
{ |
307 |
return (__m128)__builtin_ia32_cmpnleps(__b, __a);
|
308 |
} |
309 |
|
310 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
311 |
_mm_cmpord_ss(__m128 __a, __m128 __b) |
312 |
{ |
313 |
return (__m128)__builtin_ia32_cmpordss(__a, __b);
|
314 |
} |
315 |
|
316 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
317 |
_mm_cmpord_ps(__m128 __a, __m128 __b) |
318 |
{ |
319 |
return (__m128)__builtin_ia32_cmpordps(__a, __b);
|
320 |
} |
321 |
|
322 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
323 |
_mm_cmpunord_ss(__m128 __a, __m128 __b) |
324 |
{ |
325 |
return (__m128)__builtin_ia32_cmpunordss(__a, __b);
|
326 |
} |
327 |
|
328 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
329 |
_mm_cmpunord_ps(__m128 __a, __m128 __b) |
330 |
{ |
331 |
return (__m128)__builtin_ia32_cmpunordps(__a, __b);
|
332 |
} |
333 |
|
334 |
static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
335 |
_mm_comieq_ss(__m128 __a, __m128 __b) |
336 |
{ |
337 |
return __builtin_ia32_comieq(__a, __b);
|
338 |
} |
339 |
|
340 |
static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
341 |
_mm_comilt_ss(__m128 __a, __m128 __b) |
342 |
{ |
343 |
return __builtin_ia32_comilt(__a, __b);
|
344 |
} |
345 |
|
346 |
static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
347 |
_mm_comile_ss(__m128 __a, __m128 __b) |
348 |
{ |
349 |
return __builtin_ia32_comile(__a, __b);
|
350 |
} |
351 |
|
352 |
static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
353 |
_mm_comigt_ss(__m128 __a, __m128 __b) |
354 |
{ |
355 |
return __builtin_ia32_comigt(__a, __b);
|
356 |
} |
357 |
|
358 |
static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
359 |
_mm_comige_ss(__m128 __a, __m128 __b) |
360 |
{ |
361 |
return __builtin_ia32_comige(__a, __b);
|
362 |
} |
363 |
|
364 |
static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
365 |
_mm_comineq_ss(__m128 __a, __m128 __b) |
366 |
{ |
367 |
return __builtin_ia32_comineq(__a, __b);
|
368 |
} |
369 |
|
370 |
static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
371 |
_mm_ucomieq_ss(__m128 __a, __m128 __b) |
372 |
{ |
373 |
return __builtin_ia32_ucomieq(__a, __b);
|
374 |
} |
375 |
|
376 |
static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
377 |
_mm_ucomilt_ss(__m128 __a, __m128 __b) |
378 |
{ |
379 |
return __builtin_ia32_ucomilt(__a, __b);
|
380 |
} |
381 |
|
382 |
static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
383 |
_mm_ucomile_ss(__m128 __a, __m128 __b) |
384 |
{ |
385 |
return __builtin_ia32_ucomile(__a, __b);
|
386 |
} |
387 |
|
388 |
static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
389 |
_mm_ucomigt_ss(__m128 __a, __m128 __b) |
390 |
{ |
391 |
return __builtin_ia32_ucomigt(__a, __b);
|
392 |
} |
393 |
|
394 |
static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
395 |
_mm_ucomige_ss(__m128 __a, __m128 __b) |
396 |
{ |
397 |
return __builtin_ia32_ucomige(__a, __b);
|
398 |
} |
399 |
|
400 |
static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
401 |
_mm_ucomineq_ss(__m128 __a, __m128 __b) |
402 |
{ |
403 |
return __builtin_ia32_ucomineq(__a, __b);
|
404 |
} |
405 |
|
406 |
static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
407 |
_mm_cvtss_si32(__m128 __a) |
408 |
{ |
409 |
return __builtin_ia32_cvtss2si(__a);
|
410 |
} |
411 |
|
412 |
static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
413 |
_mm_cvt_ss2si(__m128 __a) |
414 |
{ |
415 |
return _mm_cvtss_si32(__a);
|
416 |
} |
417 |
|
418 |
#ifdef __x86_64__
|
419 |
|
420 |
static __inline__ long long __attribute__((__always_inline__, __nodebug__)) |
421 |
_mm_cvtss_si64(__m128 __a) |
422 |
{ |
423 |
return __builtin_ia32_cvtss2si64(__a);
|
424 |
} |
425 |
|
426 |
#endif
|
427 |
|
428 |
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
|
429 |
_mm_cvtps_pi32(__m128 __a) |
430 |
{ |
431 |
return (__m64)__builtin_ia32_cvtps2pi(__a);
|
432 |
} |
433 |
|
434 |
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
|
435 |
_mm_cvt_ps2pi(__m128 __a) |
436 |
{ |
437 |
return _mm_cvtps_pi32(__a);
|
438 |
} |
439 |
|
440 |
static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
441 |
_mm_cvttss_si32(__m128 __a) |
442 |
{ |
443 |
return __a[0]; |
444 |
} |
445 |
|
446 |
static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
447 |
_mm_cvtt_ss2si(__m128 __a) |
448 |
{ |
449 |
return _mm_cvttss_si32(__a);
|
450 |
} |
451 |
|
452 |
static __inline__ long long __attribute__((__always_inline__, __nodebug__)) |
453 |
_mm_cvttss_si64(__m128 __a) |
454 |
{ |
455 |
return __a[0]; |
456 |
} |
457 |
|
458 |
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
|
459 |
_mm_cvttps_pi32(__m128 __a) |
460 |
{ |
461 |
return (__m64)__builtin_ia32_cvttps2pi(__a);
|
462 |
} |
463 |
|
464 |
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
|
465 |
_mm_cvtt_ps2pi(__m128 __a) |
466 |
{ |
467 |
return _mm_cvttps_pi32(__a);
|
468 |
} |
469 |
|
470 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
471 |
_mm_cvtsi32_ss(__m128 __a, int __b)
|
472 |
{ |
473 |
__a[0] = __b;
|
474 |
return __a;
|
475 |
} |
476 |
|
477 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
478 |
_mm_cvt_si2ss(__m128 __a, int __b)
|
479 |
{ |
480 |
return _mm_cvtsi32_ss(__a, __b);
|
481 |
} |
482 |
|
483 |
#ifdef __x86_64__
|
484 |
|
485 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
486 |
_mm_cvtsi64_ss(__m128 __a, long long __b) |
487 |
{ |
488 |
__a[0] = __b;
|
489 |
return __a;
|
490 |
} |
491 |
|
492 |
#endif
|
493 |
|
494 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
495 |
_mm_cvtpi32_ps(__m128 __a, __m64 __b) |
496 |
{ |
497 |
return __builtin_ia32_cvtpi2ps(__a, (__v2si)__b);
|
498 |
} |
499 |
|
500 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
501 |
_mm_cvt_pi2ps(__m128 __a, __m64 __b) |
502 |
{ |
503 |
return _mm_cvtpi32_ps(__a, __b);
|
504 |
} |
505 |
|
506 |
static __inline__ float __attribute__((__always_inline__, __nodebug__)) |
507 |
_mm_cvtss_f32(__m128 __a) |
508 |
{ |
509 |
return __a[0]; |
510 |
} |
511 |
|
512 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
513 |
_mm_loadh_pi(__m128 __a, const __m64 *__p)
|
514 |
{ |
515 |
typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8))); |
516 |
struct __mm_loadh_pi_struct {
|
517 |
__mm_loadh_pi_v2f32 __u; |
518 |
} __attribute__((__packed__, __may_alias__)); |
519 |
__mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
|
520 |
__m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); |
521 |
return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5); |
522 |
} |
523 |
|
524 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
525 |
_mm_loadl_pi(__m128 __a, const __m64 *__p)
|
526 |
{ |
527 |
typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8))); |
528 |
struct __mm_loadl_pi_struct {
|
529 |
__mm_loadl_pi_v2f32 __u; |
530 |
} __attribute__((__packed__, __may_alias__)); |
531 |
__mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
|
532 |
__m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); |
533 |
return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3); |
534 |
} |
535 |
|
536 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
537 |
_mm_load_ss(const float *__p) |
538 |
{ |
539 |
struct __mm_load_ss_struct {
|
540 |
float __u;
|
541 |
} __attribute__((__packed__, __may_alias__)); |
542 |
float __u = ((struct __mm_load_ss_struct*)__p)->__u; |
543 |
return (__m128){ __u, 0, 0, 0 }; |
544 |
} |
545 |
|
546 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
547 |
_mm_load1_ps(const float *__p) |
548 |
{ |
549 |
struct __mm_load1_ps_struct {
|
550 |
float __u;
|
551 |
} __attribute__((__packed__, __may_alias__)); |
552 |
float __u = ((struct __mm_load1_ps_struct*)__p)->__u; |
553 |
return (__m128){ __u, __u, __u, __u };
|
554 |
} |
555 |
|
556 |
#define _mm_load_ps1(p) _mm_load1_ps(p)
|
557 |
|
558 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
559 |
_mm_load_ps(const float *__p) |
560 |
{ |
561 |
return *(__m128*)__p;
|
562 |
} |
563 |
|
564 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
565 |
_mm_loadu_ps(const float *__p) |
566 |
{ |
567 |
struct __loadu_ps {
|
568 |
__m128 __v; |
569 |
} __attribute__((__packed__, __may_alias__)); |
570 |
return ((struct __loadu_ps*)__p)->__v; |
571 |
} |
572 |
|
573 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
574 |
_mm_loadr_ps(const float *__p) |
575 |
{ |
576 |
__m128 __a = _mm_load_ps(__p); |
577 |
return __builtin_shufflevector(__a, __a, 3, 2, 1, 0); |
578 |
} |
579 |
|
580 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
581 |
_mm_set_ss(float __w)
|
582 |
{ |
583 |
return (__m128){ __w, 0, 0, 0 }; |
584 |
} |
585 |
|
586 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
587 |
_mm_set1_ps(float __w)
|
588 |
{ |
589 |
return (__m128){ __w, __w, __w, __w };
|
590 |
} |
591 |
|
592 |
/* Microsoft specific. */
|
593 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
594 |
_mm_set_ps1(float __w)
|
595 |
{ |
596 |
return _mm_set1_ps(__w);
|
597 |
} |
598 |
|
599 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
600 |
_mm_set_ps(float __z, float __y, float __x, float __w) |
601 |
{ |
602 |
return (__m128){ __w, __x, __y, __z };
|
603 |
} |
604 |
|
605 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
606 |
_mm_setr_ps(float __z, float __y, float __x, float __w) |
607 |
{ |
608 |
return (__m128){ __z, __y, __x, __w };
|
609 |
} |
610 |
|
611 |
static __inline__ __m128 __attribute__((__always_inline__))
|
612 |
_mm_setzero_ps(void)
|
613 |
{ |
614 |
return (__m128){ 0, 0, 0, 0 }; |
615 |
} |
616 |
|
617 |
static __inline__ void __attribute__((__always_inline__)) |
618 |
_mm_storeh_pi(__m64 *__p, __m128 __a) |
619 |
{ |
620 |
__builtin_ia32_storehps((__v2si *)__p, __a); |
621 |
} |
622 |
|
623 |
static __inline__ void __attribute__((__always_inline__)) |
624 |
_mm_storel_pi(__m64 *__p, __m128 __a) |
625 |
{ |
626 |
__builtin_ia32_storelps((__v2si *)__p, __a); |
627 |
} |
628 |
|
629 |
static __inline__ void __attribute__((__always_inline__)) |
630 |
_mm_store_ss(float *__p, __m128 __a)
|
631 |
{ |
632 |
struct __mm_store_ss_struct {
|
633 |
float __u;
|
634 |
} __attribute__((__packed__, __may_alias__)); |
635 |
((struct __mm_store_ss_struct*)__p)->__u = __a[0]; |
636 |
} |
637 |
|
638 |
static __inline__ void __attribute__((__always_inline__, __nodebug__)) |
639 |
_mm_storeu_ps(float *__p, __m128 __a)
|
640 |
{ |
641 |
__builtin_ia32_storeups(__p, __a); |
642 |
} |
643 |
|
644 |
static __inline__ void __attribute__((__always_inline__, __nodebug__)) |
645 |
_mm_store1_ps(float *__p, __m128 __a)
|
646 |
{ |
647 |
__a = __builtin_shufflevector(__a, __a, 0, 0, 0, 0); |
648 |
_mm_storeu_ps(__p, __a); |
649 |
} |
650 |
|
651 |
static __inline__ void __attribute__((__always_inline__, __nodebug__)) |
652 |
_mm_store_ps1(float *__p, __m128 __a)
|
653 |
{ |
654 |
return _mm_store1_ps(__p, __a);
|
655 |
} |
656 |
|
657 |
static __inline__ void __attribute__((__always_inline__, __nodebug__)) |
658 |
_mm_store_ps(float *__p, __m128 __a)
|
659 |
{ |
660 |
*(__m128 *)__p = __a; |
661 |
} |
662 |
|
663 |
static __inline__ void __attribute__((__always_inline__, __nodebug__)) |
664 |
_mm_storer_ps(float *__p, __m128 __a)
|
665 |
{ |
666 |
__a = __builtin_shufflevector(__a, __a, 3, 2, 1, 0); |
667 |
_mm_store_ps(__p, __a); |
668 |
} |
669 |
|
670 |
#define _MM_HINT_T0 3 |
671 |
#define _MM_HINT_T1 2 |
672 |
#define _MM_HINT_T2 1 |
673 |
#define _MM_HINT_NTA 0 |
674 |
|
675 |
#ifndef _MSC_VER
|
676 |
/* FIXME: We have to #define this because "sel" must be a constant integer, and
|
677 |
Sema doesn't do any form of constant propagation yet. */
|
678 |
|
679 |
#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel))) |
680 |
#endif
|
681 |
|
682 |
static __inline__ void __attribute__((__always_inline__, __nodebug__)) |
683 |
_mm_stream_pi(__m64 *__p, __m64 __a) |
684 |
{ |
685 |
__builtin_ia32_movntq(__p, __a); |
686 |
} |
687 |
|
688 |
static __inline__ void __attribute__((__always_inline__, __nodebug__)) |
689 |
_mm_stream_ps(float *__p, __m128 __a)
|
690 |
{ |
691 |
__builtin_ia32_movntps(__p, __a); |
692 |
} |
693 |
|
694 |
static __inline__ void __attribute__((__always_inline__, __nodebug__)) |
695 |
_mm_sfence(void)
|
696 |
{ |
697 |
__builtin_ia32_sfence(); |
698 |
} |
699 |
|
700 |
static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
701 |
_mm_extract_pi16(__m64 __a, int __n)
|
702 |
{ |
703 |
__v4hi __b = (__v4hi)__a; |
704 |
return (unsigned short)__b[__n & 3]; |
705 |
} |
706 |
|
707 |
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
|
708 |
_mm_insert_pi16(__m64 __a, int __d, int __n) |
709 |
{ |
710 |
__v4hi __b = (__v4hi)__a; |
711 |
__b[__n & 3] = __d;
|
712 |
return (__m64)__b;
|
713 |
} |
714 |
|
715 |
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
|
716 |
_mm_max_pi16(__m64 __a, __m64 __b) |
717 |
{ |
718 |
return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
|
719 |
} |
720 |
|
721 |
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
|
722 |
_mm_max_pu8(__m64 __a, __m64 __b) |
723 |
{ |
724 |
return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
|
725 |
} |
726 |
|
727 |
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
|
728 |
_mm_min_pi16(__m64 __a, __m64 __b) |
729 |
{ |
730 |
return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
|
731 |
} |
732 |
|
733 |
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
|
734 |
_mm_min_pu8(__m64 __a, __m64 __b) |
735 |
{ |
736 |
return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
|
737 |
} |
738 |
|
739 |
static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
740 |
_mm_movemask_pi8(__m64 __a) |
741 |
{ |
742 |
return __builtin_ia32_pmovmskb((__v8qi)__a);
|
743 |
} |
744 |
|
745 |
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
|
746 |
_mm_mulhi_pu16(__m64 __a, __m64 __b) |
747 |
{ |
748 |
return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
|
749 |
} |
750 |
|
751 |
#define _mm_shuffle_pi16(a, n) __extension__ ({ \
|
752 |
__m64 __a = (a); \ |
753 |
(__m64)__builtin_ia32_pshufw((__v4hi)__a, (n)); }) |
754 |
|
755 |
static __inline__ void __attribute__((__always_inline__, __nodebug__)) |
756 |
_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
|
757 |
{ |
758 |
__builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p); |
759 |
} |
760 |
|
761 |
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
|
762 |
_mm_avg_pu8(__m64 __a, __m64 __b) |
763 |
{ |
764 |
return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
|
765 |
} |
766 |
|
767 |
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
|
768 |
_mm_avg_pu16(__m64 __a, __m64 __b) |
769 |
{ |
770 |
return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
|
771 |
} |
772 |
|
773 |
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
|
774 |
_mm_sad_pu8(__m64 __a, __m64 __b) |
775 |
{ |
776 |
return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
|
777 |
} |
778 |
|
779 |
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) |
780 |
_mm_getcsr(void)
|
781 |
{ |
782 |
return __builtin_ia32_stmxcsr();
|
783 |
} |
784 |
|
785 |
static __inline__ void __attribute__((__always_inline__, __nodebug__)) |
786 |
_mm_setcsr(unsigned int __i) |
787 |
{ |
788 |
__builtin_ia32_ldmxcsr(__i); |
789 |
} |
790 |
|
791 |
#define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
|
792 |
__m128 __a = (a); \ |
793 |
__m128 __b = (b); \ |
794 |
(__m128)__builtin_shufflevector((__v4sf)__a, (__v4sf)__b, \ |
795 |
(mask) & 0x3, ((mask) & 0xc) >> 2, \ |
796 |
(((mask) & 0x30) >> 4) + 4, \ |
797 |
(((mask) & 0xc0) >> 6) + 4); }) |
798 |
|
799 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
800 |
_mm_unpackhi_ps(__m128 __a, __m128 __b) |
801 |
{ |
802 |
return __builtin_shufflevector(__a, __b, 2, 6, 3, 7); |
803 |
} |
804 |
|
805 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
806 |
_mm_unpacklo_ps(__m128 __a, __m128 __b) |
807 |
{ |
808 |
return __builtin_shufflevector(__a, __b, 0, 4, 1, 5); |
809 |
} |
810 |
|
811 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
812 |
_mm_move_ss(__m128 __a, __m128 __b) |
813 |
{ |
814 |
return __builtin_shufflevector(__a, __b, 4, 1, 2, 3); |
815 |
} |
816 |
|
817 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
818 |
_mm_movehl_ps(__m128 __a, __m128 __b) |
819 |
{ |
820 |
return __builtin_shufflevector(__a, __b, 6, 7, 2, 3); |
821 |
} |
822 |
|
823 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
824 |
_mm_movelh_ps(__m128 __a, __m128 __b) |
825 |
{ |
826 |
return __builtin_shufflevector(__a, __b, 0, 1, 4, 5); |
827 |
} |
828 |
|
829 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
830 |
_mm_cvtpi16_ps(__m64 __a) |
831 |
{ |
832 |
__m64 __b, __c; |
833 |
__m128 __r; |
834 |
|
835 |
__b = _mm_setzero_si64(); |
836 |
__b = _mm_cmpgt_pi16(__b, __a); |
837 |
__c = _mm_unpackhi_pi16(__a, __b); |
838 |
__r = _mm_setzero_ps(); |
839 |
__r = _mm_cvtpi32_ps(__r, __c); |
840 |
__r = _mm_movelh_ps(__r, __r); |
841 |
__c = _mm_unpacklo_pi16(__a, __b); |
842 |
__r = _mm_cvtpi32_ps(__r, __c); |
843 |
|
844 |
return __r;
|
845 |
} |
846 |
|
847 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
848 |
_mm_cvtpu16_ps(__m64 __a) |
849 |
{ |
850 |
__m64 __b, __c; |
851 |
__m128 __r; |
852 |
|
853 |
__b = _mm_setzero_si64(); |
854 |
__c = _mm_unpackhi_pi16(__a, __b); |
855 |
__r = _mm_setzero_ps(); |
856 |
__r = _mm_cvtpi32_ps(__r, __c); |
857 |
__r = _mm_movelh_ps(__r, __r); |
858 |
__c = _mm_unpacklo_pi16(__a, __b); |
859 |
__r = _mm_cvtpi32_ps(__r, __c); |
860 |
|
861 |
return __r;
|
862 |
} |
863 |
|
864 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
865 |
_mm_cvtpi8_ps(__m64 __a) |
866 |
{ |
867 |
__m64 __b; |
868 |
|
869 |
__b = _mm_setzero_si64(); |
870 |
__b = _mm_cmpgt_pi8(__b, __a); |
871 |
__b = _mm_unpacklo_pi8(__a, __b); |
872 |
|
873 |
return _mm_cvtpi16_ps(__b);
|
874 |
} |
875 |
|
876 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
877 |
_mm_cvtpu8_ps(__m64 __a) |
878 |
{ |
879 |
__m64 __b; |
880 |
|
881 |
__b = _mm_setzero_si64(); |
882 |
__b = _mm_unpacklo_pi8(__a, __b); |
883 |
|
884 |
return _mm_cvtpi16_ps(__b);
|
885 |
} |
886 |
|
887 |
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
|
888 |
_mm_cvtpi32x2_ps(__m64 __a, __m64 __b) |
889 |
{ |
890 |
__m128 __c; |
891 |
|
892 |
__c = _mm_setzero_ps(); |
893 |
__c = _mm_cvtpi32_ps(__c, __b); |
894 |
__c = _mm_movelh_ps(__c, __c); |
895 |
|
896 |
return _mm_cvtpi32_ps(__c, __a);
|
897 |
} |
898 |
|
899 |
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
|
900 |
_mm_cvtps_pi16(__m128 __a) |
901 |
{ |
902 |
__m64 __b, __c; |
903 |
|
904 |
__b = _mm_cvtps_pi32(__a); |
905 |
__a = _mm_movehl_ps(__a, __a); |
906 |
__c = _mm_cvtps_pi32(__a); |
907 |
|
908 |
return _mm_packs_pi32(__b, __c);
|
909 |
} |
910 |
|
911 |
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
|
912 |
_mm_cvtps_pi8(__m128 __a) |
913 |
{ |
914 |
__m64 __b, __c; |
915 |
|
916 |
__b = _mm_cvtps_pi16(__a); |
917 |
__c = _mm_setzero_si64(); |
918 |
|
919 |
return _mm_packs_pi16(__b, __c);
|
920 |
} |
921 |
|
922 |
static __inline__ int __attribute__((__always_inline__, __nodebug__)) |
923 |
_mm_movemask_ps(__m128 __a) |
924 |
{ |
925 |
return __builtin_ia32_movmskps(__a);
|
926 |
} |
927 |
|
928 |
#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) |
929 |
|
930 |
#define _MM_EXCEPT_INVALID (0x0001) |
931 |
#define _MM_EXCEPT_DENORM (0x0002) |
932 |
#define _MM_EXCEPT_DIV_ZERO (0x0004) |
933 |
#define _MM_EXCEPT_OVERFLOW (0x0008) |
934 |
#define _MM_EXCEPT_UNDERFLOW (0x0010) |
935 |
#define _MM_EXCEPT_INEXACT (0x0020) |
936 |
#define _MM_EXCEPT_MASK (0x003f) |
937 |
|
938 |
#define _MM_MASK_INVALID (0x0080) |
939 |
#define _MM_MASK_DENORM (0x0100) |
940 |
#define _MM_MASK_DIV_ZERO (0x0200) |
941 |
#define _MM_MASK_OVERFLOW (0x0400) |
942 |
#define _MM_MASK_UNDERFLOW (0x0800) |
943 |
#define _MM_MASK_INEXACT (0x1000) |
944 |
#define _MM_MASK_MASK (0x1f80) |
945 |
|
946 |
#define _MM_ROUND_NEAREST (0x0000) |
947 |
#define _MM_ROUND_DOWN (0x2000) |
948 |
#define _MM_ROUND_UP (0x4000) |
949 |
#define _MM_ROUND_TOWARD_ZERO (0x6000) |
950 |
#define _MM_ROUND_MASK (0x6000) |
951 |
|
952 |
#define _MM_FLUSH_ZERO_MASK (0x8000) |
953 |
#define _MM_FLUSH_ZERO_ON (0x8000) |
954 |
#define _MM_FLUSH_ZERO_OFF (0x0000) |
955 |
|
956 |
#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
|
957 |
#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
|
958 |
#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
|
959 |
#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
|
960 |
|
961 |
#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
|
962 |
#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
|
963 |
#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
|
964 |
#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
|
965 |
|
966 |
#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
|
967 |
do { \
|
968 |
__m128 tmp3, tmp2, tmp1, tmp0; \ |
969 |
tmp0 = _mm_unpacklo_ps((row0), (row1)); \ |
970 |
tmp2 = _mm_unpacklo_ps((row2), (row3)); \ |
971 |
tmp1 = _mm_unpackhi_ps((row0), (row1)); \ |
972 |
tmp3 = _mm_unpackhi_ps((row2), (row3)); \ |
973 |
(row0) = _mm_movelh_ps(tmp0, tmp2); \ |
974 |
(row1) = _mm_movehl_ps(tmp2, tmp0); \ |
975 |
(row2) = _mm_movelh_ps(tmp1, tmp3); \ |
976 |
(row3) = _mm_movehl_ps(tmp3, tmp1); \ |
977 |
} while (0) |
978 |
|
979 |
/* Aliases for compatibility. */
|
980 |
#define _m_pextrw _mm_extract_pi16
|
981 |
#define _m_pinsrw _mm_insert_pi16
|
982 |
#define _m_pmaxsw _mm_max_pi16
|
983 |
#define _m_pmaxub _mm_max_pu8
|
984 |
#define _m_pminsw _mm_min_pi16
|
985 |
#define _m_pminub _mm_min_pu8
|
986 |
#define _m_pmovmskb _mm_movemask_pi8
|
987 |
#define _m_pmulhuw _mm_mulhi_pu16
|
988 |
#define _m_pshufw _mm_shuffle_pi16
|
989 |
#define _m_maskmovq _mm_maskmove_si64
|
990 |
#define _m_pavgb _mm_avg_pu8
|
991 |
#define _m_pavgw _mm_avg_pu16
|
992 |
#define _m_psadbw _mm_sad_pu8
|
993 |
#define _m_ _mm_
|
994 |
#define _m_ _mm_
|
995 |
|
996 |
/* Ugly hack for backwards-compatibility (compatible with gcc) */
|
997 |
#ifdef __SSE2__
|
998 |
#include <emmintrin.h> |
999 |
#endif
|
1000 |
|
1001 |
#endif /* __SSE__ */ |
1002 |
|
1003 |
#endif /* __XMMINTRIN_H */ |