Project

General

Profile

Statistics
| Revision:

root / lab4 / .minix-src / include / clang-3.6 / emmintrin.h @ 14

History | View | Annotate | Download (40.9 KB)

1
/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2
 *
3
 * Permission is hereby granted, free of charge, to any person obtaining a copy
4
 * of this software and associated documentation files (the "Software"), to deal
5
 * in the Software without restriction, including without limitation the rights
6
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
 * copies of the Software, and to permit persons to whom the Software is
8
 * furnished to do so, subject to the following conditions:
9
 *
10
 * The above copyright notice and this permission notice shall be included in
11
 * all copies or substantial portions of the Software.
12
 *
13
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
 * THE SOFTWARE.
20
 *
21
 *===-----------------------------------------------------------------------===
22
 */
23

    
24
#ifndef __EMMINTRIN_H
25
#define __EMMINTRIN_H
26

    
27
#ifndef __SSE2__
28
#error "SSE2 instruction set not enabled"
29
#else
30

    
31
#include <xmmintrin.h>
32

    
33
typedef double __m128d __attribute__((__vector_size__(16)));
34
typedef long long __m128i __attribute__((__vector_size__(16)));
35

    
36
/* Type defines.  */
37
typedef double __v2df __attribute__ ((__vector_size__ (16)));
38
typedef long long __v2di __attribute__ ((__vector_size__ (16)));
39
typedef short __v8hi __attribute__((__vector_size__(16)));
40
typedef char __v16qi __attribute__((__vector_size__(16)));
41

    
42
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
43
_mm_add_sd(__m128d __a, __m128d __b)
44
{
45
  __a[0] += __b[0];
46
  return __a;
47
}
48

    
49
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
50
_mm_add_pd(__m128d __a, __m128d __b)
51
{
52
  return __a + __b;
53
}
54

    
55
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
56
_mm_sub_sd(__m128d __a, __m128d __b)
57
{
58
  __a[0] -= __b[0];
59
  return __a;
60
}
61

    
62
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
63
_mm_sub_pd(__m128d __a, __m128d __b)
64
{
65
  return __a - __b;
66
}
67

    
68
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
69
_mm_mul_sd(__m128d __a, __m128d __b)
70
{
71
  __a[0] *= __b[0];
72
  return __a;
73
}
74

    
75
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
76
_mm_mul_pd(__m128d __a, __m128d __b)
77
{
78
  return __a * __b;
79
}
80

    
81
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
82
_mm_div_sd(__m128d __a, __m128d __b)
83
{
84
  __a[0] /= __b[0];
85
  return __a;
86
}
87

    
88
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
89
_mm_div_pd(__m128d __a, __m128d __b)
90
{
91
  return __a / __b;
92
}
93

    
94
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
95
_mm_sqrt_sd(__m128d __a, __m128d __b)
96
{
97
  __m128d __c = __builtin_ia32_sqrtsd(__b);
98
  return (__m128d) { __c[0], __a[1] };
99
}
100

    
101
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
102
_mm_sqrt_pd(__m128d __a)
103
{
104
  return __builtin_ia32_sqrtpd(__a);
105
}
106

    
107
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
108
_mm_min_sd(__m128d __a, __m128d __b)
109
{
110
  return __builtin_ia32_minsd(__a, __b);
111
}
112

    
113
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
114
_mm_min_pd(__m128d __a, __m128d __b)
115
{
116
  return __builtin_ia32_minpd(__a, __b);
117
}
118

    
119
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
120
_mm_max_sd(__m128d __a, __m128d __b)
121
{
122
  return __builtin_ia32_maxsd(__a, __b);
123
}
124

    
125
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
126
_mm_max_pd(__m128d __a, __m128d __b)
127
{
128
  return __builtin_ia32_maxpd(__a, __b);
129
}
130

    
131
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
132
_mm_and_pd(__m128d __a, __m128d __b)
133
{
134
  return (__m128d)((__v4si)__a & (__v4si)__b);
135
}
136

    
137
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
138
_mm_andnot_pd(__m128d __a, __m128d __b)
139
{
140
  return (__m128d)(~(__v4si)__a & (__v4si)__b);
141
}
142

    
143
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
144
_mm_or_pd(__m128d __a, __m128d __b)
145
{
146
  return (__m128d)((__v4si)__a | (__v4si)__b);
147
}
148

    
149
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
150
_mm_xor_pd(__m128d __a, __m128d __b)
151
{
152
  return (__m128d)((__v4si)__a ^ (__v4si)__b);
153
}
154

    
155
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
156
_mm_cmpeq_pd(__m128d __a, __m128d __b)
157
{
158
  return (__m128d)__builtin_ia32_cmpeqpd(__a, __b);
159
}
160

    
161
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
162
_mm_cmplt_pd(__m128d __a, __m128d __b)
163
{
164
  return (__m128d)__builtin_ia32_cmpltpd(__a, __b);
165
}
166

    
167
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
168
_mm_cmple_pd(__m128d __a, __m128d __b)
169
{
170
  return (__m128d)__builtin_ia32_cmplepd(__a, __b);
171
}
172

    
173
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
174
_mm_cmpgt_pd(__m128d __a, __m128d __b)
175
{
176
  return (__m128d)__builtin_ia32_cmpltpd(__b, __a);
177
}
178

    
179
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
180
_mm_cmpge_pd(__m128d __a, __m128d __b)
181
{
182
  return (__m128d)__builtin_ia32_cmplepd(__b, __a);
183
}
184

    
185
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
186
_mm_cmpord_pd(__m128d __a, __m128d __b)
187
{
188
  return (__m128d)__builtin_ia32_cmpordpd(__a, __b);
189
}
190

    
191
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
192
_mm_cmpunord_pd(__m128d __a, __m128d __b)
193
{
194
  return (__m128d)__builtin_ia32_cmpunordpd(__a, __b);
195
}
196

    
197
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
198
_mm_cmpneq_pd(__m128d __a, __m128d __b)
199
{
200
  return (__m128d)__builtin_ia32_cmpneqpd(__a, __b);
201
}
202

    
203
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
204
_mm_cmpnlt_pd(__m128d __a, __m128d __b)
205
{
206
  return (__m128d)__builtin_ia32_cmpnltpd(__a, __b);
207
}
208

    
209
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
210
_mm_cmpnle_pd(__m128d __a, __m128d __b)
211
{
212
  return (__m128d)__builtin_ia32_cmpnlepd(__a, __b);
213
}
214

    
215
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
216
_mm_cmpngt_pd(__m128d __a, __m128d __b)
217
{
218
  return (__m128d)__builtin_ia32_cmpnltpd(__b, __a);
219
}
220

    
221
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
222
_mm_cmpnge_pd(__m128d __a, __m128d __b)
223
{
224
  return (__m128d)__builtin_ia32_cmpnlepd(__b, __a);
225
}
226

    
227
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
228
_mm_cmpeq_sd(__m128d __a, __m128d __b)
229
{
230
  return (__m128d)__builtin_ia32_cmpeqsd(__a, __b);
231
}
232

    
233
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
234
_mm_cmplt_sd(__m128d __a, __m128d __b)
235
{
236
  return (__m128d)__builtin_ia32_cmpltsd(__a, __b);
237
}
238

    
239
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
240
_mm_cmple_sd(__m128d __a, __m128d __b)
241
{
242
  return (__m128d)__builtin_ia32_cmplesd(__a, __b);
243
}
244

    
245
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
246
_mm_cmpgt_sd(__m128d __a, __m128d __b)
247
{
248
  __m128d __c = __builtin_ia32_cmpltsd(__b, __a);
249
  return (__m128d) { __c[0], __a[1] };
250
}
251

    
252
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
253
_mm_cmpge_sd(__m128d __a, __m128d __b)
254
{
255
  __m128d __c = __builtin_ia32_cmplesd(__b, __a);
256
  return (__m128d) { __c[0], __a[1] };
257
}
258

    
259
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
260
_mm_cmpord_sd(__m128d __a, __m128d __b)
261
{
262
  return (__m128d)__builtin_ia32_cmpordsd(__a, __b);
263
}
264

    
265
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
266
_mm_cmpunord_sd(__m128d __a, __m128d __b)
267
{
268
  return (__m128d)__builtin_ia32_cmpunordsd(__a, __b);
269
}
270

    
271
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
272
_mm_cmpneq_sd(__m128d __a, __m128d __b)
273
{
274
  return (__m128d)__builtin_ia32_cmpneqsd(__a, __b);
275
}
276

    
277
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
278
_mm_cmpnlt_sd(__m128d __a, __m128d __b)
279
{
280
  return (__m128d)__builtin_ia32_cmpnltsd(__a, __b);
281
}
282

    
283
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
284
_mm_cmpnle_sd(__m128d __a, __m128d __b)
285
{
286
  return (__m128d)__builtin_ia32_cmpnlesd(__a, __b);
287
}
288

    
289
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
290
_mm_cmpngt_sd(__m128d __a, __m128d __b)
291
{
292
  __m128d __c = __builtin_ia32_cmpnltsd(__b, __a);
293
  return (__m128d) { __c[0], __a[1] };
294
}
295

    
296
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
297
_mm_cmpnge_sd(__m128d __a, __m128d __b)
298
{
299
  __m128d __c = __builtin_ia32_cmpnlesd(__b, __a);
300
  return (__m128d) { __c[0], __a[1] };
301
}
302

    
303
static __inline__ int __attribute__((__always_inline__, __nodebug__))
304
_mm_comieq_sd(__m128d __a, __m128d __b)
305
{
306
  return __builtin_ia32_comisdeq(__a, __b);
307
}
308

    
309
static __inline__ int __attribute__((__always_inline__, __nodebug__))
310
_mm_comilt_sd(__m128d __a, __m128d __b)
311
{
312
  return __builtin_ia32_comisdlt(__a, __b);
313
}
314

    
315
static __inline__ int __attribute__((__always_inline__, __nodebug__))
316
_mm_comile_sd(__m128d __a, __m128d __b)
317
{
318
  return __builtin_ia32_comisdle(__a, __b);
319
}
320

    
321
static __inline__ int __attribute__((__always_inline__, __nodebug__))
322
_mm_comigt_sd(__m128d __a, __m128d __b)
323
{
324
  return __builtin_ia32_comisdgt(__a, __b);
325
}
326

    
327
static __inline__ int __attribute__((__always_inline__, __nodebug__))
328
_mm_comige_sd(__m128d __a, __m128d __b)
329
{
330
  return __builtin_ia32_comisdge(__a, __b);
331
}
332

    
333
static __inline__ int __attribute__((__always_inline__, __nodebug__))
334
_mm_comineq_sd(__m128d __a, __m128d __b)
335
{
336
  return __builtin_ia32_comisdneq(__a, __b);
337
}
338

    
339
static __inline__ int __attribute__((__always_inline__, __nodebug__))
340
_mm_ucomieq_sd(__m128d __a, __m128d __b)
341
{
342
  return __builtin_ia32_ucomisdeq(__a, __b);
343
}
344

    
345
static __inline__ int __attribute__((__always_inline__, __nodebug__))
346
_mm_ucomilt_sd(__m128d __a, __m128d __b)
347
{
348
  return __builtin_ia32_ucomisdlt(__a, __b);
349
}
350

    
351
static __inline__ int __attribute__((__always_inline__, __nodebug__))
352
_mm_ucomile_sd(__m128d __a, __m128d __b)
353
{
354
  return __builtin_ia32_ucomisdle(__a, __b);
355
}
356

    
357
static __inline__ int __attribute__((__always_inline__, __nodebug__))
358
_mm_ucomigt_sd(__m128d __a, __m128d __b)
359
{
360
  return __builtin_ia32_ucomisdgt(__a, __b);
361
}
362

    
363
static __inline__ int __attribute__((__always_inline__, __nodebug__))
364
_mm_ucomige_sd(__m128d __a, __m128d __b)
365
{
366
  return __builtin_ia32_ucomisdge(__a, __b);
367
}
368

    
369
static __inline__ int __attribute__((__always_inline__, __nodebug__))
370
_mm_ucomineq_sd(__m128d __a, __m128d __b)
371
{
372
  return __builtin_ia32_ucomisdneq(__a, __b);
373
}
374

    
375
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
376
_mm_cvtpd_ps(__m128d __a)
377
{
378
  return __builtin_ia32_cvtpd2ps(__a);
379
}
380

    
381
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
382
_mm_cvtps_pd(__m128 __a)
383
{
384
  return __builtin_ia32_cvtps2pd(__a);
385
}
386

    
387
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
388
_mm_cvtepi32_pd(__m128i __a)
389
{
390
  return __builtin_ia32_cvtdq2pd((__v4si)__a);
391
}
392

    
393
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
394
_mm_cvtpd_epi32(__m128d __a)
395
{
396
  return __builtin_ia32_cvtpd2dq(__a);
397
}
398

    
399
static __inline__ int __attribute__((__always_inline__, __nodebug__))
400
_mm_cvtsd_si32(__m128d __a)
401
{
402
  return __builtin_ia32_cvtsd2si(__a);
403
}
404

    
405
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
406
_mm_cvtsd_ss(__m128 __a, __m128d __b)
407
{
408
  __a[0] = __b[0];
409
  return __a;
410
}
411

    
412
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
413
_mm_cvtsi32_sd(__m128d __a, int __b)
414
{
415
  __a[0] = __b;
416
  return __a;
417
}
418

    
419
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
420
_mm_cvtss_sd(__m128d __a, __m128 __b)
421
{
422
  __a[0] = __b[0];
423
  return __a;
424
}
425

    
426
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
427
_mm_cvttpd_epi32(__m128d __a)
428
{
429
  return (__m128i)__builtin_ia32_cvttpd2dq(__a);
430
}
431

    
432
static __inline__ int __attribute__((__always_inline__, __nodebug__))
433
_mm_cvttsd_si32(__m128d __a)
434
{
435
  return __a[0];
436
}
437

    
438
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
439
_mm_cvtpd_pi32(__m128d __a)
440
{
441
  return (__m64)__builtin_ia32_cvtpd2pi(__a);
442
}
443

    
444
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
445
_mm_cvttpd_pi32(__m128d __a)
446
{
447
  return (__m64)__builtin_ia32_cvttpd2pi(__a);
448
}
449

    
450
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
451
_mm_cvtpi32_pd(__m64 __a)
452
{
453
  return __builtin_ia32_cvtpi2pd((__v2si)__a);
454
}
455

    
456
static __inline__ double __attribute__((__always_inline__, __nodebug__))
457
_mm_cvtsd_f64(__m128d __a)
458
{
459
  return __a[0];
460
}
461

    
462
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
463
_mm_load_pd(double const *__dp)
464
{
465
  return *(__m128d*)__dp;
466
}
467

    
468
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
469
_mm_load1_pd(double const *__dp)
470
{
471
  struct __mm_load1_pd_struct {
472
    double __u;
473
  } __attribute__((__packed__, __may_alias__));
474
  double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
475
  return (__m128d){ __u, __u };
476
}
477

    
478
#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
479

    
480
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
481
_mm_loadr_pd(double const *__dp)
482
{
483
  __m128d __u = *(__m128d*)__dp;
484
  return __builtin_shufflevector(__u, __u, 1, 0);
485
}
486

    
487
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
488
_mm_loadu_pd(double const *__dp)
489
{
490
  struct __loadu_pd {
491
    __m128d __v;
492
  } __attribute__((packed, may_alias));
493
  return ((struct __loadu_pd*)__dp)->__v;
494
}
495

    
496
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
497
_mm_load_sd(double const *__dp)
498
{
499
  struct __mm_load_sd_struct {
500
    double __u;
501
  } __attribute__((__packed__, __may_alias__));
502
  double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
503
  return (__m128d){ __u, 0 };
504
}
505

    
506
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
507
_mm_loadh_pd(__m128d __a, double const *__dp)
508
{
509
  struct __mm_loadh_pd_struct {
510
    double __u;
511
  } __attribute__((__packed__, __may_alias__));
512
  double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
513
  return (__m128d){ __a[0], __u };
514
}
515

    
516
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
517
_mm_loadl_pd(__m128d __a, double const *__dp)
518
{
519
  struct __mm_loadl_pd_struct {
520
    double __u;
521
  } __attribute__((__packed__, __may_alias__));
522
  double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
523
  return (__m128d){ __u, __a[1] };
524
}
525

    
526
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
527
_mm_set_sd(double __w)
528
{
529
  return (__m128d){ __w, 0 };
530
}
531

    
532
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
533
_mm_set1_pd(double __w)
534
{
535
  return (__m128d){ __w, __w };
536
}
537

    
538
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
539
_mm_set_pd(double __w, double __x)
540
{
541
  return (__m128d){ __x, __w };
542
}
543

    
544
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
545
_mm_setr_pd(double __w, double __x)
546
{
547
  return (__m128d){ __w, __x };
548
}
549

    
550
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
551
_mm_setzero_pd(void)
552
{
553
  return (__m128d){ 0, 0 };
554
}
555

    
556
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
557
_mm_move_sd(__m128d __a, __m128d __b)
558
{
559
  return (__m128d){ __b[0], __a[1] };
560
}
561

    
562
static __inline__ void __attribute__((__always_inline__, __nodebug__))
563
_mm_store_sd(double *__dp, __m128d __a)
564
{
565
  struct __mm_store_sd_struct {
566
    double __u;
567
  } __attribute__((__packed__, __may_alias__));
568
  ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
569
}
570

    
571
static __inline__ void __attribute__((__always_inline__, __nodebug__))
572
_mm_store1_pd(double *__dp, __m128d __a)
573
{
574
  struct __mm_store1_pd_struct {
575
    double __u[2];
576
  } __attribute__((__packed__, __may_alias__));
577
  ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0];
578
  ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
579
}
580

    
581
static __inline__ void __attribute__((__always_inline__, __nodebug__))
582
_mm_store_pd(double *__dp, __m128d __a)
583
{
584
  *(__m128d *)__dp = __a;
585
}
586

    
587
static __inline__ void __attribute__((__always_inline__, __nodebug__))
588
_mm_storeu_pd(double *__dp, __m128d __a)
589
{
590
  __builtin_ia32_storeupd(__dp, __a);
591
}
592

    
593
static __inline__ void __attribute__((__always_inline__, __nodebug__))
594
_mm_storer_pd(double *__dp, __m128d __a)
595
{
596
  __a = __builtin_shufflevector(__a, __a, 1, 0);
597
  *(__m128d *)__dp = __a;
598
}
599

    
600
static __inline__ void __attribute__((__always_inline__, __nodebug__))
601
_mm_storeh_pd(double *__dp, __m128d __a)
602
{
603
  struct __mm_storeh_pd_struct {
604
    double __u;
605
  } __attribute__((__packed__, __may_alias__));
606
  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
607
}
608

    
609
static __inline__ void __attribute__((__always_inline__, __nodebug__))
610
_mm_storel_pd(double *__dp, __m128d __a)
611
{
612
  struct __mm_storeh_pd_struct {
613
    double __u;
614
  } __attribute__((__packed__, __may_alias__));
615
  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
616
}
617

    
618
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
619
_mm_add_epi8(__m128i __a, __m128i __b)
620
{
621
  return (__m128i)((__v16qi)__a + (__v16qi)__b);
622
}
623

    
624
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
625
_mm_add_epi16(__m128i __a, __m128i __b)
626
{
627
  return (__m128i)((__v8hi)__a + (__v8hi)__b);
628
}
629

    
630
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
631
_mm_add_epi32(__m128i __a, __m128i __b)
632
{
633
  return (__m128i)((__v4si)__a + (__v4si)__b);
634
}
635

    
636
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
637
_mm_add_si64(__m64 __a, __m64 __b)
638
{
639
  return __a + __b;
640
}
641

    
642
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
643
_mm_add_epi64(__m128i __a, __m128i __b)
644
{
645
  return __a + __b;
646
}
647

    
648
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
649
_mm_adds_epi8(__m128i __a, __m128i __b)
650
{
651
  return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
652
}
653

    
654
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
655
_mm_adds_epi16(__m128i __a, __m128i __b)
656
{
657
  return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
658
}
659

    
660
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
661
_mm_adds_epu8(__m128i __a, __m128i __b)
662
{
663
  return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
664
}
665

    
666
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
667
_mm_adds_epu16(__m128i __a, __m128i __b)
668
{
669
  return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
670
}
671

    
672
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
673
_mm_avg_epu8(__m128i __a, __m128i __b)
674
{
675
  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
676
}
677

    
678
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
679
_mm_avg_epu16(__m128i __a, __m128i __b)
680
{
681
  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
682
}
683

    
684
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
685
_mm_madd_epi16(__m128i __a, __m128i __b)
686
{
687
  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
688
}
689

    
690
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
691
_mm_max_epi16(__m128i __a, __m128i __b)
692
{
693
  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
694
}
695

    
696
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
697
_mm_max_epu8(__m128i __a, __m128i __b)
698
{
699
  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
700
}
701

    
702
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
703
_mm_min_epi16(__m128i __a, __m128i __b)
704
{
705
  return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
706
}
707

    
708
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
709
_mm_min_epu8(__m128i __a, __m128i __b)
710
{
711
  return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
712
}
713

    
714
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
715
_mm_mulhi_epi16(__m128i __a, __m128i __b)
716
{
717
  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
718
}
719

    
720
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
721
_mm_mulhi_epu16(__m128i __a, __m128i __b)
722
{
723
  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
724
}
725

    
726
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
727
_mm_mullo_epi16(__m128i __a, __m128i __b)
728
{
729
  return (__m128i)((__v8hi)__a * (__v8hi)__b);
730
}
731

    
732
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
733
_mm_mul_su32(__m64 __a, __m64 __b)
734
{
735
  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
736
}
737

    
738
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
739
_mm_mul_epu32(__m128i __a, __m128i __b)
740
{
741
  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
742
}
743

    
744
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
745
_mm_sad_epu8(__m128i __a, __m128i __b)
746
{
747
  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
748
}
749

    
750
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
751
_mm_sub_epi8(__m128i __a, __m128i __b)
752
{
753
  return (__m128i)((__v16qi)__a - (__v16qi)__b);
754
}
755

    
756
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
757
_mm_sub_epi16(__m128i __a, __m128i __b)
758
{
759
  return (__m128i)((__v8hi)__a - (__v8hi)__b);
760
}
761

    
762
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
763
_mm_sub_epi32(__m128i __a, __m128i __b)
764
{
765
  return (__m128i)((__v4si)__a - (__v4si)__b);
766
}
767

    
768
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
769
_mm_sub_si64(__m64 __a, __m64 __b)
770
{
771
  return __a - __b;
772
}
773

    
774
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
775
_mm_sub_epi64(__m128i __a, __m128i __b)
776
{
777
  return __a - __b;
778
}
779

    
780
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
781
_mm_subs_epi8(__m128i __a, __m128i __b)
782
{
783
  return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
784
}
785

    
786
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
787
_mm_subs_epi16(__m128i __a, __m128i __b)
788
{
789
  return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
790
}
791

    
792
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
793
_mm_subs_epu8(__m128i __a, __m128i __b)
794
{
795
  return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
796
}
797

    
798
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
799
_mm_subs_epu16(__m128i __a, __m128i __b)
800
{
801
  return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
802
}
803

    
804
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
805
_mm_and_si128(__m128i __a, __m128i __b)
806
{
807
  return __a & __b;
808
}
809

    
810
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
811
_mm_andnot_si128(__m128i __a, __m128i __b)
812
{
813
  return ~__a & __b;
814
}
815

    
816
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
817
_mm_or_si128(__m128i __a, __m128i __b)
818
{
819
  return __a | __b;
820
}
821

    
822
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
823
_mm_xor_si128(__m128i __a, __m128i __b)
824
{
825
  return __a ^ __b;
826
}
827

    
828
#define _mm_slli_si128(a, count) __extension__ ({ \
829
  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
830
  __m128i __a = (a); \
831
   _Pragma("clang diagnostic pop"); \
832
  (__m128i)__builtin_ia32_pslldqi128(__a, (count)*8); })
833

    
834
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
835
_mm_slli_epi16(__m128i __a, int __count)
836
{
837
  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
838
}
839

    
840
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
841
_mm_sll_epi16(__m128i __a, __m128i __count)
842
{
843
  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
844
}
845

    
846
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
847
_mm_slli_epi32(__m128i __a, int __count)
848
{
849
  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
850
}
851

    
852
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
853
_mm_sll_epi32(__m128i __a, __m128i __count)
854
{
855
  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
856
}
857

    
858
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
859
_mm_slli_epi64(__m128i __a, int __count)
860
{
861
  return __builtin_ia32_psllqi128(__a, __count);
862
}
863

    
864
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
865
_mm_sll_epi64(__m128i __a, __m128i __count)
866
{
867
  return __builtin_ia32_psllq128(__a, __count);
868
}
869

    
870
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
871
_mm_srai_epi16(__m128i __a, int __count)
872
{
873
  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
874
}
875

    
876
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
877
_mm_sra_epi16(__m128i __a, __m128i __count)
878
{
879
  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
880
}
881

    
882
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
883
_mm_srai_epi32(__m128i __a, int __count)
884
{
885
  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
886
}
887

    
888
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
889
_mm_sra_epi32(__m128i __a, __m128i __count)
890
{
891
  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
892
}
893

    
894

    
895
#define _mm_srli_si128(a, count) __extension__ ({ \
896
  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
897
  __m128i __a = (a); \
898
  _Pragma("clang diagnostic pop"); \
899
  (__m128i)__builtin_ia32_psrldqi128(__a, (count)*8); })
900

    
901
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
902
_mm_srli_epi16(__m128i __a, int __count)
903
{
904
  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
905
}
906

    
907
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
908
_mm_srl_epi16(__m128i __a, __m128i __count)
909
{
910
  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
911
}
912

    
913
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
914
_mm_srli_epi32(__m128i __a, int __count)
915
{
916
  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
917
}
918

    
919
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
920
_mm_srl_epi32(__m128i __a, __m128i __count)
921
{
922
  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
923
}
924

    
925
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
926
_mm_srli_epi64(__m128i __a, int __count)
927
{
928
  return __builtin_ia32_psrlqi128(__a, __count);
929
}
930

    
931
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
932
_mm_srl_epi64(__m128i __a, __m128i __count)
933
{
934
  return __builtin_ia32_psrlq128(__a, __count);
935
}
936

    
937
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
938
_mm_cmpeq_epi8(__m128i __a, __m128i __b)
939
{
940
  return (__m128i)((__v16qi)__a == (__v16qi)__b);
941
}
942

    
943
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
944
_mm_cmpeq_epi16(__m128i __a, __m128i __b)
945
{
946
  return (__m128i)((__v8hi)__a == (__v8hi)__b);
947
}
948

    
949
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
950
_mm_cmpeq_epi32(__m128i __a, __m128i __b)
951
{
952
  return (__m128i)((__v4si)__a == (__v4si)__b);
953
}
954

    
955
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
956
_mm_cmpgt_epi8(__m128i __a, __m128i __b)
957
{
958
  /* This function always performs a signed comparison, but __v16qi is a char
959
     which may be signed or unsigned. */
960
  typedef signed char __v16qs __attribute__((__vector_size__(16)));
961
  return (__m128i)((__v16qs)__a > (__v16qs)__b);
962
}
963

    
964
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
965
_mm_cmpgt_epi16(__m128i __a, __m128i __b)
966
{
967
  return (__m128i)((__v8hi)__a > (__v8hi)__b);
968
}
969

    
970
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
971
_mm_cmpgt_epi32(__m128i __a, __m128i __b)
972
{
973
  return (__m128i)((__v4si)__a > (__v4si)__b);
974
}
975

    
976
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
977
_mm_cmplt_epi8(__m128i __a, __m128i __b)
978
{
979
  return _mm_cmpgt_epi8(__b, __a);
980
}
981

    
982
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
983
_mm_cmplt_epi16(__m128i __a, __m128i __b)
984
{
985
  return _mm_cmpgt_epi16(__b, __a);
986
}
987

    
988
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
989
_mm_cmplt_epi32(__m128i __a, __m128i __b)
990
{
991
  return _mm_cmpgt_epi32(__b, __a);
992
}
993

    
994
#ifdef __x86_64__
995
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
996
_mm_cvtsi64_sd(__m128d __a, long long __b)
997
{
998
  __a[0] = __b;
999
  return __a;
1000
}
1001

    
1002
static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1003
_mm_cvtsd_si64(__m128d __a)
1004
{
1005
  return __builtin_ia32_cvtsd2si64(__a);
1006
}
1007

    
1008
static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1009
_mm_cvttsd_si64(__m128d __a)
1010
{
1011
  return __a[0];
1012
}
1013
#endif
1014

    
1015
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1016
_mm_cvtepi32_ps(__m128i __a)
1017
{
1018
  return __builtin_ia32_cvtdq2ps((__v4si)__a);
1019
}
1020

    
1021
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1022
_mm_cvtps_epi32(__m128 __a)
1023
{
1024
  return (__m128i)__builtin_ia32_cvtps2dq(__a);
1025
}
1026

    
1027
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1028
_mm_cvttps_epi32(__m128 __a)
1029
{
1030
  return (__m128i)__builtin_ia32_cvttps2dq(__a);
1031
}
1032

    
1033
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1034
_mm_cvtsi32_si128(int __a)
1035
{
1036
  return (__m128i)(__v4si){ __a, 0, 0, 0 };
1037
}
1038

    
1039
#ifdef __x86_64__
1040
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1041
_mm_cvtsi64_si128(long long __a)
1042
{
1043
  return (__m128i){ __a, 0 };
1044
}
1045
#endif
1046

    
1047
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1048
_mm_cvtsi128_si32(__m128i __a)
1049
{
1050
  __v4si __b = (__v4si)__a;
1051
  return __b[0];
1052
}
1053

    
1054
#ifdef __x86_64__
1055
static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1056
_mm_cvtsi128_si64(__m128i __a)
1057
{
1058
  return __a[0];
1059
}
1060
#endif
1061

    
1062
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1063
_mm_load_si128(__m128i const *__p)
1064
{
1065
  return *__p;
1066
}
1067

    
1068
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1069
_mm_loadu_si128(__m128i const *__p)
1070
{
1071
  struct __loadu_si128 {
1072
    __m128i __v;
1073
  } __attribute__((packed, may_alias));
1074
  return ((struct __loadu_si128*)__p)->__v;
1075
}
1076

    
1077
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1078
_mm_loadl_epi64(__m128i const *__p)
1079
{
1080
  struct __mm_loadl_epi64_struct {
1081
    long long __u;
1082
  } __attribute__((__packed__, __may_alias__));
1083
  return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
1084
}
1085

    
1086
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1087
_mm_set_epi64x(long long q1, long long q0)
1088
{
1089
  return (__m128i){ q0, q1 };
1090
}
1091

    
1092
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1093
_mm_set_epi64(__m64 q1, __m64 q0)
1094
{
1095
  return (__m128i){ (long long)q0, (long long)q1 };
1096
}
1097

    
1098
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1099
_mm_set_epi32(int i3, int i2, int i1, int i0)
1100
{
1101
  return (__m128i)(__v4si){ i0, i1, i2, i3};
1102
}
1103

    
1104
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1105
_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1106
{
1107
  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1108
}
1109

    
1110
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1111
_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1112
{
1113
  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1114
}
1115

    
1116
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1117
_mm_set1_epi64x(long long __q)
1118
{
1119
  return (__m128i){ __q, __q };
1120
}
1121

    
1122
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1123
_mm_set1_epi64(__m64 __q)
1124
{
1125
  return (__m128i){ (long long)__q, (long long)__q };
1126
}
1127

    
1128
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1129
_mm_set1_epi32(int __i)
1130
{
1131
  return (__m128i)(__v4si){ __i, __i, __i, __i };
1132
}
1133

    
1134
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1135
_mm_set1_epi16(short __w)
1136
{
1137
  return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
1138
}
1139

    
1140
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1141
_mm_set1_epi8(char __b)
1142
{
1143
  return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
1144
}
1145

    
1146
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1147
_mm_setr_epi64(__m64 q0, __m64 q1)
1148
{
1149
  return (__m128i){ (long long)q0, (long long)q1 };
1150
}
1151

    
1152
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1153
_mm_setr_epi32(int i0, int i1, int i2, int i3)
1154
{
1155
  return (__m128i)(__v4si){ i0, i1, i2, i3};
1156
}
1157

    
1158
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1159
_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1160
{
1161
  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1162
}
1163

    
1164
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1165
_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1166
{
1167
  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1168
}
1169

    
1170
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1171
_mm_setzero_si128(void)
1172
{
1173
  return (__m128i){ 0LL, 0LL };
1174
}
1175

    
1176
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1177
_mm_store_si128(__m128i *__p, __m128i __b)
1178
{
1179
  *__p = __b;
1180
}
1181

    
1182
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1183
_mm_storeu_si128(__m128i *__p, __m128i __b)
1184
{
1185
  __builtin_ia32_storedqu((char *)__p, (__v16qi)__b);
1186
}
1187

    
1188
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1189
_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
1190
{
1191
  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
1192
}
1193

    
1194
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1195
_mm_storel_epi64(__m128i *__p, __m128i __a)
1196
{
1197
  struct __mm_storel_epi64_struct {
1198
    long long __u;
1199
  } __attribute__((__packed__, __may_alias__));
1200
  ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
1201
}
1202

    
1203
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1204
_mm_stream_pd(double *__p, __m128d __a)
1205
{
1206
  __builtin_ia32_movntpd(__p, __a);
1207
}
1208

    
1209
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1210
_mm_stream_si128(__m128i *__p, __m128i __a)
1211
{
1212
  __builtin_ia32_movntdq(__p, __a);
1213
}
1214

    
1215
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1216
_mm_stream_si32(int *__p, int __a)
1217
{
1218
  __builtin_ia32_movnti(__p, __a);
1219
}
1220

    
1221
#ifdef __x86_64__
1222
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1223
_mm_stream_si64(long long *__p, long long __a)
1224
{
1225
  __builtin_ia32_movnti64(__p, __a);
1226
}
1227
#endif
1228

    
1229
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1230
_mm_clflush(void const *__p)
1231
{
1232
  __builtin_ia32_clflush(__p);
1233
}
1234

    
1235
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1236
_mm_lfence(void)
1237
{
1238
  __builtin_ia32_lfence();
1239
}
1240

    
1241
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1242
_mm_mfence(void)
1243
{
1244
  __builtin_ia32_mfence();
1245
}
1246

    
1247
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1248
_mm_packs_epi16(__m128i __a, __m128i __b)
1249
{
1250
  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
1251
}
1252

    
1253
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1254
_mm_packs_epi32(__m128i __a, __m128i __b)
1255
{
1256
  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
1257
}
1258

    
1259
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1260
_mm_packus_epi16(__m128i __a, __m128i __b)
1261
{
1262
  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
1263
}
1264

    
1265
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1266
_mm_extract_epi16(__m128i __a, int __imm)
1267
{
1268
  __v8hi __b = (__v8hi)__a;
1269
  return (unsigned short)__b[__imm & 7];
1270
}
1271

    
1272
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1273
_mm_insert_epi16(__m128i __a, int __b, int __imm)
1274
{
1275
  __v8hi __c = (__v8hi)__a;
1276
  __c[__imm & 7] = __b;
1277
  return (__m128i)__c;
1278
}
1279

    
1280
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1281
_mm_movemask_epi8(__m128i __a)
1282
{
1283
  return __builtin_ia32_pmovmskb128((__v16qi)__a);
1284
}
1285

    
1286
#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
1287
  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
1288
  __m128i __a = (a); \
1289
  _Pragma("clang diagnostic pop"); \
1290
  (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si) _mm_set1_epi32(0), \
1291
                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
1292
                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
1293

    
1294
#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
1295
  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
1296
  __m128i __a = (a); \
1297
  _Pragma("clang diagnostic pop"); \
1298
  (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
1299
                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
1300
                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
1301
                                   4, 5, 6, 7); })
1302

    
1303
#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
1304
  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
1305
  __m128i __a = (a); \
1306
  _Pragma("clang diagnostic pop"); \
1307
  (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
1308
                                   0, 1, 2, 3, \
1309
                                   4 + (((imm) & 0x03) >> 0), \
1310
                                   4 + (((imm) & 0x0c) >> 2), \
1311
                                   4 + (((imm) & 0x30) >> 4), \
1312
                                   4 + (((imm) & 0xc0) >> 6)); })
1313

    
1314
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1315
_mm_unpackhi_epi8(__m128i __a, __m128i __b)
1316
{
1317
  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1318
}
1319

    
1320
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1321
_mm_unpackhi_epi16(__m128i __a, __m128i __b)
1322
{
1323
  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1324
}
1325

    
1326
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1327
_mm_unpackhi_epi32(__m128i __a, __m128i __b)
1328
{
1329
  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
1330
}
1331

    
1332
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1333
_mm_unpackhi_epi64(__m128i __a, __m128i __b)
1334
{
1335
  return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1);
1336
}
1337

    
1338
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1339
_mm_unpacklo_epi8(__m128i __a, __m128i __b)
1340
{
1341
  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1342
}
1343

    
1344
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1345
_mm_unpacklo_epi16(__m128i __a, __m128i __b)
1346
{
1347
  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1348
}
1349

    
1350
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1351
_mm_unpacklo_epi32(__m128i __a, __m128i __b)
1352
{
1353
  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
1354
}
1355

    
1356
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1357
_mm_unpacklo_epi64(__m128i __a, __m128i __b)
1358
{
1359
  return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0);
1360
}
1361

    
1362
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
1363
_mm_movepi64_pi64(__m128i __a)
1364
{
1365
  return (__m64)__a[0];
1366
}
1367

    
1368
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1369
_mm_movpi64_epi64(__m64 __a)
1370
{
1371
  return (__m128i){ (long long)__a, 0 };
1372
}
1373

    
1374
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1375
_mm_move_epi64(__m128i __a)
1376
{
1377
  return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2);
1378
}
1379

    
1380
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1381
_mm_unpackhi_pd(__m128d __a, __m128d __b)
1382
{
1383
  return __builtin_shufflevector(__a, __b, 1, 2+1);
1384
}
1385

    
1386
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1387
_mm_unpacklo_pd(__m128d __a, __m128d __b)
1388
{
1389
  return __builtin_shufflevector(__a, __b, 0, 2+0);
1390
}
1391

    
1392
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1393
_mm_movemask_pd(__m128d __a)
1394
{
1395
  return __builtin_ia32_movmskpd(__a);
1396
}
1397

    
1398
#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
1399
  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
1400
  __m128d __a = (a); \
1401
  __m128d __b = (b); \
1402
  _Pragma("clang diagnostic pop"); \
1403
  __builtin_shufflevector(__a, __b, (i) & 1, (((i) & 2) >> 1) + 2); })
1404

    
1405
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1406
_mm_castpd_ps(__m128d __a)
1407
{
1408
  return (__m128)__a;
1409
}
1410

    
1411
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1412
_mm_castpd_si128(__m128d __a)
1413
{
1414
  return (__m128i)__a;
1415
}
1416

    
1417
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1418
_mm_castps_pd(__m128 __a)
1419
{
1420
  return (__m128d)__a;
1421
}
1422

    
1423
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1424
_mm_castps_si128(__m128 __a)
1425
{
1426
  return (__m128i)__a;
1427
}
1428

    
1429
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1430
_mm_castsi128_ps(__m128i __a)
1431
{
1432
  return (__m128)__a;
1433
}
1434

    
1435
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1436
_mm_castsi128_pd(__m128i __a)
1437
{
1438
  return (__m128d)__a;
1439
}
1440

    
1441
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1442
_mm_pause(void)
1443
{
1444
  __asm__ volatile ("pause");
1445
}
1446

    
1447
#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1448

    
1449
#endif /* __SSE2__ */
1450

    
1451
#endif /* __EMMINTRIN_H */