Project

General

Profile

Statistics
| Revision:

root / lab4 / .minix-src / include / clang-3.6 / emmintrin.h @ 13

History | View | Annotate | Download (40.9 KB)

1 13 up20180614
/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2
 *
3
 * Permission is hereby granted, free of charge, to any person obtaining a copy
4
 * of this software and associated documentation files (the "Software"), to deal
5
 * in the Software without restriction, including without limitation the rights
6
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
 * copies of the Software, and to permit persons to whom the Software is
8
 * furnished to do so, subject to the following conditions:
9
 *
10
 * The above copyright notice and this permission notice shall be included in
11
 * all copies or substantial portions of the Software.
12
 *
13
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
 * THE SOFTWARE.
20
 *
21
 *===-----------------------------------------------------------------------===
22
 */
23
24
#ifndef __EMMINTRIN_H
25
#define __EMMINTRIN_H
26
27
#ifndef __SSE2__
28
#error "SSE2 instruction set not enabled"
29
#else
30
31
#include <xmmintrin.h>
32
33
typedef double __m128d __attribute__((__vector_size__(16)));
34
typedef long long __m128i __attribute__((__vector_size__(16)));
35
36
/* Type defines.  */
37
typedef double __v2df __attribute__ ((__vector_size__ (16)));
38
typedef long long __v2di __attribute__ ((__vector_size__ (16)));
39
typedef short __v8hi __attribute__((__vector_size__(16)));
40
typedef char __v16qi __attribute__((__vector_size__(16)));
41
42
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
43
_mm_add_sd(__m128d __a, __m128d __b)
44
{
45
  __a[0] += __b[0];
46
  return __a;
47
}
48
49
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
50
_mm_add_pd(__m128d __a, __m128d __b)
51
{
52
  return __a + __b;
53
}
54
55
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
56
_mm_sub_sd(__m128d __a, __m128d __b)
57
{
58
  __a[0] -= __b[0];
59
  return __a;
60
}
61
62
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
63
_mm_sub_pd(__m128d __a, __m128d __b)
64
{
65
  return __a - __b;
66
}
67
68
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
69
_mm_mul_sd(__m128d __a, __m128d __b)
70
{
71
  __a[0] *= __b[0];
72
  return __a;
73
}
74
75
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
76
_mm_mul_pd(__m128d __a, __m128d __b)
77
{
78
  return __a * __b;
79
}
80
81
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
82
_mm_div_sd(__m128d __a, __m128d __b)
83
{
84
  __a[0] /= __b[0];
85
  return __a;
86
}
87
88
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
89
_mm_div_pd(__m128d __a, __m128d __b)
90
{
91
  return __a / __b;
92
}
93
94
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
95
_mm_sqrt_sd(__m128d __a, __m128d __b)
96
{
97
  __m128d __c = __builtin_ia32_sqrtsd(__b);
98
  return (__m128d) { __c[0], __a[1] };
99
}
100
101
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
102
_mm_sqrt_pd(__m128d __a)
103
{
104
  return __builtin_ia32_sqrtpd(__a);
105
}
106
107
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
108
_mm_min_sd(__m128d __a, __m128d __b)
109
{
110
  return __builtin_ia32_minsd(__a, __b);
111
}
112
113
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
114
_mm_min_pd(__m128d __a, __m128d __b)
115
{
116
  return __builtin_ia32_minpd(__a, __b);
117
}
118
119
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
120
_mm_max_sd(__m128d __a, __m128d __b)
121
{
122
  return __builtin_ia32_maxsd(__a, __b);
123
}
124
125
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
126
_mm_max_pd(__m128d __a, __m128d __b)
127
{
128
  return __builtin_ia32_maxpd(__a, __b);
129
}
130
131
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
132
_mm_and_pd(__m128d __a, __m128d __b)
133
{
134
  return (__m128d)((__v4si)__a & (__v4si)__b);
135
}
136
137
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
138
_mm_andnot_pd(__m128d __a, __m128d __b)
139
{
140
  return (__m128d)(~(__v4si)__a & (__v4si)__b);
141
}
142
143
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
144
_mm_or_pd(__m128d __a, __m128d __b)
145
{
146
  return (__m128d)((__v4si)__a | (__v4si)__b);
147
}
148
149
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
150
_mm_xor_pd(__m128d __a, __m128d __b)
151
{
152
  return (__m128d)((__v4si)__a ^ (__v4si)__b);
153
}
154
155
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
156
_mm_cmpeq_pd(__m128d __a, __m128d __b)
157
{
158
  return (__m128d)__builtin_ia32_cmpeqpd(__a, __b);
159
}
160
161
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
162
_mm_cmplt_pd(__m128d __a, __m128d __b)
163
{
164
  return (__m128d)__builtin_ia32_cmpltpd(__a, __b);
165
}
166
167
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
168
_mm_cmple_pd(__m128d __a, __m128d __b)
169
{
170
  return (__m128d)__builtin_ia32_cmplepd(__a, __b);
171
}
172
173
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
174
_mm_cmpgt_pd(__m128d __a, __m128d __b)
175
{
176
  return (__m128d)__builtin_ia32_cmpltpd(__b, __a);
177
}
178
179
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
180
_mm_cmpge_pd(__m128d __a, __m128d __b)
181
{
182
  return (__m128d)__builtin_ia32_cmplepd(__b, __a);
183
}
184
185
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
186
_mm_cmpord_pd(__m128d __a, __m128d __b)
187
{
188
  return (__m128d)__builtin_ia32_cmpordpd(__a, __b);
189
}
190
191
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
192
_mm_cmpunord_pd(__m128d __a, __m128d __b)
193
{
194
  return (__m128d)__builtin_ia32_cmpunordpd(__a, __b);
195
}
196
197
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
198
_mm_cmpneq_pd(__m128d __a, __m128d __b)
199
{
200
  return (__m128d)__builtin_ia32_cmpneqpd(__a, __b);
201
}
202
203
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
204
_mm_cmpnlt_pd(__m128d __a, __m128d __b)
205
{
206
  return (__m128d)__builtin_ia32_cmpnltpd(__a, __b);
207
}
208
209
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
210
_mm_cmpnle_pd(__m128d __a, __m128d __b)
211
{
212
  return (__m128d)__builtin_ia32_cmpnlepd(__a, __b);
213
}
214
215
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
216
_mm_cmpngt_pd(__m128d __a, __m128d __b)
217
{
218
  return (__m128d)__builtin_ia32_cmpnltpd(__b, __a);
219
}
220
221
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
222
_mm_cmpnge_pd(__m128d __a, __m128d __b)
223
{
224
  return (__m128d)__builtin_ia32_cmpnlepd(__b, __a);
225
}
226
227
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
228
_mm_cmpeq_sd(__m128d __a, __m128d __b)
229
{
230
  return (__m128d)__builtin_ia32_cmpeqsd(__a, __b);
231
}
232
233
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
234
_mm_cmplt_sd(__m128d __a, __m128d __b)
235
{
236
  return (__m128d)__builtin_ia32_cmpltsd(__a, __b);
237
}
238
239
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
240
_mm_cmple_sd(__m128d __a, __m128d __b)
241
{
242
  return (__m128d)__builtin_ia32_cmplesd(__a, __b);
243
}
244
245
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
246
_mm_cmpgt_sd(__m128d __a, __m128d __b)
247
{
248
  __m128d __c = __builtin_ia32_cmpltsd(__b, __a);
249
  return (__m128d) { __c[0], __a[1] };
250
}
251
252
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
253
_mm_cmpge_sd(__m128d __a, __m128d __b)
254
{
255
  __m128d __c = __builtin_ia32_cmplesd(__b, __a);
256
  return (__m128d) { __c[0], __a[1] };
257
}
258
259
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
260
_mm_cmpord_sd(__m128d __a, __m128d __b)
261
{
262
  return (__m128d)__builtin_ia32_cmpordsd(__a, __b);
263
}
264
265
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
266
_mm_cmpunord_sd(__m128d __a, __m128d __b)
267
{
268
  return (__m128d)__builtin_ia32_cmpunordsd(__a, __b);
269
}
270
271
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
272
_mm_cmpneq_sd(__m128d __a, __m128d __b)
273
{
274
  return (__m128d)__builtin_ia32_cmpneqsd(__a, __b);
275
}
276
277
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
278
_mm_cmpnlt_sd(__m128d __a, __m128d __b)
279
{
280
  return (__m128d)__builtin_ia32_cmpnltsd(__a, __b);
281
}
282
283
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
284
_mm_cmpnle_sd(__m128d __a, __m128d __b)
285
{
286
  return (__m128d)__builtin_ia32_cmpnlesd(__a, __b);
287
}
288
289
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
290
_mm_cmpngt_sd(__m128d __a, __m128d __b)
291
{
292
  __m128d __c = __builtin_ia32_cmpnltsd(__b, __a);
293
  return (__m128d) { __c[0], __a[1] };
294
}
295
296
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
297
_mm_cmpnge_sd(__m128d __a, __m128d __b)
298
{
299
  __m128d __c = __builtin_ia32_cmpnlesd(__b, __a);
300
  return (__m128d) { __c[0], __a[1] };
301
}
302
303
static __inline__ int __attribute__((__always_inline__, __nodebug__))
304
_mm_comieq_sd(__m128d __a, __m128d __b)
305
{
306
  return __builtin_ia32_comisdeq(__a, __b);
307
}
308
309
static __inline__ int __attribute__((__always_inline__, __nodebug__))
310
_mm_comilt_sd(__m128d __a, __m128d __b)
311
{
312
  return __builtin_ia32_comisdlt(__a, __b);
313
}
314
315
static __inline__ int __attribute__((__always_inline__, __nodebug__))
316
_mm_comile_sd(__m128d __a, __m128d __b)
317
{
318
  return __builtin_ia32_comisdle(__a, __b);
319
}
320
321
static __inline__ int __attribute__((__always_inline__, __nodebug__))
322
_mm_comigt_sd(__m128d __a, __m128d __b)
323
{
324
  return __builtin_ia32_comisdgt(__a, __b);
325
}
326
327
static __inline__ int __attribute__((__always_inline__, __nodebug__))
328
_mm_comige_sd(__m128d __a, __m128d __b)
329
{
330
  return __builtin_ia32_comisdge(__a, __b);
331
}
332
333
static __inline__ int __attribute__((__always_inline__, __nodebug__))
334
_mm_comineq_sd(__m128d __a, __m128d __b)
335
{
336
  return __builtin_ia32_comisdneq(__a, __b);
337
}
338
339
static __inline__ int __attribute__((__always_inline__, __nodebug__))
340
_mm_ucomieq_sd(__m128d __a, __m128d __b)
341
{
342
  return __builtin_ia32_ucomisdeq(__a, __b);
343
}
344
345
static __inline__ int __attribute__((__always_inline__, __nodebug__))
346
_mm_ucomilt_sd(__m128d __a, __m128d __b)
347
{
348
  return __builtin_ia32_ucomisdlt(__a, __b);
349
}
350
351
static __inline__ int __attribute__((__always_inline__, __nodebug__))
352
_mm_ucomile_sd(__m128d __a, __m128d __b)
353
{
354
  return __builtin_ia32_ucomisdle(__a, __b);
355
}
356
357
static __inline__ int __attribute__((__always_inline__, __nodebug__))
358
_mm_ucomigt_sd(__m128d __a, __m128d __b)
359
{
360
  return __builtin_ia32_ucomisdgt(__a, __b);
361
}
362
363
static __inline__ int __attribute__((__always_inline__, __nodebug__))
364
_mm_ucomige_sd(__m128d __a, __m128d __b)
365
{
366
  return __builtin_ia32_ucomisdge(__a, __b);
367
}
368
369
static __inline__ int __attribute__((__always_inline__, __nodebug__))
370
_mm_ucomineq_sd(__m128d __a, __m128d __b)
371
{
372
  return __builtin_ia32_ucomisdneq(__a, __b);
373
}
374
375
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
376
_mm_cvtpd_ps(__m128d __a)
377
{
378
  return __builtin_ia32_cvtpd2ps(__a);
379
}
380
381
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
382
_mm_cvtps_pd(__m128 __a)
383
{
384
  return __builtin_ia32_cvtps2pd(__a);
385
}
386
387
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
388
_mm_cvtepi32_pd(__m128i __a)
389
{
390
  return __builtin_ia32_cvtdq2pd((__v4si)__a);
391
}
392
393
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
394
_mm_cvtpd_epi32(__m128d __a)
395
{
396
  return __builtin_ia32_cvtpd2dq(__a);
397
}
398
399
static __inline__ int __attribute__((__always_inline__, __nodebug__))
400
_mm_cvtsd_si32(__m128d __a)
401
{
402
  return __builtin_ia32_cvtsd2si(__a);
403
}
404
405
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
406
_mm_cvtsd_ss(__m128 __a, __m128d __b)
407
{
408
  __a[0] = __b[0];
409
  return __a;
410
}
411
412
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
413
_mm_cvtsi32_sd(__m128d __a, int __b)
414
{
415
  __a[0] = __b;
416
  return __a;
417
}
418
419
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
420
_mm_cvtss_sd(__m128d __a, __m128 __b)
421
{
422
  __a[0] = __b[0];
423
  return __a;
424
}
425
426
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
427
_mm_cvttpd_epi32(__m128d __a)
428
{
429
  return (__m128i)__builtin_ia32_cvttpd2dq(__a);
430
}
431
432
static __inline__ int __attribute__((__always_inline__, __nodebug__))
433
_mm_cvttsd_si32(__m128d __a)
434
{
435
  return __a[0];
436
}
437
438
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
439
_mm_cvtpd_pi32(__m128d __a)
440
{
441
  return (__m64)__builtin_ia32_cvtpd2pi(__a);
442
}
443
444
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
445
_mm_cvttpd_pi32(__m128d __a)
446
{
447
  return (__m64)__builtin_ia32_cvttpd2pi(__a);
448
}
449
450
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
451
_mm_cvtpi32_pd(__m64 __a)
452
{
453
  return __builtin_ia32_cvtpi2pd((__v2si)__a);
454
}
455
456
static __inline__ double __attribute__((__always_inline__, __nodebug__))
457
_mm_cvtsd_f64(__m128d __a)
458
{
459
  return __a[0];
460
}
461
462
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
463
_mm_load_pd(double const *__dp)
464
{
465
  return *(__m128d*)__dp;
466
}
467
468
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
469
_mm_load1_pd(double const *__dp)
470
{
471
  struct __mm_load1_pd_struct {
472
    double __u;
473
  } __attribute__((__packed__, __may_alias__));
474
  double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
475
  return (__m128d){ __u, __u };
476
}
477
478
#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
479
480
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
481
_mm_loadr_pd(double const *__dp)
482
{
483
  __m128d __u = *(__m128d*)__dp;
484
  return __builtin_shufflevector(__u, __u, 1, 0);
485
}
486
487
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
488
_mm_loadu_pd(double const *__dp)
489
{
490
  struct __loadu_pd {
491
    __m128d __v;
492
  } __attribute__((packed, may_alias));
493
  return ((struct __loadu_pd*)__dp)->__v;
494
}
495
496
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
497
_mm_load_sd(double const *__dp)
498
{
499
  struct __mm_load_sd_struct {
500
    double __u;
501
  } __attribute__((__packed__, __may_alias__));
502
  double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
503
  return (__m128d){ __u, 0 };
504
}
505
506
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
507
_mm_loadh_pd(__m128d __a, double const *__dp)
508
{
509
  struct __mm_loadh_pd_struct {
510
    double __u;
511
  } __attribute__((__packed__, __may_alias__));
512
  double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
513
  return (__m128d){ __a[0], __u };
514
}
515
516
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
517
_mm_loadl_pd(__m128d __a, double const *__dp)
518
{
519
  struct __mm_loadl_pd_struct {
520
    double __u;
521
  } __attribute__((__packed__, __may_alias__));
522
  double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
523
  return (__m128d){ __u, __a[1] };
524
}
525
526
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
527
_mm_set_sd(double __w)
528
{
529
  return (__m128d){ __w, 0 };
530
}
531
532
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
533
_mm_set1_pd(double __w)
534
{
535
  return (__m128d){ __w, __w };
536
}
537
538
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
539
_mm_set_pd(double __w, double __x)
540
{
541
  return (__m128d){ __x, __w };
542
}
543
544
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
545
_mm_setr_pd(double __w, double __x)
546
{
547
  return (__m128d){ __w, __x };
548
}
549
550
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
551
_mm_setzero_pd(void)
552
{
553
  return (__m128d){ 0, 0 };
554
}
555
556
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
557
_mm_move_sd(__m128d __a, __m128d __b)
558
{
559
  return (__m128d){ __b[0], __a[1] };
560
}
561
562
static __inline__ void __attribute__((__always_inline__, __nodebug__))
563
_mm_store_sd(double *__dp, __m128d __a)
564
{
565
  struct __mm_store_sd_struct {
566
    double __u;
567
  } __attribute__((__packed__, __may_alias__));
568
  ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
569
}
570
571
static __inline__ void __attribute__((__always_inline__, __nodebug__))
572
_mm_store1_pd(double *__dp, __m128d __a)
573
{
574
  struct __mm_store1_pd_struct {
575
    double __u[2];
576
  } __attribute__((__packed__, __may_alias__));
577
  ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0];
578
  ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
579
}
580
581
static __inline__ void __attribute__((__always_inline__, __nodebug__))
582
_mm_store_pd(double *__dp, __m128d __a)
583
{
584
  *(__m128d *)__dp = __a;
585
}
586
587
static __inline__ void __attribute__((__always_inline__, __nodebug__))
588
_mm_storeu_pd(double *__dp, __m128d __a)
589
{
590
  __builtin_ia32_storeupd(__dp, __a);
591
}
592
593
static __inline__ void __attribute__((__always_inline__, __nodebug__))
594
_mm_storer_pd(double *__dp, __m128d __a)
595
{
596
  __a = __builtin_shufflevector(__a, __a, 1, 0);
597
  *(__m128d *)__dp = __a;
598
}
599
600
static __inline__ void __attribute__((__always_inline__, __nodebug__))
601
_mm_storeh_pd(double *__dp, __m128d __a)
602
{
603
  struct __mm_storeh_pd_struct {
604
    double __u;
605
  } __attribute__((__packed__, __may_alias__));
606
  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
607
}
608
609
static __inline__ void __attribute__((__always_inline__, __nodebug__))
610
_mm_storel_pd(double *__dp, __m128d __a)
611
{
612
  struct __mm_storeh_pd_struct {
613
    double __u;
614
  } __attribute__((__packed__, __may_alias__));
615
  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
616
}
617
618
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
619
_mm_add_epi8(__m128i __a, __m128i __b)
620
{
621
  return (__m128i)((__v16qi)__a + (__v16qi)__b);
622
}
623
624
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
625
_mm_add_epi16(__m128i __a, __m128i __b)
626
{
627
  return (__m128i)((__v8hi)__a + (__v8hi)__b);
628
}
629
630
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
631
_mm_add_epi32(__m128i __a, __m128i __b)
632
{
633
  return (__m128i)((__v4si)__a + (__v4si)__b);
634
}
635
636
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
637
_mm_add_si64(__m64 __a, __m64 __b)
638
{
639
  return __a + __b;
640
}
641
642
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
643
_mm_add_epi64(__m128i __a, __m128i __b)
644
{
645
  return __a + __b;
646
}
647
648
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
649
_mm_adds_epi8(__m128i __a, __m128i __b)
650
{
651
  return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
652
}
653
654
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
655
_mm_adds_epi16(__m128i __a, __m128i __b)
656
{
657
  return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
658
}
659
660
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
661
_mm_adds_epu8(__m128i __a, __m128i __b)
662
{
663
  return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
664
}
665
666
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
667
_mm_adds_epu16(__m128i __a, __m128i __b)
668
{
669
  return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
670
}
671
672
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
673
_mm_avg_epu8(__m128i __a, __m128i __b)
674
{
675
  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
676
}
677
678
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
679
_mm_avg_epu16(__m128i __a, __m128i __b)
680
{
681
  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
682
}
683
684
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
685
_mm_madd_epi16(__m128i __a, __m128i __b)
686
{
687
  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
688
}
689
690
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
691
_mm_max_epi16(__m128i __a, __m128i __b)
692
{
693
  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
694
}
695
696
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
697
_mm_max_epu8(__m128i __a, __m128i __b)
698
{
699
  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
700
}
701
702
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
703
_mm_min_epi16(__m128i __a, __m128i __b)
704
{
705
  return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
706
}
707
708
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
709
_mm_min_epu8(__m128i __a, __m128i __b)
710
{
711
  return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
712
}
713
714
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
715
_mm_mulhi_epi16(__m128i __a, __m128i __b)
716
{
717
  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
718
}
719
720
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
721
_mm_mulhi_epu16(__m128i __a, __m128i __b)
722
{
723
  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
724
}
725
726
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
727
_mm_mullo_epi16(__m128i __a, __m128i __b)
728
{
729
  return (__m128i)((__v8hi)__a * (__v8hi)__b);
730
}
731
732
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
733
_mm_mul_su32(__m64 __a, __m64 __b)
734
{
735
  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
736
}
737
738
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
739
_mm_mul_epu32(__m128i __a, __m128i __b)
740
{
741
  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
742
}
743
744
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
745
_mm_sad_epu8(__m128i __a, __m128i __b)
746
{
747
  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
748
}
749
750
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
751
_mm_sub_epi8(__m128i __a, __m128i __b)
752
{
753
  return (__m128i)((__v16qi)__a - (__v16qi)__b);
754
}
755
756
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
757
_mm_sub_epi16(__m128i __a, __m128i __b)
758
{
759
  return (__m128i)((__v8hi)__a - (__v8hi)__b);
760
}
761
762
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
763
_mm_sub_epi32(__m128i __a, __m128i __b)
764
{
765
  return (__m128i)((__v4si)__a - (__v4si)__b);
766
}
767
768
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
769
_mm_sub_si64(__m64 __a, __m64 __b)
770
{
771
  return __a - __b;
772
}
773
774
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
775
_mm_sub_epi64(__m128i __a, __m128i __b)
776
{
777
  return __a - __b;
778
}
779
780
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
781
_mm_subs_epi8(__m128i __a, __m128i __b)
782
{
783
  return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
784
}
785
786
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
787
_mm_subs_epi16(__m128i __a, __m128i __b)
788
{
789
  return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
790
}
791
792
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
793
_mm_subs_epu8(__m128i __a, __m128i __b)
794
{
795
  return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
796
}
797
798
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
799
_mm_subs_epu16(__m128i __a, __m128i __b)
800
{
801
  return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
802
}
803
804
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
805
_mm_and_si128(__m128i __a, __m128i __b)
806
{
807
  return __a & __b;
808
}
809
810
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
811
_mm_andnot_si128(__m128i __a, __m128i __b)
812
{
813
  return ~__a & __b;
814
}
815
816
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
817
_mm_or_si128(__m128i __a, __m128i __b)
818
{
819
  return __a | __b;
820
}
821
822
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
823
_mm_xor_si128(__m128i __a, __m128i __b)
824
{
825
  return __a ^ __b;
826
}
827
828
#define _mm_slli_si128(a, count) __extension__ ({ \
829
  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
830
  __m128i __a = (a); \
831
   _Pragma("clang diagnostic pop"); \
832
  (__m128i)__builtin_ia32_pslldqi128(__a, (count)*8); })
833
834
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
835
_mm_slli_epi16(__m128i __a, int __count)
836
{
837
  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
838
}
839
840
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
841
_mm_sll_epi16(__m128i __a, __m128i __count)
842
{
843
  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
844
}
845
846
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
847
_mm_slli_epi32(__m128i __a, int __count)
848
{
849
  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
850
}
851
852
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
853
_mm_sll_epi32(__m128i __a, __m128i __count)
854
{
855
  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
856
}
857
858
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
859
_mm_slli_epi64(__m128i __a, int __count)
860
{
861
  return __builtin_ia32_psllqi128(__a, __count);
862
}
863
864
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
865
_mm_sll_epi64(__m128i __a, __m128i __count)
866
{
867
  return __builtin_ia32_psllq128(__a, __count);
868
}
869
870
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
871
_mm_srai_epi16(__m128i __a, int __count)
872
{
873
  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
874
}
875
876
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
877
_mm_sra_epi16(__m128i __a, __m128i __count)
878
{
879
  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
880
}
881
882
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
883
_mm_srai_epi32(__m128i __a, int __count)
884
{
885
  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
886
}
887
888
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
889
_mm_sra_epi32(__m128i __a, __m128i __count)
890
{
891
  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
892
}
893
894
895
#define _mm_srli_si128(a, count) __extension__ ({ \
896
  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
897
  __m128i __a = (a); \
898
  _Pragma("clang diagnostic pop"); \
899
  (__m128i)__builtin_ia32_psrldqi128(__a, (count)*8); })
900
901
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
902
_mm_srli_epi16(__m128i __a, int __count)
903
{
904
  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
905
}
906
907
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
908
_mm_srl_epi16(__m128i __a, __m128i __count)
909
{
910
  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
911
}
912
913
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
914
_mm_srli_epi32(__m128i __a, int __count)
915
{
916
  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
917
}
918
919
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
920
_mm_srl_epi32(__m128i __a, __m128i __count)
921
{
922
  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
923
}
924
925
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
926
_mm_srli_epi64(__m128i __a, int __count)
927
{
928
  return __builtin_ia32_psrlqi128(__a, __count);
929
}
930
931
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
932
_mm_srl_epi64(__m128i __a, __m128i __count)
933
{
934
  return __builtin_ia32_psrlq128(__a, __count);
935
}
936
937
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
938
_mm_cmpeq_epi8(__m128i __a, __m128i __b)
939
{
940
  return (__m128i)((__v16qi)__a == (__v16qi)__b);
941
}
942
943
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
944
_mm_cmpeq_epi16(__m128i __a, __m128i __b)
945
{
946
  return (__m128i)((__v8hi)__a == (__v8hi)__b);
947
}
948
949
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
950
_mm_cmpeq_epi32(__m128i __a, __m128i __b)
951
{
952
  return (__m128i)((__v4si)__a == (__v4si)__b);
953
}
954
955
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
956
_mm_cmpgt_epi8(__m128i __a, __m128i __b)
957
{
958
  /* This function always performs a signed comparison, but __v16qi is a char
959
     which may be signed or unsigned. */
960
  typedef signed char __v16qs __attribute__((__vector_size__(16)));
961
  return (__m128i)((__v16qs)__a > (__v16qs)__b);
962
}
963
964
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
965
_mm_cmpgt_epi16(__m128i __a, __m128i __b)
966
{
967
  return (__m128i)((__v8hi)__a > (__v8hi)__b);
968
}
969
970
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
971
_mm_cmpgt_epi32(__m128i __a, __m128i __b)
972
{
973
  return (__m128i)((__v4si)__a > (__v4si)__b);
974
}
975
976
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
977
_mm_cmplt_epi8(__m128i __a, __m128i __b)
978
{
979
  return _mm_cmpgt_epi8(__b, __a);
980
}
981
982
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
983
_mm_cmplt_epi16(__m128i __a, __m128i __b)
984
{
985
  return _mm_cmpgt_epi16(__b, __a);
986
}
987
988
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
989
_mm_cmplt_epi32(__m128i __a, __m128i __b)
990
{
991
  return _mm_cmpgt_epi32(__b, __a);
992
}
993
994
#ifdef __x86_64__
995
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
996
_mm_cvtsi64_sd(__m128d __a, long long __b)
997
{
998
  __a[0] = __b;
999
  return __a;
1000
}
1001
1002
static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1003
_mm_cvtsd_si64(__m128d __a)
1004
{
1005
  return __builtin_ia32_cvtsd2si64(__a);
1006
}
1007
1008
static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1009
_mm_cvttsd_si64(__m128d __a)
1010
{
1011
  return __a[0];
1012
}
1013
#endif
1014
1015
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1016
_mm_cvtepi32_ps(__m128i __a)
1017
{
1018
  return __builtin_ia32_cvtdq2ps((__v4si)__a);
1019
}
1020
1021
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1022
_mm_cvtps_epi32(__m128 __a)
1023
{
1024
  return (__m128i)__builtin_ia32_cvtps2dq(__a);
1025
}
1026
1027
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1028
_mm_cvttps_epi32(__m128 __a)
1029
{
1030
  return (__m128i)__builtin_ia32_cvttps2dq(__a);
1031
}
1032
1033
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1034
_mm_cvtsi32_si128(int __a)
1035
{
1036
  return (__m128i)(__v4si){ __a, 0, 0, 0 };
1037
}
1038
1039
#ifdef __x86_64__
1040
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1041
_mm_cvtsi64_si128(long long __a)
1042
{
1043
  return (__m128i){ __a, 0 };
1044
}
1045
#endif
1046
1047
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1048
_mm_cvtsi128_si32(__m128i __a)
1049
{
1050
  __v4si __b = (__v4si)__a;
1051
  return __b[0];
1052
}
1053
1054
#ifdef __x86_64__
1055
static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1056
_mm_cvtsi128_si64(__m128i __a)
1057
{
1058
  return __a[0];
1059
}
1060
#endif
1061
1062
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1063
_mm_load_si128(__m128i const *__p)
1064
{
1065
  return *__p;
1066
}
1067
1068
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1069
_mm_loadu_si128(__m128i const *__p)
1070
{
1071
  struct __loadu_si128 {
1072
    __m128i __v;
1073
  } __attribute__((packed, may_alias));
1074
  return ((struct __loadu_si128*)__p)->__v;
1075
}
1076
1077
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1078
_mm_loadl_epi64(__m128i const *__p)
1079
{
1080
  struct __mm_loadl_epi64_struct {
1081
    long long __u;
1082
  } __attribute__((__packed__, __may_alias__));
1083
  return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
1084
}
1085
1086
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1087
_mm_set_epi64x(long long q1, long long q0)
1088
{
1089
  return (__m128i){ q0, q1 };
1090
}
1091
1092
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1093
_mm_set_epi64(__m64 q1, __m64 q0)
1094
{
1095
  return (__m128i){ (long long)q0, (long long)q1 };
1096
}
1097
1098
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1099
_mm_set_epi32(int i3, int i2, int i1, int i0)
1100
{
1101
  return (__m128i)(__v4si){ i0, i1, i2, i3};
1102
}
1103
1104
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1105
_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1106
{
1107
  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1108
}
1109
1110
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1111
_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1112
{
1113
  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1114
}
1115
1116
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1117
_mm_set1_epi64x(long long __q)
1118
{
1119
  return (__m128i){ __q, __q };
1120
}
1121
1122
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1123
_mm_set1_epi64(__m64 __q)
1124
{
1125
  return (__m128i){ (long long)__q, (long long)__q };
1126
}
1127
1128
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1129
_mm_set1_epi32(int __i)
1130
{
1131
  return (__m128i)(__v4si){ __i, __i, __i, __i };
1132
}
1133
1134
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1135
_mm_set1_epi16(short __w)
1136
{
1137
  return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
1138
}
1139
1140
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1141
_mm_set1_epi8(char __b)
1142
{
1143
  return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
1144
}
1145
1146
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1147
_mm_setr_epi64(__m64 q0, __m64 q1)
1148
{
1149
  return (__m128i){ (long long)q0, (long long)q1 };
1150
}
1151
1152
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1153
_mm_setr_epi32(int i0, int i1, int i2, int i3)
1154
{
1155
  return (__m128i)(__v4si){ i0, i1, i2, i3};
1156
}
1157
1158
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1159
_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1160
{
1161
  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1162
}
1163
1164
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1165
_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1166
{
1167
  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1168
}
1169
1170
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1171
_mm_setzero_si128(void)
1172
{
1173
  return (__m128i){ 0LL, 0LL };
1174
}
1175
1176
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1177
_mm_store_si128(__m128i *__p, __m128i __b)
1178
{
1179
  *__p = __b;
1180
}
1181
1182
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1183
_mm_storeu_si128(__m128i *__p, __m128i __b)
1184
{
1185
  __builtin_ia32_storedqu((char *)__p, (__v16qi)__b);
1186
}
1187
1188
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1189
_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
1190
{
1191
  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
1192
}
1193
1194
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1195
_mm_storel_epi64(__m128i *__p, __m128i __a)
1196
{
1197
  struct __mm_storel_epi64_struct {
1198
    long long __u;
1199
  } __attribute__((__packed__, __may_alias__));
1200
  ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
1201
}
1202
1203
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1204
_mm_stream_pd(double *__p, __m128d __a)
1205
{
1206
  __builtin_ia32_movntpd(__p, __a);
1207
}
1208
1209
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1210
_mm_stream_si128(__m128i *__p, __m128i __a)
1211
{
1212
  __builtin_ia32_movntdq(__p, __a);
1213
}
1214
1215
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1216
_mm_stream_si32(int *__p, int __a)
1217
{
1218
  __builtin_ia32_movnti(__p, __a);
1219
}
1220
1221
#ifdef __x86_64__
1222
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1223
_mm_stream_si64(long long *__p, long long __a)
1224
{
1225
  __builtin_ia32_movnti64(__p, __a);
1226
}
1227
#endif
1228
1229
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1230
_mm_clflush(void const *__p)
1231
{
1232
  __builtin_ia32_clflush(__p);
1233
}
1234
1235
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1236
_mm_lfence(void)
1237
{
1238
  __builtin_ia32_lfence();
1239
}
1240
1241
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1242
_mm_mfence(void)
1243
{
1244
  __builtin_ia32_mfence();
1245
}
1246
1247
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1248
_mm_packs_epi16(__m128i __a, __m128i __b)
1249
{
1250
  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
1251
}
1252
1253
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1254
_mm_packs_epi32(__m128i __a, __m128i __b)
1255
{
1256
  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
1257
}
1258
1259
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1260
_mm_packus_epi16(__m128i __a, __m128i __b)
1261
{
1262
  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
1263
}
1264
1265
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1266
_mm_extract_epi16(__m128i __a, int __imm)
1267
{
1268
  __v8hi __b = (__v8hi)__a;
1269
  return (unsigned short)__b[__imm & 7];
1270
}
1271
1272
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1273
_mm_insert_epi16(__m128i __a, int __b, int __imm)
1274
{
1275
  __v8hi __c = (__v8hi)__a;
1276
  __c[__imm & 7] = __b;
1277
  return (__m128i)__c;
1278
}
1279
1280
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1281
_mm_movemask_epi8(__m128i __a)
1282
{
1283
  return __builtin_ia32_pmovmskb128((__v16qi)__a);
1284
}
1285
1286
#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
1287
  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
1288
  __m128i __a = (a); \
1289
  _Pragma("clang diagnostic pop"); \
1290
  (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si) _mm_set1_epi32(0), \
1291
                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
1292
                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
1293
1294
#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
1295
  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
1296
  __m128i __a = (a); \
1297
  _Pragma("clang diagnostic pop"); \
1298
  (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
1299
                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
1300
                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
1301
                                   4, 5, 6, 7); })
1302
1303
#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
1304
  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
1305
  __m128i __a = (a); \
1306
  _Pragma("clang diagnostic pop"); \
1307
  (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
1308
                                   0, 1, 2, 3, \
1309
                                   4 + (((imm) & 0x03) >> 0), \
1310
                                   4 + (((imm) & 0x0c) >> 2), \
1311
                                   4 + (((imm) & 0x30) >> 4), \
1312
                                   4 + (((imm) & 0xc0) >> 6)); })
1313
1314
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1315
_mm_unpackhi_epi8(__m128i __a, __m128i __b)
1316
{
1317
  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1318
}
1319
1320
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1321
_mm_unpackhi_epi16(__m128i __a, __m128i __b)
1322
{
1323
  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1324
}
1325
1326
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1327
_mm_unpackhi_epi32(__m128i __a, __m128i __b)
1328
{
1329
  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
1330
}
1331
1332
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1333
_mm_unpackhi_epi64(__m128i __a, __m128i __b)
1334
{
1335
  return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1);
1336
}
1337
1338
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1339
_mm_unpacklo_epi8(__m128i __a, __m128i __b)
1340
{
1341
  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1342
}
1343
1344
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1345
_mm_unpacklo_epi16(__m128i __a, __m128i __b)
1346
{
1347
  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1348
}
1349
1350
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1351
_mm_unpacklo_epi32(__m128i __a, __m128i __b)
1352
{
1353
  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
1354
}
1355
1356
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1357
_mm_unpacklo_epi64(__m128i __a, __m128i __b)
1358
{
1359
  return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0);
1360
}
1361
1362
static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
1363
_mm_movepi64_pi64(__m128i __a)
1364
{
1365
  return (__m64)__a[0];
1366
}
1367
1368
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1369
_mm_movpi64_epi64(__m64 __a)
1370
{
1371
  return (__m128i){ (long long)__a, 0 };
1372
}
1373
1374
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1375
_mm_move_epi64(__m128i __a)
1376
{
1377
  return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2);
1378
}
1379
1380
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1381
_mm_unpackhi_pd(__m128d __a, __m128d __b)
1382
{
1383
  return __builtin_shufflevector(__a, __b, 1, 2+1);
1384
}
1385
1386
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1387
_mm_unpacklo_pd(__m128d __a, __m128d __b)
1388
{
1389
  return __builtin_shufflevector(__a, __b, 0, 2+0);
1390
}
1391
1392
static __inline__ int __attribute__((__always_inline__, __nodebug__))
1393
_mm_movemask_pd(__m128d __a)
1394
{
1395
  return __builtin_ia32_movmskpd(__a);
1396
}
1397
1398
#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
1399
  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
1400
  __m128d __a = (a); \
1401
  __m128d __b = (b); \
1402
  _Pragma("clang diagnostic pop"); \
1403
  __builtin_shufflevector(__a, __b, (i) & 1, (((i) & 2) >> 1) + 2); })
1404
1405
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1406
_mm_castpd_ps(__m128d __a)
1407
{
1408
  return (__m128)__a;
1409
}
1410
1411
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1412
_mm_castpd_si128(__m128d __a)
1413
{
1414
  return (__m128i)__a;
1415
}
1416
1417
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1418
_mm_castps_pd(__m128 __a)
1419
{
1420
  return (__m128d)__a;
1421
}
1422
1423
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1424
_mm_castps_si128(__m128 __a)
1425
{
1426
  return (__m128i)__a;
1427
}
1428
1429
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1430
_mm_castsi128_ps(__m128i __a)
1431
{
1432
  return (__m128)__a;
1433
}
1434
1435
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1436
_mm_castsi128_pd(__m128i __a)
1437
{
1438
  return (__m128d)__a;
1439
}
1440
1441
static __inline__ void __attribute__((__always_inline__, __nodebug__))
1442
_mm_pause(void)
1443
{
1444
  __asm__ volatile ("pause");
1445
}
1446
1447
#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1448
1449
#endif /* __SSE2__ */
1450
1451
#endif /* __EMMINTRIN_H */