Project

General

Profile

Statistics
| Revision:

root / lab4 / .minix-src / include / clang-3.6 / avxintrin.h @ 13

History | View | Annotate | Download (39.5 KB)

1
/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2
 *
3
 * Permission is hereby granted, free of charge, to any person obtaining a copy
4
 * of this software and associated documentation files (the "Software"), to deal
5
 * in the Software without restriction, including without limitation the rights
6
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
 * copies of the Software, and to permit persons to whom the Software is
8
 * furnished to do so, subject to the following conditions:
9
 *
10
 * The above copyright notice and this permission notice shall be included in
11
 * all copies or substantial portions of the Software.
12
 *
13
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
 * THE SOFTWARE.
20
 *
21
 *===-----------------------------------------------------------------------===
22
 */
23

    
24
#ifndef __IMMINTRIN_H
25
#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26
#endif
27

    
28
#ifndef __AVXINTRIN_H
29
#define __AVXINTRIN_H
30

    
31
typedef double __v4df __attribute__ ((__vector_size__ (32)));
32
typedef float __v8sf __attribute__ ((__vector_size__ (32)));
33
typedef long long __v4di __attribute__ ((__vector_size__ (32)));
34
typedef int __v8si __attribute__ ((__vector_size__ (32)));
35
typedef short __v16hi __attribute__ ((__vector_size__ (32)));
36
typedef char __v32qi __attribute__ ((__vector_size__ (32)));
37

    
38
typedef float __m256 __attribute__ ((__vector_size__ (32)));
39
typedef double __m256d __attribute__((__vector_size__(32)));
40
typedef long long __m256i __attribute__((__vector_size__(32)));
41

    
42
/* Arithmetic */
43
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
44
_mm256_add_pd(__m256d __a, __m256d __b)
45
{
46
  return __a+__b;
47
}
48

    
49
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
50
_mm256_add_ps(__m256 __a, __m256 __b)
51
{
52
  return __a+__b;
53
}
54

    
55
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
56
_mm256_sub_pd(__m256d __a, __m256d __b)
57
{
58
  return __a-__b;
59
}
60

    
61
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
62
_mm256_sub_ps(__m256 __a, __m256 __b)
63
{
64
  return __a-__b;
65
}
66

    
67
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
68
_mm256_addsub_pd(__m256d __a, __m256d __b)
69
{
70
  return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
71
}
72

    
73
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
74
_mm256_addsub_ps(__m256 __a, __m256 __b)
75
{
76
  return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
77
}
78

    
79
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
80
_mm256_div_pd(__m256d __a, __m256d __b)
81
{
82
  return __a / __b;
83
}
84

    
85
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
86
_mm256_div_ps(__m256 __a, __m256 __b)
87
{
88
  return __a / __b;
89
}
90

    
91
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
92
_mm256_max_pd(__m256d __a, __m256d __b)
93
{
94
  return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
95
}
96

    
97
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
98
_mm256_max_ps(__m256 __a, __m256 __b)
99
{
100
  return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
101
}
102

    
103
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
104
_mm256_min_pd(__m256d __a, __m256d __b)
105
{
106
  return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
107
}
108

    
109
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
110
_mm256_min_ps(__m256 __a, __m256 __b)
111
{
112
  return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
113
}
114

    
115
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
116
_mm256_mul_pd(__m256d __a, __m256d __b)
117
{
118
  return __a * __b;
119
}
120

    
121
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
122
_mm256_mul_ps(__m256 __a, __m256 __b)
123
{
124
  return __a * __b;
125
}
126

    
127
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
128
_mm256_sqrt_pd(__m256d __a)
129
{
130
  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
131
}
132

    
133
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
134
_mm256_sqrt_ps(__m256 __a)
135
{
136
  return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
137
}
138

    
139
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
140
_mm256_rsqrt_ps(__m256 __a)
141
{
142
  return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
143
}
144

    
145
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
146
_mm256_rcp_ps(__m256 __a)
147
{
148
  return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
149
}
150

    
151
#define _mm256_round_pd(V, M) __extension__ ({ \
152
    __m256d __V = (V); \
153
    (__m256d)__builtin_ia32_roundpd256((__v4df)__V, (M)); })
154

    
155
#define _mm256_round_ps(V, M) __extension__ ({ \
156
  __m256 __V = (V); \
157
  (__m256)__builtin_ia32_roundps256((__v8sf)__V, (M)); })
158

    
159
#define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
160
#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
161
#define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
162
#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
163

    
164
/* Logical */
165
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
166
_mm256_and_pd(__m256d __a, __m256d __b)
167
{
168
  return (__m256d)((__v4di)__a & (__v4di)__b);
169
}
170

    
171
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
172
_mm256_and_ps(__m256 __a, __m256 __b)
173
{
174
  return (__m256)((__v8si)__a & (__v8si)__b);
175
}
176

    
177
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
178
_mm256_andnot_pd(__m256d __a, __m256d __b)
179
{
180
  return (__m256d)(~(__v4di)__a & (__v4di)__b);
181
}
182

    
183
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
184
_mm256_andnot_ps(__m256 __a, __m256 __b)
185
{
186
  return (__m256)(~(__v8si)__a & (__v8si)__b);
187
}
188

    
189
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
190
_mm256_or_pd(__m256d __a, __m256d __b)
191
{
192
  return (__m256d)((__v4di)__a | (__v4di)__b);
193
}
194

    
195
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
196
_mm256_or_ps(__m256 __a, __m256 __b)
197
{
198
  return (__m256)((__v8si)__a | (__v8si)__b);
199
}
200

    
201
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
202
_mm256_xor_pd(__m256d __a, __m256d __b)
203
{
204
  return (__m256d)((__v4di)__a ^ (__v4di)__b);
205
}
206

    
207
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
208
_mm256_xor_ps(__m256 __a, __m256 __b)
209
{
210
  return (__m256)((__v8si)__a ^ (__v8si)__b);
211
}
212

    
213
/* Horizontal arithmetic */
214
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
215
_mm256_hadd_pd(__m256d __a, __m256d __b)
216
{
217
  return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
218
}
219

    
220
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
221
_mm256_hadd_ps(__m256 __a, __m256 __b)
222
{
223
  return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
224
}
225

    
226
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
227
_mm256_hsub_pd(__m256d __a, __m256d __b)
228
{
229
  return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
230
}
231

    
232
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
233
_mm256_hsub_ps(__m256 __a, __m256 __b)
234
{
235
  return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
236
}
237

    
238
/* Vector permutations */
239
static __inline __m128d __attribute__((__always_inline__, __nodebug__))
240
_mm_permutevar_pd(__m128d __a, __m128i __c)
241
{
242
  return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
243
}
244

    
245
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
246
_mm256_permutevar_pd(__m256d __a, __m256i __c)
247
{
248
  return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
249
}
250

    
251
static __inline __m128 __attribute__((__always_inline__, __nodebug__))
252
_mm_permutevar_ps(__m128 __a, __m128i __c)
253
{
254
  return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
255
}
256

    
257
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
258
_mm256_permutevar_ps(__m256 __a, __m256i __c)
259
{
260
  return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a,
261
                                                  (__v8si)__c);
262
}
263

    
264
#define _mm_permute_pd(A, C) __extension__ ({ \
265
  __m128d __A = (A); \
266
  (__m128d)__builtin_shufflevector((__v2df)__A, (__v2df) _mm_setzero_pd(), \
267
                                   (C) & 0x1, ((C) & 0x2) >> 1); })
268

    
269
#define _mm256_permute_pd(A, C) __extension__ ({ \
270
  __m256d __A = (A); \
271
  (__m256d)__builtin_shufflevector((__v4df)__A, (__v4df) _mm256_setzero_pd(), \
272
                                   (C) & 0x1, ((C) & 0x2) >> 1, \
273
                                   2 + (((C) & 0x4) >> 2), \
274
                                   2 + (((C) & 0x8) >> 3)); })
275

    
276
#define _mm_permute_ps(A, C) __extension__ ({ \
277
  __m128 __A = (A); \
278
  (__m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \
279
                                   (C) & 0x3, ((C) & 0xc) >> 2, \
280
                                   ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
281

    
282
#define _mm256_permute_ps(A, C) __extension__ ({ \
283
  __m256 __A = (A); \
284
  (__m256)__builtin_shufflevector((__v8sf)__A, (__v8sf) _mm256_setzero_ps(), \
285
                                  (C) & 0x3, ((C) & 0xc) >> 2, \
286
                                  ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \
287
                                  4 + (((C) & 0x03) >> 0), \
288
                                  4 + (((C) & 0x0c) >> 2), \
289
                                  4 + (((C) & 0x30) >> 4), \
290
                                  4 + (((C) & 0xc0) >> 6)); })
291

    
292
#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
293
  __m256d __V1 = (V1); \
294
  __m256d __V2 = (V2); \
295
  (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, (M)); })
296

    
297
#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
298
  __m256 __V1 = (V1); \
299
  __m256 __V2 = (V2); \
300
  (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
301

    
302
#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
303
  __m256i __V1 = (V1); \
304
  __m256i __V2 = (V2); \
305
  (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, (M)); })
306

    
307
/* Vector Blend */
308
#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
309
  __m256d __V1 = (V1); \
310
  __m256d __V2 = (V2); \
311
  (__m256d)__builtin_shufflevector((__v4df)__V1, (__v4df)__V2, \
312
                                   (((M) & 0x01) ? 4 : 0), \
313
                                   (((M) & 0x02) ? 5 : 1), \
314
                                   (((M) & 0x04) ? 6 : 2), \
315
                                   (((M) & 0x08) ? 7 : 3)); })
316

    
317
#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
318
  __m256 __V1 = (V1); \
319
  __m256 __V2 = (V2); \
320
  (__m256)__builtin_shufflevector((__v8sf)__V1, (__v8sf)__V2, \
321
                                  (((M) & 0x01) ?  8 : 0), \
322
                                  (((M) & 0x02) ?  9 : 1), \
323
                                  (((M) & 0x04) ? 10 : 2), \
324
                                  (((M) & 0x08) ? 11 : 3), \
325
                                  (((M) & 0x10) ? 12 : 4), \
326
                                  (((M) & 0x20) ? 13 : 5), \
327
                                  (((M) & 0x40) ? 14 : 6), \
328
                                  (((M) & 0x80) ? 15 : 7)); })
329

    
330
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
331
_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
332
{
333
  return (__m256d)__builtin_ia32_blendvpd256(
334
    (__v4df)__a, (__v4df)__b, (__v4df)__c);
335
}
336

    
337
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
338
_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
339
{
340
  return (__m256)__builtin_ia32_blendvps256(
341
    (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
342
}
343

    
344
/* Vector Dot Product */
345
#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
346
  __m256 __V1 = (V1); \
347
  __m256 __V2 = (V2); \
348
  (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
349

    
350
/* Vector shuffle */
351
#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
352
        __m256 __a = (a); \
353
        __m256 __b = (b); \
354
        (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \
355
        (mask) & 0x3,                ((mask) & 0xc) >> 2, \
356
        (((mask) & 0x30) >> 4) + 8,  (((mask) & 0xc0) >> 6) + 8, \
357
        ((mask) & 0x3) + 4,          (((mask) & 0xc) >> 2) + 4, \
358
        (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); })
359

    
360
#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
361
        __m256d __a = (a); \
362
        __m256d __b = (b); \
363
        (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \
364
        (mask) & 0x1, \
365
        (((mask) & 0x2) >> 1) + 4, \
366
        (((mask) & 0x4) >> 2) + 2, \
367
        (((mask) & 0x8) >> 3) + 6); })
368

    
369
/* Compare */
370
#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
371
#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
372
#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
373
#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
374
#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
375
#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
376
#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
377
#define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
378
#define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
379
#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
380
#define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
381
#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
382
#define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
383
#define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
384
#define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
385
#define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
386
#define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
387
#define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
388
#define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
389
#define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
390
#define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
391
#define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
392
#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
393
#define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
394
#define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
395
#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
396
#define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
397
#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
398
#define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
399
#define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
400
#define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
401
#define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
402

    
403
#define _mm_cmp_pd(a, b, c) __extension__ ({ \
404
  __m128d __a = (a); \
405
  __m128d __b = (b); \
406
  (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); })
407

    
408
#define _mm_cmp_ps(a, b, c) __extension__ ({ \
409
  __m128 __a = (a); \
410
  __m128 __b = (b); \
411
  (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); })
412

    
413
#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
414
  __m256d __a = (a); \
415
  __m256d __b = (b); \
416
  (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); })
417

    
418
#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
419
  __m256 __a = (a); \
420
  __m256 __b = (b); \
421
  (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); })
422

    
423
#define _mm_cmp_sd(a, b, c) __extension__ ({ \
424
  __m128d __a = (a); \
425
  __m128d __b = (b); \
426
  (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); })
427

    
428
#define _mm_cmp_ss(a, b, c) __extension__ ({ \
429
  __m128 __a = (a); \
430
  __m128 __b = (b); \
431
  (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
432

    
433
/* Vector extract */
434
#define _mm256_extractf128_pd(A, O) __extension__ ({ \
435
  __m256d __A = (A); \
436
  (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); })
437

    
438
#define _mm256_extractf128_ps(A, O) __extension__ ({ \
439
  __m256 __A = (A); \
440
  (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); })
441

    
442
#define _mm256_extractf128_si256(A, O) __extension__ ({ \
443
  __m256i __A = (A); \
444
  (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); })
445

    
446
static __inline int __attribute__((__always_inline__, __nodebug__))
447
_mm256_extract_epi32(__m256i __a, int const __imm)
448
{
449
  __v8si __b = (__v8si)__a;
450
  return __b[__imm & 7];
451
}
452

    
453
static __inline int __attribute__((__always_inline__, __nodebug__))
454
_mm256_extract_epi16(__m256i __a, int const __imm)
455
{
456
  __v16hi __b = (__v16hi)__a;
457
  return __b[__imm & 15];
458
}
459

    
460
static __inline int __attribute__((__always_inline__, __nodebug__))
461
_mm256_extract_epi8(__m256i __a, int const __imm)
462
{
463
  __v32qi __b = (__v32qi)__a;
464
  return __b[__imm & 31];
465
}
466

    
467
#ifdef __x86_64__
468
static __inline long long  __attribute__((__always_inline__, __nodebug__))
469
_mm256_extract_epi64(__m256i __a, const int __imm)
470
{
471
  __v4di __b = (__v4di)__a;
472
  return __b[__imm & 3];
473
}
474
#endif
475

    
476
/* Vector insert */
477
#define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \
478
  __m256d __V1 = (V1); \
479
  __m128d __V2 = (V2); \
480
  (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, (O)); })
481

    
482
#define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \
483
  __m256 __V1 = (V1); \
484
  __m128 __V2 = (V2); \
485
  (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, (O)); })
486

    
487
#define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \
488
  __m256i __V1 = (V1); \
489
  __m128i __V2 = (V2); \
490
  (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, (O)); })
491

    
492
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
493
_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
494
{
495
  __v8si __c = (__v8si)__a;
496
  __c[__imm & 7] = __b;
497
  return (__m256i)__c;
498
}
499

    
500
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
501
_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
502
{
503
  __v16hi __c = (__v16hi)__a;
504
  __c[__imm & 15] = __b;
505
  return (__m256i)__c;
506
}
507

    
508
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
509
_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
510
{
511
  __v32qi __c = (__v32qi)__a;
512
  __c[__imm & 31] = __b;
513
  return (__m256i)__c;
514
}
515

    
516
#ifdef __x86_64__
517
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
518
_mm256_insert_epi64(__m256i __a, int __b, int const __imm)
519
{
520
  __v4di __c = (__v4di)__a;
521
  __c[__imm & 3] = __b;
522
  return (__m256i)__c;
523
}
524
#endif
525

    
526
/* Conversion */
527
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
528
_mm256_cvtepi32_pd(__m128i __a)
529
{
530
  return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) __a);
531
}
532

    
533
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
534
_mm256_cvtepi32_ps(__m256i __a)
535
{
536
  return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
537
}
538

    
539
static __inline __m128 __attribute__((__always_inline__, __nodebug__))
540
_mm256_cvtpd_ps(__m256d __a)
541
{
542
  return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
543
}
544

    
545
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
546
_mm256_cvtps_epi32(__m256 __a)
547
{
548
  return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
549
}
550

    
551
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
552
_mm256_cvtps_pd(__m128 __a)
553
{
554
  return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) __a);
555
}
556

    
557
static __inline __m128i __attribute__((__always_inline__, __nodebug__))
558
_mm256_cvttpd_epi32(__m256d __a)
559
{
560
  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
561
}
562

    
563
static __inline __m128i __attribute__((__always_inline__, __nodebug__))
564
_mm256_cvtpd_epi32(__m256d __a)
565
{
566
  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
567
}
568

    
569
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
570
_mm256_cvttps_epi32(__m256 __a)
571
{
572
  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
573
}
574

    
575
/* Vector replicate */
576
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
577
_mm256_movehdup_ps(__m256 __a)
578
{
579
  return __builtin_shufflevector(__a, __a, 1, 1, 3, 3, 5, 5, 7, 7);
580
}
581

    
582
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
583
_mm256_moveldup_ps(__m256 __a)
584
{
585
  return __builtin_shufflevector(__a, __a, 0, 0, 2, 2, 4, 4, 6, 6);
586
}
587

    
588
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
589
_mm256_movedup_pd(__m256d __a)
590
{
591
  return __builtin_shufflevector(__a, __a, 0, 0, 2, 2);
592
}
593

    
594
/* Unpack and Interleave */
595
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
596
_mm256_unpackhi_pd(__m256d __a, __m256d __b)
597
{
598
  return __builtin_shufflevector(__a, __b, 1, 5, 1+2, 5+2);
599
}
600

    
601
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
602
_mm256_unpacklo_pd(__m256d __a, __m256d __b)
603
{
604
  return __builtin_shufflevector(__a, __b, 0, 4, 0+2, 4+2);
605
}
606

    
607
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
608
_mm256_unpackhi_ps(__m256 __a, __m256 __b)
609
{
610
  return __builtin_shufflevector(__a, __b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
611
}
612

    
613
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
614
_mm256_unpacklo_ps(__m256 __a, __m256 __b)
615
{
616
  return __builtin_shufflevector(__a, __b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
617
}
618

    
619
/* Bit Test */
620
static __inline int __attribute__((__always_inline__, __nodebug__))
621
_mm_testz_pd(__m128d __a, __m128d __b)
622
{
623
  return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
624
}
625

    
626
static __inline int __attribute__((__always_inline__, __nodebug__))
627
_mm_testc_pd(__m128d __a, __m128d __b)
628
{
629
  return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
630
}
631

    
632
static __inline int __attribute__((__always_inline__, __nodebug__))
633
_mm_testnzc_pd(__m128d __a, __m128d __b)
634
{
635
  return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
636
}
637

    
638
static __inline int __attribute__((__always_inline__, __nodebug__))
639
_mm_testz_ps(__m128 __a, __m128 __b)
640
{
641
  return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
642
}
643

    
644
static __inline int __attribute__((__always_inline__, __nodebug__))
645
_mm_testc_ps(__m128 __a, __m128 __b)
646
{
647
  return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
648
}
649

    
650
static __inline int __attribute__((__always_inline__, __nodebug__))
651
_mm_testnzc_ps(__m128 __a, __m128 __b)
652
{
653
  return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
654
}
655

    
656
static __inline int __attribute__((__always_inline__, __nodebug__))
657
_mm256_testz_pd(__m256d __a, __m256d __b)
658
{
659
  return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
660
}
661

    
662
static __inline int __attribute__((__always_inline__, __nodebug__))
663
_mm256_testc_pd(__m256d __a, __m256d __b)
664
{
665
  return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
666
}
667

    
668
static __inline int __attribute__((__always_inline__, __nodebug__))
669
_mm256_testnzc_pd(__m256d __a, __m256d __b)
670
{
671
  return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
672
}
673

    
674
static __inline int __attribute__((__always_inline__, __nodebug__))
675
_mm256_testz_ps(__m256 __a, __m256 __b)
676
{
677
  return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
678
}
679

    
680
static __inline int __attribute__((__always_inline__, __nodebug__))
681
_mm256_testc_ps(__m256 __a, __m256 __b)
682
{
683
  return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
684
}
685

    
686
static __inline int __attribute__((__always_inline__, __nodebug__))
687
_mm256_testnzc_ps(__m256 __a, __m256 __b)
688
{
689
  return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
690
}
691

    
692
static __inline int __attribute__((__always_inline__, __nodebug__))
693
_mm256_testz_si256(__m256i __a, __m256i __b)
694
{
695
  return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
696
}
697

    
698
static __inline int __attribute__((__always_inline__, __nodebug__))
699
_mm256_testc_si256(__m256i __a, __m256i __b)
700
{
701
  return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
702
}
703

    
704
static __inline int __attribute__((__always_inline__, __nodebug__))
705
_mm256_testnzc_si256(__m256i __a, __m256i __b)
706
{
707
  return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
708
}
709

    
710
/* Vector extract sign mask */
711
static __inline int __attribute__((__always_inline__, __nodebug__))
712
_mm256_movemask_pd(__m256d __a)
713
{
714
  return __builtin_ia32_movmskpd256((__v4df)__a);
715
}
716

    
717
static __inline int __attribute__((__always_inline__, __nodebug__))
718
_mm256_movemask_ps(__m256 __a)
719
{
720
  return __builtin_ia32_movmskps256((__v8sf)__a);
721
}
722

    
723
/* Vector __zero */
724
static __inline void __attribute__((__always_inline__, __nodebug__))
725
_mm256_zeroall(void)
726
{
727
  __builtin_ia32_vzeroall();
728
}
729

    
730
static __inline void __attribute__((__always_inline__, __nodebug__))
731
_mm256_zeroupper(void)
732
{
733
  __builtin_ia32_vzeroupper();
734
}
735

    
736
/* Vector load with broadcast */
737
static __inline __m128 __attribute__((__always_inline__, __nodebug__))
738
_mm_broadcast_ss(float const *__a)
739
{
740
  float __f = *__a;
741
  return (__m128)(__v4sf){ __f, __f, __f, __f };
742
}
743

    
744
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
745
_mm256_broadcast_sd(double const *__a)
746
{
747
  double __d = *__a;
748
  return (__m256d)(__v4df){ __d, __d, __d, __d };
749
}
750

    
751
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
752
_mm256_broadcast_ss(float const *__a)
753
{
754
  float __f = *__a;
755
  return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
756
}
757

    
758
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
759
_mm256_broadcast_pd(__m128d const *__a)
760
{
761
  return (__m256d)__builtin_ia32_vbroadcastf128_pd256(__a);
762
}
763

    
764
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
765
_mm256_broadcast_ps(__m128 const *__a)
766
{
767
  return (__m256)__builtin_ia32_vbroadcastf128_ps256(__a);
768
}
769

    
770
/* SIMD load ops */
771
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
772
_mm256_load_pd(double const *__p)
773
{
774
  return *(__m256d *)__p;
775
}
776

    
777
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
778
_mm256_load_ps(float const *__p)
779
{
780
  return *(__m256 *)__p;
781
}
782

    
783
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
784
_mm256_loadu_pd(double const *__p)
785
{
786
  struct __loadu_pd {
787
    __m256d __v;
788
  } __attribute__((packed, may_alias));
789
  return ((struct __loadu_pd*)__p)->__v;
790
}
791

    
792
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
793
_mm256_loadu_ps(float const *__p)
794
{
795
  struct __loadu_ps {
796
    __m256 __v;
797
  } __attribute__((packed, may_alias));
798
  return ((struct __loadu_ps*)__p)->__v;
799
}
800

    
801
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
802
_mm256_load_si256(__m256i const *__p)
803
{
804
  return *__p;
805
}
806

    
807
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
808
_mm256_loadu_si256(__m256i const *__p)
809
{
810
  struct __loadu_si256 {
811
    __m256i __v;
812
  } __attribute__((packed, may_alias));
813
  return ((struct __loadu_si256*)__p)->__v;
814
}
815

    
816
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
817
_mm256_lddqu_si256(__m256i const *__p)
818
{
819
  return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
820
}
821

    
822
/* SIMD store ops */
823
static __inline void __attribute__((__always_inline__, __nodebug__))
824
_mm256_store_pd(double *__p, __m256d __a)
825
{
826
  *(__m256d *)__p = __a;
827
}
828

    
829
static __inline void __attribute__((__always_inline__, __nodebug__))
830
_mm256_store_ps(float *__p, __m256 __a)
831
{
832
  *(__m256 *)__p = __a;
833
}
834

    
835
static __inline void __attribute__((__always_inline__, __nodebug__))
836
_mm256_storeu_pd(double *__p, __m256d __a)
837
{
838
  __builtin_ia32_storeupd256(__p, (__v4df)__a);
839
}
840

    
841
static __inline void __attribute__((__always_inline__, __nodebug__))
842
_mm256_storeu_ps(float *__p, __m256 __a)
843
{
844
  __builtin_ia32_storeups256(__p, (__v8sf)__a);
845
}
846

    
847
static __inline void __attribute__((__always_inline__, __nodebug__))
848
_mm256_store_si256(__m256i *__p, __m256i __a)
849
{
850
  *__p = __a;
851
}
852

    
853
static __inline void __attribute__((__always_inline__, __nodebug__))
854
_mm256_storeu_si256(__m256i *__p, __m256i __a)
855
{
856
  __builtin_ia32_storedqu256((char *)__p, (__v32qi)__a);
857
}
858

    
859
/* Conditional load ops */
860
static __inline __m128d __attribute__((__always_inline__, __nodebug__))
861
_mm_maskload_pd(double const *__p, __m128d __m)
862
{
863
  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2df)__m);
864
}
865

    
866
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
867
_mm256_maskload_pd(double const *__p, __m256d __m)
868
{
869
  return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
870
                                               (__v4df)__m);
871
}
872

    
873
static __inline __m128 __attribute__((__always_inline__, __nodebug__))
874
_mm_maskload_ps(float const *__p, __m128 __m)
875
{
876
  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4sf)__m);
877
}
878

    
879
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
880
_mm256_maskload_ps(float const *__p, __m256 __m)
881
{
882
  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8sf)__m);
883
}
884

    
885
/* Conditional store ops */
886
static __inline void __attribute__((__always_inline__, __nodebug__))
887
_mm256_maskstore_ps(float *__p, __m256 __m, __m256 __a)
888
{
889
  __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8sf)__m, (__v8sf)__a);
890
}
891

    
892
static __inline void __attribute__((__always_inline__, __nodebug__))
893
_mm_maskstore_pd(double *__p, __m128d __m, __m128d __a)
894
{
895
  __builtin_ia32_maskstorepd((__v2df *)__p, (__v2df)__m, (__v2df)__a);
896
}
897

    
898
static __inline void __attribute__((__always_inline__, __nodebug__))
899
_mm256_maskstore_pd(double *__p, __m256d __m, __m256d __a)
900
{
901
  __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4df)__m, (__v4df)__a);
902
}
903

    
904
static __inline void __attribute__((__always_inline__, __nodebug__))
905
_mm_maskstore_ps(float *__p, __m128 __m, __m128 __a)
906
{
907
  __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4sf)__m, (__v4sf)__a);
908
}
909

    
910
/* Cacheability support ops */
911
static __inline void __attribute__((__always_inline__, __nodebug__))
912
_mm256_stream_si256(__m256i *__a, __m256i __b)
913
{
914
  __builtin_ia32_movntdq256((__v4di *)__a, (__v4di)__b);
915
}
916

    
917
static __inline void __attribute__((__always_inline__, __nodebug__))
918
_mm256_stream_pd(double *__a, __m256d __b)
919
{
920
  __builtin_ia32_movntpd256(__a, (__v4df)__b);
921
}
922

    
923
static __inline void __attribute__((__always_inline__, __nodebug__))
924
_mm256_stream_ps(float *__p, __m256 __a)
925
{
926
  __builtin_ia32_movntps256(__p, (__v8sf)__a);
927
}
928

    
929
/* Create vectors */
930
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
931
_mm256_set_pd(double __a, double __b, double __c, double __d)
932
{
933
  return (__m256d){ __d, __c, __b, __a };
934
}
935

    
936
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
937
_mm256_set_ps(float __a, float __b, float __c, float __d,
938
                    float __e, float __f, float __g, float __h)
939
{
940
  return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
941
}
942

    
943
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
944
_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
945
                             int __i4, int __i5, int __i6, int __i7)
946
{
947
  return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
948
}
949

    
950
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
951
_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
952
                             short __w11, short __w10, short __w09, short __w08,
953
                             short __w07, short __w06, short __w05, short __w04,
954
                             short __w03, short __w02, short __w01, short __w00)
955
{
956
  return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
957
    __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
958
}
959

    
960
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
961
_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
962
                            char __b27, char __b26, char __b25, char __b24,
963
                            char __b23, char __b22, char __b21, char __b20,
964
                            char __b19, char __b18, char __b17, char __b16,
965
                            char __b15, char __b14, char __b13, char __b12,
966
                            char __b11, char __b10, char __b09, char __b08,
967
                            char __b07, char __b06, char __b05, char __b04,
968
                            char __b03, char __b02, char __b01, char __b00)
969
{
970
  return (__m256i)(__v32qi){
971
    __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
972
    __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
973
    __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
974
    __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
975
  };
976
}
977

    
978
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
979
_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
980
{
981
  return (__m256i)(__v4di){ __d, __c, __b, __a };
982
}
983

    
984
/* Create vectors with elements in reverse order */
985
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
986
_mm256_setr_pd(double __a, double __b, double __c, double __d)
987
{
988
  return (__m256d){ __a, __b, __c, __d };
989
}
990

    
991
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
992
_mm256_setr_ps(float __a, float __b, float __c, float __d,
993
                           float __e, float __f, float __g, float __h)
994
{
995
  return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
996
}
997

    
998
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
999
_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
1000
                              int __i4, int __i5, int __i6, int __i7)
1001
{
1002
  return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
1003
}
1004

    
1005
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1006
_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
1007
                   short __w11, short __w10, short __w09, short __w08,
1008
                   short __w07, short __w06, short __w05, short __w04,
1009
                   short __w03, short __w02, short __w01, short __w00)
1010
{
1011
  return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
1012
    __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
1013
}
1014

    
1015
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1016
_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
1017
                             char __b27, char __b26, char __b25, char __b24,
1018
                             char __b23, char __b22, char __b21, char __b20,
1019
                             char __b19, char __b18, char __b17, char __b16,
1020
                             char __b15, char __b14, char __b13, char __b12,
1021
                             char __b11, char __b10, char __b09, char __b08,
1022
                             char __b07, char __b06, char __b05, char __b04,
1023
                             char __b03, char __b02, char __b01, char __b00)
1024
{
1025
  return (__m256i)(__v32qi){
1026
    __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
1027
                __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
1028
                __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
1029
                __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
1030
}
1031

    
1032
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1033
_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
1034
{
1035
  return (__m256i)(__v4di){ __a, __b, __c, __d };
1036
}
1037

    
1038
/* Create vectors with repeated elements */
1039
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1040
_mm256_set1_pd(double __w)
1041
{
1042
  return (__m256d){ __w, __w, __w, __w };
1043
}
1044

    
1045
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1046
_mm256_set1_ps(float __w)
1047
{
1048
  return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
1049
}
1050

    
1051
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1052
_mm256_set1_epi32(int __i)
1053
{
1054
  return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
1055
}
1056

    
1057
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1058
_mm256_set1_epi16(short __w)
1059
{
1060
  return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
1061
    __w, __w, __w, __w, __w, __w };
1062
}
1063

    
1064
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1065
_mm256_set1_epi8(char __b)
1066
{
1067
  return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
1068
    __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
1069
    __b, __b, __b, __b, __b, __b, __b };
1070
}
1071

    
1072
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1073
_mm256_set1_epi64x(long long __q)
1074
{
1075
  return (__m256i)(__v4di){ __q, __q, __q, __q };
1076
}
1077

    
1078
/* Create __zeroed vectors */
1079
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1080
_mm256_setzero_pd(void)
1081
{
1082
  return (__m256d){ 0, 0, 0, 0 };
1083
}
1084

    
1085
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1086
_mm256_setzero_ps(void)
1087
{
1088
  return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
1089
}
1090

    
1091
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1092
_mm256_setzero_si256(void)
1093
{
1094
  return (__m256i){ 0LL, 0LL, 0LL, 0LL };
1095
}
1096

    
1097
/* Cast between vector types */
1098
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1099
_mm256_castpd_ps(__m256d __a)
1100
{
1101
  return (__m256)__a;
1102
}
1103

    
1104
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1105
_mm256_castpd_si256(__m256d __a)
1106
{
1107
  return (__m256i)__a;
1108
}
1109

    
1110
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1111
_mm256_castps_pd(__m256 __a)
1112
{
1113
  return (__m256d)__a;
1114
}
1115

    
1116
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1117
_mm256_castps_si256(__m256 __a)
1118
{
1119
  return (__m256i)__a;
1120
}
1121

    
1122
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1123
_mm256_castsi256_ps(__m256i __a)
1124
{
1125
  return (__m256)__a;
1126
}
1127

    
1128
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1129
_mm256_castsi256_pd(__m256i __a)
1130
{
1131
  return (__m256d)__a;
1132
}
1133

    
1134
static __inline __m128d __attribute__((__always_inline__, __nodebug__))
1135
_mm256_castpd256_pd128(__m256d __a)
1136
{
1137
  return __builtin_shufflevector(__a, __a, 0, 1);
1138
}
1139

    
1140
static __inline __m128 __attribute__((__always_inline__, __nodebug__))
1141
_mm256_castps256_ps128(__m256 __a)
1142
{
1143
  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
1144
}
1145

    
1146
static __inline __m128i __attribute__((__always_inline__, __nodebug__))
1147
_mm256_castsi256_si128(__m256i __a)
1148
{
1149
  return __builtin_shufflevector(__a, __a, 0, 1);
1150
}
1151

    
1152
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1153
_mm256_castpd128_pd256(__m128d __a)
1154
{
1155
  return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
1156
}
1157

    
1158
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1159
_mm256_castps128_ps256(__m128 __a)
1160
{
1161
  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
1162
}
1163

    
1164
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1165
_mm256_castsi128_si256(__m128i __a)
1166
{
1167
  return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
1168
}
1169

    
1170
/* SIMD load ops (unaligned) */
1171
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1172
_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
1173
{
1174
  struct __loadu_ps {
1175
    __m128 __v;
1176
  } __attribute__((__packed__, __may_alias__));
1177

    
1178
  __m256 __v256 = _mm256_castps128_ps256(((struct __loadu_ps*)__addr_lo)->__v);
1179
  return _mm256_insertf128_ps(__v256, ((struct __loadu_ps*)__addr_hi)->__v, 1);
1180
}
1181

    
1182
static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1183
_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
1184
{
1185
  struct __loadu_pd {
1186
    __m128d __v;
1187
  } __attribute__((__packed__, __may_alias__));
1188
  
1189
  __m256d __v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)__addr_lo)->__v);
1190
  return _mm256_insertf128_pd(__v256, ((struct __loadu_pd*)__addr_hi)->__v, 1);
1191
}
1192

    
1193
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1194
_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
1195
{
1196
  struct __loadu_si128 {
1197
    __m128i __v;
1198
  } __attribute__((packed, may_alias));
1199
  __m256i __v256 = _mm256_castsi128_si256(
1200
    ((struct __loadu_si128*)__addr_lo)->__v);
1201
  return _mm256_insertf128_si256(__v256,
1202
                                 ((struct __loadu_si128*)__addr_hi)->__v, 1);
1203
}
1204

    
1205
/* SIMD store ops (unaligned) */
1206
static __inline void __attribute__((__always_inline__, __nodebug__))
1207
_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
1208
{
1209
  __m128 __v128;
1210

    
1211
  __v128 = _mm256_castps256_ps128(__a);
1212
  __builtin_ia32_storeups(__addr_lo, __v128);
1213
  __v128 = _mm256_extractf128_ps(__a, 1);
1214
  __builtin_ia32_storeups(__addr_hi, __v128);
1215
}
1216

    
1217
static __inline void __attribute__((__always_inline__, __nodebug__))
1218
_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
1219
{
1220
  __m128d __v128;
1221

    
1222
  __v128 = _mm256_castpd256_pd128(__a);
1223
  __builtin_ia32_storeupd(__addr_lo, __v128);
1224
  __v128 = _mm256_extractf128_pd(__a, 1);
1225
  __builtin_ia32_storeupd(__addr_hi, __v128);
1226
}
1227

    
1228
static __inline void __attribute__((__always_inline__, __nodebug__))
1229
_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
1230
{
1231
  __m128i __v128;
1232

    
1233
  __v128 = _mm256_castsi256_si128(__a);
1234
  __builtin_ia32_storedqu((char *)__addr_lo, (__v16qi)__v128);
1235
  __v128 = _mm256_extractf128_si256(__a, 1);
1236
  __builtin_ia32_storedqu((char *)__addr_hi, (__v16qi)__v128);
1237
}
1238

    
1239
#endif /* __AVXINTRIN_H */