/storage/packages/gcc/5.3.0/lib/gcc/x86_64-unknown-linux-gnu/5.3.0/include/avxintrin.h

Line% of fetchesSource
1  
/* Copyright (C) 2008-2015 Free Software Foundation, Inc.
2  
3  
   This file is part of GCC.
4  
5  
   GCC is free software; you can redistribute it and/or modify
6  
   it under the terms of the GNU General Public License as published by
7  
   the Free Software Foundation; either version 3, or (at your option)
8  
   any later version.
9  
10  
   GCC is distributed in the hope that it will be useful,
11  
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12  
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  
   GNU General Public License for more details.
14  
15  
   Under Section 7 of GPL version 3, you are granted additional
16  
   permissions described in the GCC Runtime Library Exception, version
17  
   3.1, as published by the Free Software Foundation.
18  
19  
   You should have received a copy of the GNU General Public License and
20  
   a copy of the GCC Runtime Library Exception along with this program;
21  
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22  
   <http://www.gnu.org/licenses/>.  */
23  
24  
/* Implemented from the specification included in the Intel C++ Compiler
25  
   User Guide and Reference, version 11.0.  */
26  
27  
#ifndef _IMMINTRIN_H_INCLUDED
28  
# error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
29  
#endif
30  
31  
#ifndef _AVXINTRIN_H_INCLUDED
32  
#define _AVXINTRIN_H_INCLUDED
33  
34  
#ifndef __AVX__
35  
#pragma GCC push_options
36  
#pragma GCC target("avx")
37  
#define __DISABLE_AVX__
38  
#endif /* __AVX__ */
39  
40  
/* Internal data types for implementing the intrinsics.  */
41  
typedef double __v4df __attribute__ ((__vector_size__ (32)));
42  
typedef float __v8sf __attribute__ ((__vector_size__ (32)));
43  
typedef long long __v4di __attribute__ ((__vector_size__ (32)));
44  
typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
45  
typedef int __v8si __attribute__ ((__vector_size__ (32)));
46  
typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
47  
typedef short __v16hi __attribute__ ((__vector_size__ (32)));
48  
typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
49  
typedef char __v32qi __attribute__ ((__vector_size__ (32)));
50  
typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
51  
52  
/* The Intel API is flexible enough that we must allow aliasing with other
53  
   vector types, and their scalar components.  */
54  
typedef float __m256 __attribute__ ((__vector_size__ (32),
55  
				     __may_alias__));
56  
typedef long long __m256i __attribute__ ((__vector_size__ (32),
57  
					  __may_alias__));
58  
typedef double __m256d __attribute__ ((__vector_size__ (32),
59  
				       __may_alias__));
60  
61  
/* Compare predicates for scalar and packed compare intrinsics.  */
62  
63  
/* Equal (ordered, non-signaling)  */
64  
#define _CMP_EQ_OQ	0x00
65  
/* Less-than (ordered, signaling)  */
66  
#define _CMP_LT_OS	0x01
67  
/* Less-than-or-equal (ordered, signaling)  */
68  
#define _CMP_LE_OS	0x02
69  
/* Unordered (non-signaling)  */
70  
#define _CMP_UNORD_Q	0x03
71  
/* Not-equal (unordered, non-signaling)  */
72  
#define _CMP_NEQ_UQ	0x04
73  
/* Not-less-than (unordered, signaling)  */
74  
#define _CMP_NLT_US	0x05
75  
/* Not-less-than-or-equal (unordered, signaling)  */
76  
#define _CMP_NLE_US	0x06
77  
/* Ordered (nonsignaling)   */
78  
#define _CMP_ORD_Q	0x07
79  
/* Equal (unordered, non-signaling)  */
80  
#define _CMP_EQ_UQ	0x08
81  
/* Not-greater-than-or-equal (unordered, signaling)  */
82  
#define _CMP_NGE_US	0x09
83  
/* Not-greater-than (unordered, signaling)  */
84  
#define _CMP_NGT_US	0x0a
85  
/* False (ordered, non-signaling)  */
86  
#define _CMP_FALSE_OQ	0x0b
87  
/* Not-equal (ordered, non-signaling)  */
88  
#define _CMP_NEQ_OQ	0x0c
89  
/* Greater-than-or-equal (ordered, signaling)  */
90  
#define _CMP_GE_OS	0x0d
91  
/* Greater-than (ordered, signaling)  */
92  
#define _CMP_GT_OS	0x0e
93  
/* True (unordered, non-signaling)  */
94  
#define _CMP_TRUE_UQ	0x0f
95  
/* Equal (ordered, signaling)  */
96  
#define _CMP_EQ_OS	0x10
97  
/* Less-than (ordered, non-signaling)  */
98  
#define _CMP_LT_OQ	0x11
99  
/* Less-than-or-equal (ordered, non-signaling)  */
100  
#define _CMP_LE_OQ	0x12
101  
/* Unordered (signaling)  */
102  
#define _CMP_UNORD_S	0x13
103  
/* Not-equal (unordered, signaling)  */
104  
#define _CMP_NEQ_US	0x14
105  
/* Not-less-than (unordered, non-signaling)  */
106  
#define _CMP_NLT_UQ	0x15
107  
/* Not-less-than-or-equal (unordered, non-signaling)  */
108  
#define _CMP_NLE_UQ	0x16
109  
/* Ordered (signaling)  */
110  
#define _CMP_ORD_S	0x17
111  
/* Equal (unordered, signaling)  */
112  
#define _CMP_EQ_US	0x18
113  
/* Not-greater-than-or-equal (unordered, non-signaling)  */
114  
#define _CMP_NGE_UQ	0x19
115  
/* Not-greater-than (unordered, non-signaling)  */
116  
#define _CMP_NGT_UQ	0x1a
117  
/* False (ordered, signaling)  */
118  
#define _CMP_FALSE_OS	0x1b
119  
/* Not-equal (ordered, signaling)  */
120  
#define _CMP_NEQ_OS	0x1c
121  
/* Greater-than-or-equal (ordered, non-signaling)  */
122  
#define _CMP_GE_OQ	0x1d
123  
/* Greater-than (ordered, non-signaling)  */
124  
#define _CMP_GT_OQ	0x1e
125  
/* True (unordered, signaling)  */
126  
#define _CMP_TRUE_US	0x1f
127  
128  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
129  
_mm256_add_pd (__m256d __A, __m256d __B)
130  
{
131  
  return (__m256d) ((__v4df)__A + (__v4df)__B);
132  
}
133  
134  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
135  
_mm256_add_ps (__m256 __A, __m256 __B)
136  
{
137  
  return (__m256) ((__v8sf)__A + (__v8sf)__B);
138  
}
139  
140  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
141  
_mm256_addsub_pd (__m256d __A, __m256d __B)
142  
{
143  
  return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
144  
}
145  
146  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147  
_mm256_addsub_ps (__m256 __A, __m256 __B)
148  
{
149  
  return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
150  
}
151  
152  
153  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
154  
_mm256_and_pd (__m256d __A, __m256d __B)
155  
{
156  
  return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
157  
}
158  
159  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
160  
_mm256_and_ps (__m256 __A, __m256 __B)
161  
{
162  
  return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
163  
}
164  
165  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
166  
_mm256_andnot_pd (__m256d __A, __m256d __B)
167  
{
168  
  return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
169  
}
170  
171  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
172  
_mm256_andnot_ps (__m256 __A, __m256 __B)
173  
{
174  
  return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
175  
}
176  
177  
/* Double/single precision floating point blend instructions - select
178  
   data from 2 sources using constant/variable mask.  */
179  
180  
#ifdef __OPTIMIZE__
181  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
182  
_mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
183  
{
184  
  return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
185  
					      (__v4df)__Y,
186  
					      __M);
187  
}
188  
189  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
190  
_mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
191  
{
192  
  return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
193  
					     (__v8sf)__Y,
194  
					     __M);
195  
}
196  
#else
197  
#define _mm256_blend_pd(X, Y, M)					\
198  
  ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X),		\
199  
					(__v4df)(__m256d)(Y), (int)(M)))
200  
201  
#define _mm256_blend_ps(X, Y, M)					\
202  
  ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X),		\
203  
				       (__v8sf)(__m256)(Y), (int)(M)))
204  
#endif
205  
206  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
207  
_mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
208  
{
209  
  return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
210  
					       (__v4df)__Y,
211  
					       (__v4df)__M);
212  
}
213  
214  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
215  
_mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
216  
{
217  
  return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
218  
					      (__v8sf)__Y,
219  
					      (__v8sf)__M);
220  
}
221  
222  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
223  
_mm256_div_pd (__m256d __A, __m256d __B)
224  
{
225  
  return (__m256d) ((__v4df)__A / (__v4df)__B);
226  
}
227  
228  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
229  
_mm256_div_ps (__m256 __A, __m256 __B)
230  
{
231  
  return (__m256) ((__v8sf)__A / (__v8sf)__B);
232  
}
233  
234  
/* Dot product instructions with mask-defined summing and zeroing parts
235  
   of result.  */
236  
237  
#ifdef __OPTIMIZE__
238  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
239  
_mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
240  
{
241  
  return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
242  
					  (__v8sf)__Y,
243  
					  __M);
244  
}
245  
#else
246  
#define _mm256_dp_ps(X, Y, M)						\
247  
  ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X),		\
248  
				    (__v8sf)(__m256)(Y), (int)(M)))
249  
#endif
250  
251  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
252  
_mm256_hadd_pd (__m256d __X, __m256d __Y)
253  
{
254  
  return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
255  
}
256  
257  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
258  
_mm256_hadd_ps (__m256 __X, __m256 __Y)
259  
{
260  
  return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
261  
}
262  
263  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
264  
_mm256_hsub_pd (__m256d __X, __m256d __Y)
265  
{
266  
  return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
267  
}
268  
269  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
270  
_mm256_hsub_ps (__m256 __X, __m256 __Y)
271  
{
272  
  return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
273  
}
274  
275  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
276  
_mm256_max_pd (__m256d __A, __m256d __B)
277  
{
278  
  return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
279  
}
280  
281  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
282  
_mm256_max_ps (__m256 __A, __m256 __B)
283  
{
284  
  return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
285  
}
286  
287  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
288  
_mm256_min_pd (__m256d __A, __m256d __B)
289  
{
290  
  return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
291  
}
292  
293  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
294  
_mm256_min_ps (__m256 __A, __m256 __B)
295  
{
296  
  return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
297  
}
298  
299  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
300  
_mm256_mul_pd (__m256d __A, __m256d __B)
301  
{
302  
  return (__m256d) ((__v4df)__A * (__v4df)__B);
303  
}
304  
305  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
306  
_mm256_mul_ps (__m256 __A, __m256 __B)
307  
{
308  
  return (__m256) ((__v8sf)__A * (__v8sf)__B);
309  
}
310  
311  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
312  
_mm256_or_pd (__m256d __A, __m256d __B)
313  
{
314  
  return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
315  
}
316  
317  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
318  
_mm256_or_ps (__m256 __A, __m256 __B)
319  
{
320  
  return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
321  
}
322  
323  
#ifdef __OPTIMIZE__
324  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
325  
_mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
326  
{
327  
  return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
328  
					     __mask);
329  
}
330  
331  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
332  
_mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
333  
{
334  
  return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
335  
					    __mask);
336  
}
337  
#else
338  
#define _mm256_shuffle_pd(A, B, N)					\
339  
  ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A),		\
340  
				      (__v4df)(__m256d)(B), (int)(N)))
341  
342  
#define _mm256_shuffle_ps(A, B, N)					\
343  
  ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A),		\
344  
				      (__v8sf)(__m256)(B), (int)(N)))
345  
#endif
346  
347  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
348  
_mm256_sub_pd (__m256d __A, __m256d __B)
349  
{
350  
  return (__m256d) ((__v4df)__A - (__v4df)__B);
351  
}
352  
353  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
354  
_mm256_sub_ps (__m256 __A, __m256 __B)
355  
{
356  
  return (__m256) ((__v8sf)__A - (__v8sf)__B);
357  
}
358  
359  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
360  
_mm256_xor_pd (__m256d __A, __m256d __B)
361  
{
362  
  return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
363  
}
364  
365  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
366  
_mm256_xor_ps (__m256 __A, __m256 __B)
367  
{
368  
  return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
369  
}
370  
371  
#ifdef __OPTIMIZE__
372  
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
373  
_mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
374  
{
375  
  return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
376  
}
377  
378  
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
379  
_mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
380  
{
381  
  return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
382  
}
383  
384  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
385  
_mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
386  
{
387  
  return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
388  
					    __P);
389  
}
390  
391  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392  
_mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
393  
{
394  
  return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
395  
					   __P);
396  
}
397  
398  
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
399  
_mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
400  
{
401  
  return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
402  
}
403  
404  
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
405  
_mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
406  
{
407  
  return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
408  
}
409  
#else
410  
#define _mm_cmp_pd(X, Y, P)						\
411  
  ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X),		\
412  
				   (__v2df)(__m128d)(Y), (int)(P)))
413  
414  
#define _mm_cmp_ps(X, Y, P)						\
415  
  ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X),			\
416  
				  (__v4sf)(__m128)(Y), (int)(P)))
417  
418  
#define _mm256_cmp_pd(X, Y, P)						\
419  
  ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X),		\
420  
				      (__v4df)(__m256d)(Y), (int)(P)))
421  
422  
#define _mm256_cmp_ps(X, Y, P)						\
423  
  ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X),		\
424  
				     (__v8sf)(__m256)(Y), (int)(P)))
425  
426  
#define _mm_cmp_sd(X, Y, P)						\
427  
  ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X),		\
428  
				   (__v2df)(__m128d)(Y), (int)(P)))
429  
430  
#define _mm_cmp_ss(X, Y, P)						\
431  
  ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X),			\
432  
				  (__v4sf)(__m128)(Y), (int)(P)))
433  
#endif
434  
435  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
436  
_mm256_cvtepi32_pd (__m128i __A)
437  
{
438  
  return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
439  
}
440  
441  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
442  
_mm256_cvtepi32_ps (__m256i __A)
443  
{
444  
  return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
445  
}
446  
447  
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
448  
_mm256_cvtpd_ps (__m256d __A)
449  
{
450  
  return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
451  
}
452  
453  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
454  
_mm256_cvtps_epi32 (__m256 __A)
455  
{
456  
  return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
457  
}
458  
459  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
460  
_mm256_cvtps_pd (__m128 __A)
461  
{
462  
  return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
463  
}
464  
465  
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
466  
_mm256_cvttpd_epi32 (__m256d __A)
467  
{
468  
  return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
469  
}
470  
471  
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
472  
_mm256_cvtpd_epi32 (__m256d __A)
473  
{
474  
  return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
475  
}
476  
477  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
478  
_mm256_cvttps_epi32 (__m256 __A)
479  
{
480  
  return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
481  
}
482  
483  
#ifdef __OPTIMIZE__
484  
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
485  
_mm256_extractf128_pd (__m256d __X, const int __N)
486  
{
487  
  return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
488  
}
489  
490  
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
491  
_mm256_extractf128_ps (__m256 __X, const int __N)
492  
{
493  
  return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
494  
}
495  
496  
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
497  
_mm256_extractf128_si256 (__m256i __X, const int __N)
498  
{
499  
  return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
500  
}
501  
502  
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
503  
_mm256_extract_epi32 (__m256i __X, int const __N)
504  
{
505  
  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
506  
  return _mm_extract_epi32 (__Y, __N % 4);
507  
}
508  
509  
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
510  
_mm256_extract_epi16 (__m256i __X, int const __N)
511  
{
512  
  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
513  
  return _mm_extract_epi16 (__Y, __N % 8);
514  
}
515  
516  
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
517  
_mm256_extract_epi8 (__m256i __X, int const __N)
518  
{
519  
  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
520  
  return _mm_extract_epi8 (__Y, __N % 16);
521  
}
522  
523  
#ifdef __x86_64__
524  
extern __inline long long  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
525  
_mm256_extract_epi64 (__m256i __X, const int __N)
526  
{
527  
  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
528  
  return _mm_extract_epi64 (__Y, __N % 2);
529  
}
530  
#endif
531  
#else
532  
#define _mm256_extractf128_pd(X, N)					\
533  
  ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X),	\
534  
						(int)(N)))
535  
536  
#define _mm256_extractf128_ps(X, N)					\
537  
  ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X),	\
538  
					       (int)(N)))
539  
540  
#define _mm256_extractf128_si256(X, N)					\
541  
  ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X),	\
542  
						(int)(N)))
543  
544  
#define _mm256_extract_epi32(X, N)					\
545  
  (__extension__							\
546  
   ({									\
547  
      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2);		\
548  
      _mm_extract_epi32 (__Y, (N) % 4);					\
549  
    }))
550  
551  
#define _mm256_extract_epi16(X, N)					\
552  
  (__extension__							\
553  
   ({									\
554  
      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3);		\
555  
      _mm_extract_epi16 (__Y, (N) % 8);					\
556  
    }))
557  
558  
#define _mm256_extract_epi8(X, N)					\
559  
  (__extension__							\
560  
   ({									\
561  
      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4);		\
562  
      _mm_extract_epi8 (__Y, (N) % 16);					\
563  
    }))
564  
565  
#ifdef __x86_64__
566  
#define _mm256_extract_epi64(X, N)					\
567  
  (__extension__							\
568  
   ({									\
569  
      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1);		\
570  
      _mm_extract_epi64 (__Y, (N) % 2);					\
571  
    }))
572  
#endif
573  
#endif
574  
575  
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
576  
_mm256_zeroall (void)
577  
{
578  
  __builtin_ia32_vzeroall ();
579  
}
580  
581  
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
582  
_mm256_zeroupper (void)
583  
{
584  
  __builtin_ia32_vzeroupper ();
585  
}
586  
587  
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
588  
_mm_permutevar_pd (__m128d __A, __m128i __C)
589  
{
590  
  return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
591  
						(__v2di)__C);
592  
}
593  
594  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
595  
_mm256_permutevar_pd (__m256d __A, __m256i __C)
596  
{
597  
  return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
598  
						   (__v4di)__C);
599  
}
600  
601  
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
602  
_mm_permutevar_ps (__m128 __A, __m128i __C)
603  
{
604  
  return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
605  
					       (__v4si)__C);
606  
}
607  
608  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
609  
_mm256_permutevar_ps (__m256 __A, __m256i __C)
610  
{
611  
  return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
612  
						  (__v8si)__C);
613  
}
614  
615  
#ifdef __OPTIMIZE__
616  
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
617  
_mm_permute_pd (__m128d __X, const int __C)
618  
{
619  
  return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
620  
}
621  
622  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
623  
_mm256_permute_pd (__m256d __X, const int __C)
624  
{
625  
  return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
626  
}
627  
628  
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
629  
_mm_permute_ps (__m128 __X, const int __C)
630  
{
631  
  return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
632  
}
633  
634  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
635  
_mm256_permute_ps (__m256 __X, const int __C)
636  
{
637  
  return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
638  
}
639  
#else
640  
#define _mm_permute_pd(X, C)						\
641  
  ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
642  
643  
#define _mm256_permute_pd(X, C)						\
644  
  ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X),	(int)(C)))
645  
646  
#define _mm_permute_ps(X, C)						\
647  
  ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
648  
649  
#define _mm256_permute_ps(X, C)						\
650  
  ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
651  
#endif
652  
653  
#ifdef __OPTIMIZE__
654  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
655  
_mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
656  
{
657  
  return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
658  
						    (__v4df)__Y,
659  
						    __C);
660  
}
661  
662  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
663  
_mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
664  
{
665  
  return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
666  
						   (__v8sf)__Y,
667  
						   __C);
668  
}
669  
670  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
671  
_mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
672  
{
673  
  return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
674  
						    (__v8si)__Y,
675  
						    __C);
676  
}
677  
#else
678  
#define _mm256_permute2f128_pd(X, Y, C)					\
679  
  ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X),	\
680  
					      (__v4df)(__m256d)(Y),	\
681  
					      (int)(C)))
682  
683  
#define _mm256_permute2f128_ps(X, Y, C)					\
684  
  ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X),	\
685  
					     (__v8sf)(__m256)(Y),	\
686  
					     (int)(C)))
687  
688  
#define _mm256_permute2f128_si256(X, Y, C)				\
689  
  ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X),	\
690  
					      (__v8si)(__m256i)(Y),	\
691  
					      (int)(C)))
692  
#endif
693  
694  
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
695  
_mm_broadcast_ss (float const *__X)
696  
{
697  
  return (__m128) __builtin_ia32_vbroadcastss (__X);
698  
}
699  
700  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
701  
_mm256_broadcast_sd (double const *__X)
702  
{
703  
  return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
704  
}
705  
706  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
707  
_mm256_broadcast_ss (float const *__X)
708  
{
709  
  return (__m256) __builtin_ia32_vbroadcastss256 (__X);
710  
}
711  
712  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
713  
_mm256_broadcast_pd (__m128d const *__X)
714  
{
715  
  return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
716  
}
717  
718  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
719  
_mm256_broadcast_ps (__m128 const *__X)
720  
{
721  
  return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
722  
}
723  
724  
#ifdef __OPTIMIZE__
725  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
726  
_mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
727  
{
728  
  return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
729  
						     (__v2df)__Y,
730  
						     __O);
731  
}
732  
733  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
734  
_mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
735  
{
736  
  return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
737  
						    (__v4sf)__Y,
738  
						    __O);
739  
}
740  
741  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
742  
_mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
743  
{
744  
  return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
745  
						     (__v4si)__Y,
746  
						     __O);
747  
}
748  
749  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
750  
_mm256_insert_epi32 (__m256i __X, int __D, int const __N)
751  
{
752  
  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
753  
  __Y = _mm_insert_epi32 (__Y, __D, __N % 4);
754  
  return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
755  
}
756  
757  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
758  
_mm256_insert_epi16 (__m256i __X, int __D, int const __N)
759  
{
760  
  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
761  
  __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
762  
  return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
763  
}
764  
765  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
766  
_mm256_insert_epi8 (__m256i __X, int __D, int const __N)
767  
{
768  
  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
769  
  __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
770  
  return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
771  
}
772  
773  
#ifdef __x86_64__
774  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
775  
_mm256_insert_epi64 (__m256i __X, long long __D, int const __N)
776  
{
777  
  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
778  
  __Y = _mm_insert_epi64 (__Y, __D, __N % 2);
779  
  return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
780  
}
781  
#endif
782  
#else
783  
#define _mm256_insertf128_pd(X, Y, O)					\
784  
  ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X),	\
785  
					       (__v2df)(__m128d)(Y),	\
786  
					       (int)(O)))
787  
788  
#define _mm256_insertf128_ps(X, Y, O)					\
789  
  ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X),	\
790  
					      (__v4sf)(__m128)(Y),  	\
791  
					      (int)(O)))
792  
793  
#define _mm256_insertf128_si256(X, Y, O)				\
794  
  ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X),	\
795  
					       (__v4si)(__m128i)(Y),	\
796  
					       (int)(O)))
797  
798  
#define _mm256_insert_epi32(X, D, N)					\
799  
  (__extension__							\
800  
   ({									\
801  
      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2);		\
802  
      __Y = _mm_insert_epi32 (__Y, (D), (N) % 4);			\
803  
      _mm256_insertf128_si256 ((X), __Y, (N) >> 2);			\
804  
    }))
805  
806  
#define _mm256_insert_epi16(X, D, N)					\
807  
  (__extension__							\
808  
   ({									\
809  
      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3);		\
810  
      __Y = _mm_insert_epi16 (__Y, (D), (N) % 8);			\
811  
      _mm256_insertf128_si256 ((X), __Y, (N) >> 3);			\
812  
    }))
813  
814  
#define _mm256_insert_epi8(X, D, N)					\
815  
  (__extension__							\
816  
   ({									\
817  
      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4);		\
818  
      __Y = _mm_insert_epi8 (__Y, (D), (N) % 16);			\
819  
      _mm256_insertf128_si256 ((X), __Y, (N) >> 4);			\
820  
    }))
821  
822  
#ifdef __x86_64__
823  
#define _mm256_insert_epi64(X, D, N)					\
824  
  (__extension__							\
825  
   ({									\
826  
      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1);		\
827  
      __Y = _mm_insert_epi64 (__Y, (D), (N) % 2);			\
828  
      _mm256_insertf128_si256 ((X), __Y, (N) >> 1);			\
829  
    }))
830  
#endif
831  
#endif
832  
833  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
834  
_mm256_load_pd (double const *__P)
835  
{
836  
  return *(__m256d *)__P;
837  
}
838  
839  
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
840  
_mm256_store_pd (double *__P, __m256d __A)
841  
{
842  
  *(__m256d *)__P = __A;
843  
}
844  
845  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
846  
_mm256_load_ps (float const *__P)
847  
{
848  
  return *(__m256 *)__P;
849  
}
850  
851  
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
852  
_mm256_store_ps (float *__P, __m256 __A)
853  
{
854  
  *(__m256 *)__P = __A;
855  
}
856  
857  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
858  
_mm256_loadu_pd (double const *__P)
859  
{
860  
  return (__m256d) __builtin_ia32_loadupd256 (__P);
861  
}
862  
863  
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
864  
_mm256_storeu_pd (double *__P, __m256d __A)
865  
{
866  
  __builtin_ia32_storeupd256 (__P, (__v4df)__A);
867  
}
868  
869  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
870  
_mm256_loadu_ps (float const *__P)
871  
{
872  
  return (__m256) __builtin_ia32_loadups256 (__P);
873  
}
874  
875  
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
876  
_mm256_storeu_ps (float *__P, __m256 __A)
877  
{
878  
  __builtin_ia32_storeups256 (__P, (__v8sf)__A);
879  
}
880  
881  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
882  
_mm256_load_si256 (__m256i const *__P)
883  
{
884  
  return *__P;
885  
}
886  
887  
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
888  
_mm256_store_si256 (__m256i *__P, __m256i __A)
889  
{
890  
  *__P = __A;
891  
}
892  
893  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
894  
_mm256_loadu_si256 (__m256i const *__P)
895  
{
896  
  return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P);
897  
}
898  
899  
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
900  
_mm256_storeu_si256 (__m256i *__P, __m256i __A)
901  
{
902  
  __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A);
903  
}
904  
905  
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
906  
_mm_maskload_pd (double const *__P, __m128i __M)
907  
{
908  
  return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
909  
					      (__v2di)__M);
910  
}
911  
912  
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
913  
_mm_maskstore_pd (double *__P, __m128i __M, __m128d __A)
914  
{
915  
  __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A);
916  
}
917  
918  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
919  
_mm256_maskload_pd (double const *__P, __m256i __M)
920  
{
921  
  return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
922  
						 (__v4di)__M);
923  
}
924  
925  
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
926  
_mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A)
927  
{
928  
  __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A);
929  
}
930  
931  
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
932  
_mm_maskload_ps (float const *__P, __m128i __M)
933  
{
934  
  return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
935  
					     (__v4si)__M);
936  
}
937  
938  
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939  
_mm_maskstore_ps (float *__P, __m128i __M, __m128 __A)
940  
{
941  
  __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A);
942  
}
943  
944  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
945  
_mm256_maskload_ps (float const *__P, __m256i __M)
946  
{
947  
  return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
948  
						(__v8si)__M);
949  
}
950  
951  
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
952  
_mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A)
953  
{
954  
  __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A);
955  
}
956  
957  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
958  
_mm256_movehdup_ps (__m256 __X)
959  
{
960  
  return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
961  
}
962  
963  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
964  
_mm256_moveldup_ps (__m256 __X)
965  
{
966  
  return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
967  
}
968  
969  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
970  
_mm256_movedup_pd (__m256d __X)
971  
{
972  
  return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
973  
}
974  
975  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
976  
_mm256_lddqu_si256 (__m256i const *__P)
977  
{
978  
  return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
979  
}
980  
981  
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
982  
_mm256_stream_si256 (__m256i *__A, __m256i __B)
983  
{
984  
  __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B);
985  
}
986  
987  
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
988  
_mm256_stream_pd (double *__A, __m256d __B)
989  
{
990  
  __builtin_ia32_movntpd256 (__A, (__v4df)__B);
991  
}
992  
993  
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
994  
_mm256_stream_ps (float *__P, __m256 __A)
995  
{
996  
  __builtin_ia32_movntps256 (__P, (__v8sf)__A);
997  
}
998  
999  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1000  
_mm256_rcp_ps (__m256 __A)
1001  
{
1002  
  return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
1003  
}
1004  
1005  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1006  
_mm256_rsqrt_ps (__m256 __A)
1007  
{
1008  
  return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
1009  
}
1010  
1011  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1012  
_mm256_sqrt_pd (__m256d __A)
1013  
{
1014  
  return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
1015  
}
1016  
1017  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1018  
_mm256_sqrt_ps (__m256 __A)
1019  
{
1020  
  return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
1021  
}
1022  
1023  
#ifdef __OPTIMIZE__
1024  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1025  
_mm256_round_pd (__m256d __V, const int __M)
1026  
{
1027  
  return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
1028  
}
1029  
1030  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1031  
_mm256_round_ps (__m256 __V, const int __M)
1032  
{
1033  
  return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
1034  
}
1035  
#else
1036  
#define _mm256_round_pd(V, M) \
1037  
  ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
1038  
1039  
#define _mm256_round_ps(V, M) \
1040  
  ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
1041  
#endif
1042  
1043  
#define _mm256_ceil_pd(V)	_mm256_round_pd ((V), _MM_FROUND_CEIL)
1044  
#define _mm256_floor_pd(V)	_mm256_round_pd ((V), _MM_FROUND_FLOOR)
1045  
#define _mm256_ceil_ps(V)	_mm256_round_ps ((V), _MM_FROUND_CEIL)
1046  
#define _mm256_floor_ps(V)	_mm256_round_ps ((V), _MM_FROUND_FLOOR)
1047  
1048  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1049  
_mm256_unpackhi_pd (__m256d __A, __m256d __B)
1050  
{
1051  
  return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
1052  
}
1053  
1054  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1055  
_mm256_unpacklo_pd (__m256d __A, __m256d __B)
1056  
{
1057  
  return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
1058  
}
1059  
1060  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1061  
_mm256_unpackhi_ps (__m256 __A, __m256 __B)
1062  
{
1063  
  return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
1064  
}
1065  
1066  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1067  
_mm256_unpacklo_ps (__m256 __A, __m256 __B)
1068  
{
1069  
  return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
1070  
}
1071  
1072  
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1073  
_mm_testz_pd (__m128d __M, __m128d __V)
1074  
{
1075  
  return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
1076  
}
1077  
1078  
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1079  
_mm_testc_pd (__m128d __M, __m128d __V)
1080  
{
1081  
  return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
1082  
}
1083  
1084  
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1085  
_mm_testnzc_pd (__m128d __M, __m128d __V)
1086  
{
1087  
  return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
1088  
}
1089  
1090  
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1091  
_mm_testz_ps (__m128 __M, __m128 __V)
1092  
{
1093  
  return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
1094  
}
1095  
1096  
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1097  
_mm_testc_ps (__m128 __M, __m128 __V)
1098  
{
1099  
  return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
1100  
}
1101  
1102  
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1103  
_mm_testnzc_ps (__m128 __M, __m128 __V)
1104  
{
1105  
  return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
1106  
}
1107  
1108  
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1109  
_mm256_testz_pd (__m256d __M, __m256d __V)
1110  
{
1111  
  return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
1112  
}
1113  
1114  
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1115  
_mm256_testc_pd (__m256d __M, __m256d __V)
1116  
{
1117  
  return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
1118  
}
1119  
1120  
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1121  
_mm256_testnzc_pd (__m256d __M, __m256d __V)
1122  
{
1123  
  return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
1124  
}
1125  
1126  
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1127  
_mm256_testz_ps (__m256 __M, __m256 __V)
1128  
{
1129  
  return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
1130  
}
1131  
1132  
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1133  
_mm256_testc_ps (__m256 __M, __m256 __V)
1134  
{
1135  
  return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
1136  
}
1137  
1138  
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1139  
_mm256_testnzc_ps (__m256 __M, __m256 __V)
1140  
{
1141  
  return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
1142  
}
1143  
1144  
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1145  
_mm256_testz_si256 (__m256i __M, __m256i __V)
1146  
{
1147  
  return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
1148  
}
1149  
1150  
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1151  
_mm256_testc_si256 (__m256i __M, __m256i __V)
1152  
{
1153  
  return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
1154  
}
1155  
1156  
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1157  
_mm256_testnzc_si256 (__m256i __M, __m256i __V)
1158  
{
1159  
  return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
1160  
}
1161  
1162  
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1163  
_mm256_movemask_pd (__m256d __A)
1164  
{
1165  
  return __builtin_ia32_movmskpd256 ((__v4df)__A);
1166  
}
1167  
1168  
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1169  
_mm256_movemask_ps (__m256 __A)
1170  
{
1171  
  return __builtin_ia32_movmskps256 ((__v8sf)__A);
1172  
}
1173  
1174  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1175  
_mm256_undefined_pd (void)
1176  
{
1177  
  __m256d __Y = __Y;
1178  
  return __Y;
1179  
}
1180  
1181  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1182  
_mm256_undefined_ps (void)
1183  
{
1184  
  __m256 __Y = __Y;
1185  
  return __Y;
1186  
}
1187  
1188  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1189  
_mm256_undefined_si256 (void)
1190  
{
1191  
  __m256i __Y = __Y;
1192  
  return __Y;
1193  
}
1194  
1195  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1196  
_mm256_setzero_pd (void)
1197  
{
1198  
  return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
1199  
}
1200  
1201  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1202  
_mm256_setzero_ps (void)
1203  
{
1204  
  return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
1205  
				 0.0, 0.0, 0.0, 0.0 };
1206  
}
1207  
1208  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1209  
_mm256_setzero_si256 (void)
1210  
{
1211  
  return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
1212  
}
1213  
1214  
/* Create the vector [A B C D].  */
1215  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1216  
_mm256_set_pd (double __A, double __B, double __C, double __D)
1217  
{
1218  
  return __extension__ (__m256d){ __D, __C, __B, __A };
1219  
}
1220  
1221  
/* Create the vector [A B C D E F G H].  */
1222  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1223  
_mm256_set_ps (float __A, float __B, float __C, float __D,
1224  
	       float __E, float __F, float __G, float __H)
1225  
{
1226  
  return __extension__ (__m256){ __H, __G, __F, __E,
1227  
				 __D, __C, __B, __A };
1228  
}
1229  
1230  
/* Create the vector [A B C D E F G H].  */
1231  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1232  
_mm256_set_epi32 (int __A, int __B, int __C, int __D,
1233  
		  int __E, int __F, int __G, int __H)
1234  
{
1235  
  return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
1236  
					  __D, __C, __B, __A };
1237  
}
1238  
1239  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1240  
_mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
1241  
		  short __q11, short __q10, short __q09, short __q08,
1242  
		  short __q07, short __q06, short __q05, short __q04,
1243  
		  short __q03, short __q02, short __q01, short __q00)
1244  
{
1245  
  return __extension__ (__m256i)(__v16hi){
1246  
    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1247  
    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
1248  
  };
1249  
}
1250  
1251  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1252  
_mm256_set_epi8  (char __q31, char __q30, char __q29, char __q28,
1253  
		  char __q27, char __q26, char __q25, char __q24,
1254  
		  char __q23, char __q22, char __q21, char __q20,
1255  
		  char __q19, char __q18, char __q17, char __q16,
1256  
		  char __q15, char __q14, char __q13, char __q12,
1257  
		  char __q11, char __q10, char __q09, char __q08,
1258  
		  char __q07, char __q06, char __q05, char __q04,
1259  
		  char __q03, char __q02, char __q01, char __q00)
1260  
{
1261  
  return __extension__ (__m256i)(__v32qi){
1262  
    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1263  
    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
1264  
    __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
1265  
    __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
1266  
  };
1267  
}
1268  
1269  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1270  
_mm256_set_epi64x (long long __A, long long __B, long long __C,
1271  
		   long long __D)
1272  
{
1273  
  return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
1274  
}
1275  
1276  
/* Create a vector with all elements equal to A.  */
1277  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1278  
_mm256_set1_pd (double __A)
1279  
{
1280 0.0%
  return __extension__ (__m256d){ __A, __A, __A, __A };
1281  
}
1282  
1283  
/* Create a vector with all elements equal to A.  */
1284  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1285  
_mm256_set1_ps (float __A)
1286  
{
1287  
  return __extension__ (__m256){ __A, __A, __A, __A,
1288  
				 __A, __A, __A, __A };
1289  
}
1290  
1291  
/* Create a vector with all elements equal to A.  */
1292  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1293  
_mm256_set1_epi32 (int __A)
1294  
{
1295  
  return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
1296  
					  __A, __A, __A, __A };
1297  
}
1298  
1299  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1300  
_mm256_set1_epi16 (short __A)
1301  
{
1302  
  return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
1303  
			   __A, __A, __A, __A, __A, __A, __A, __A);
1304  
}
1305  
1306  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1307  
_mm256_set1_epi8 (char __A)
1308  
{
1309  
  return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
1310  
			  __A, __A, __A, __A, __A, __A, __A, __A,
1311  
			  __A, __A, __A, __A, __A, __A, __A, __A,
1312  
			  __A, __A, __A, __A, __A, __A, __A, __A);
1313  
}
1314  
1315  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1316  
_mm256_set1_epi64x (long long __A)
1317  
{
1318  
  return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
1319  
}
1320  
1321  
/* Create vectors of elements in the reversed order from the
1322  
   _mm256_set_XXX functions.  */
1323  
1324  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1325  
_mm256_setr_pd (double __A, double __B, double __C, double __D)
1326  
{
1327  
  return _mm256_set_pd (__D, __C, __B, __A);
1328  
}
1329  
1330  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1331  
_mm256_setr_ps (float __A, float __B, float __C, float __D,
1332  
		float __E, float __F, float __G, float __H)
1333  
{
1334  
  return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
1335  
}
1336  
1337  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1338  
_mm256_setr_epi32 (int __A, int __B, int __C, int __D,
1339  
		   int __E, int __F, int __G, int __H)
1340  
{
1341  
  return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
1342  
}
1343  
1344  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1345  
_mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
1346  
		   short __q11, short __q10, short __q09, short __q08,
1347  
		   short __q07, short __q06, short __q05, short __q04,
1348  
		   short __q03, short __q02, short __q01, short __q00)
1349  
{
1350  
  return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
1351  
			   __q04, __q05, __q06, __q07,
1352  
			   __q08, __q09, __q10, __q11,
1353  
			   __q12, __q13, __q14, __q15);
1354  
}
1355  
1356  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1357  
_mm256_setr_epi8  (char __q31, char __q30, char __q29, char __q28,
1358  
		   char __q27, char __q26, char __q25, char __q24,
1359  
		   char __q23, char __q22, char __q21, char __q20,
1360  
		   char __q19, char __q18, char __q17, char __q16,
1361  
		   char __q15, char __q14, char __q13, char __q12,
1362  
		   char __q11, char __q10, char __q09, char __q08,
1363  
		   char __q07, char __q06, char __q05, char __q04,
1364  
		   char __q03, char __q02, char __q01, char __q00)
1365  
{
1366  
  return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
1367  
			  __q04, __q05, __q06, __q07,
1368  
			  __q08, __q09, __q10, __q11,
1369  
			  __q12, __q13, __q14, __q15,
1370  
			  __q16, __q17, __q18, __q19,
1371  
			  __q20, __q21, __q22, __q23,
1372  
			  __q24, __q25, __q26, __q27,
1373  
			  __q28, __q29, __q30, __q31);
1374  
}
1375  
1376  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1377  
_mm256_setr_epi64x (long long __A, long long __B, long long __C,
1378  
		    long long __D)
1379  
{
1380  
  return _mm256_set_epi64x (__D, __C, __B, __A);
1381  
}
1382  
1383  
/* Casts between various SP, DP, INT vector types.  Note that these do no
1384  
   conversion of values, they just change the type.  */
1385  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1386  
_mm256_castpd_ps (__m256d __A)
1387  
{
1388  
  return (__m256) __A;
1389  
}
1390  
1391  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1392  
_mm256_castpd_si256 (__m256d __A)
1393  
{
1394  
  return (__m256i) __A;
1395  
}
1396  
1397  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1398  
_mm256_castps_pd (__m256 __A)
1399  
{
1400  
  return (__m256d) __A;
1401  
}
1402  
1403  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1404  
_mm256_castps_si256(__m256 __A)
1405  
{
1406  
  return (__m256i) __A;
1407  
}
1408  
1409  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1410  
_mm256_castsi256_ps (__m256i __A)
1411  
{
1412  
  return (__m256) __A;
1413  
}
1414  
1415  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1416  
_mm256_castsi256_pd (__m256i __A)
1417  
{
1418  
  return (__m256d) __A;
1419  
}
1420  
1421  
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1422  
_mm256_castpd256_pd128 (__m256d __A)
1423  
{
1424  
  return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
1425  
}
1426  
1427  
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1428  
_mm256_castps256_ps128 (__m256 __A)
1429  
{
1430  
  return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
1431  
}
1432  
1433  
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1434  
_mm256_castsi256_si128 (__m256i __A)
1435  
{
1436  
  return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
1437  
}
1438  
1439  
/* When cast is done from a 128 to 256-bit type, the low 128 bits of
1440  
   the 256-bit result contain source parameter value and the upper 128
1441  
   bits of the result are undefined.  Those intrinsics shouldn't
1442  
   generate any extra moves.  */
1443  
1444  
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1445  
_mm256_castpd128_pd256 (__m128d __A)
1446  
{
1447  
  return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
1448  
}
1449  
1450  
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1451  
_mm256_castps128_ps256 (__m128 __A)
1452  
{
1453  
  return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
1454  
}
1455  
1456  
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1457  
_mm256_castsi128_si256 (__m128i __A)
1458  
{
1459  
  return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
1460  
}
1461  
1462  
#ifdef __DISABLE_AVX__
1463  
#undef __DISABLE_AVX__
1464  
#pragma GCC pop_options
1465  
#endif /* __DISABLE_AVX__ */
1466  
1467  
#endif /* _AVXINTRIN_H_INCLUDED */
1468  

Copyright (c) 2006-2012 Rogue Wave Software, Inc. All Rights Reserved.
Patents pending.