Line | % of fetches | Source | |
---|---|---|---|
1 | /* Copyright (C) 2008-2015 Free Software Foundation, Inc. | ||
2 | |||
3 | This file is part of GCC. | ||
4 | |||
5 | GCC is free software; you can redistribute it and/or modify | ||
6 | it under the terms of the GNU General Public License as published by | ||
7 | the Free Software Foundation; either version 3, or (at your option) | ||
8 | any later version. | ||
9 | |||
10 | GCC is distributed in the hope that it will be useful, | ||
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | GNU General Public License for more details. | ||
14 | |||
15 | Under Section 7 of GPL version 3, you are granted additional | ||
16 | permissions described in the GCC Runtime Library Exception, version | ||
17 | 3.1, as published by the Free Software Foundation. | ||
18 | |||
19 | You should have received a copy of the GNU General Public License and | ||
20 | a copy of the GCC Runtime Library Exception along with this program; | ||
21 | see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | ||
22 | <http://www.gnu.org/licenses/>. */ | ||
23 | |||
24 | /* Implemented from the specification included in the Intel C++ Compiler | ||
25 | User Guide and Reference, version 11.0. */ | ||
26 | |||
27 | #ifndef _IMMINTRIN_H_INCLUDED | ||
28 | # error "Never use <avxintrin.h> directly; include <immintrin.h> instead." | ||
29 | #endif | ||
30 | |||
31 | #ifndef _AVXINTRIN_H_INCLUDED | ||
32 | #define _AVXINTRIN_H_INCLUDED | ||
33 | |||
34 | #ifndef __AVX__ | ||
35 | #pragma GCC push_options | ||
36 | #pragma GCC target("avx") | ||
37 | #define __DISABLE_AVX__ | ||
38 | #endif /* __AVX__ */ | ||
39 | |||
40 | /* Internal data types for implementing the intrinsics. */ | ||
41 | typedef double __v4df __attribute__ ((__vector_size__ (32))); | ||
42 | typedef float __v8sf __attribute__ ((__vector_size__ (32))); | ||
43 | typedef long long __v4di __attribute__ ((__vector_size__ (32))); | ||
44 | typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32))); | ||
45 | typedef int __v8si __attribute__ ((__vector_size__ (32))); | ||
46 | typedef unsigned int __v8su __attribute__ ((__vector_size__ (32))); | ||
47 | typedef short __v16hi __attribute__ ((__vector_size__ (32))); | ||
48 | typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32))); | ||
49 | typedef char __v32qi __attribute__ ((__vector_size__ (32))); | ||
50 | typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32))); | ||
51 | |||
52 | /* The Intel API is flexible enough that we must allow aliasing with other | ||
53 | vector types, and their scalar components. */ | ||
54 | typedef float __m256 __attribute__ ((__vector_size__ (32), | ||
55 | __may_alias__)); | ||
56 | typedef long long __m256i __attribute__ ((__vector_size__ (32), | ||
57 | __may_alias__)); | ||
58 | typedef double __m256d __attribute__ ((__vector_size__ (32), | ||
59 | __may_alias__)); | ||
60 | |||
61 | /* Compare predicates for scalar and packed compare intrinsics. */ | ||
62 | |||
63 | /* Equal (ordered, non-signaling) */ | ||
64 | #define _CMP_EQ_OQ 0x00 | ||
65 | /* Less-than (ordered, signaling) */ | ||
66 | #define _CMP_LT_OS 0x01 | ||
67 | /* Less-than-or-equal (ordered, signaling) */ | ||
68 | #define _CMP_LE_OS 0x02 | ||
69 | /* Unordered (non-signaling) */ | ||
70 | #define _CMP_UNORD_Q 0x03 | ||
71 | /* Not-equal (unordered, non-signaling) */ | ||
72 | #define _CMP_NEQ_UQ 0x04 | ||
73 | /* Not-less-than (unordered, signaling) */ | ||
74 | #define _CMP_NLT_US 0x05 | ||
75 | /* Not-less-than-or-equal (unordered, signaling) */ | ||
76 | #define _CMP_NLE_US 0x06 | ||
77 | /* Ordered (nonsignaling) */ | ||
78 | #define _CMP_ORD_Q 0x07 | ||
79 | /* Equal (unordered, non-signaling) */ | ||
80 | #define _CMP_EQ_UQ 0x08 | ||
81 | /* Not-greater-than-or-equal (unordered, signaling) */ | ||
82 | #define _CMP_NGE_US 0x09 | ||
83 | /* Not-greater-than (unordered, signaling) */ | ||
84 | #define _CMP_NGT_US 0x0a | ||
85 | /* False (ordered, non-signaling) */ | ||
86 | #define _CMP_FALSE_OQ 0x0b | ||
87 | /* Not-equal (ordered, non-signaling) */ | ||
88 | #define _CMP_NEQ_OQ 0x0c | ||
89 | /* Greater-than-or-equal (ordered, signaling) */ | ||
90 | #define _CMP_GE_OS 0x0d | ||
91 | /* Greater-than (ordered, signaling) */ | ||
92 | #define _CMP_GT_OS 0x0e | ||
93 | /* True (unordered, non-signaling) */ | ||
94 | #define _CMP_TRUE_UQ 0x0f | ||
95 | /* Equal (ordered, signaling) */ | ||
96 | #define _CMP_EQ_OS 0x10 | ||
97 | /* Less-than (ordered, non-signaling) */ | ||
98 | #define _CMP_LT_OQ 0x11 | ||
99 | /* Less-than-or-equal (ordered, non-signaling) */ | ||
100 | #define _CMP_LE_OQ 0x12 | ||
101 | /* Unordered (signaling) */ | ||
102 | #define _CMP_UNORD_S 0x13 | ||
103 | /* Not-equal (unordered, signaling) */ | ||
104 | #define _CMP_NEQ_US 0x14 | ||
105 | /* Not-less-than (unordered, non-signaling) */ | ||
106 | #define _CMP_NLT_UQ 0x15 | ||
107 | /* Not-less-than-or-equal (unordered, non-signaling) */ | ||
108 | #define _CMP_NLE_UQ 0x16 | ||
109 | /* Ordered (signaling) */ | ||
110 | #define _CMP_ORD_S 0x17 | ||
111 | /* Equal (unordered, signaling) */ | ||
112 | #define _CMP_EQ_US 0x18 | ||
113 | /* Not-greater-than-or-equal (unordered, non-signaling) */ | ||
114 | #define _CMP_NGE_UQ 0x19 | ||
115 | /* Not-greater-than (unordered, non-signaling) */ | ||
116 | #define _CMP_NGT_UQ 0x1a | ||
117 | /* False (ordered, signaling) */ | ||
118 | #define _CMP_FALSE_OS 0x1b | ||
119 | /* Not-equal (ordered, signaling) */ | ||
120 | #define _CMP_NEQ_OS 0x1c | ||
121 | /* Greater-than-or-equal (ordered, non-signaling) */ | ||
122 | #define _CMP_GE_OQ 0x1d | ||
123 | /* Greater-than (ordered, non-signaling) */ | ||
124 | #define _CMP_GT_OQ 0x1e | ||
125 | /* True (unordered, signaling) */ | ||
126 | #define _CMP_TRUE_US 0x1f | ||
127 | |||
128 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
129 | _mm256_add_pd (__m256d __A, __m256d __B) | ||
130 | { | ||
131 | return (__m256d) ((__v4df)__A + (__v4df)__B); | ||
132 | } | ||
133 | |||
134 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
135 | _mm256_add_ps (__m256 __A, __m256 __B) | ||
136 | { | ||
137 | return (__m256) ((__v8sf)__A + (__v8sf)__B); | ||
138 | } | ||
139 | |||
140 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
141 | _mm256_addsub_pd (__m256d __A, __m256d __B) | ||
142 | { | ||
143 | return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B); | ||
144 | } | ||
145 | |||
146 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
147 | _mm256_addsub_ps (__m256 __A, __m256 __B) | ||
148 | { | ||
149 | return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B); | ||
150 | } | ||
151 | |||
152 | |||
153 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
154 | _mm256_and_pd (__m256d __A, __m256d __B) | ||
155 | { | ||
156 | return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B); | ||
157 | } | ||
158 | |||
159 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
160 | _mm256_and_ps (__m256 __A, __m256 __B) | ||
161 | { | ||
162 | return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B); | ||
163 | } | ||
164 | |||
165 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
166 | _mm256_andnot_pd (__m256d __A, __m256d __B) | ||
167 | { | ||
168 | return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B); | ||
169 | } | ||
170 | |||
171 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
172 | _mm256_andnot_ps (__m256 __A, __m256 __B) | ||
173 | { | ||
174 | return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B); | ||
175 | } | ||
176 | |||
177 | /* Double/single precision floating point blend instructions - select | ||
178 | data from 2 sources using constant/variable mask. */ | ||
179 | |||
180 | #ifdef __OPTIMIZE__ | ||
181 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
182 | _mm256_blend_pd (__m256d __X, __m256d __Y, const int __M) | ||
183 | { | ||
184 | return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X, | ||
185 | (__v4df)__Y, | ||
186 | __M); | ||
187 | } | ||
188 | |||
189 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
190 | _mm256_blend_ps (__m256 __X, __m256 __Y, const int __M) | ||
191 | { | ||
192 | return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X, | ||
193 | (__v8sf)__Y, | ||
194 | __M); | ||
195 | } | ||
196 | #else | ||
197 | #define _mm256_blend_pd(X, Y, M) \ | ||
198 | ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \ | ||
199 | (__v4df)(__m256d)(Y), (int)(M))) | ||
200 | |||
201 | #define _mm256_blend_ps(X, Y, M) \ | ||
202 | ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \ | ||
203 | (__v8sf)(__m256)(Y), (int)(M))) | ||
204 | #endif | ||
205 | |||
206 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
207 | _mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M) | ||
208 | { | ||
209 | return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X, | ||
210 | (__v4df)__Y, | ||
211 | (__v4df)__M); | ||
212 | } | ||
213 | |||
214 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
215 | _mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M) | ||
216 | { | ||
217 | return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X, | ||
218 | (__v8sf)__Y, | ||
219 | (__v8sf)__M); | ||
220 | } | ||
221 | |||
222 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
223 | _mm256_div_pd (__m256d __A, __m256d __B) | ||
224 | { | ||
225 | return (__m256d) ((__v4df)__A / (__v4df)__B); | ||
226 | } | ||
227 | |||
228 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
229 | _mm256_div_ps (__m256 __A, __m256 __B) | ||
230 | { | ||
231 | return (__m256) ((__v8sf)__A / (__v8sf)__B); | ||
232 | } | ||
233 | |||
234 | /* Dot product instructions with mask-defined summing and zeroing parts | ||
235 | of result. */ | ||
236 | |||
237 | #ifdef __OPTIMIZE__ | ||
238 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
239 | _mm256_dp_ps (__m256 __X, __m256 __Y, const int __M) | ||
240 | { | ||
241 | return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X, | ||
242 | (__v8sf)__Y, | ||
243 | __M); | ||
244 | } | ||
245 | #else | ||
246 | #define _mm256_dp_ps(X, Y, M) \ | ||
247 | ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \ | ||
248 | (__v8sf)(__m256)(Y), (int)(M))) | ||
249 | #endif | ||
250 | |||
251 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
252 | _mm256_hadd_pd (__m256d __X, __m256d __Y) | ||
253 | { | ||
254 | return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y); | ||
255 | } | ||
256 | |||
257 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
258 | _mm256_hadd_ps (__m256 __X, __m256 __Y) | ||
259 | { | ||
260 | return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y); | ||
261 | } | ||
262 | |||
263 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
264 | _mm256_hsub_pd (__m256d __X, __m256d __Y) | ||
265 | { | ||
266 | return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y); | ||
267 | } | ||
268 | |||
269 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
270 | _mm256_hsub_ps (__m256 __X, __m256 __Y) | ||
271 | { | ||
272 | return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y); | ||
273 | } | ||
274 | |||
275 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
276 | _mm256_max_pd (__m256d __A, __m256d __B) | ||
277 | { | ||
278 | return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B); | ||
279 | } | ||
280 | |||
281 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
282 | _mm256_max_ps (__m256 __A, __m256 __B) | ||
283 | { | ||
284 | return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B); | ||
285 | } | ||
286 | |||
287 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
288 | _mm256_min_pd (__m256d __A, __m256d __B) | ||
289 | { | ||
290 | return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B); | ||
291 | } | ||
292 | |||
293 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
294 | _mm256_min_ps (__m256 __A, __m256 __B) | ||
295 | { | ||
296 | return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B); | ||
297 | } | ||
298 | |||
299 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
300 | _mm256_mul_pd (__m256d __A, __m256d __B) | ||
301 | { | ||
302 | return (__m256d) ((__v4df)__A * (__v4df)__B); | ||
303 | } | ||
304 | |||
305 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
306 | _mm256_mul_ps (__m256 __A, __m256 __B) | ||
307 | { | ||
308 | return (__m256) ((__v8sf)__A * (__v8sf)__B); | ||
309 | } | ||
310 | |||
311 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
312 | _mm256_or_pd (__m256d __A, __m256d __B) | ||
313 | { | ||
314 | return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B); | ||
315 | } | ||
316 | |||
317 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
318 | _mm256_or_ps (__m256 __A, __m256 __B) | ||
319 | { | ||
320 | return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B); | ||
321 | } | ||
322 | |||
323 | #ifdef __OPTIMIZE__ | ||
324 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
325 | _mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask) | ||
326 | { | ||
327 | return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B, | ||
328 | __mask); | ||
329 | } | ||
330 | |||
331 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
332 | _mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask) | ||
333 | { | ||
334 | return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B, | ||
335 | __mask); | ||
336 | } | ||
337 | #else | ||
338 | #define _mm256_shuffle_pd(A, B, N) \ | ||
339 | ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \ | ||
340 | (__v4df)(__m256d)(B), (int)(N))) | ||
341 | |||
342 | #define _mm256_shuffle_ps(A, B, N) \ | ||
343 | ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \ | ||
344 | (__v8sf)(__m256)(B), (int)(N))) | ||
345 | #endif | ||
346 | |||
347 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
348 | _mm256_sub_pd (__m256d __A, __m256d __B) | ||
349 | { | ||
350 | return (__m256d) ((__v4df)__A - (__v4df)__B); | ||
351 | } | ||
352 | |||
353 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
354 | _mm256_sub_ps (__m256 __A, __m256 __B) | ||
355 | { | ||
356 | return (__m256) ((__v8sf)__A - (__v8sf)__B); | ||
357 | } | ||
358 | |||
359 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
360 | _mm256_xor_pd (__m256d __A, __m256d __B) | ||
361 | { | ||
362 | return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B); | ||
363 | } | ||
364 | |||
365 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
366 | _mm256_xor_ps (__m256 __A, __m256 __B) | ||
367 | { | ||
368 | return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B); | ||
369 | } | ||
370 | |||
371 | #ifdef __OPTIMIZE__ | ||
372 | extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
373 | _mm_cmp_pd (__m128d __X, __m128d __Y, const int __P) | ||
374 | { | ||
375 | return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P); | ||
376 | } | ||
377 | |||
378 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
379 | _mm_cmp_ps (__m128 __X, __m128 __Y, const int __P) | ||
380 | { | ||
381 | return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P); | ||
382 | } | ||
383 | |||
384 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
385 | _mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P) | ||
386 | { | ||
387 | return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y, | ||
388 | __P); | ||
389 | } | ||
390 | |||
391 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
392 | _mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P) | ||
393 | { | ||
394 | return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y, | ||
395 | __P); | ||
396 | } | ||
397 | |||
398 | extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
399 | _mm_cmp_sd (__m128d __X, __m128d __Y, const int __P) | ||
400 | { | ||
401 | return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P); | ||
402 | } | ||
403 | |||
404 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
405 | _mm_cmp_ss (__m128 __X, __m128 __Y, const int __P) | ||
406 | { | ||
407 | return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P); | ||
408 | } | ||
409 | #else | ||
410 | #define _mm_cmp_pd(X, Y, P) \ | ||
411 | ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), \ | ||
412 | (__v2df)(__m128d)(Y), (int)(P))) | ||
413 | |||
414 | #define _mm_cmp_ps(X, Y, P) \ | ||
415 | ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), \ | ||
416 | (__v4sf)(__m128)(Y), (int)(P))) | ||
417 | |||
418 | #define _mm256_cmp_pd(X, Y, P) \ | ||
419 | ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \ | ||
420 | (__v4df)(__m256d)(Y), (int)(P))) | ||
421 | |||
422 | #define _mm256_cmp_ps(X, Y, P) \ | ||
423 | ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \ | ||
424 | (__v8sf)(__m256)(Y), (int)(P))) | ||
425 | |||
426 | #define _mm_cmp_sd(X, Y, P) \ | ||
427 | ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X), \ | ||
428 | (__v2df)(__m128d)(Y), (int)(P))) | ||
429 | |||
430 | #define _mm_cmp_ss(X, Y, P) \ | ||
431 | ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X), \ | ||
432 | (__v4sf)(__m128)(Y), (int)(P))) | ||
433 | #endif | ||
434 | |||
435 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
436 | _mm256_cvtepi32_pd (__m128i __A) | ||
437 | { | ||
438 | return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A); | ||
439 | } | ||
440 | |||
441 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
442 | _mm256_cvtepi32_ps (__m256i __A) | ||
443 | { | ||
444 | return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A); | ||
445 | } | ||
446 | |||
447 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
448 | _mm256_cvtpd_ps (__m256d __A) | ||
449 | { | ||
450 | return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A); | ||
451 | } | ||
452 | |||
453 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
454 | _mm256_cvtps_epi32 (__m256 __A) | ||
455 | { | ||
456 | return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A); | ||
457 | } | ||
458 | |||
459 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
460 | _mm256_cvtps_pd (__m128 __A) | ||
461 | { | ||
462 | return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A); | ||
463 | } | ||
464 | |||
465 | extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
466 | _mm256_cvttpd_epi32 (__m256d __A) | ||
467 | { | ||
468 | return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A); | ||
469 | } | ||
470 | |||
471 | extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
472 | _mm256_cvtpd_epi32 (__m256d __A) | ||
473 | { | ||
474 | return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A); | ||
475 | } | ||
476 | |||
477 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
478 | _mm256_cvttps_epi32 (__m256 __A) | ||
479 | { | ||
480 | return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A); | ||
481 | } | ||
482 | |||
483 | #ifdef __OPTIMIZE__ | ||
484 | extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
485 | _mm256_extractf128_pd (__m256d __X, const int __N) | ||
486 | { | ||
487 | return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N); | ||
488 | } | ||
489 | |||
490 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
491 | _mm256_extractf128_ps (__m256 __X, const int __N) | ||
492 | { | ||
493 | return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N); | ||
494 | } | ||
495 | |||
496 | extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
497 | _mm256_extractf128_si256 (__m256i __X, const int __N) | ||
498 | { | ||
499 | return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N); | ||
500 | } | ||
501 | |||
502 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
503 | _mm256_extract_epi32 (__m256i __X, int const __N) | ||
504 | { | ||
505 | __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2); | ||
506 | return _mm_extract_epi32 (__Y, __N % 4); | ||
507 | } | ||
508 | |||
509 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
510 | _mm256_extract_epi16 (__m256i __X, int const __N) | ||
511 | { | ||
512 | __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3); | ||
513 | return _mm_extract_epi16 (__Y, __N % 8); | ||
514 | } | ||
515 | |||
516 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
517 | _mm256_extract_epi8 (__m256i __X, int const __N) | ||
518 | { | ||
519 | __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4); | ||
520 | return _mm_extract_epi8 (__Y, __N % 16); | ||
521 | } | ||
522 | |||
523 | #ifdef __x86_64__ | ||
524 | extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
525 | _mm256_extract_epi64 (__m256i __X, const int __N) | ||
526 | { | ||
527 | __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1); | ||
528 | return _mm_extract_epi64 (__Y, __N % 2); | ||
529 | } | ||
530 | #endif | ||
531 | #else | ||
532 | #define _mm256_extractf128_pd(X, N) \ | ||
533 | ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \ | ||
534 | (int)(N))) | ||
535 | |||
536 | #define _mm256_extractf128_ps(X, N) \ | ||
537 | ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \ | ||
538 | (int)(N))) | ||
539 | |||
540 | #define _mm256_extractf128_si256(X, N) \ | ||
541 | ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \ | ||
542 | (int)(N))) | ||
543 | |||
544 | #define _mm256_extract_epi32(X, N) \ | ||
545 | (__extension__ \ | ||
546 | ({ \ | ||
547 | __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \ | ||
548 | _mm_extract_epi32 (__Y, (N) % 4); \ | ||
549 | })) | ||
550 | |||
551 | #define _mm256_extract_epi16(X, N) \ | ||
552 | (__extension__ \ | ||
553 | ({ \ | ||
554 | __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \ | ||
555 | _mm_extract_epi16 (__Y, (N) % 8); \ | ||
556 | })) | ||
557 | |||
558 | #define _mm256_extract_epi8(X, N) \ | ||
559 | (__extension__ \ | ||
560 | ({ \ | ||
561 | __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \ | ||
562 | _mm_extract_epi8 (__Y, (N) % 16); \ | ||
563 | })) | ||
564 | |||
565 | #ifdef __x86_64__ | ||
566 | #define _mm256_extract_epi64(X, N) \ | ||
567 | (__extension__ \ | ||
568 | ({ \ | ||
569 | __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \ | ||
570 | _mm_extract_epi64 (__Y, (N) % 2); \ | ||
571 | })) | ||
572 | #endif | ||
573 | #endif | ||
574 | |||
575 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
576 | _mm256_zeroall (void) | ||
577 | { | ||
578 | __builtin_ia32_vzeroall (); | ||
579 | } | ||
580 | |||
581 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
582 | _mm256_zeroupper (void) | ||
583 | { | ||
584 | __builtin_ia32_vzeroupper (); | ||
585 | } | ||
586 | |||
587 | extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
588 | _mm_permutevar_pd (__m128d __A, __m128i __C) | ||
589 | { | ||
590 | return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A, | ||
591 | (__v2di)__C); | ||
592 | } | ||
593 | |||
594 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
595 | _mm256_permutevar_pd (__m256d __A, __m256i __C) | ||
596 | { | ||
597 | return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A, | ||
598 | (__v4di)__C); | ||
599 | } | ||
600 | |||
601 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
602 | _mm_permutevar_ps (__m128 __A, __m128i __C) | ||
603 | { | ||
604 | return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A, | ||
605 | (__v4si)__C); | ||
606 | } | ||
607 | |||
608 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
609 | _mm256_permutevar_ps (__m256 __A, __m256i __C) | ||
610 | { | ||
611 | return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A, | ||
612 | (__v8si)__C); | ||
613 | } | ||
614 | |||
615 | #ifdef __OPTIMIZE__ | ||
616 | extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
617 | _mm_permute_pd (__m128d __X, const int __C) | ||
618 | { | ||
619 | return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C); | ||
620 | } | ||
621 | |||
622 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
623 | _mm256_permute_pd (__m256d __X, const int __C) | ||
624 | { | ||
625 | return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C); | ||
626 | } | ||
627 | |||
628 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
629 | _mm_permute_ps (__m128 __X, const int __C) | ||
630 | { | ||
631 | return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C); | ||
632 | } | ||
633 | |||
634 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
635 | _mm256_permute_ps (__m256 __X, const int __C) | ||
636 | { | ||
637 | return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C); | ||
638 | } | ||
639 | #else | ||
640 | #define _mm_permute_pd(X, C) \ | ||
641 | ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C))) | ||
642 | |||
643 | #define _mm256_permute_pd(X, C) \ | ||
644 | ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C))) | ||
645 | |||
646 | #define _mm_permute_ps(X, C) \ | ||
647 | ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C))) | ||
648 | |||
649 | #define _mm256_permute_ps(X, C) \ | ||
650 | ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C))) | ||
651 | #endif | ||
652 | |||
653 | #ifdef __OPTIMIZE__ | ||
654 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
655 | _mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C) | ||
656 | { | ||
657 | return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X, | ||
658 | (__v4df)__Y, | ||
659 | __C); | ||
660 | } | ||
661 | |||
662 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
663 | _mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C) | ||
664 | { | ||
665 | return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X, | ||
666 | (__v8sf)__Y, | ||
667 | __C); | ||
668 | } | ||
669 | |||
670 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
671 | _mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C) | ||
672 | { | ||
673 | return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X, | ||
674 | (__v8si)__Y, | ||
675 | __C); | ||
676 | } | ||
677 | #else | ||
678 | #define _mm256_permute2f128_pd(X, Y, C) \ | ||
679 | ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \ | ||
680 | (__v4df)(__m256d)(Y), \ | ||
681 | (int)(C))) | ||
682 | |||
683 | #define _mm256_permute2f128_ps(X, Y, C) \ | ||
684 | ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \ | ||
685 | (__v8sf)(__m256)(Y), \ | ||
686 | (int)(C))) | ||
687 | |||
688 | #define _mm256_permute2f128_si256(X, Y, C) \ | ||
689 | ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \ | ||
690 | (__v8si)(__m256i)(Y), \ | ||
691 | (int)(C))) | ||
692 | #endif | ||
693 | |||
694 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
695 | _mm_broadcast_ss (float const *__X) | ||
696 | { | ||
697 | return (__m128) __builtin_ia32_vbroadcastss (__X); | ||
698 | } | ||
699 | |||
700 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
701 | _mm256_broadcast_sd (double const *__X) | ||
702 | { | ||
703 | return (__m256d) __builtin_ia32_vbroadcastsd256 (__X); | ||
704 | } | ||
705 | |||
706 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
707 | _mm256_broadcast_ss (float const *__X) | ||
708 | { | ||
709 | return (__m256) __builtin_ia32_vbroadcastss256 (__X); | ||
710 | } | ||
711 | |||
712 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
713 | _mm256_broadcast_pd (__m128d const *__X) | ||
714 | { | ||
715 | return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X); | ||
716 | } | ||
717 | |||
718 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
719 | _mm256_broadcast_ps (__m128 const *__X) | ||
720 | { | ||
721 | return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X); | ||
722 | } | ||
723 | |||
724 | #ifdef __OPTIMIZE__ | ||
725 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
726 | _mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O) | ||
727 | { | ||
728 | return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X, | ||
729 | (__v2df)__Y, | ||
730 | __O); | ||
731 | } | ||
732 | |||
733 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
734 | _mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O) | ||
735 | { | ||
736 | return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X, | ||
737 | (__v4sf)__Y, | ||
738 | __O); | ||
739 | } | ||
740 | |||
741 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
742 | _mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O) | ||
743 | { | ||
744 | return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X, | ||
745 | (__v4si)__Y, | ||
746 | __O); | ||
747 | } | ||
748 | |||
749 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
750 | _mm256_insert_epi32 (__m256i __X, int __D, int const __N) | ||
751 | { | ||
752 | __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2); | ||
753 | __Y = _mm_insert_epi32 (__Y, __D, __N % 4); | ||
754 | return _mm256_insertf128_si256 (__X, __Y, __N >> 2); | ||
755 | } | ||
756 | |||
757 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
758 | _mm256_insert_epi16 (__m256i __X, int __D, int const __N) | ||
759 | { | ||
760 | __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3); | ||
761 | __Y = _mm_insert_epi16 (__Y, __D, __N % 8); | ||
762 | return _mm256_insertf128_si256 (__X, __Y, __N >> 3); | ||
763 | } | ||
764 | |||
765 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
766 | _mm256_insert_epi8 (__m256i __X, int __D, int const __N) | ||
767 | { | ||
768 | __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4); | ||
769 | __Y = _mm_insert_epi8 (__Y, __D, __N % 16); | ||
770 | return _mm256_insertf128_si256 (__X, __Y, __N >> 4); | ||
771 | } | ||
772 | |||
773 | #ifdef __x86_64__ | ||
774 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
775 | _mm256_insert_epi64 (__m256i __X, long long __D, int const __N) | ||
776 | { | ||
777 | __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1); | ||
778 | __Y = _mm_insert_epi64 (__Y, __D, __N % 2); | ||
779 | return _mm256_insertf128_si256 (__X, __Y, __N >> 1); | ||
780 | } | ||
781 | #endif | ||
782 | #else | ||
783 | #define _mm256_insertf128_pd(X, Y, O) \ | ||
784 | ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \ | ||
785 | (__v2df)(__m128d)(Y), \ | ||
786 | (int)(O))) | ||
787 | |||
788 | #define _mm256_insertf128_ps(X, Y, O) \ | ||
789 | ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \ | ||
790 | (__v4sf)(__m128)(Y), \ | ||
791 | (int)(O))) | ||
792 | |||
793 | #define _mm256_insertf128_si256(X, Y, O) \ | ||
794 | ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \ | ||
795 | (__v4si)(__m128i)(Y), \ | ||
796 | (int)(O))) | ||
797 | |||
798 | #define _mm256_insert_epi32(X, D, N) \ | ||
799 | (__extension__ \ | ||
800 | ({ \ | ||
801 | __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \ | ||
802 | __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \ | ||
803 | _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \ | ||
804 | })) | ||
805 | |||
806 | #define _mm256_insert_epi16(X, D, N) \ | ||
807 | (__extension__ \ | ||
808 | ({ \ | ||
809 | __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \ | ||
810 | __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \ | ||
811 | _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \ | ||
812 | })) | ||
813 | |||
814 | #define _mm256_insert_epi8(X, D, N) \ | ||
815 | (__extension__ \ | ||
816 | ({ \ | ||
817 | __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \ | ||
818 | __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \ | ||
819 | _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \ | ||
820 | })) | ||
821 | |||
822 | #ifdef __x86_64__ | ||
823 | #define _mm256_insert_epi64(X, D, N) \ | ||
824 | (__extension__ \ | ||
825 | ({ \ | ||
826 | __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \ | ||
827 | __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \ | ||
828 | _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \ | ||
829 | })) | ||
830 | #endif | ||
831 | #endif | ||
832 | |||
833 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
834 | _mm256_load_pd (double const *__P) | ||
835 | { | ||
836 | return *(__m256d *)__P; | ||
837 | } | ||
838 | |||
839 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
840 | _mm256_store_pd (double *__P, __m256d __A) | ||
841 | { | ||
842 | *(__m256d *)__P = __A; | ||
843 | } | ||
844 | |||
845 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
846 | _mm256_load_ps (float const *__P) | ||
847 | { | ||
848 | return *(__m256 *)__P; | ||
849 | } | ||
850 | |||
851 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
852 | _mm256_store_ps (float *__P, __m256 __A) | ||
853 | { | ||
854 | *(__m256 *)__P = __A; | ||
855 | } | ||
856 | |||
857 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
858 | _mm256_loadu_pd (double const *__P) | ||
859 | { | ||
860 | return (__m256d) __builtin_ia32_loadupd256 (__P); | ||
861 | } | ||
862 | |||
863 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
864 | _mm256_storeu_pd (double *__P, __m256d __A) | ||
865 | { | ||
866 | __builtin_ia32_storeupd256 (__P, (__v4df)__A); | ||
867 | } | ||
868 | |||
869 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
870 | _mm256_loadu_ps (float const *__P) | ||
871 | { | ||
872 | return (__m256) __builtin_ia32_loadups256 (__P); | ||
873 | } | ||
874 | |||
875 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
876 | _mm256_storeu_ps (float *__P, __m256 __A) | ||
877 | { | ||
878 | __builtin_ia32_storeups256 (__P, (__v8sf)__A); | ||
879 | } | ||
880 | |||
881 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
882 | _mm256_load_si256 (__m256i const *__P) | ||
883 | { | ||
884 | return *__P; | ||
885 | } | ||
886 | |||
887 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
888 | _mm256_store_si256 (__m256i *__P, __m256i __A) | ||
889 | { | ||
890 | *__P = __A; | ||
891 | } | ||
892 | |||
893 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
894 | _mm256_loadu_si256 (__m256i const *__P) | ||
895 | { | ||
896 | return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P); | ||
897 | } | ||
898 | |||
899 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
900 | _mm256_storeu_si256 (__m256i *__P, __m256i __A) | ||
901 | { | ||
902 | __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A); | ||
903 | } | ||
904 | |||
905 | extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
906 | _mm_maskload_pd (double const *__P, __m128i __M) | ||
907 | { | ||
908 | return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P, | ||
909 | (__v2di)__M); | ||
910 | } | ||
911 | |||
912 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
913 | _mm_maskstore_pd (double *__P, __m128i __M, __m128d __A) | ||
914 | { | ||
915 | __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A); | ||
916 | } | ||
917 | |||
918 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
919 | _mm256_maskload_pd (double const *__P, __m256i __M) | ||
920 | { | ||
921 | return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P, | ||
922 | (__v4di)__M); | ||
923 | } | ||
924 | |||
925 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
926 | _mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A) | ||
927 | { | ||
928 | __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A); | ||
929 | } | ||
930 | |||
931 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
932 | _mm_maskload_ps (float const *__P, __m128i __M) | ||
933 | { | ||
934 | return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P, | ||
935 | (__v4si)__M); | ||
936 | } | ||
937 | |||
938 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
939 | _mm_maskstore_ps (float *__P, __m128i __M, __m128 __A) | ||
940 | { | ||
941 | __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A); | ||
942 | } | ||
943 | |||
944 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
945 | _mm256_maskload_ps (float const *__P, __m256i __M) | ||
946 | { | ||
947 | return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P, | ||
948 | (__v8si)__M); | ||
949 | } | ||
950 | |||
951 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
952 | _mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A) | ||
953 | { | ||
954 | __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A); | ||
955 | } | ||
956 | |||
957 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
958 | _mm256_movehdup_ps (__m256 __X) | ||
959 | { | ||
960 | return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X); | ||
961 | } | ||
962 | |||
963 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
964 | _mm256_moveldup_ps (__m256 __X) | ||
965 | { | ||
966 | return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X); | ||
967 | } | ||
968 | |||
969 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
970 | _mm256_movedup_pd (__m256d __X) | ||
971 | { | ||
972 | return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X); | ||
973 | } | ||
974 | |||
975 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
976 | _mm256_lddqu_si256 (__m256i const *__P) | ||
977 | { | ||
978 | return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P); | ||
979 | } | ||
980 | |||
981 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
982 | _mm256_stream_si256 (__m256i *__A, __m256i __B) | ||
983 | { | ||
984 | __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B); | ||
985 | } | ||
986 | |||
987 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
988 | _mm256_stream_pd (double *__A, __m256d __B) | ||
989 | { | ||
990 | __builtin_ia32_movntpd256 (__A, (__v4df)__B); | ||
991 | } | ||
992 | |||
993 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
994 | _mm256_stream_ps (float *__P, __m256 __A) | ||
995 | { | ||
996 | __builtin_ia32_movntps256 (__P, (__v8sf)__A); | ||
997 | } | ||
998 | |||
999 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1000 | _mm256_rcp_ps (__m256 __A) | ||
1001 | { | ||
1002 | return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A); | ||
1003 | } | ||
1004 | |||
1005 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1006 | _mm256_rsqrt_ps (__m256 __A) | ||
1007 | { | ||
1008 | return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A); | ||
1009 | } | ||
1010 | |||
1011 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1012 | _mm256_sqrt_pd (__m256d __A) | ||
1013 | { | ||
1014 | return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A); | ||
1015 | } | ||
1016 | |||
1017 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1018 | _mm256_sqrt_ps (__m256 __A) | ||
1019 | { | ||
1020 | return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A); | ||
1021 | } | ||
1022 | |||
1023 | #ifdef __OPTIMIZE__ | ||
1024 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1025 | _mm256_round_pd (__m256d __V, const int __M) | ||
1026 | { | ||
1027 | return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M); | ||
1028 | } | ||
1029 | |||
1030 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1031 | _mm256_round_ps (__m256 __V, const int __M) | ||
1032 | { | ||
1033 | return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M); | ||
1034 | } | ||
1035 | #else | ||
1036 | #define _mm256_round_pd(V, M) \ | ||
1037 | ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M))) | ||
1038 | |||
1039 | #define _mm256_round_ps(V, M) \ | ||
1040 | ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M))) | ||
1041 | #endif | ||
1042 | |||
1043 | #define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL) | ||
1044 | #define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR) | ||
1045 | #define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL) | ||
1046 | #define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR) | ||
1047 | |||
1048 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1049 | _mm256_unpackhi_pd (__m256d __A, __m256d __B) | ||
1050 | { | ||
1051 | return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B); | ||
1052 | } | ||
1053 | |||
1054 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1055 | _mm256_unpacklo_pd (__m256d __A, __m256d __B) | ||
1056 | { | ||
1057 | return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B); | ||
1058 | } | ||
1059 | |||
1060 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1061 | _mm256_unpackhi_ps (__m256 __A, __m256 __B) | ||
1062 | { | ||
1063 | return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B); | ||
1064 | } | ||
1065 | |||
1066 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1067 | _mm256_unpacklo_ps (__m256 __A, __m256 __B) | ||
1068 | { | ||
1069 | return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B); | ||
1070 | } | ||
1071 | |||
1072 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1073 | _mm_testz_pd (__m128d __M, __m128d __V) | ||
1074 | { | ||
1075 | return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V); | ||
1076 | } | ||
1077 | |||
1078 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1079 | _mm_testc_pd (__m128d __M, __m128d __V) | ||
1080 | { | ||
1081 | return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V); | ||
1082 | } | ||
1083 | |||
1084 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1085 | _mm_testnzc_pd (__m128d __M, __m128d __V) | ||
1086 | { | ||
1087 | return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V); | ||
1088 | } | ||
1089 | |||
1090 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1091 | _mm_testz_ps (__m128 __M, __m128 __V) | ||
1092 | { | ||
1093 | return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V); | ||
1094 | } | ||
1095 | |||
1096 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1097 | _mm_testc_ps (__m128 __M, __m128 __V) | ||
1098 | { | ||
1099 | return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V); | ||
1100 | } | ||
1101 | |||
1102 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1103 | _mm_testnzc_ps (__m128 __M, __m128 __V) | ||
1104 | { | ||
1105 | return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V); | ||
1106 | } | ||
1107 | |||
1108 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1109 | _mm256_testz_pd (__m256d __M, __m256d __V) | ||
1110 | { | ||
1111 | return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V); | ||
1112 | } | ||
1113 | |||
1114 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1115 | _mm256_testc_pd (__m256d __M, __m256d __V) | ||
1116 | { | ||
1117 | return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V); | ||
1118 | } | ||
1119 | |||
1120 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1121 | _mm256_testnzc_pd (__m256d __M, __m256d __V) | ||
1122 | { | ||
1123 | return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V); | ||
1124 | } | ||
1125 | |||
1126 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1127 | _mm256_testz_ps (__m256 __M, __m256 __V) | ||
1128 | { | ||
1129 | return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V); | ||
1130 | } | ||
1131 | |||
1132 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1133 | _mm256_testc_ps (__m256 __M, __m256 __V) | ||
1134 | { | ||
1135 | return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V); | ||
1136 | } | ||
1137 | |||
1138 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1139 | _mm256_testnzc_ps (__m256 __M, __m256 __V) | ||
1140 | { | ||
1141 | return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V); | ||
1142 | } | ||
1143 | |||
1144 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1145 | _mm256_testz_si256 (__m256i __M, __m256i __V) | ||
1146 | { | ||
1147 | return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V); | ||
1148 | } | ||
1149 | |||
1150 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1151 | _mm256_testc_si256 (__m256i __M, __m256i __V) | ||
1152 | { | ||
1153 | return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V); | ||
1154 | } | ||
1155 | |||
1156 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1157 | _mm256_testnzc_si256 (__m256i __M, __m256i __V) | ||
1158 | { | ||
1159 | return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V); | ||
1160 | } | ||
1161 | |||
1162 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1163 | _mm256_movemask_pd (__m256d __A) | ||
1164 | { | ||
1165 | return __builtin_ia32_movmskpd256 ((__v4df)__A); | ||
1166 | } | ||
1167 | |||
1168 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1169 | _mm256_movemask_ps (__m256 __A) | ||
1170 | { | ||
1171 | return __builtin_ia32_movmskps256 ((__v8sf)__A); | ||
1172 | } | ||
1173 | |||
1174 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1175 | _mm256_undefined_pd (void) | ||
1176 | { | ||
1177 | __m256d __Y = __Y; | ||
1178 | return __Y; | ||
1179 | } | ||
1180 | |||
1181 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1182 | _mm256_undefined_ps (void) | ||
1183 | { | ||
1184 | __m256 __Y = __Y; | ||
1185 | return __Y; | ||
1186 | } | ||
1187 | |||
1188 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1189 | _mm256_undefined_si256 (void) | ||
1190 | { | ||
1191 | __m256i __Y = __Y; | ||
1192 | return __Y; | ||
1193 | } | ||
1194 | |||
1195 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1196 | _mm256_setzero_pd (void) | ||
1197 | { | ||
1198 | return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 }; | ||
1199 | } | ||
1200 | |||
1201 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1202 | _mm256_setzero_ps (void) | ||
1203 | { | ||
1204 | return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0, | ||
1205 | 0.0, 0.0, 0.0, 0.0 }; | ||
1206 | } | ||
1207 | |||
1208 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1209 | _mm256_setzero_si256 (void) | ||
1210 | { | ||
1211 | return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 }; | ||
1212 | } | ||
1213 | |||
1214 | /* Create the vector [A B C D]. */ | ||
1215 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1216 | _mm256_set_pd (double __A, double __B, double __C, double __D) | ||
1217 | { | ||
1218 | return __extension__ (__m256d){ __D, __C, __B, __A }; | ||
1219 | } | ||
1220 | |||
1221 | /* Create the vector [A B C D E F G H]. */ | ||
1222 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1223 | _mm256_set_ps (float __A, float __B, float __C, float __D, | ||
1224 | float __E, float __F, float __G, float __H) | ||
1225 | { | ||
1226 | return __extension__ (__m256){ __H, __G, __F, __E, | ||
1227 | __D, __C, __B, __A }; | ||
1228 | } | ||
1229 | |||
1230 | /* Create the vector [A B C D E F G H]. */ | ||
1231 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1232 | _mm256_set_epi32 (int __A, int __B, int __C, int __D, | ||
1233 | int __E, int __F, int __G, int __H) | ||
1234 | { | ||
1235 | return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E, | ||
1236 | __D, __C, __B, __A }; | ||
1237 | } | ||
1238 | |||
1239 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1240 | _mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12, | ||
1241 | short __q11, short __q10, short __q09, short __q08, | ||
1242 | short __q07, short __q06, short __q05, short __q04, | ||
1243 | short __q03, short __q02, short __q01, short __q00) | ||
1244 | { | ||
1245 | return __extension__ (__m256i)(__v16hi){ | ||
1246 | __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, | ||
1247 | __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15 | ||
1248 | }; | ||
1249 | } | ||
1250 | |||
1251 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1252 | _mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28, | ||
1253 | char __q27, char __q26, char __q25, char __q24, | ||
1254 | char __q23, char __q22, char __q21, char __q20, | ||
1255 | char __q19, char __q18, char __q17, char __q16, | ||
1256 | char __q15, char __q14, char __q13, char __q12, | ||
1257 | char __q11, char __q10, char __q09, char __q08, | ||
1258 | char __q07, char __q06, char __q05, char __q04, | ||
1259 | char __q03, char __q02, char __q01, char __q00) | ||
1260 | { | ||
1261 | return __extension__ (__m256i)(__v32qi){ | ||
1262 | __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, | ||
1263 | __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15, | ||
1264 | __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23, | ||
1265 | __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31 | ||
1266 | }; | ||
1267 | } | ||
1268 | |||
1269 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1270 | _mm256_set_epi64x (long long __A, long long __B, long long __C, | ||
1271 | long long __D) | ||
1272 | { | ||
1273 | return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A }; | ||
1274 | } | ||
1275 | |||
1276 | /* Create a vector with all elements equal to A. */ | ||
1277 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1278 | _mm256_set1_pd (double __A) | ||
1279 | { | ||
1280 ![]() | 0.0% | return __extension__ (__m256d){ __A, __A, __A, __A }; | |
![]() | |||
1281 | } | ||
1282 | |||
1283 | /* Create a vector with all elements equal to A. */ | ||
1284 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1285 | _mm256_set1_ps (float __A) | ||
1286 | { | ||
1287 | return __extension__ (__m256){ __A, __A, __A, __A, | ||
1288 | __A, __A, __A, __A }; | ||
1289 | } | ||
1290 | |||
1291 | /* Create a vector with all elements equal to A. */ | ||
1292 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1293 | _mm256_set1_epi32 (int __A) | ||
1294 | { | ||
1295 | return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A, | ||
1296 | __A, __A, __A, __A }; | ||
1297 | } | ||
1298 | |||
1299 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1300 | _mm256_set1_epi16 (short __A) | ||
1301 | { | ||
1302 | return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A, | ||
1303 | __A, __A, __A, __A, __A, __A, __A, __A); | ||
1304 | } | ||
1305 | |||
1306 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1307 | _mm256_set1_epi8 (char __A) | ||
1308 | { | ||
1309 | return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A, | ||
1310 | __A, __A, __A, __A, __A, __A, __A, __A, | ||
1311 | __A, __A, __A, __A, __A, __A, __A, __A, | ||
1312 | __A, __A, __A, __A, __A, __A, __A, __A); | ||
1313 | } | ||
1314 | |||
1315 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1316 | _mm256_set1_epi64x (long long __A) | ||
1317 | { | ||
1318 | return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A }; | ||
1319 | } | ||
1320 | |||
1321 | /* Create vectors of elements in the reversed order from the | ||
1322 | _mm256_set_XXX functions. */ | ||
1323 | |||
1324 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1325 | _mm256_setr_pd (double __A, double __B, double __C, double __D) | ||
1326 | { | ||
1327 | return _mm256_set_pd (__D, __C, __B, __A); | ||
1328 | } | ||
1329 | |||
1330 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1331 | _mm256_setr_ps (float __A, float __B, float __C, float __D, | ||
1332 | float __E, float __F, float __G, float __H) | ||
1333 | { | ||
1334 | return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A); | ||
1335 | } | ||
1336 | |||
1337 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1338 | _mm256_setr_epi32 (int __A, int __B, int __C, int __D, | ||
1339 | int __E, int __F, int __G, int __H) | ||
1340 | { | ||
1341 | return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A); | ||
1342 | } | ||
1343 | |||
1344 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1345 | _mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12, | ||
1346 | short __q11, short __q10, short __q09, short __q08, | ||
1347 | short __q07, short __q06, short __q05, short __q04, | ||
1348 | short __q03, short __q02, short __q01, short __q00) | ||
1349 | { | ||
1350 | return _mm256_set_epi16 (__q00, __q01, __q02, __q03, | ||
1351 | __q04, __q05, __q06, __q07, | ||
1352 | __q08, __q09, __q10, __q11, | ||
1353 | __q12, __q13, __q14, __q15); | ||
1354 | } | ||
1355 | |||
1356 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1357 | _mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28, | ||
1358 | char __q27, char __q26, char __q25, char __q24, | ||
1359 | char __q23, char __q22, char __q21, char __q20, | ||
1360 | char __q19, char __q18, char __q17, char __q16, | ||
1361 | char __q15, char __q14, char __q13, char __q12, | ||
1362 | char __q11, char __q10, char __q09, char __q08, | ||
1363 | char __q07, char __q06, char __q05, char __q04, | ||
1364 | char __q03, char __q02, char __q01, char __q00) | ||
1365 | { | ||
1366 | return _mm256_set_epi8 (__q00, __q01, __q02, __q03, | ||
1367 | __q04, __q05, __q06, __q07, | ||
1368 | __q08, __q09, __q10, __q11, | ||
1369 | __q12, __q13, __q14, __q15, | ||
1370 | __q16, __q17, __q18, __q19, | ||
1371 | __q20, __q21, __q22, __q23, | ||
1372 | __q24, __q25, __q26, __q27, | ||
1373 | __q28, __q29, __q30, __q31); | ||
1374 | } | ||
1375 | |||
1376 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1377 | _mm256_setr_epi64x (long long __A, long long __B, long long __C, | ||
1378 | long long __D) | ||
1379 | { | ||
1380 | return _mm256_set_epi64x (__D, __C, __B, __A); | ||
1381 | } | ||
1382 | |||
1383 | /* Casts between various SP, DP, INT vector types. Note that these do no | ||
1384 | conversion of values, they just change the type. */ | ||
1385 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1386 | _mm256_castpd_ps (__m256d __A) | ||
1387 | { | ||
1388 | return (__m256) __A; | ||
1389 | } | ||
1390 | |||
1391 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1392 | _mm256_castpd_si256 (__m256d __A) | ||
1393 | { | ||
1394 | return (__m256i) __A; | ||
1395 | } | ||
1396 | |||
1397 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1398 | _mm256_castps_pd (__m256 __A) | ||
1399 | { | ||
1400 | return (__m256d) __A; | ||
1401 | } | ||
1402 | |||
1403 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1404 | _mm256_castps_si256(__m256 __A) | ||
1405 | { | ||
1406 | return (__m256i) __A; | ||
1407 | } | ||
1408 | |||
1409 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1410 | _mm256_castsi256_ps (__m256i __A) | ||
1411 | { | ||
1412 | return (__m256) __A; | ||
1413 | } | ||
1414 | |||
1415 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1416 | _mm256_castsi256_pd (__m256i __A) | ||
1417 | { | ||
1418 | return (__m256d) __A; | ||
1419 | } | ||
1420 | |||
1421 | extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1422 | _mm256_castpd256_pd128 (__m256d __A) | ||
1423 | { | ||
1424 | return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A); | ||
1425 | } | ||
1426 | |||
1427 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1428 | _mm256_castps256_ps128 (__m256 __A) | ||
1429 | { | ||
1430 | return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A); | ||
1431 | } | ||
1432 | |||
1433 | extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1434 | _mm256_castsi256_si128 (__m256i __A) | ||
1435 | { | ||
1436 | return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A); | ||
1437 | } | ||
1438 | |||
1439 | /* When cast is done from a 128 to 256-bit type, the low 128 bits of | ||
1440 | the 256-bit result contain source parameter value and the upper 128 | ||
1441 | bits of the result are undefined. Those intrinsics shouldn't | ||
1442 | generate any extra moves. */ | ||
1443 | |||
1444 | extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1445 | _mm256_castpd128_pd256 (__m128d __A) | ||
1446 | { | ||
1447 | return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A); | ||
1448 | } | ||
1449 | |||
1450 | extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1451 | _mm256_castps128_ps256 (__m128 __A) | ||
1452 | { | ||
1453 | return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A); | ||
1454 | } | ||
1455 | |||
1456 | extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
1457 | _mm256_castsi128_si256 (__m128i __A) | ||
1458 | { | ||
1459 | return (__m256i) __builtin_ia32_si256_si ((__v4si)__A); | ||
1460 | } | ||
1461 | |||
1462 | #ifdef __DISABLE_AVX__ | ||
1463 | #undef __DISABLE_AVX__ | ||
1464 | #pragma GCC pop_options | ||
1465 | #endif /* __DISABLE_AVX__ */ | ||
1466 | |||
1467 | #endif /* _AVXINTRIN_H_INCLUDED */ | ||
1468 |
Copyright (c) 2006-2012 Rogue Wave Software, Inc. All Rights Reserved.
Patents pending.