Home | History | Annotate | Download | only in include
      1 /*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
      2  *
      3  * Permission is hereby granted, free of charge, to any person obtaining a copy
      4  * of this software and associated documentation files (the "Software"), to deal
      5  * in the Software without restriction, including without limitation the rights
      6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      7  * copies of the Software, and to permit persons to whom the Software is
      8  * furnished to do so, subject to the following conditions:
      9  *
     10  * The above copyright notice and this permission notice shall be included in
     11  * all copies or substantial portions of the Software.
     12  *
     13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     19  * THE SOFTWARE.
     20  *
     21  *===-----------------------------------------------------------------------===
     22  */
     23 
     24 #ifndef __X86INTRIN_H
     25 #error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead."
     26 #endif
     27 
     28 #ifndef __FMA4INTRIN_H
     29 #define __FMA4INTRIN_H
     30 
     31 #ifndef __FMA4__
     32 # error "FMA4 instruction set is not enabled"
     33 #else
     34 
     35 #include <pmmintrin.h>
     36 
     37 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
     38 _mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
     39 {
     40   return (__m128)__builtin_ia32_vfmaddps(__A, __B, __C);
     41 }
     42 
     43 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
     44 _mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
     45 {
     46   return (__m128d)__builtin_ia32_vfmaddpd(__A, __B, __C);
     47 }
     48 
     49 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
     50 _mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
     51 {
     52   return (__m128)__builtin_ia32_vfmaddss(__A, __B, __C);
     53 }
     54 
     55 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
     56 _mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
     57 {
     58   return (__m128d)__builtin_ia32_vfmaddsd(__A, __B, __C);
     59 }
     60 
     61 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
     62 _mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
     63 {
     64   return (__m128)__builtin_ia32_vfmsubps(__A, __B, __C);
     65 }
     66 
     67 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
     68 _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
     69 {
     70   return (__m128d)__builtin_ia32_vfmsubpd(__A, __B, __C);
     71 }
     72 
     73 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
     74 _mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
     75 {
     76   return (__m128)__builtin_ia32_vfmsubss(__A, __B, __C);
     77 }
     78 
     79 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
     80 _mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
     81 {
     82   return (__m128d)__builtin_ia32_vfmsubsd(__A, __B, __C);
     83 }
     84 
     85 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
     86 _mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
     87 {
     88   return (__m128)__builtin_ia32_vfnmaddps(__A, __B, __C);
     89 }
     90 
     91 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
     92 _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
     93 {
     94   return (__m128d)__builtin_ia32_vfnmaddpd(__A, __B, __C);
     95 }
     96 
     97 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
     98 _mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
     99 {
    100   return (__m128)__builtin_ia32_vfnmaddss(__A, __B, __C);
    101 }
    102 
    103 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    104 _mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
    105 {
    106   return (__m128d)__builtin_ia32_vfnmaddsd(__A, __B, __C);
    107 }
    108 
    109 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    110 _mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
    111 {
    112   return (__m128)__builtin_ia32_vfnmsubps(__A, __B, __C);
    113 }
    114 
    115 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    116 _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
    117 {
    118   return (__m128d)__builtin_ia32_vfnmsubpd(__A, __B, __C);
    119 }
    120 
    121 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    122 _mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
    123 {
    124   return (__m128)__builtin_ia32_vfnmsubss(__A, __B, __C);
    125 }
    126 
    127 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    128 _mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
    129 {
    130   return (__m128d)__builtin_ia32_vfnmsubsd(__A, __B, __C);
    131 }
    132 
    133 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    134 _mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C)
    135 {
    136   return (__m128)__builtin_ia32_vfmaddsubps(__A, __B, __C);
    137 }
    138 
    139 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    140 _mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
    141 {
    142   return (__m128d)__builtin_ia32_vfmaddsubpd(__A, __B, __C);
    143 }
    144 
    145 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
    146 _mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
    147 {
    148   return (__m128)__builtin_ia32_vfmsubaddps(__A, __B, __C);
    149 }
    150 
    151 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
    152 _mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
    153 {
    154   return (__m128d)__builtin_ia32_vfmsubaddpd(__A, __B, __C);
    155 }
    156 
    157 static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
    158 _mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
    159 {
    160   return (__m256)__builtin_ia32_vfmaddps256(__A, __B, __C);
    161 }
    162 
    163 static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
    164 _mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
    165 {
    166   return (__m256d)__builtin_ia32_vfmaddpd256(__A, __B, __C);
    167 }
    168 
    169 static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
    170 _mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
    171 {
    172   return (__m256)__builtin_ia32_vfmsubps256(__A, __B, __C);
    173 }
    174 
    175 static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
    176 _mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
    177 {
    178   return (__m256d)__builtin_ia32_vfmsubpd256(__A, __B, __C);
    179 }
    180 
    181 static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
    182 _mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
    183 {
    184   return (__m256)__builtin_ia32_vfnmaddps256(__A, __B, __C);
    185 }
    186 
    187 static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
    188 _mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
    189 {
    190   return (__m256d)__builtin_ia32_vfnmaddpd256(__A, __B, __C);
    191 }
    192 
    193 static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
    194 _mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
    195 {
    196   return (__m256)__builtin_ia32_vfnmsubps256(__A, __B, __C);
    197 }
    198 
    199 static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
    200 _mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
    201 {
    202   return (__m256d)__builtin_ia32_vfnmsubpd256(__A, __B, __C);
    203 }
    204 
    205 static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
    206 _mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C)
    207 {
    208   return (__m256)__builtin_ia32_vfmaddsubps256(__A, __B, __C);
    209 }
    210 
    211 static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
    212 _mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
    213 {
    214   return (__m256d)__builtin_ia32_vfmaddsubpd256(__A, __B, __C);
    215 }
    216 
    217 static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
    218 _mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
    219 {
    220   return (__m256)__builtin_ia32_vfmsubaddps256(__A, __B, __C);
    221 }
    222 
    223 static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
    224 _mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
    225 {
    226   return (__m256d)__builtin_ia32_vfmsubaddpd256(__A, __B, __C);
    227 }
    228 
    229 #endif /* __FMA4__ */
    230 
    231 #endif /* __FMA4INTRIN_H */
    232