1 /*===---- ammintrin.h - SSE4a intrinsics -----------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24 #ifndef __AMMINTRIN_H 25 #define __AMMINTRIN_H 26 27 #include <pmmintrin.h> 28 29 /* Define the default attributes for the functions in this file. */ 30 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a"))) 31 32 /// \brief Extracts the specified bits from the lower 64 bits of the 128-bit 33 /// integer vector operand at the index idx and of the length len. 34 /// 35 /// \headerfile <x86intrin.h> 36 /// 37 /// \code 38 /// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx); 39 /// \endcode 40 /// 41 /// \code 42 /// This intrinsic corresponds to the \c EXTRQ instruction. 43 /// \endcode 44 /// 45 /// \param x 46 /// The value from which bits are extracted. 47 /// \param len 48 /// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0] 49 /// are zero, the length is interpreted as 64. 50 /// \param idx 51 /// Bits [5:0] specify the index of the least significant bit; the other 52 /// bits are ignored. If the sum of the index and length is greater than 53 /// 64, the result is undefined. If the length and index are both zero, 54 /// bits [63:0] of parameter x are extracted. If the length is zero 55 /// but the index is non-zero, the result is undefined. 56 /// \returns A 128-bit integer vector whose lower 64 bits contain the bits 57 /// extracted from the source operand. 58 #define _mm_extracti_si64(x, len, idx) \ 59 ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \ 60 (char)(len), (char)(idx))) 61 62 /// \brief Extracts the specified bits from the lower 64 bits of the 128-bit 63 /// integer vector operand at the index and of the length specified by __y. 64 /// 65 /// \headerfile <x86intrin.h> 66 /// 67 /// \code 68 /// This intrinsic corresponds to the \c EXTRQ instruction. 69 /// \endcode 70 /// 71 /// \param __x 72 /// The value from which bits are extracted. 73 /// \param __y 74 /// Specifies the index of the least significant bit at [13:8] 75 /// and the length at [5:0]; all other bits are ignored. 76 /// If bits [5:0] are zero, the length is interpreted as 64. 77 /// If the sum of the index and length is greater than 64, the result is 78 /// undefined. If the length and index are both zero, bits [63:0] of 79 /// parameter __x are extracted. If the length is zero but the index is 80 /// non-zero, the result is undefined. 81 /// \returns A 128-bit vector whose lower 64 bits contain the bits extracted 82 /// from the source operand. 83 static __inline__ __m128i __DEFAULT_FN_ATTRS 84 _mm_extract_si64(__m128i __x, __m128i __y) 85 { 86 return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y); 87 } 88 89 /// \brief Inserts bits of a specified length from the source integer vector 90 /// y into the lower 64 bits of the destination integer vector x at the 91 /// index idx and of the length len. 92 /// 93 /// \headerfile <x86intrin.h> 94 /// 95 /// \code 96 /// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len, 97 /// const int idx); 98 /// \endcode 99 /// 100 /// \code 101 /// This intrinsic corresponds to the \c INSERTQ instruction. 102 /// \endcode 103 /// 104 /// \param x 105 /// The destination operand where bits will be inserted. The inserted bits 106 /// are defined by the length len and by the index idx specifying the least 107 /// significant bit. 108 /// \param y 109 /// The source operand containing the bits to be extracted. The extracted 110 /// bits are the least significant bits of operand y of length len. 111 /// \param len 112 /// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0] 113 /// are zero, the length is interpreted as 64. 114 /// \param idx 115 /// Bits [5:0] specify the index of the least significant bit; the other 116 /// bits are ignored. If the sum of the index and length is greater than 117 /// 64, the result is undefined. If the length and index are both zero, 118 /// bits [63:0] of parameter y are inserted into parameter x. If the 119 /// length is zero but the index is non-zero, the result is undefined. 120 /// \returns A 128-bit integer vector containing the original lower 64-bits 121 /// of destination operand x with the specified bitfields replaced by the 122 /// lower bits of source operand y. The upper 64 bits of the return value 123 /// are undefined. 124 125 #define _mm_inserti_si64(x, y, len, idx) \ 126 ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \ 127 (__v2di)(__m128i)(y), \ 128 (char)(len), (char)(idx))) 129 130 /// \brief Inserts bits of a specified length from the source integer vector 131 /// __y into the lower 64 bits of the destination integer vector __x at 132 /// the index and of the length specified by __y. 133 /// 134 /// \headerfile <x86intrin.h> 135 /// 136 /// \code 137 /// This intrinsic corresponds to the \c INSERTQ instruction. 138 /// \endcode 139 /// 140 /// \param __x 141 /// The destination operand where bits will be inserted. The inserted bits 142 /// are defined by the length and by the index of the least significant bit 143 /// specified by operand __y. 144 /// \param __y 145 /// The source operand containing the bits to be extracted. The extracted 146 /// bits are the least significant bits of operand __y with length specified 147 /// by bits [69:64]. These are inserted into the destination at the index 148 /// specified by bits [77:72]; all other bits are ignored. 149 /// If bits [69:64] are zero, the length is interpreted as 64. 150 /// If the sum of the index and length is greater than 64, the result is 151 /// undefined. If the length and index are both zero, bits [63:0] of 152 /// parameter __y are inserted into parameter __x. If the length 153 /// is zero but the index is non-zero, the result is undefined. 154 /// \returns A 128-bit integer vector containing the original lower 64-bits 155 /// of destination operand __x with the specified bitfields replaced by the 156 /// lower bits of source operand __y. The upper 64 bits of the return value 157 /// are undefined. 158 159 static __inline__ __m128i __DEFAULT_FN_ATTRS 160 _mm_insert_si64(__m128i __x, __m128i __y) 161 { 162 return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y); 163 } 164 165 /// \brief Stores a 64-bit double-precision value in a 64-bit memory location. 166 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 167 /// used again soon). 168 /// 169 /// \headerfile <x86intrin.h> 170 /// 171 /// \code 172 /// This intrinsic corresponds to the \c MOVNTSD instruction. 173 /// \endcode 174 /// 175 /// \param __p 176 /// The 64-bit memory location used to store the register value. 177 /// \param __a 178 /// The 64-bit double-precision floating-point register value to 179 /// be stored. 180 static __inline__ void __DEFAULT_FN_ATTRS 181 _mm_stream_sd(double *__p, __m128d __a) 182 { 183 __builtin_ia32_movntsd(__p, (__v2df)__a); 184 } 185 186 /// \brief Stores a 32-bit single-precision floating-point value in a 32-bit 187 /// memory location. To minimize caching, the data is flagged as 188 /// non-temporal (unlikely to be used again soon). 189 /// 190 /// \headerfile <x86intrin.h> 191 /// 192 /// \code 193 /// This intrinsic corresponds to the \c MOVNTSS instruction. 194 /// \endcode 195 /// 196 /// \param __p 197 /// The 32-bit memory location used to store the register value. 198 /// \param __a 199 /// The 32-bit single-precision floating-point register value to 200 /// be stored. 201 static __inline__ void __DEFAULT_FN_ATTRS 202 _mm_stream_ss(float *__p, __m128 __a) 203 { 204 __builtin_ia32_movntss(__p, (__v4sf)__a); 205 } 206 207 #undef __DEFAULT_FN_ATTRS 208 209 #endif /* __AMMINTRIN_H */ 210