Home | History | Annotate | Download | only in include
      1 /* Copyright (C) 2002-2014 Free Software Foundation, Inc.
      2 
      3    This file is part of GCC.
      4 
      5    GCC is free software; you can redistribute it and/or modify it
      6    under the terms of the GNU General Public License as published
      7    by the Free Software Foundation; either version 3, or (at your
      8    option) any later version.
      9 
     10    GCC is distributed in the hope that it will be useful, but WITHOUT
     11    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     12    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
     13    License for more details.
     14 
     15    Under Section 7 of GPL version 3, you are granted additional
     16    permissions described in the GCC Runtime Library Exception, version
     17    3.1, as published by the Free Software Foundation.
     18 
     19    You should have received a copy of the GNU General Public License and
     20    a copy of the GCC Runtime Library Exception along with this program;
     21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     22    <http://www.gnu.org/licenses/>.  */
     23 
     24 #ifndef _MMINTRIN_H_INCLUDED
     25 #define _MMINTRIN_H_INCLUDED
     26 
     27 #ifndef __IWMMXT__
     28 #error mmintrin.h included without enabling WMMX/WMMX2 instructions (e.g. -march=iwmmxt or -march=iwmmxt2)
     29 #endif
     30 
     31 
     32 #if defined __cplusplus
     33 extern "C" {
     34 /* Intrinsics use C name-mangling.  */
     35 #endif /* __cplusplus */
     36 
     37 /* The data type intended for user use.  */
     38 typedef unsigned long long __m64, __int64;
     39 
     40 /* Internal data types for implementing the intrinsics.  */
     41 typedef int __v2si __attribute__ ((vector_size (8)));
     42 typedef short __v4hi __attribute__ ((vector_size (8)));
     43 typedef signed char __v8qi __attribute__ ((vector_size (8)));
     44 
     45 /* Provided for source compatibility with MMX.  */
     46 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     47 _mm_empty (void)
     48 {
     49 }
     50 
     51 /* "Convert" __m64 and __int64 into each other.  */
     52 static __inline __m64
     53 _mm_cvtsi64_m64 (__int64 __i)
     54 {
     55   return __i;
     56 }
     57 
     58 static __inline __int64
     59 _mm_cvtm64_si64 (__m64 __i)
     60 {
     61   return __i;
     62 }
     63 
     64 static __inline int
     65 _mm_cvtsi64_si32 (__int64 __i)
     66 {
     67   return __i;
     68 }
     69 
     70 static __inline __int64
     71 _mm_cvtsi32_si64 (int __i)
     72 {
     73   return (__i & 0xffffffff);
     74 }
     75 
     76 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
     77    the result, and the four 16-bit values from M2 into the upper four 8-bit
     78    values of the result, all with signed saturation.  */
     79 static __inline __m64
     80 _mm_packs_pi16 (__m64 __m1, __m64 __m2)
     81 {
     82   return (__m64) __builtin_arm_wpackhss ((__v4hi)__m1, (__v4hi)__m2);
     83 }
     84 
     85 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
     86    the result, and the two 32-bit values from M2 into the upper two 16-bit
     87    values of the result, all with signed saturation.  */
     88 static __inline __m64
     89 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
     90 {
     91   return (__m64) __builtin_arm_wpackwss ((__v2si)__m1, (__v2si)__m2);
     92 }
     93 
     94 /* Copy the 64-bit value from M1 into the lower 32-bits of the result, and
     95    the 64-bit value from M2 into the upper 32-bits of the result, all with
     96    signed saturation for values that do not fit exactly into 32-bits.  */
     97 static __inline __m64
     98 _mm_packs_pi64 (__m64 __m1, __m64 __m2)
     99 {
    100   return (__m64) __builtin_arm_wpackdss ((long long)__m1, (long long)__m2);
    101 }
    102 
    103 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
    104    the result, and the four 16-bit values from M2 into the upper four 8-bit
    105    values of the result, all with unsigned saturation.  */
    106 static __inline __m64
    107 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
    108 {
    109   return (__m64) __builtin_arm_wpackhus ((__v4hi)__m1, (__v4hi)__m2);
    110 }
    111 
    112 /* Pack the two 32-bit values from M1 into the lower two 16-bit values of
    113    the result, and the two 32-bit values from M2 into the upper two 16-bit
    114    values of the result, all with unsigned saturation.  */
    115 static __inline __m64
    116 _mm_packs_pu32 (__m64 __m1, __m64 __m2)
    117 {
    118   return (__m64) __builtin_arm_wpackwus ((__v2si)__m1, (__v2si)__m2);
    119 }
    120 
    121 /* Copy the 64-bit value from M1 into the lower 32-bits of the result, and
    122    the 64-bit value from M2 into the upper 32-bits of the result, all with
    123    unsigned saturation for values that do not fit exactly into 32-bits.  */
    124 static __inline __m64
    125 _mm_packs_pu64 (__m64 __m1, __m64 __m2)
    126 {
    127   return (__m64) __builtin_arm_wpackdus ((long long)__m1, (long long)__m2);
    128 }
    129 
    130 /* Interleave the four 8-bit values from the high half of M1 with the four
    131    8-bit values from the high half of M2.  */
    132 static __inline __m64
    133 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
    134 {
    135   return (__m64) __builtin_arm_wunpckihb ((__v8qi)__m1, (__v8qi)__m2);
    136 }
    137 
    138 /* Interleave the two 16-bit values from the high half of M1 with the two
    139    16-bit values from the high half of M2.  */
    140 static __inline __m64
    141 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
    142 {
    143   return (__m64) __builtin_arm_wunpckihh ((__v4hi)__m1, (__v4hi)__m2);
    144 }
    145 
    146 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
    147    value from the high half of M2.  */
    148 static __inline __m64
    149 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
    150 {
    151   return (__m64) __builtin_arm_wunpckihw ((__v2si)__m1, (__v2si)__m2);
    152 }
    153 
    154 /* Interleave the four 8-bit values from the low half of M1 with the four
    155    8-bit values from the low half of M2.  */
    156 static __inline __m64
    157 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
    158 {
    159   return (__m64) __builtin_arm_wunpckilb ((__v8qi)__m1, (__v8qi)__m2);
    160 }
    161 
    162 /* Interleave the two 16-bit values from the low half of M1 with the two
    163    16-bit values from the low half of M2.  */
    164 static __inline __m64
    165 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
    166 {
    167   return (__m64) __builtin_arm_wunpckilh ((__v4hi)__m1, (__v4hi)__m2);
    168 }
    169 
    170 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
    171    value from the low half of M2.  */
    172 static __inline __m64
    173 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
    174 {
    175   return (__m64) __builtin_arm_wunpckilw ((__v2si)__m1, (__v2si)__m2);
    176 }
    177 
    178 /* Take the four 8-bit values from the low half of M1, sign extend them,
    179    and return the result as a vector of four 16-bit quantities.  */
    180 static __inline __m64
    181 _mm_unpackel_pi8 (__m64 __m1)
    182 {
    183   return (__m64) __builtin_arm_wunpckelsb ((__v8qi)__m1);
    184 }
    185 
    186 /* Take the two 16-bit values from the low half of M1, sign extend them,
    187    and return the result as a vector of two 32-bit quantities.  */
    188 static __inline __m64
    189 _mm_unpackel_pi16 (__m64 __m1)
    190 {
    191   return (__m64) __builtin_arm_wunpckelsh ((__v4hi)__m1);
    192 }
    193 
    194 /* Take the 32-bit value from the low half of M1, and return it sign extended
    195   to 64 bits.  */
    196 static __inline __m64
    197 _mm_unpackel_pi32 (__m64 __m1)
    198 {
    199   return (__m64) __builtin_arm_wunpckelsw ((__v2si)__m1);
    200 }
    201 
    202 /* Take the four 8-bit values from the high half of M1, sign extend them,
    203    and return the result as a vector of four 16-bit quantities.  */
    204 static __inline __m64
    205 _mm_unpackeh_pi8 (__m64 __m1)
    206 {
    207   return (__m64) __builtin_arm_wunpckehsb ((__v8qi)__m1);
    208 }
    209 
    210 /* Take the two 16-bit values from the high half of M1, sign extend them,
    211    and return the result as a vector of two 32-bit quantities.  */
    212 static __inline __m64
    213 _mm_unpackeh_pi16 (__m64 __m1)
    214 {
    215   return (__m64) __builtin_arm_wunpckehsh ((__v4hi)__m1);
    216 }
    217 
    218 /* Take the 32-bit value from the high half of M1, and return it sign extended
    219   to 64 bits.  */
    220 static __inline __m64
    221 _mm_unpackeh_pi32 (__m64 __m1)
    222 {
    223   return (__m64) __builtin_arm_wunpckehsw ((__v2si)__m1);
    224 }
    225 
    226 /* Take the four 8-bit values from the low half of M1, zero extend them,
    227    and return the result as a vector of four 16-bit quantities.  */
    228 static __inline __m64
    229 _mm_unpackel_pu8 (__m64 __m1)
    230 {
    231   return (__m64) __builtin_arm_wunpckelub ((__v8qi)__m1);
    232 }
    233 
    234 /* Take the two 16-bit values from the low half of M1, zero extend them,
    235    and return the result as a vector of two 32-bit quantities.  */
    236 static __inline __m64
    237 _mm_unpackel_pu16 (__m64 __m1)
    238 {
    239   return (__m64) __builtin_arm_wunpckeluh ((__v4hi)__m1);
    240 }
    241 
    242 /* Take the 32-bit value from the low half of M1, and return it zero extended
    243   to 64 bits.  */
    244 static __inline __m64
    245 _mm_unpackel_pu32 (__m64 __m1)
    246 {
    247   return (__m64) __builtin_arm_wunpckeluw ((__v2si)__m1);
    248 }
    249 
    250 /* Take the four 8-bit values from the high half of M1, zero extend them,
    251    and return the result as a vector of four 16-bit quantities.  */
    252 static __inline __m64
    253 _mm_unpackeh_pu8 (__m64 __m1)
    254 {
    255   return (__m64) __builtin_arm_wunpckehub ((__v8qi)__m1);
    256 }
    257 
    258 /* Take the two 16-bit values from the high half of M1, zero extend them,
    259    and return the result as a vector of two 32-bit quantities.  */
    260 static __inline __m64
    261 _mm_unpackeh_pu16 (__m64 __m1)
    262 {
    263   return (__m64) __builtin_arm_wunpckehuh ((__v4hi)__m1);
    264 }
    265 
    266 /* Take the 32-bit value from the high half of M1, and return it zero extended
    267   to 64 bits.  */
    268 static __inline __m64
    269 _mm_unpackeh_pu32 (__m64 __m1)
    270 {
    271   return (__m64) __builtin_arm_wunpckehuw ((__v2si)__m1);
    272 }
    273 
    274 /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
    275 static __inline __m64
    276 _mm_add_pi8 (__m64 __m1, __m64 __m2)
    277 {
    278   return (__m64) __builtin_arm_waddb ((__v8qi)__m1, (__v8qi)__m2);
    279 }
    280 
    281 /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
    282 static __inline __m64
    283 _mm_add_pi16 (__m64 __m1, __m64 __m2)
    284 {
    285   return (__m64) __builtin_arm_waddh ((__v4hi)__m1, (__v4hi)__m2);
    286 }
    287 
    288 /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
    289 static __inline __m64
    290 _mm_add_pi32 (__m64 __m1, __m64 __m2)
    291 {
    292   return (__m64) __builtin_arm_waddw ((__v2si)__m1, (__v2si)__m2);
    293 }
    294 
    295 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
    296    saturated arithmetic.  */
    297 static __inline __m64
    298 _mm_adds_pi8 (__m64 __m1, __m64 __m2)
    299 {
    300   return (__m64) __builtin_arm_waddbss ((__v8qi)__m1, (__v8qi)__m2);
    301 }
    302 
    303 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
    304    saturated arithmetic.  */
    305 static __inline __m64
    306 _mm_adds_pi16 (__m64 __m1, __m64 __m2)
    307 {
    308   return (__m64) __builtin_arm_waddhss ((__v4hi)__m1, (__v4hi)__m2);
    309 }
    310 
    311 /* Add the 32-bit values in M1 to the 32-bit values in M2 using signed
    312    saturated arithmetic.  */
    313 static __inline __m64
    314 _mm_adds_pi32 (__m64 __m1, __m64 __m2)
    315 {
    316   return (__m64) __builtin_arm_waddwss ((__v2si)__m1, (__v2si)__m2);
    317 }
    318 
    319 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
    320    saturated arithmetic.  */
    321 static __inline __m64
    322 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
    323 {
    324   return (__m64) __builtin_arm_waddbus ((__v8qi)__m1, (__v8qi)__m2);
    325 }
    326 
    327 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
    328    saturated arithmetic.  */
    329 static __inline __m64
    330 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
    331 {
    332   return (__m64) __builtin_arm_waddhus ((__v4hi)__m1, (__v4hi)__m2);
    333 }
    334 
    335 /* Add the 32-bit values in M1 to the 32-bit values in M2 using unsigned
    336    saturated arithmetic.  */
    337 static __inline __m64
    338 _mm_adds_pu32 (__m64 __m1, __m64 __m2)
    339 {
    340   return (__m64) __builtin_arm_waddwus ((__v2si)__m1, (__v2si)__m2);
    341 }
    342 
    343 /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
    344 static __inline __m64
    345 _mm_sub_pi8 (__m64 __m1, __m64 __m2)
    346 {
    347   return (__m64) __builtin_arm_wsubb ((__v8qi)__m1, (__v8qi)__m2);
    348 }
    349 
    350 /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
    351 static __inline __m64
    352 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
    353 {
    354   return (__m64) __builtin_arm_wsubh ((__v4hi)__m1, (__v4hi)__m2);
    355 }
    356 
    357 /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
    358 static __inline __m64
    359 _mm_sub_pi32 (__m64 __m1, __m64 __m2)
    360 {
    361   return (__m64) __builtin_arm_wsubw ((__v2si)__m1, (__v2si)__m2);
    362 }
    363 
    364 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
    365    saturating arithmetic.  */
    366 static __inline __m64
    367 _mm_subs_pi8 (__m64 __m1, __m64 __m2)
    368 {
    369   return (__m64) __builtin_arm_wsubbss ((__v8qi)__m1, (__v8qi)__m2);
    370 }
    371 
    372 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
    373    signed saturating arithmetic.  */
    374 static __inline __m64
    375 _mm_subs_pi16 (__m64 __m1, __m64 __m2)
    376 {
    377   return (__m64) __builtin_arm_wsubhss ((__v4hi)__m1, (__v4hi)__m2);
    378 }
    379 
    380 /* Subtract the 32-bit values in M2 from the 32-bit values in M1 using
    381    signed saturating arithmetic.  */
    382 static __inline __m64
    383 _mm_subs_pi32 (__m64 __m1, __m64 __m2)
    384 {
    385   return (__m64) __builtin_arm_wsubwss ((__v2si)__m1, (__v2si)__m2);
    386 }
    387 
    388 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
    389    unsigned saturating arithmetic.  */
    390 static __inline __m64
    391 _mm_subs_pu8 (__m64 __m1, __m64 __m2)
    392 {
    393   return (__m64) __builtin_arm_wsubbus ((__v8qi)__m1, (__v8qi)__m2);
    394 }
    395 
    396 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
    397    unsigned saturating arithmetic.  */
    398 static __inline __m64
    399 _mm_subs_pu16 (__m64 __m1, __m64 __m2)
    400 {
    401   return (__m64) __builtin_arm_wsubhus ((__v4hi)__m1, (__v4hi)__m2);
    402 }
    403 
    404 /* Subtract the 32-bit values in M2 from the 32-bit values in M1 using
    405    unsigned saturating arithmetic.  */
    406 static __inline __m64
    407 _mm_subs_pu32 (__m64 __m1, __m64 __m2)
    408 {
    409   return (__m64) __builtin_arm_wsubwus ((__v2si)__m1, (__v2si)__m2);
    410 }
    411 
    412 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
    413    four 32-bit intermediate results, which are then summed by pairs to
    414    produce two 32-bit results.  */
    415 static __inline __m64
    416 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
    417 {
    418   return (__m64) __builtin_arm_wmadds ((__v4hi)__m1, (__v4hi)__m2);
    419 }
    420 
    421 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
    422    four 32-bit intermediate results, which are then summed by pairs to
    423    produce two 32-bit results.  */
    424 static __inline __m64
    425 _mm_madd_pu16 (__m64 __m1, __m64 __m2)
    426 {
    427   return (__m64) __builtin_arm_wmaddu ((__v4hi)__m1, (__v4hi)__m2);
    428 }
    429 
    430 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
    431    M2 and produce the high 16 bits of the 32-bit results.  */
    432 static __inline __m64
    433 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
    434 {
    435   return (__m64) __builtin_arm_wmulsm ((__v4hi)__m1, (__v4hi)__m2);
    436 }
    437 
    438 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
    439    M2 and produce the high 16 bits of the 32-bit results.  */
    440 static __inline __m64
    441 _mm_mulhi_pu16 (__m64 __m1, __m64 __m2)
    442 {
    443   return (__m64) __builtin_arm_wmulum ((__v4hi)__m1, (__v4hi)__m2);
    444 }
    445 
    446 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
    447    the low 16 bits of the results.  */
    448 static __inline __m64
    449 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
    450 {
    451   return (__m64) __builtin_arm_wmulul ((__v4hi)__m1, (__v4hi)__m2);
    452 }
    453 
    454 /* Shift four 16-bit values in M left by COUNT.  */
    455 static __inline __m64
    456 _mm_sll_pi16 (__m64 __m, __m64 __count)
    457 {
    458   return (__m64) __builtin_arm_wsllh ((__v4hi)__m, __count);
    459 }
    460 
    461 static __inline __m64
    462 _mm_slli_pi16 (__m64 __m, int __count)
    463 {
    464   return (__m64) __builtin_arm_wsllhi ((__v4hi)__m, __count);
    465 }
    466 
    467 /* Shift two 32-bit values in M left by COUNT.  */
    468 static __inline __m64
    469 _mm_sll_pi32 (__m64 __m, __m64 __count)
    470 {
    471   return (__m64) __builtin_arm_wsllw ((__v2si)__m, __count);
    472 }
    473 
    474 static __inline __m64
    475 _mm_slli_pi32 (__m64 __m, int __count)
    476 {
    477   return (__m64) __builtin_arm_wsllwi ((__v2si)__m, __count);
    478 }
    479 
    480 /* Shift the 64-bit value in M left by COUNT.  */
    481 static __inline __m64
    482 _mm_sll_si64 (__m64 __m, __m64 __count)
    483 {
    484   return (__m64) __builtin_arm_wslld (__m, __count);
    485 }
    486 
    487 static __inline __m64
    488 _mm_slli_si64 (__m64 __m, int __count)
    489 {
    490   return (__m64) __builtin_arm_wslldi (__m, __count);
    491 }
    492 
    493 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
    494 static __inline __m64
    495 _mm_sra_pi16 (__m64 __m, __m64 __count)
    496 {
    497   return (__m64) __builtin_arm_wsrah ((__v4hi)__m, __count);
    498 }
    499 
    500 static __inline __m64
    501 _mm_srai_pi16 (__m64 __m, int __count)
    502 {
    503   return (__m64) __builtin_arm_wsrahi ((__v4hi)__m, __count);
    504 }
    505 
    506 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
    507 static __inline __m64
    508 _mm_sra_pi32 (__m64 __m, __m64 __count)
    509 {
    510   return (__m64) __builtin_arm_wsraw ((__v2si)__m, __count);
    511 }
    512 
    513 static __inline __m64
    514 _mm_srai_pi32 (__m64 __m, int __count)
    515 {
    516   return (__m64) __builtin_arm_wsrawi ((__v2si)__m, __count);
    517 }
    518 
    519 /* Shift the 64-bit value in M right by COUNT; shift in the sign bit.  */
    520 static __inline __m64
    521 _mm_sra_si64 (__m64 __m, __m64 __count)
    522 {
    523   return (__m64) __builtin_arm_wsrad (__m, __count);
    524 }
    525 
    526 static __inline __m64
    527 _mm_srai_si64 (__m64 __m, int __count)
    528 {
    529   return (__m64) __builtin_arm_wsradi (__m, __count);
    530 }
    531 
    532 /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
    533 static __inline __m64
    534 _mm_srl_pi16 (__m64 __m, __m64 __count)
    535 {
    536   return (__m64) __builtin_arm_wsrlh ((__v4hi)__m, __count);
    537 }
    538 
    539 static __inline __m64
    540 _mm_srli_pi16 (__m64 __m, int __count)
    541 {
    542   return (__m64) __builtin_arm_wsrlhi ((__v4hi)__m, __count);
    543 }
    544 
    545 /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
    546 static __inline __m64
    547 _mm_srl_pi32 (__m64 __m, __m64 __count)
    548 {
    549   return (__m64) __builtin_arm_wsrlw ((__v2si)__m, __count);
    550 }
    551 
    552 static __inline __m64
    553 _mm_srli_pi32 (__m64 __m, int __count)
    554 {
    555   return (__m64) __builtin_arm_wsrlwi ((__v2si)__m, __count);
    556 }
    557 
    558 /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
    559 static __inline __m64
    560 _mm_srl_si64 (__m64 __m, __m64 __count)
    561 {
    562   return (__m64) __builtin_arm_wsrld (__m, __count);
    563 }
    564 
    565 static __inline __m64
    566 _mm_srli_si64 (__m64 __m, int __count)
    567 {
    568   return (__m64) __builtin_arm_wsrldi (__m, __count);
    569 }
    570 
    571 /* Rotate four 16-bit values in M right by COUNT.  */
    572 static __inline __m64
    573 _mm_ror_pi16 (__m64 __m, __m64 __count)
    574 {
    575   return (__m64) __builtin_arm_wrorh ((__v4hi)__m, __count);
    576 }
    577 
    578 static __inline __m64
    579 _mm_rori_pi16 (__m64 __m, int __count)
    580 {
    581   return (__m64) __builtin_arm_wrorhi ((__v4hi)__m, __count);
    582 }
    583 
    584 /* Rotate two 32-bit values in M right by COUNT.  */
    585 static __inline __m64
    586 _mm_ror_pi32 (__m64 __m, __m64 __count)
    587 {
    588   return (__m64) __builtin_arm_wrorw ((__v2si)__m, __count);
    589 }
    590 
    591 static __inline __m64
    592 _mm_rori_pi32 (__m64 __m, int __count)
    593 {
    594   return (__m64) __builtin_arm_wrorwi ((__v2si)__m, __count);
    595 }
    596 
    597 /* Rotate two 64-bit values in M right by COUNT.  */
    598 static __inline __m64
    599 _mm_ror_si64 (__m64 __m, __m64 __count)
    600 {
    601   return (__m64) __builtin_arm_wrord (__m, __count);
    602 }
    603 
    604 static __inline __m64
    605 _mm_rori_si64 (__m64 __m, int __count)
    606 {
    607   return (__m64) __builtin_arm_wrordi (__m, __count);
    608 }
    609 
    610 /* Bit-wise AND the 64-bit values in M1 and M2.  */
    611 static __inline __m64
    612 _mm_and_si64 (__m64 __m1, __m64 __m2)
    613 {
    614   return __builtin_arm_wand (__m1, __m2);
    615 }
    616 
    617 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
    618    64-bit value in M2.  */
    619 static __inline __m64
    620 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
    621 {
    622   return __builtin_arm_wandn (__m2, __m1);
    623 }
    624 
    625 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
    626 static __inline __m64
    627 _mm_or_si64 (__m64 __m1, __m64 __m2)
    628 {
    629   return __builtin_arm_wor (__m1, __m2);
    630 }
    631 
    632 /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
    633 static __inline __m64
    634 _mm_xor_si64 (__m64 __m1, __m64 __m2)
    635 {
    636   return __builtin_arm_wxor (__m1, __m2);
    637 }
    638 
    639 /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
    640    test is true and zero if false.  */
    641 static __inline __m64
    642 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
    643 {
    644   return (__m64) __builtin_arm_wcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
    645 }
    646 
    647 static __inline __m64
    648 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
    649 {
    650   return (__m64) __builtin_arm_wcmpgtsb ((__v8qi)__m1, (__v8qi)__m2);
    651 }
    652 
    653 static __inline __m64
    654 _mm_cmpgt_pu8 (__m64 __m1, __m64 __m2)
    655 {
    656   return (__m64) __builtin_arm_wcmpgtub ((__v8qi)__m1, (__v8qi)__m2);
    657 }
    658 
    659 /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
    660    the test is true and zero if false.  */
    661 static __inline __m64
    662 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
    663 {
    664   return (__m64) __builtin_arm_wcmpeqh ((__v4hi)__m1, (__v4hi)__m2);
    665 }
    666 
    667 static __inline __m64
    668 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
    669 {
    670   return (__m64) __builtin_arm_wcmpgtsh ((__v4hi)__m1, (__v4hi)__m2);
    671 }
    672 
    673 static __inline __m64
    674 _mm_cmpgt_pu16 (__m64 __m1, __m64 __m2)
    675 {
    676   return (__m64) __builtin_arm_wcmpgtuh ((__v4hi)__m1, (__v4hi)__m2);
    677 }
    678 
    679 /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
    680    the test is true and zero if false.  */
    681 static __inline __m64
    682 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
    683 {
    684   return (__m64) __builtin_arm_wcmpeqw ((__v2si)__m1, (__v2si)__m2);
    685 }
    686 
    687 static __inline __m64
    688 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
    689 {
    690   return (__m64) __builtin_arm_wcmpgtsw ((__v2si)__m1, (__v2si)__m2);
    691 }
    692 
    693 static __inline __m64
    694 _mm_cmpgt_pu32 (__m64 __m1, __m64 __m2)
    695 {
    696   return (__m64) __builtin_arm_wcmpgtuw ((__v2si)__m1, (__v2si)__m2);
    697 }
    698 
    699 /* Element-wise multiplication of unsigned 16-bit values __B and __C, followed
    700    by accumulate across all elements and __A.  */
    701 static __inline __m64
    702 _mm_mac_pu16 (__m64 __A, __m64 __B, __m64 __C)
    703 {
    704   return __builtin_arm_wmacu (__A, (__v4hi)__B, (__v4hi)__C);
    705 }
    706 
    707 /* Element-wise multiplication of signed 16-bit values __B and __C, followed
    708    by accumulate across all elements and __A.  */
    709 static __inline __m64
    710 _mm_mac_pi16 (__m64 __A, __m64 __B, __m64 __C)
    711 {
    712   return __builtin_arm_wmacs (__A, (__v4hi)__B, (__v4hi)__C);
    713 }
    714 
    715 /* Element-wise multiplication of unsigned 16-bit values __B and __C, followed
    716    by accumulate across all elements.  */
    717 static __inline __m64
    718 _mm_macz_pu16 (__m64 __A, __m64 __B)
    719 {
    720   return __builtin_arm_wmacuz ((__v4hi)__A, (__v4hi)__B);
    721 }
    722 
    723 /* Element-wise multiplication of signed 16-bit values __B and __C, followed
    724    by accumulate across all elements.  */
    725 static __inline __m64
    726 _mm_macz_pi16 (__m64 __A, __m64 __B)
    727 {
    728   return __builtin_arm_wmacsz ((__v4hi)__A, (__v4hi)__B);
    729 }
    730 
    731 /* Accumulate across all unsigned 8-bit values in __A.  */
    732 static __inline __m64
    733 _mm_acc_pu8 (__m64 __A)
    734 {
    735   return __builtin_arm_waccb ((__v8qi)__A);
    736 }
    737 
    738 /* Accumulate across all unsigned 16-bit values in __A.  */
    739 static __inline __m64
    740 _mm_acc_pu16 (__m64 __A)
    741 {
    742   return __builtin_arm_wacch ((__v4hi)__A);
    743 }
    744 
    745 /* Accumulate across all unsigned 32-bit values in __A.  */
    746 static __inline __m64
    747 _mm_acc_pu32 (__m64 __A)
    748 {
    749   return __builtin_arm_waccw ((__v2si)__A);
    750 }
    751 
    752 static __inline __m64
    753 _mm_mia_si64 (__m64 __A, int __B, int __C)
    754 {
    755   return __builtin_arm_tmia (__A, __B, __C);
    756 }
    757 
    758 static __inline __m64
    759 _mm_miaph_si64 (__m64 __A, int __B, int __C)
    760 {
    761   return __builtin_arm_tmiaph (__A, __B, __C);
    762 }
    763 
    764 static __inline __m64
    765 _mm_miabb_si64 (__m64 __A, int __B, int __C)
    766 {
    767   return __builtin_arm_tmiabb (__A, __B, __C);
    768 }
    769 
    770 static __inline __m64
    771 _mm_miabt_si64 (__m64 __A, int __B, int __C)
    772 {
    773   return __builtin_arm_tmiabt (__A, __B, __C);
    774 }
    775 
    776 static __inline __m64
    777 _mm_miatb_si64 (__m64 __A, int __B, int __C)
    778 {
    779   return __builtin_arm_tmiatb (__A, __B, __C);
    780 }
    781 
    782 static __inline __m64
    783 _mm_miatt_si64 (__m64 __A, int __B, int __C)
    784 {
    785   return __builtin_arm_tmiatt (__A, __B, __C);
    786 }
    787 
    788 /* Extract one of the elements of A and sign extend.  The selector N must
    789    be immediate.  */
    790 #define _mm_extract_pi8(A, N) __builtin_arm_textrmsb ((__v8qi)(A), (N))
    791 #define _mm_extract_pi16(A, N) __builtin_arm_textrmsh ((__v4hi)(A), (N))
    792 #define _mm_extract_pi32(A, N) __builtin_arm_textrmsw ((__v2si)(A), (N))
    793 
    794 /* Extract one of the elements of A and zero extend.  The selector N must
    795    be immediate.  */
    796 #define _mm_extract_pu8(A, N) __builtin_arm_textrmub ((__v8qi)(A), (N))
    797 #define _mm_extract_pu16(A, N) __builtin_arm_textrmuh ((__v4hi)(A), (N))
    798 #define _mm_extract_pu32(A, N) __builtin_arm_textrmuw ((__v2si)(A), (N))
    799 
    800 /* Inserts word D into one of the elements of A.  The selector N must be
    801    immediate.  */
    802 #define _mm_insert_pi8(A, D, N) \
    803   ((__m64) __builtin_arm_tinsrb ((__v8qi)(A), (D), (N)))
    804 #define _mm_insert_pi16(A, D, N) \
    805   ((__m64) __builtin_arm_tinsrh ((__v4hi)(A), (D), (N)))
    806 #define _mm_insert_pi32(A, D, N) \
    807   ((__m64) __builtin_arm_tinsrw ((__v2si)(A), (D), (N)))
    808 
    809 /* Compute the element-wise maximum of signed 8-bit values.  */
    810 static __inline __m64
    811 _mm_max_pi8 (__m64 __A, __m64 __B)
    812 {
    813   return (__m64) __builtin_arm_wmaxsb ((__v8qi)__A, (__v8qi)__B);
    814 }
    815 
    816 /* Compute the element-wise maximum of signed 16-bit values.  */
    817 static __inline __m64
    818 _mm_max_pi16 (__m64 __A, __m64 __B)
    819 {
    820   return (__m64) __builtin_arm_wmaxsh ((__v4hi)__A, (__v4hi)__B);
    821 }
    822 
    823 /* Compute the element-wise maximum of signed 32-bit values.  */
    824 static __inline __m64
    825 _mm_max_pi32 (__m64 __A, __m64 __B)
    826 {
    827   return (__m64) __builtin_arm_wmaxsw ((__v2si)__A, (__v2si)__B);
    828 }
    829 
    830 /* Compute the element-wise maximum of unsigned 8-bit values.  */
    831 static __inline __m64
    832 _mm_max_pu8 (__m64 __A, __m64 __B)
    833 {
    834   return (__m64) __builtin_arm_wmaxub ((__v8qi)__A, (__v8qi)__B);
    835 }
    836 
    837 /* Compute the element-wise maximum of unsigned 16-bit values.  */
    838 static __inline __m64
    839 _mm_max_pu16 (__m64 __A, __m64 __B)
    840 {
    841   return (__m64) __builtin_arm_wmaxuh ((__v4hi)__A, (__v4hi)__B);
    842 }
    843 
    844 /* Compute the element-wise maximum of unsigned 32-bit values.  */
    845 static __inline __m64
    846 _mm_max_pu32 (__m64 __A, __m64 __B)
    847 {
    848   return (__m64) __builtin_arm_wmaxuw ((__v2si)__A, (__v2si)__B);
    849 }
    850 
    851 /* Compute the element-wise minimum of signed 16-bit values.  */
    852 static __inline __m64
    853 _mm_min_pi8 (__m64 __A, __m64 __B)
    854 {
    855   return (__m64) __builtin_arm_wminsb ((__v8qi)__A, (__v8qi)__B);
    856 }
    857 
    858 /* Compute the element-wise minimum of signed 16-bit values.  */
    859 static __inline __m64
    860 _mm_min_pi16 (__m64 __A, __m64 __B)
    861 {
    862   return (__m64) __builtin_arm_wminsh ((__v4hi)__A, (__v4hi)__B);
    863 }
    864 
    865 /* Compute the element-wise minimum of signed 32-bit values.  */
    866 static __inline __m64
    867 _mm_min_pi32 (__m64 __A, __m64 __B)
    868 {
    869   return (__m64) __builtin_arm_wminsw ((__v2si)__A, (__v2si)__B);
    870 }
    871 
    872 /* Compute the element-wise minimum of unsigned 16-bit values.  */
    873 static __inline __m64
    874 _mm_min_pu8 (__m64 __A, __m64 __B)
    875 {
    876   return (__m64) __builtin_arm_wminub ((__v8qi)__A, (__v8qi)__B);
    877 }
    878 
    879 /* Compute the element-wise minimum of unsigned 16-bit values.  */
    880 static __inline __m64
    881 _mm_min_pu16 (__m64 __A, __m64 __B)
    882 {
    883   return (__m64) __builtin_arm_wminuh ((__v4hi)__A, (__v4hi)__B);
    884 }
    885 
    886 /* Compute the element-wise minimum of unsigned 32-bit values.  */
    887 static __inline __m64
    888 _mm_min_pu32 (__m64 __A, __m64 __B)
    889 {
    890   return (__m64) __builtin_arm_wminuw ((__v2si)__A, (__v2si)__B);
    891 }
    892 
    893 /* Create an 8-bit mask of the signs of 8-bit values.  */
    894 static __inline int
    895 _mm_movemask_pi8 (__m64 __A)
    896 {
    897   return __builtin_arm_tmovmskb ((__v8qi)__A);
    898 }
    899 
    900 /* Create an 8-bit mask of the signs of 16-bit values.  */
    901 static __inline int
    902 _mm_movemask_pi16 (__m64 __A)
    903 {
    904   return __builtin_arm_tmovmskh ((__v4hi)__A);
    905 }
    906 
    907 /* Create an 8-bit mask of the signs of 32-bit values.  */
    908 static __inline int
    909 _mm_movemask_pi32 (__m64 __A)
    910 {
    911   return __builtin_arm_tmovmskw ((__v2si)__A);
    912 }
    913 
    914 /* Return a combination of the four 16-bit values in A.  The selector
    915    must be an immediate.  */
    916 #define _mm_shuffle_pi16(A, N) \
    917   ((__m64) __builtin_arm_wshufh ((__v4hi)(A), (N)))
    918 
    919 
    920 /* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
    921 static __inline __m64
    922 _mm_avg_pu8 (__m64 __A, __m64 __B)
    923 {
    924   return (__m64) __builtin_arm_wavg2br ((__v8qi)__A, (__v8qi)__B);
    925 }
    926 
    927 /* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
    928 static __inline __m64
    929 _mm_avg_pu16 (__m64 __A, __m64 __B)
    930 {
    931   return (__m64) __builtin_arm_wavg2hr ((__v4hi)__A, (__v4hi)__B);
    932 }
    933 
    934 /* Compute the averages of the unsigned 8-bit values in A and B.  */
    935 static __inline __m64
    936 _mm_avg2_pu8 (__m64 __A, __m64 __B)
    937 {
    938   return (__m64) __builtin_arm_wavg2b ((__v8qi)__A, (__v8qi)__B);
    939 }
    940 
    941 /* Compute the averages of the unsigned 16-bit values in A and B.  */
    942 static __inline __m64
    943 _mm_avg2_pu16 (__m64 __A, __m64 __B)
    944 {
    945   return (__m64) __builtin_arm_wavg2h ((__v4hi)__A, (__v4hi)__B);
    946 }
    947 
    948 /* Compute the sum of the absolute differences of the unsigned 8-bit
    949    values in A and B.  Return the value in the lower 16-bit word; the
    950    upper words are cleared.  */
    951 static __inline __m64
    952 _mm_sad_pu8 (__m64 __A, __m64 __B)
    953 {
    954   return (__m64) __builtin_arm_wsadbz ((__v8qi)__A, (__v8qi)__B);
    955 }
    956 
    957 static __inline __m64
    958 _mm_sada_pu8 (__m64 __A, __m64 __B, __m64 __C)
    959 {
    960   return (__m64) __builtin_arm_wsadb ((__v2si)__A, (__v8qi)__B, (__v8qi)__C);
    961 }
    962 
    963 /* Compute the sum of the absolute differences of the unsigned 16-bit
    964    values in A and B.  Return the value in the lower 32-bit word; the
    965    upper words are cleared.  */
    966 static __inline __m64
    967 _mm_sad_pu16 (__m64 __A, __m64 __B)
    968 {
    969   return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B);
    970 }
    971 
    972 static __inline __m64
    973 _mm_sada_pu16 (__m64 __A, __m64 __B, __m64 __C)
    974 {
    975   return (__m64) __builtin_arm_wsadh ((__v2si)__A, (__v4hi)__B, (__v4hi)__C);
    976 }
    977 
    978 
    979 /* Compute the sum of the absolute differences of the unsigned 8-bit
    980    values in A and B.  Return the value in the lower 16-bit word; the
    981    upper words are cleared.  */
    982 static __inline __m64
    983 _mm_sadz_pu8 (__m64 __A, __m64 __B)
    984 {
    985   return (__m64) __builtin_arm_wsadbz ((__v8qi)__A, (__v8qi)__B);
    986 }
    987 
    988 /* Compute the sum of the absolute differences of the unsigned 16-bit
    989    values in A and B.  Return the value in the lower 32-bit word; the
    990    upper words are cleared.  */
    991 static __inline __m64
    992 _mm_sadz_pu16 (__m64 __A, __m64 __B)
    993 {
    994   return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B);
    995 }
    996 
    997 #define _mm_align_si64(__A,__B, N) \
    998   (__m64) __builtin_arm_walign ((__v8qi) (__A),(__v8qi) (__B), (N))
    999 
   1000 /* Creates a 64-bit zero.  */
   1001 static __inline __m64
   1002 _mm_setzero_si64 (void)
   1003 {
   1004   return __builtin_arm_wzero ();
   1005 }
   1006 
   1007 /* Set and Get arbitrary iWMMXt Control registers.
   1008    Note only registers 0-3 and 8-11 are currently defined,
   1009    the rest are reserved.  */
   1010 
   1011 static __inline void
   1012 _mm_setwcx (const int __value, const int __regno)
   1013 {
   1014   switch (__regno)
   1015     {
   1016     case 0:
   1017       __asm __volatile ("tmcr wcid, %0" :: "r"(__value));
   1018       break;
   1019     case 1:
   1020       __asm __volatile ("tmcr wcon, %0" :: "r"(__value));
   1021       break;
   1022     case 2:
   1023       __asm __volatile ("tmcr wcssf, %0" :: "r"(__value));
   1024       break;
   1025     case 3:
   1026       __asm __volatile ("tmcr wcasf, %0" :: "r"(__value));
   1027       break;
   1028     case 8:
   1029       __builtin_arm_setwcgr0 (__value);
   1030       break;
   1031     case 9:
   1032       __builtin_arm_setwcgr1 (__value);
   1033       break;
   1034     case 10:
   1035       __builtin_arm_setwcgr2 (__value);
   1036       break;
   1037     case 11:
   1038       __builtin_arm_setwcgr3 (__value);
   1039       break;
   1040     default:
   1041       break;
   1042     }
   1043 }
   1044 
   1045 static __inline int
   1046 _mm_getwcx (const int __regno)
   1047 {
   1048   int __value;
   1049   switch (__regno)
   1050     {
   1051     case 0:
   1052       __asm __volatile ("tmrc %0, wcid" : "=r"(__value));
   1053       break;
   1054     case 1:
   1055       __asm __volatile ("tmrc %0, wcon" : "=r"(__value));
   1056       break;
   1057     case 2:
   1058       __asm __volatile ("tmrc %0, wcssf" : "=r"(__value));
   1059       break;
   1060     case 3:
   1061       __asm __volatile ("tmrc %0, wcasf" : "=r"(__value));
   1062       break;
   1063     case 8:
   1064       return __builtin_arm_getwcgr0 ();
   1065     case 9:
   1066       return __builtin_arm_getwcgr1 ();
   1067     case 10:
   1068       return __builtin_arm_getwcgr2 ();
   1069     case 11:
   1070       return __builtin_arm_getwcgr3 ();
   1071     default:
   1072       break;
   1073     }
   1074   return __value;
   1075 }
   1076 
   1077 /* Creates a vector of two 32-bit values; I0 is least significant.  */
   1078 static __inline __m64
   1079 _mm_set_pi32 (int __i1, int __i0)
   1080 {
   1081   union
   1082   {
   1083     __m64 __q;
   1084     struct
   1085     {
   1086       unsigned int __i0;
   1087       unsigned int __i1;
   1088     } __s;
   1089   } __u;
   1090 
   1091   __u.__s.__i0 = __i0;
   1092   __u.__s.__i1 = __i1;
   1093 
   1094   return __u.__q;
   1095 }
   1096 
   1097 /* Creates a vector of four 16-bit values; W0 is least significant.  */
   1098 static __inline __m64
   1099 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
   1100 {
   1101   unsigned int __i1 = (unsigned short) __w3 << 16 | (unsigned short) __w2;
   1102   unsigned int __i0 = (unsigned short) __w1 << 16 | (unsigned short) __w0;
   1103 
   1104   return _mm_set_pi32 (__i1, __i0);
   1105 }
   1106 
   1107 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
   1108 static __inline __m64
   1109 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
   1110 	     char __b3, char __b2, char __b1, char __b0)
   1111 {
   1112   unsigned int __i1, __i0;
   1113 
   1114   __i1 = (unsigned char)__b7;
   1115   __i1 = __i1 << 8 | (unsigned char)__b6;
   1116   __i1 = __i1 << 8 | (unsigned char)__b5;
   1117   __i1 = __i1 << 8 | (unsigned char)__b4;
   1118 
   1119   __i0 = (unsigned char)__b3;
   1120   __i0 = __i0 << 8 | (unsigned char)__b2;
   1121   __i0 = __i0 << 8 | (unsigned char)__b1;
   1122   __i0 = __i0 << 8 | (unsigned char)__b0;
   1123 
   1124   return _mm_set_pi32 (__i1, __i0);
   1125 }
   1126 
   1127 /* Similar, but with the arguments in reverse order.  */
   1128 static __inline __m64
   1129 _mm_setr_pi32 (int __i0, int __i1)
   1130 {
   1131   return _mm_set_pi32 (__i1, __i0);
   1132 }
   1133 
   1134 static __inline __m64
   1135 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
   1136 {
   1137   return _mm_set_pi16 (__w3, __w2, __w1, __w0);
   1138 }
   1139 
   1140 static __inline __m64
   1141 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
   1142 	      char __b4, char __b5, char __b6, char __b7)
   1143 {
   1144   return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
   1145 }
   1146 
   1147 /* Creates a vector of two 32-bit values, both elements containing I.  */
   1148 static __inline __m64
   1149 _mm_set1_pi32 (int __i)
   1150 {
   1151   return _mm_set_pi32 (__i, __i);
   1152 }
   1153 
   1154 /* Creates a vector of four 16-bit values, all elements containing W.  */
   1155 static __inline __m64
   1156 _mm_set1_pi16 (short __w)
   1157 {
   1158   unsigned int __i = (unsigned short)__w << 16 | (unsigned short)__w;
   1159   return _mm_set1_pi32 (__i);
   1160 }
   1161 
   1162 /* Creates a vector of four 16-bit values, all elements containing B.  */
   1163 static __inline __m64
   1164 _mm_set1_pi8 (char __b)
   1165 {
   1166   unsigned int __w = (unsigned char)__b << 8 | (unsigned char)__b;
   1167   unsigned int __i = __w << 16 | __w;
   1168   return _mm_set1_pi32 (__i);
   1169 }
   1170 
   1171 #ifdef __IWMMXT2__
   1172 static __inline __m64
   1173 _mm_abs_pi8 (__m64 m1)
   1174 {
   1175   return (__m64) __builtin_arm_wabsb ((__v8qi)m1);
   1176 }
   1177 
   1178 static __inline __m64
   1179 _mm_abs_pi16 (__m64 m1)
   1180 {
   1181   return (__m64) __builtin_arm_wabsh ((__v4hi)m1);
   1182 
   1183 }
   1184 
   1185 static __inline __m64
   1186 _mm_abs_pi32 (__m64 m1)
   1187 {
   1188   return (__m64) __builtin_arm_wabsw ((__v2si)m1);
   1189 
   1190 }
   1191 
   1192 static __inline __m64
   1193 _mm_addsubhx_pi16 (__m64 a, __m64 b)
   1194 {
   1195   return (__m64) __builtin_arm_waddsubhx ((__v4hi)a, (__v4hi)b);
   1196 }
   1197 
   1198 static __inline __m64
   1199 _mm_absdiff_pu8 (__m64 a, __m64 b)
   1200 {
   1201   return (__m64) __builtin_arm_wabsdiffb ((__v8qi)a, (__v8qi)b);
   1202 }
   1203 
   1204 static __inline __m64
   1205 _mm_absdiff_pu16 (__m64 a, __m64 b)
   1206 {
   1207   return (__m64) __builtin_arm_wabsdiffh ((__v4hi)a, (__v4hi)b);
   1208 }
   1209 
   1210 static __inline __m64
   1211 _mm_absdiff_pu32 (__m64 a, __m64 b)
   1212 {
   1213   return (__m64) __builtin_arm_wabsdiffw ((__v2si)a, (__v2si)b);
   1214 }
   1215 
   1216 static __inline __m64
   1217 _mm_addc_pu16 (__m64 a, __m64 b)
   1218 {
   1219   __m64 result;
   1220   __asm__ __volatile__ ("waddhc	%0, %1, %2" : "=y" (result) : "y" (a),  "y" (b));
   1221   return result;
   1222 }
   1223 
   1224 static __inline __m64
   1225 _mm_addc_pu32 (__m64 a, __m64 b)
   1226 {
   1227   __m64 result;
   1228   __asm__ __volatile__ ("waddwc	%0, %1, %2" : "=y" (result) : "y" (a),  "y" (b));
   1229   return result;
   1230 }
   1231 
   1232 static __inline __m64
   1233 _mm_avg4_pu8 (__m64 a, __m64 b)
   1234 {
   1235   return (__m64) __builtin_arm_wavg4 ((__v8qi)a, (__v8qi)b);
   1236 }
   1237 
   1238 static __inline __m64
   1239 _mm_avg4r_pu8 (__m64 a, __m64 b)
   1240 {
   1241   return (__m64) __builtin_arm_wavg4r ((__v8qi)a, (__v8qi)b);
   1242 }
   1243 
   1244 static __inline __m64
   1245 _mm_maddx_pi16 (__m64 a, __m64 b)
   1246 {
   1247   return (__m64) __builtin_arm_wmaddsx ((__v4hi)a, (__v4hi)b);
   1248 }
   1249 
   1250 static __inline __m64
   1251 _mm_maddx_pu16 (__m64 a, __m64 b)
   1252 {
   1253   return (__m64) __builtin_arm_wmaddux ((__v4hi)a, (__v4hi)b);
   1254 }
   1255 
   1256 static __inline __m64
   1257 _mm_msub_pi16 (__m64 a, __m64 b)
   1258 {
   1259   return (__m64) __builtin_arm_wmaddsn ((__v4hi)a, (__v4hi)b);
   1260 }
   1261 
   1262 static __inline __m64
   1263 _mm_msub_pu16 (__m64 a, __m64 b)
   1264 {
   1265   return (__m64) __builtin_arm_wmaddun ((__v4hi)a, (__v4hi)b);
   1266 }
   1267 
   1268 static __inline __m64
   1269 _mm_mulhi_pi32 (__m64 a, __m64 b)
   1270 {
   1271   return (__m64) __builtin_arm_wmulwsm ((__v2si)a, (__v2si)b);
   1272 }
   1273 
   1274 static __inline __m64
   1275 _mm_mulhi_pu32 (__m64 a, __m64 b)
   1276 {
   1277   return (__m64) __builtin_arm_wmulwum ((__v2si)a, (__v2si)b);
   1278 }
   1279 
   1280 static __inline __m64
   1281 _mm_mulhir_pi16 (__m64 a, __m64 b)
   1282 {
   1283   return (__m64) __builtin_arm_wmulsmr ((__v4hi)a, (__v4hi)b);
   1284 }
   1285 
   1286 static __inline __m64
   1287 _mm_mulhir_pi32 (__m64 a, __m64 b)
   1288 {
   1289   return (__m64) __builtin_arm_wmulwsmr ((__v2si)a, (__v2si)b);
   1290 }
   1291 
   1292 static __inline __m64
   1293 _mm_mulhir_pu16 (__m64 a, __m64 b)
   1294 {
   1295   return (__m64) __builtin_arm_wmulumr ((__v4hi)a, (__v4hi)b);
   1296 }
   1297 
   1298 static __inline __m64
   1299 _mm_mulhir_pu32 (__m64 a, __m64 b)
   1300 {
   1301   return (__m64) __builtin_arm_wmulwumr ((__v2si)a, (__v2si)b);
   1302 }
   1303 
   1304 static __inline __m64
   1305 _mm_mullo_pi32 (__m64 a, __m64 b)
   1306 {
   1307   return (__m64) __builtin_arm_wmulwl ((__v2si)a, (__v2si)b);
   1308 }
   1309 
   1310 static __inline __m64
   1311 _mm_qmulm_pi16 (__m64 a, __m64 b)
   1312 {
   1313   return (__m64) __builtin_arm_wqmulm ((__v4hi)a, (__v4hi)b);
   1314 }
   1315 
   1316 static __inline __m64
   1317 _mm_qmulm_pi32 (__m64 a, __m64 b)
   1318 {
   1319   return (__m64) __builtin_arm_wqmulwm ((__v2si)a, (__v2si)b);
   1320 }
   1321 
   1322 static __inline __m64
   1323 _mm_qmulmr_pi16 (__m64 a, __m64 b)
   1324 {
   1325   return (__m64) __builtin_arm_wqmulmr ((__v4hi)a, (__v4hi)b);
   1326 }
   1327 
   1328 static __inline __m64
   1329 _mm_qmulmr_pi32 (__m64 a, __m64 b)
   1330 {
   1331   return (__m64) __builtin_arm_wqmulwmr ((__v2si)a, (__v2si)b);
   1332 }
   1333 
   1334 static __inline __m64
   1335 _mm_subaddhx_pi16 (__m64 a, __m64 b)
   1336 {
   1337   return (__m64) __builtin_arm_wsubaddhx ((__v4hi)a, (__v4hi)b);
   1338 }
   1339 
   1340 static __inline __m64
   1341 _mm_addbhusl_pu8 (__m64 a, __m64 b)
   1342 {
   1343   return (__m64) __builtin_arm_waddbhusl ((__v4hi)a, (__v8qi)b);
   1344 }
   1345 
   1346 static __inline __m64
   1347 _mm_addbhusm_pu8 (__m64 a, __m64 b)
   1348 {
   1349   return (__m64) __builtin_arm_waddbhusm ((__v4hi)a, (__v8qi)b);
   1350 }
   1351 
   1352 #define _mm_qmiabb_pi32(acc, m1, m2) \
   1353   ({\
   1354    __m64 _acc = acc;\
   1355    __m64 _m1 = m1;\
   1356    __m64 _m2 = m2;\
   1357    _acc = (__m64) __builtin_arm_wqmiabb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1358    _acc;\
   1359    })
   1360 
   1361 #define _mm_qmiabbn_pi32(acc, m1, m2) \
   1362   ({\
   1363    __m64 _acc = acc;\
   1364    __m64 _m1 = m1;\
   1365    __m64 _m2 = m2;\
   1366    _acc = (__m64) __builtin_arm_wqmiabbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1367    _acc;\
   1368    })
   1369 
   1370 #define _mm_qmiabt_pi32(acc, m1, m2) \
   1371   ({\
   1372    __m64 _acc = acc;\
   1373    __m64 _m1 = m1;\
   1374    __m64 _m2 = m2;\
   1375    _acc = (__m64) __builtin_arm_wqmiabt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1376    _acc;\
   1377    })
   1378 
   1379 #define _mm_qmiabtn_pi32(acc, m1, m2) \
   1380   ({\
   1381    __m64 _acc=acc;\
   1382    __m64 _m1=m1;\
   1383    __m64 _m2=m2;\
   1384    _acc = (__m64) __builtin_arm_wqmiabtn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1385    _acc;\
   1386    })
   1387 
   1388 #define _mm_qmiatb_pi32(acc, m1, m2) \
   1389   ({\
   1390    __m64 _acc = acc;\
   1391    __m64 _m1 = m1;\
   1392    __m64 _m2 = m2;\
   1393    _acc = (__m64) __builtin_arm_wqmiatb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1394    _acc;\
   1395    })
   1396 
   1397 #define _mm_qmiatbn_pi32(acc, m1, m2) \
   1398   ({\
   1399    __m64 _acc = acc;\
   1400    __m64 _m1 = m1;\
   1401    __m64 _m2 = m2;\
   1402    _acc = (__m64) __builtin_arm_wqmiatbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1403    _acc;\
   1404    })
   1405 
   1406 #define _mm_qmiatt_pi32(acc, m1, m2) \
   1407   ({\
   1408    __m64 _acc = acc;\
   1409    __m64 _m1 = m1;\
   1410    __m64 _m2 = m2;\
   1411    _acc = (__m64) __builtin_arm_wqmiatt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1412    _acc;\
   1413    })
   1414 
   1415 #define _mm_qmiattn_pi32(acc, m1, m2) \
   1416   ({\
   1417    __m64 _acc = acc;\
   1418    __m64 _m1 = m1;\
   1419    __m64 _m2 = m2;\
   1420    _acc = (__m64) __builtin_arm_wqmiattn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1421    _acc;\
   1422    })
   1423 
   1424 #define _mm_wmiabb_si64(acc, m1, m2) \
   1425   ({\
   1426    __m64 _acc = acc;\
   1427    __m64 _m1 = m1;\
   1428    __m64 _m2 = m2;\
   1429    _acc = (__m64) __builtin_arm_wmiabb (_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1430    _acc;\
   1431    })
   1432 
   1433 #define _mm_wmiabbn_si64(acc, m1, m2) \
   1434   ({\
   1435    __m64 _acc = acc;\
   1436    __m64 _m1 = m1;\
   1437    __m64 _m2 = m2;\
   1438    _acc = (__m64) __builtin_arm_wmiabbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1439    _acc;\
   1440    })
   1441 
   1442 #define _mm_wmiabt_si64(acc, m1, m2) \
   1443   ({\
   1444    __m64 _acc = acc;\
   1445    __m64 _m1 = m1;\
   1446    __m64 _m2 = m2;\
   1447    _acc = (__m64) __builtin_arm_wmiabt (_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1448    _acc;\
   1449    })
   1450 
   1451 #define _mm_wmiabtn_si64(acc, m1, m2) \
   1452   ({\
   1453    __m64 _acc = acc;\
   1454    __m64 _m1 = m1;\
   1455    __m64 _m2 = m2;\
   1456    _acc = (__m64) __builtin_arm_wmiabtn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1457    _acc;\
   1458    })
   1459 
   1460 #define _mm_wmiatb_si64(acc, m1, m2) \
   1461   ({\
   1462    __m64 _acc = acc;\
   1463    __m64 _m1 = m1;\
   1464    __m64 _m2 = m2;\
   1465    _acc = (__m64) __builtin_arm_wmiatb (_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1466    _acc;\
   1467    })
   1468 
   1469 #define _mm_wmiatbn_si64(acc, m1, m2) \
   1470   ({\
   1471    __m64 _acc = acc;\
   1472    __m64 _m1 = m1;\
   1473    __m64 _m2 = m2;\
   1474    _acc = (__m64) __builtin_arm_wmiatbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1475    _acc;\
   1476    })
   1477 
   1478 #define _mm_wmiatt_si64(acc, m1, m2) \
   1479   ({\
   1480    __m64 _acc = acc;\
   1481    __m64 _m1 = m1;\
   1482    __m64 _m2 = m2;\
   1483    _acc = (__m64) __builtin_arm_wmiatt (_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1484    _acc;\
   1485    })
   1486 
   1487 #define _mm_wmiattn_si64(acc, m1, m2) \
   1488   ({\
   1489    __m64 _acc = acc;\
   1490    __m64 _m1 = m1;\
   1491    __m64 _m2 = m2;\
   1492    _acc = (__m64) __builtin_arm_wmiattn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1493    _acc;\
   1494    })
   1495 
   1496 #define _mm_wmiawbb_si64(acc, m1, m2) \
   1497   ({\
   1498    __m64 _acc = acc;\
   1499    __m64 _m1 = m1;\
   1500    __m64 _m2 = m2;\
   1501    _acc = (__m64) __builtin_arm_wmiawbb (_acc, (__v2si)_m1, (__v2si)_m2);\
   1502    _acc;\
   1503    })
   1504 
   1505 #define _mm_wmiawbbn_si64(acc, m1, m2) \
   1506   ({\
   1507    __m64 _acc = acc;\
   1508    __m64 _m1 = m1;\
   1509    __m64 _m2 = m2;\
   1510    _acc = (__m64) __builtin_arm_wmiawbbn (_acc, (__v2si)_m1, (__v2si)_m2);\
   1511    _acc;\
   1512    })
   1513 
   1514 #define _mm_wmiawbt_si64(acc, m1, m2) \
   1515   ({\
   1516    __m64 _acc = acc;\
   1517    __m64 _m1 = m1;\
   1518    __m64 _m2 = m2;\
   1519    _acc = (__m64) __builtin_arm_wmiawbt (_acc, (__v2si)_m1, (__v2si)_m2);\
   1520    _acc;\
   1521    })
   1522 
   1523 #define _mm_wmiawbtn_si64(acc, m1, m2) \
   1524   ({\
   1525    __m64 _acc = acc;\
   1526    __m64 _m1 = m1;\
   1527    __m64 _m2 = m2;\
   1528    _acc = (__m64) __builtin_arm_wmiawbtn (_acc, (__v2si)_m1, (__v2si)_m2);\
   1529    _acc;\
   1530    })
   1531 
   1532 #define _mm_wmiawtb_si64(acc, m1, m2) \
   1533   ({\
   1534    __m64 _acc = acc;\
   1535    __m64 _m1 = m1;\
   1536    __m64 _m2 = m2;\
   1537    _acc = (__m64) __builtin_arm_wmiawtb (_acc, (__v2si)_m1, (__v2si)_m2);\
   1538    _acc;\
   1539    })
   1540 
   1541 #define _mm_wmiawtbn_si64(acc, m1, m2) \
   1542   ({\
   1543    __m64 _acc = acc;\
   1544    __m64 _m1 = m1;\
   1545    __m64 _m2 = m2;\
   1546    _acc = (__m64) __builtin_arm_wmiawtbn (_acc, (__v2si)_m1, (__v2si)_m2);\
   1547    _acc;\
   1548    })
   1549 
   1550 #define _mm_wmiawtt_si64(acc, m1, m2) \
   1551   ({\
   1552    __m64 _acc = acc;\
   1553    __m64 _m1 = m1;\
   1554    __m64 _m2 = m2;\
   1555    _acc = (__m64) __builtin_arm_wmiawtt (_acc, (__v2si)_m1, (__v2si)_m2);\
   1556    _acc;\
   1557    })
   1558 
   1559 #define _mm_wmiawttn_si64(acc, m1, m2) \
   1560   ({\
   1561    __m64 _acc = acc;\
   1562    __m64 _m1 = m1;\
   1563    __m64 _m2 = m2;\
   1564    _acc = (__m64) __builtin_arm_wmiawttn (_acc, (__v2si)_m1, (__v2si)_m2);\
   1565    _acc;\
   1566    })
   1567 
   1568 /* The third arguments should be an immediate.  */
   1569 #define _mm_merge_si64(a, b, n) \
   1570   ({\
   1571    __m64 result;\
   1572    result = (__m64) __builtin_arm_wmerge ((__m64) (a), (__m64) (b), (n));\
   1573    result;\
   1574    })
   1575 #endif  /* __IWMMXT2__ */
   1576 
   1577 static __inline __m64
   1578 _mm_alignr0_si64 (__m64 a, __m64 b)
   1579 {
   1580   return (__m64) __builtin_arm_walignr0 ((__v8qi) a, (__v8qi) b);
   1581 }
   1582 
   1583 static __inline __m64
   1584 _mm_alignr1_si64 (__m64 a, __m64 b)
   1585 {
   1586   return (__m64) __builtin_arm_walignr1 ((__v8qi) a, (__v8qi) b);
   1587 }
   1588 
   1589 static __inline __m64
   1590 _mm_alignr2_si64 (__m64 a, __m64 b)
   1591 {
   1592   return (__m64) __builtin_arm_walignr2 ((__v8qi) a, (__v8qi) b);
   1593 }
   1594 
   1595 static __inline __m64
   1596 _mm_alignr3_si64 (__m64 a, __m64 b)
   1597 {
   1598   return (__m64) __builtin_arm_walignr3 ((__v8qi) a, (__v8qi) b);
   1599 }
   1600 
   1601 static __inline void
   1602 _mm_tandcb ()
   1603 {
   1604   __asm __volatile ("tandcb r15");
   1605 }
   1606 
   1607 static __inline void
   1608 _mm_tandch ()
   1609 {
   1610   __asm __volatile ("tandch r15");
   1611 }
   1612 
   1613 static __inline void
   1614 _mm_tandcw ()
   1615 {
   1616   __asm __volatile ("tandcw r15");
   1617 }
   1618 
   1619 #define _mm_textrcb(n) \
   1620   ({\
   1621    __asm__ __volatile__ (\
   1622      "textrcb r15, %0" : : "i" (n));\
   1623    })
   1624 
   1625 #define _mm_textrch(n) \
   1626   ({\
   1627    __asm__ __volatile__ (\
   1628      "textrch r15, %0" : : "i" (n));\
   1629    })
   1630 
   1631 #define _mm_textrcw(n) \
   1632   ({\
   1633    __asm__ __volatile__ (\
   1634      "textrcw r15, %0" : : "i" (n));\
   1635    })
   1636 
   1637 static __inline void
   1638 _mm_torcb ()
   1639 {
   1640   __asm __volatile ("torcb r15");
   1641 }
   1642 
   1643 static __inline void
   1644 _mm_torch ()
   1645 {
   1646   __asm __volatile ("torch r15");
   1647 }
   1648 
   1649 static __inline void
   1650 _mm_torcw ()
   1651 {
   1652   __asm __volatile ("torcw r15");
   1653 }
   1654 
   1655 #ifdef __IWMMXT2__
   1656 static __inline void
   1657 _mm_torvscb ()
   1658 {
   1659   __asm __volatile ("torvscb r15");
   1660 }
   1661 
   1662 static __inline void
   1663 _mm_torvsch ()
   1664 {
   1665   __asm __volatile ("torvsch r15");
   1666 }
   1667 
   1668 static __inline void
   1669 _mm_torvscw ()
   1670 {
   1671   __asm __volatile ("torvscw r15");
   1672 }
   1673 #endif /* __IWMMXT2__ */
   1674 
   1675 static __inline __m64
   1676 _mm_tbcst_pi8 (int value)
   1677 {
   1678   return (__m64) __builtin_arm_tbcstb ((signed char) value);
   1679 }
   1680 
   1681 static __inline __m64
   1682 _mm_tbcst_pi16 (int value)
   1683 {
   1684   return (__m64) __builtin_arm_tbcsth ((short) value);
   1685 }
   1686 
   1687 static __inline __m64
   1688 _mm_tbcst_pi32 (int value)
   1689 {
   1690   return (__m64) __builtin_arm_tbcstw (value);
   1691 }
   1692 
   1693 #define _m_empty _mm_empty
   1694 #define _m_packsswb _mm_packs_pi16
   1695 #define _m_packssdw _mm_packs_pi32
   1696 #define _m_packuswb _mm_packs_pu16
   1697 #define _m_packusdw _mm_packs_pu32
   1698 #define _m_packssqd _mm_packs_pi64
   1699 #define _m_packusqd _mm_packs_pu64
   1700 #define _mm_packs_si64 _mm_packs_pi64
   1701 #define _mm_packs_su64 _mm_packs_pu64
   1702 #define _m_punpckhbw _mm_unpackhi_pi8
   1703 #define _m_punpckhwd _mm_unpackhi_pi16
   1704 #define _m_punpckhdq _mm_unpackhi_pi32
   1705 #define _m_punpcklbw _mm_unpacklo_pi8
   1706 #define _m_punpcklwd _mm_unpacklo_pi16
   1707 #define _m_punpckldq _mm_unpacklo_pi32
   1708 #define _m_punpckehsbw _mm_unpackeh_pi8
   1709 #define _m_punpckehswd _mm_unpackeh_pi16
   1710 #define _m_punpckehsdq _mm_unpackeh_pi32
   1711 #define _m_punpckehubw _mm_unpackeh_pu8
   1712 #define _m_punpckehuwd _mm_unpackeh_pu16
   1713 #define _m_punpckehudq _mm_unpackeh_pu32
   1714 #define _m_punpckelsbw _mm_unpackel_pi8
   1715 #define _m_punpckelswd _mm_unpackel_pi16
   1716 #define _m_punpckelsdq _mm_unpackel_pi32
   1717 #define _m_punpckelubw _mm_unpackel_pu8
   1718 #define _m_punpckeluwd _mm_unpackel_pu16
   1719 #define _m_punpckeludq _mm_unpackel_pu32
   1720 #define _m_paddb _mm_add_pi8
   1721 #define _m_paddw _mm_add_pi16
   1722 #define _m_paddd _mm_add_pi32
   1723 #define _m_paddsb _mm_adds_pi8
   1724 #define _m_paddsw _mm_adds_pi16
   1725 #define _m_paddsd _mm_adds_pi32
   1726 #define _m_paddusb _mm_adds_pu8
   1727 #define _m_paddusw _mm_adds_pu16
   1728 #define _m_paddusd _mm_adds_pu32
   1729 #define _m_psubb _mm_sub_pi8
   1730 #define _m_psubw _mm_sub_pi16
   1731 #define _m_psubd _mm_sub_pi32
   1732 #define _m_psubsb _mm_subs_pi8
   1733 #define _m_psubsw _mm_subs_pi16
   1734 #define _m_psubuw _mm_subs_pi32
   1735 #define _m_psubusb _mm_subs_pu8
   1736 #define _m_psubusw _mm_subs_pu16
   1737 #define _m_psubusd _mm_subs_pu32
   1738 #define _m_pmaddwd _mm_madd_pi16
   1739 #define _m_pmadduwd _mm_madd_pu16
   1740 #define _m_pmulhw _mm_mulhi_pi16
   1741 #define _m_pmulhuw _mm_mulhi_pu16
   1742 #define _m_pmullw _mm_mullo_pi16
   1743 #define _m_pmacsw _mm_mac_pi16
   1744 #define _m_pmacuw _mm_mac_pu16
   1745 #define _m_pmacszw _mm_macz_pi16
   1746 #define _m_pmacuzw _mm_macz_pu16
   1747 #define _m_paccb _mm_acc_pu8
   1748 #define _m_paccw _mm_acc_pu16
   1749 #define _m_paccd _mm_acc_pu32
   1750 #define _m_pmia _mm_mia_si64
   1751 #define _m_pmiaph _mm_miaph_si64
   1752 #define _m_pmiabb _mm_miabb_si64
   1753 #define _m_pmiabt _mm_miabt_si64
   1754 #define _m_pmiatb _mm_miatb_si64
   1755 #define _m_pmiatt _mm_miatt_si64
   1756 #define _m_psllw _mm_sll_pi16
   1757 #define _m_psllwi _mm_slli_pi16
   1758 #define _m_pslld _mm_sll_pi32
   1759 #define _m_pslldi _mm_slli_pi32
   1760 #define _m_psllq _mm_sll_si64
   1761 #define _m_psllqi _mm_slli_si64
   1762 #define _m_psraw _mm_sra_pi16
   1763 #define _m_psrawi _mm_srai_pi16
   1764 #define _m_psrad _mm_sra_pi32
   1765 #define _m_psradi _mm_srai_pi32
   1766 #define _m_psraq _mm_sra_si64
   1767 #define _m_psraqi _mm_srai_si64
   1768 #define _m_psrlw _mm_srl_pi16
   1769 #define _m_psrlwi _mm_srli_pi16
   1770 #define _m_psrld _mm_srl_pi32
   1771 #define _m_psrldi _mm_srli_pi32
   1772 #define _m_psrlq _mm_srl_si64
   1773 #define _m_psrlqi _mm_srli_si64
   1774 #define _m_prorw _mm_ror_pi16
   1775 #define _m_prorwi _mm_rori_pi16
   1776 #define _m_prord _mm_ror_pi32
   1777 #define _m_prordi _mm_rori_pi32
   1778 #define _m_prorq _mm_ror_si64
   1779 #define _m_prorqi _mm_rori_si64
   1780 #define _m_pand _mm_and_si64
   1781 #define _m_pandn _mm_andnot_si64
   1782 #define _m_por _mm_or_si64
   1783 #define _m_pxor _mm_xor_si64
   1784 #define _m_pcmpeqb _mm_cmpeq_pi8
   1785 #define _m_pcmpeqw _mm_cmpeq_pi16
   1786 #define _m_pcmpeqd _mm_cmpeq_pi32
   1787 #define _m_pcmpgtb _mm_cmpgt_pi8
   1788 #define _m_pcmpgtub _mm_cmpgt_pu8
   1789 #define _m_pcmpgtw _mm_cmpgt_pi16
   1790 #define _m_pcmpgtuw _mm_cmpgt_pu16
   1791 #define _m_pcmpgtd _mm_cmpgt_pi32
   1792 #define _m_pcmpgtud _mm_cmpgt_pu32
   1793 #define _m_pextrb _mm_extract_pi8
   1794 #define _m_pextrw _mm_extract_pi16
   1795 #define _m_pextrd _mm_extract_pi32
   1796 #define _m_pextrub _mm_extract_pu8
   1797 #define _m_pextruw _mm_extract_pu16
   1798 #define _m_pextrud _mm_extract_pu32
   1799 #define _m_pinsrb _mm_insert_pi8
   1800 #define _m_pinsrw _mm_insert_pi16
   1801 #define _m_pinsrd _mm_insert_pi32
   1802 #define _m_pmaxsb _mm_max_pi8
   1803 #define _m_pmaxsw _mm_max_pi16
   1804 #define _m_pmaxsd _mm_max_pi32
   1805 #define _m_pmaxub _mm_max_pu8
   1806 #define _m_pmaxuw _mm_max_pu16
   1807 #define _m_pmaxud _mm_max_pu32
   1808 #define _m_pminsb _mm_min_pi8
   1809 #define _m_pminsw _mm_min_pi16
   1810 #define _m_pminsd _mm_min_pi32
   1811 #define _m_pminub _mm_min_pu8
   1812 #define _m_pminuw _mm_min_pu16
   1813 #define _m_pminud _mm_min_pu32
   1814 #define _m_pmovmskb _mm_movemask_pi8
   1815 #define _m_pmovmskw _mm_movemask_pi16
   1816 #define _m_pmovmskd _mm_movemask_pi32
   1817 #define _m_pshufw _mm_shuffle_pi16
   1818 #define _m_pavgb _mm_avg_pu8
   1819 #define _m_pavgw _mm_avg_pu16
   1820 #define _m_pavg2b _mm_avg2_pu8
   1821 #define _m_pavg2w _mm_avg2_pu16
   1822 #define _m_psadbw _mm_sad_pu8
   1823 #define _m_psadwd _mm_sad_pu16
   1824 #define _m_psadzbw _mm_sadz_pu8
   1825 #define _m_psadzwd _mm_sadz_pu16
   1826 #define _m_paligniq _mm_align_si64
   1827 #define _m_cvt_si2pi _mm_cvtsi64_m64
   1828 #define _m_cvt_pi2si _mm_cvtm64_si64
   1829 #define _m_from_int _mm_cvtsi32_si64
   1830 #define _m_to_int _mm_cvtsi64_si32
   1831 
   1832 #if defined __cplusplus
   1833 }; /* End "C" */
   1834 #endif /* __cplusplus */
   1835 
   1836 #endif /* _MMINTRIN_H_INCLUDED */
   1837