Home | History | Annotate | Download | only in CodeGen
      1 // RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse2 -emit-llvm -o - -Werror | FileCheck %s
      2 // RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse2 -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s
      3 
      4 // Don't include mm_malloc.h, it's system specific.
      5 #define __MM_MALLOC_H
      6 
      7 #include <x86intrin.h>
      8 
      9 __m128i test_mm_add_epi8(__m128i A, __m128i B) {
     10   // CHECK-LABEL: test_mm_add_epi8
     11   // CHECK: add <16 x i8>
     12   return _mm_add_epi8(A, B);
     13 }
     14 
     15 __m128i test_mm_add_epi16(__m128i A, __m128i B) {
     16   // CHECK-LABEL: test_mm_add_epi16
     17   // CHECK: add <8 x i16>
     18   return _mm_add_epi16(A, B);
     19 }
     20 
     21 __m128i test_mm_add_epi32(__m128i A, __m128i B) {
     22   // CHECK-LABEL: test_mm_add_epi32
     23   // CHECK: add <4 x i32>
     24   return _mm_add_epi32(A, B);
     25 }
     26 
     27 __m128i test_mm_add_epi64(__m128i A, __m128i B) {
     28   // CHECK-LABEL: test_mm_add_epi64
     29   // CHECK: add <2 x i64>
     30   return _mm_add_epi64(A, B);
     31 }
     32 
     33 __m128d test_mm_add_pd(__m128d A, __m128d B) {
     34   // CHECK-LABEL: test_mm_add_pd
     35   // CHECK: fadd <2 x double>
     36   return _mm_add_pd(A, B);
     37 }
     38 
     39 __m128d test_mm_add_sd(__m128d A, __m128d B) {
     40   // CHECK-LABEL: test_mm_add_sd
     41   // CHECK: fadd double
     42   return _mm_add_sd(A, B);
     43 }
     44 
     45 __m128i test_mm_adds_epi8(__m128i A, __m128i B) {
     46   // CHECK-LABEL: test_mm_adds_epi8
     47   // CHECK: call <16 x i8> @llvm.x86.sse2.padds.b
     48   return _mm_adds_epi8(A, B);
     49 }
     50 
     51 __m128i test_mm_adds_epi16(__m128i A, __m128i B) {
     52   // CHECK-LABEL: test_mm_adds_epi16
     53   // CHECK: call <8 x i16> @llvm.x86.sse2.padds.w
     54   return _mm_adds_epi16(A, B);
     55 }
     56 
     57 __m128i test_mm_adds_epu8(__m128i A, __m128i B) {
     58   // CHECK-LABEL: test_mm_adds_epu8
     59   // CHECK: call <16 x i8> @llvm.x86.sse2.paddus.b
     60   return _mm_adds_epu8(A, B);
     61 }
     62 
     63 __m128i test_mm_adds_epu16(__m128i A, __m128i B) {
     64   // CHECK-LABEL: test_mm_adds_epu16
     65   // CHECK: call <8 x i16> @llvm.x86.sse2.paddus.w
     66   return _mm_adds_epu16(A, B);
     67 }
     68 
     69 __m128d test_mm_and_pd(__m128d A, __m128d B) {
     70   // CHECK-LABEL: test_mm_and_pd
     71   // CHECK: and <4 x i32>
     72   return _mm_and_pd(A, B);
     73 }
     74 
     75 __m128i test_mm_and_si128(__m128i A, __m128i B) {
     76   // CHECK-LABEL: test_mm_and_si128
     77   // CHECK: and <2 x i64>
     78   return _mm_and_si128(A, B);
     79 }
     80 
     81 __m128i test_mm_avg_epu8(__m128i A, __m128i B) {
     82   // CHECK-LABEL: test_mm_avg_epu8
     83   // CHECK: call <16 x i8> @llvm.x86.sse2.pavg.b
     84   return _mm_avg_epu8(A, B);
     85 }
     86 
     87 __m128i test_mm_avg_epu16(__m128i A, __m128i B) {
     88   // CHECK-LABEL: test_mm_avg_epu16
     89   // CHECK: call <8 x i16> @llvm.x86.sse2.pavg.w
     90   return _mm_avg_epu16(A, B);
     91 }
     92 
     93 __m128i test_mm_bslli_si128(__m128i A) {
     94   // CHECK-LABEL: test_mm_bslli_si128
     95   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
     96   return _mm_bslli_si128(A, 5);
     97 }
     98 
     99 __m128i test_mm_bsrli_si128(__m128i A) {
    100   // CHECK-LABEL: test_mm_bsrli_si128
    101   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
    102   return _mm_bsrli_si128(A, 5);
    103 }
    104 
    105 void test_mm_clflush(void* A) {
    106   // CHECK-LABEL: test_mm_clflush
    107   // CHECK: call void @llvm.x86.sse2.clflush(i8* %{{.*}})
    108   _mm_clflush(A);
    109 }
    110 
    111 __m128i test_mm_cmpeq_epi8(__m128i A, __m128i B) {
    112   // CHECK-LABEL: test_mm_cmpeq_epi8
    113   // CHECK: icmp eq <16 x i8>
    114   return _mm_cmpeq_epi8(A, B);
    115 }
    116 
    117 __m128i test_mm_cmpeq_epi16(__m128i A, __m128i B) {
    118   // CHECK-LABEL: test_mm_cmpeq_epi16
    119   // CHECK: icmp eq <8 x i16>
    120   return _mm_cmpeq_epi16(A, B);
    121 }
    122 
    123 __m128i test_mm_cmpeq_epi32(__m128i A, __m128i B) {
    124   // CHECK-LABEL: test_mm_cmpeq_epi32
    125   // CHECK: icmp eq <4 x i32>
    126   return _mm_cmpeq_epi32(A, B);
    127 }
    128 
    129 __m128d test_mm_cmpeq_pd(__m128d A, __m128d B) {
    130   // CHECK-LABEL: test_mm_cmpeq_pd
    131   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 0)
    132   return _mm_cmpeq_pd(A, B);
    133 }
    134 
    135 __m128d test_mm_cmpeq_sd(__m128d A, __m128d B) {
    136   // CHECK-LABEL: test_mm_cmpeq_sd
    137   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 0)
    138   return _mm_cmpeq_sd(A, B);
    139 }
    140 
    141 __m128d test_mm_cmpge_pd(__m128d A, __m128d B) {
    142   // CHECK-LABEL: test_mm_cmpge_pd
    143   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 2)
    144   return _mm_cmpge_pd(A, B);
    145 }
    146 
    147 __m128d test_mm_cmpge_sd(__m128d A, __m128d B) {
    148   // CHECK-LABEL: test_mm_cmpge_sd
    149   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 2)
    150   return _mm_cmpge_sd(A, B);
    151 }
    152 
    153 __m128i test_mm_cmpgt_epi8(__m128i A, __m128i B) {
    154   // CHECK-LABEL: test_mm_cmpgt_epi8
    155   // CHECK: icmp sgt <16 x i8>
    156   return _mm_cmpgt_epi8(A, B);
    157 }
    158 
    159 __m128i test_mm_cmpgt_epi16(__m128i A, __m128i B) {
    160   // CHECK-LABEL: test_mm_cmpgt_epi16
    161   // CHECK: icmp sgt <8 x i16>
    162   return _mm_cmpgt_epi16(A, B);
    163 }
    164 
    165 __m128i test_mm_cmpgt_epi32(__m128i A, __m128i B) {
    166   // CHECK-LABEL: test_mm_cmpgt_epi32
    167   // CHECK: icmp sgt <4 x i32>
    168   return _mm_cmpgt_epi32(A, B);
    169 }
    170 
    171 __m128d test_mm_cmpgt_pd(__m128d A, __m128d B) {
    172   // CHECK-LABEL: test_mm_cmpgt_pd
    173   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
    174   return _mm_cmpgt_pd(A, B);
    175 }
    176 
    177 __m128d test_mm_cmpgt_sd(__m128d A, __m128d B) {
    178   // CHECK-LABEL: test_mm_cmpgt_sd
    179   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
    180   return _mm_cmpgt_sd(A, B);
    181 }
    182 
    183 __m128d test_mm_cmple_pd(__m128d A, __m128d B) {
    184   // CHECK-LABEL: test_mm_cmple_pd
    185   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 2)
    186   return _mm_cmple_pd(A, B);
    187 }
    188 
    189 __m128d test_mm_cmple_sd(__m128d A, __m128d B) {
    190   // CHECK-LABEL: test_mm_cmple_sd
    191   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 2)
    192   return _mm_cmple_sd(A, B);
    193 }
    194 
    195 __m128i test_mm_cmplt_epi8(__m128i A, __m128i B) {
    196   // CHECK-LABEL: test_mm_cmplt_epi8
    197   // CHECK: icmp sgt <16 x i8>
    198   return _mm_cmplt_epi8(A, B);
    199 }
    200 
    201 __m128i test_mm_cmplt_epi16(__m128i A, __m128i B) {
    202   // CHECK-LABEL: test_mm_cmplt_epi16
    203   // CHECK: icmp sgt <8 x i16>
    204   return _mm_cmplt_epi16(A, B);
    205 }
    206 
    207 __m128i test_mm_cmplt_epi32(__m128i A, __m128i B) {
    208   // CHECK-LABEL: test_mm_cmplt_epi32
    209   // CHECK: icmp sgt <4 x i32>
    210   return _mm_cmplt_epi32(A, B);
    211 }
    212 
    213 __m128d test_mm_cmplt_pd(__m128d A, __m128d B) {
    214   // CHECK-LABEL: test_mm_cmplt_pd
    215   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
    216   return _mm_cmplt_pd(A, B);
    217 }
    218 
    219 __m128d test_mm_cmplt_sd(__m128d A, __m128d B) {
    220   // CHECK-LABEL: test_mm_cmplt_sd
    221   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
    222   return _mm_cmplt_sd(A, B);
    223 }
    224 
    225 __m128d test_mm_cmpneq_pd(__m128d A, __m128d B) {
    226   // CHECK-LABEL: test_mm_cmpneq_pd
    227   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 4)
    228   return _mm_cmpneq_pd(A, B);
    229 }
    230 
    231 __m128d test_mm_cmpneq_sd(__m128d A, __m128d B) {
    232   // CHECK-LABEL: test_mm_cmpneq_sd
    233   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 4)
    234   return _mm_cmpneq_sd(A, B);
    235 }
    236 
    237 __m128d test_mm_cmpnge_pd(__m128d A, __m128d B) {
    238   // CHECK-LABEL: test_mm_cmpnge_pd
    239   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 6)
    240   return _mm_cmpnge_pd(A, B);
    241 }
    242 
    243 __m128d test_mm_cmpnge_sd(__m128d A, __m128d B) {
    244   // CHECK-LABEL: test_mm_cmpnge_sd
    245   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 6)
    246   return _mm_cmpnge_sd(A, B);
    247 }
    248 
    249 __m128d test_mm_cmpngt_pd(__m128d A, __m128d B) {
    250   // CHECK-LABEL: test_mm_cmpngt_pd
    251   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 5)
    252   return _mm_cmpngt_pd(A, B);
    253 }
    254 
    255 __m128d test_mm_cmpngt_sd(__m128d A, __m128d B) {
    256   // CHECK-LABEL: test_mm_cmpngt_sd
    257   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 5)
    258   return _mm_cmpngt_sd(A, B);
    259 }
    260 
    261 __m128d test_mm_cmpnle_pd(__m128d A, __m128d B) {
    262   // CHECK-LABEL: test_mm_cmpnle_pd
    263   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 6)
    264   return _mm_cmpnle_pd(A, B);
    265 }
    266 
    267 __m128d test_mm_cmpnle_sd(__m128d A, __m128d B) {
    268   // CHECK-LABEL: test_mm_cmpnle_sd
    269   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 6)
    270   return _mm_cmpnle_sd(A, B);
    271 }
    272 
    273 __m128d test_mm_cmpnlt_pd(__m128d A, __m128d B) {
    274   // CHECK-LABEL: test_mm_cmpnlt_pd
    275   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 5)
    276   return _mm_cmpnlt_pd(A, B);
    277 }
    278 
    279 __m128d test_mm_cmpnlt_sd(__m128d A, __m128d B) {
    280   // CHECK-LABEL: test_mm_cmpnlt_sd
    281   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 5)
    282   return _mm_cmpnlt_sd(A, B);
    283 }
    284 
    285 __m128d test_mm_cmpord_pd(__m128d A, __m128d B) {
    286   // CHECK-LABEL: test_mm_cmpord_pd
    287   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 7)
    288   return _mm_cmpord_pd(A, B);
    289 }
    290 
    291 __m128d test_mm_cmpord_sd(__m128d A, __m128d B) {
    292   // CHECK-LABEL: test_mm_cmpord_sd
    293   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 7)
    294   return _mm_cmpord_sd(A, B);
    295 }
    296 
    297 __m128d test_mm_cmpunord_pd(__m128d A, __m128d B) {
    298   // CHECK-LABEL: test_mm_cmpunord_pd
    299   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 3)
    300   return _mm_cmpunord_pd(A, B);
    301 }
    302 
    303 __m128d test_mm_cmpunord_sd(__m128d A, __m128d B) {
    304   // CHECK-LABEL: test_mm_cmpunord_sd
    305   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 3)
    306   return _mm_cmpunord_sd(A, B);
    307 }
    308 
    309 int test_mm_comieq_sd(__m128d A, __m128d B) {
    310   // CHECK-LABEL: test_mm_comieq_sd
    311   // CHECK: call i32 @llvm.x86.sse2.comieq.sd
    312   return _mm_comieq_sd(A, B);
    313 }
    314 
    315 int test_mm_comige_sd(__m128d A, __m128d B) {
    316   // CHECK-LABEL: test_mm_comige_sd
    317   // CHECK: call i32 @llvm.x86.sse2.comige.sd
    318   return _mm_comige_sd(A, B);
    319 }
    320 
    321 int test_mm_comigt_sd(__m128d A, __m128d B) {
    322   // CHECK-LABEL: test_mm_comigt_sd
    323   // CHECK: call i32 @llvm.x86.sse2.comigt.sd
    324   return _mm_comigt_sd(A, B);
    325 }
    326 
    327 int test_mm_comile_sd(__m128d A, __m128d B) {
    328   // CHECK-LABEL: test_mm_comile_sd
    329   // CHECK: call i32 @llvm.x86.sse2.comile.sd
    330   return _mm_comile_sd(A, B);
    331 }
    332 
    333 int test_mm_comilt_sd(__m128d A, __m128d B) {
    334   // CHECK-LABEL: test_mm_comilt_sd
    335   // CHECK: call i32 @llvm.x86.sse2.comilt.sd
    336   return _mm_comilt_sd(A, B);
    337 }
    338 
    339 int test_mm_comineq_sd(__m128d A, __m128d B) {
    340   // CHECK-LABEL: test_mm_comineq_sd
    341   // CHECK: call i32 @llvm.x86.sse2.comineq.sd
    342   return _mm_comineq_sd(A, B);
    343 }
    344 
    345 __m128d test_mm_cvtepi32_pd(__m128i A) {
    346   // CHECK-LABEL: test_mm_cvtepi32_pd
    347   // CHECK: call <2 x double> @llvm.x86.sse2.cvtdq2pd
    348   return _mm_cvtepi32_pd(A);
    349 }
    350 
    351 __m128 test_mm_cvtepi32_ps(__m128i A) {
    352   // CHECK-LABEL: test_mm_cvtepi32_ps
    353   // CHECK: call <4 x float> @llvm.x86.sse2.cvtdq2ps
    354   return _mm_cvtepi32_ps(A);
    355 }
    356 
    357 __m128i test_mm_cvtpd_epi32(__m128d A) {
    358   // CHECK-LABEL: test_mm_cvtpd_epi32
    359   // CHECK: call <4 x i32> @llvm.x86.sse2.cvtpd2dq
    360   return _mm_cvtpd_epi32(A);
    361 }
    362 
    363 __m128 test_mm_cvtpd_ps(__m128d A) {
    364   // CHECK-LABEL: test_mm_cvtpd_ps
    365   // CHECK: call <4 x float> @llvm.x86.sse2.cvtpd2ps
    366   return _mm_cvtpd_ps(A);
    367 }
    368 
    369 __m128i test_mm_cvtps_epi32(__m128 A) {
    370   // CHECK-LABEL: test_mm_cvtps_epi32
    371   // CHECK: call <4 x i32> @llvm.x86.sse2.cvtps2dq
    372   return _mm_cvtps_epi32(A);
    373 }
    374 
    375 __m128d test_mm_cvtps_pd(__m128 A) {
    376   // CHECK-LABEL: test_mm_cvtps_pd
    377   // CHECK: call <2 x double> @llvm.x86.sse2.cvtps2pd
    378   return _mm_cvtps_pd(A);
    379 }
    380 
    381 double test_mm_cvtsd_f64(__m128d A) {
    382   // CHECK-LABEL: test_mm_cvtsd_f64
    383   // CHECK: extractelement <2 x double> %{{.*}}, i32 0
    384   return _mm_cvtsd_f64(A);
    385 }
    386 
    387 int test_mm_cvtsd_si32(__m128d A) {
    388   // CHECK-LABEL: test_mm_cvtsd_si32
    389   // CHECK: call i32 @llvm.x86.sse2.cvtsd2si
    390   return _mm_cvtsd_si32(A);
    391 }
    392 
    393 long long test_mm_cvtsd_si64(__m128d A) {
    394   // CHECK-LABEL: test_mm_cvtsd_si64
    395   // CHECK: call i64 @llvm.x86.sse2.cvtsd2si64
    396   return _mm_cvtsd_si64(A);
    397 }
    398 
    399 __m128 test_mm_cvtsd_ss(__m128 A, __m128d B) {
    400   // CHECK-LABEL: test_mm_cvtsd_ss
    401   // CHECK: fptrunc double %{{.*}} to float
    402   return _mm_cvtsd_ss(A, B);
    403 }
    404 
    405 int test_mm_cvtsi128_si32(__m128i A) {
    406   // CHECK-LABEL: test_mm_cvtsi128_si32
    407   // CHECK: extractelement <4 x i32> %{{.*}}, i32 0
    408   return _mm_cvtsi128_si32(A);
    409 }
    410 
    411 long long test_mm_cvtsi128_si64(__m128i A) {
    412   // CHECK-LABEL: test_mm_cvtsi128_si64
    413   // CHECK: extractelement <2 x i64> %{{.*}}, i32 0
    414   return _mm_cvtsi128_si64(A);
    415 }
    416 
    417 __m128d test_mm_cvtsi32_sd(__m128d A, int B) {
    418   // CHECK-LABEL: test_mm_cvtsi32_sd
    419   // CHECK: sitofp i32 %{{.*}} to double
    420   // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
    421   return _mm_cvtsi32_sd(A, B);
    422 }
    423 
    424 __m128i test_mm_cvtsi32_si128(int A) {
    425   // CHECK-LABEL: test_mm_cvtsi32_si128
    426   // CHECK: insertelement <4 x i32> undef, i32 %{{.*}}, i32 0
    427   return _mm_cvtsi32_si128(A);
    428 }
    429 
    430 __m128d test_mm_cvtsi64_sd(__m128d A, long long B) {
    431   // CHECK-LABEL: test_mm_cvtsi64_sd
    432   // CHECK: sitofp i64 %{{.*}} to double
    433   // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
    434   return _mm_cvtsi64_sd(A, B);
    435 }
    436 
    437 __m128i test_mm_cvtsi64_si128(long long A) {
    438   // CHECK-LABEL: test_mm_cvtsi64_si128
    439   // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
    440   return _mm_cvtsi64_si128(A);
    441 }
    442 
    443 __m128d test_mm_cvtss_sd(__m128d A, __m128 B) {
    444   // CHECK-LABEL: test_mm_cvtss_sd
    445   // CHECK: extractelement <4 x float> %{{.*}}, i32 0
    446   // CHECK: fpext float %{{.*}} to double
    447   // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
    448   return _mm_cvtss_sd(A, B);
    449 }
    450 
    451 __m128i test_mm_cvttpd_epi32(__m128d A) {
    452   // CHECK-LABEL: test_mm_cvttpd_epi32
    453   // CHECK: call <4 x i32> @llvm.x86.sse2.cvttpd2dq
    454   return _mm_cvttpd_epi32(A);
    455 }
    456 
    457 __m128i test_mm_cvttps_epi32(__m128 A) {
    458   // CHECK-LABEL: test_mm_cvttps_epi32
    459   // CHECK: call <4 x i32> @llvm.x86.sse2.cvttps2dq
    460   return _mm_cvttps_epi32(A);
    461 }
    462 
    463 int test_mm_cvttsd_si32(__m128d A) {
    464   // CHECK-LABEL: test_mm_cvttsd_si32
    465   // CHECK: extractelement <2 x double> %{{.*}}, i32 0
    466   // CHECK: fptosi double %{{.*}} to i32
    467   return _mm_cvttsd_si32(A);
    468 }
    469 
    470 long long test_mm_cvttsd_si64(__m128d A) {
    471   // CHECK-LABEL: test_mm_cvttsd_si64
    472   // CHECK: extractelement <2 x double> %{{.*}}, i32 0
    473   // CHECK: fptosi double %{{.*}} to i64
    474   return _mm_cvttsd_si64(A);
    475 }
    476 
    477 __m128d test_mm_div_pd(__m128d A, __m128d B) {
    478   // CHECK-LABEL: test_mm_div_pd
    479   // CHECK: fdiv <2 x double>
    480   return _mm_div_pd(A, B);
    481 }
    482 
    483 __m128d test_mm_div_sd(__m128d A, __m128d B) {
    484   // CHECK-LABEL: test_mm_div_sd
    485   // CHECK: fdiv double
    486   return _mm_div_sd(A, B);
    487 }
    488 
    489 // Lowering to pextrw requires optimization.
    490 int test_mm_extract_epi16(__m128i A) {
    491   // CHECK-LABEL: test_mm_extract_epi16
    492   // CHECK: [[x:%.*]] = and i32 %{{.*}}, 7
    493   // CHECK: extractelement <8 x i16> %{{.*}}, i32 [[x]]
    494   return _mm_extract_epi16(A, 8);
    495 }
    496 
    497 __m128i test_mm_insert_epi16(__m128i A, short B) {
    498   // CHECK-LABEL: test_mm_insert_epi16
    499   // CHECK: [[x:%.*]] = and i32 %{{.*}}, 7
    500   // CHECK: insertelement <8 x i16> %{{.*}}, i32 [[x]]
    501   return _mm_insert_epi16(A, B, 8);
    502 }
    503 
    504 void test_mm_lfence() {
    505   // CHECK-LABEL: test_mm_lfence
    506   // CHECK: call void @llvm.x86.sse2.lfence()
    507   _mm_lfence();
    508 }
    509 
    510 __m128d test_mm_load_pd(double const* A) {
    511   // CHECK-LABEL: test_mm_load_pd
    512   // CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 16
    513   return _mm_load_pd(A);
    514 }
    515 
    516 __m128d test_mm_load_sd(double const* A) {
    517   // CHECK-LABEL: test_mm_load_sd
    518   // CHECK: load double, double* %{{.*}}, align 1
    519   return _mm_load_sd(A);
    520 }
    521 
    522 __m128i test_mm_load_si128(__m128i const* A) {
    523   // CHECK-LABEL: test_mm_load_si128
    524   // CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
    525   return _mm_load_si128(A);
    526 }
    527 
    528 __m128d test_mm_load1_pd(double const* A) {
    529   // CHECK-LABEL: test_mm_load1_pd
    530   // CHECK: load double, double* %{{.*}}, align 8
    531   // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
    532   // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
    533   return _mm_load1_pd(A);
    534 }
    535 
    536 __m128d test_mm_loadh_pd(__m128d x, void* y) {
    537   // CHECK-LABEL: test_mm_loadh_pd
    538   // CHECK: load double, double* %{{.*}}, align 1{{$}}
    539   return _mm_loadh_pd(x, y);
    540 }
    541 
    542 __m128d test_mm_loadr_pd(double const* A) {
    543   // CHECK-LABEL: test_mm_loadr_pd
    544   // CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 16
    545   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 0>
    546   return _mm_loadr_pd(A);
    547 }
    548 
    549 __m128d test_mm_loadu_pd(double const* A) {
    550   // CHECK-LABEL: test_mm_loadu_pd
    551   // CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 1
    552   return _mm_loadu_pd(A);
    553 }
    554 
    555 __m128i test_mm_loadu_si128(__m128i const* A) {
    556   // CHECK-LABEL: test_mm_loadu_si128
    557   // CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 1
    558   return _mm_loadu_si128(A);
    559 }
    560 
    561 __m128i test_mm_madd_epi16(__m128i A, __m128i B) {
    562   // CHECK-LABEL: test_mm_madd_epi16
    563   // CHECK: call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
    564   return _mm_madd_epi16(A, B);
    565 }
    566 
    567 void test_mm_maskmoveu_si128(__m128i A, __m128i B, char* C) {
    568   // CHECK-LABEL: test_mm_maskmoveu_si128
    569   // CHECK: call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8* %{{.*}})
    570   _mm_maskmoveu_si128(A, B, C);
    571 }
    572 
    573 __m128i test_mm_max_epi16(__m128i A, __m128i B) {
    574   // CHECK-LABEL: test_mm_max_epi16
    575   // CHECK: call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
    576   return _mm_max_epi16(A, B);
    577 }
    578 
    579 __m128i test_mm_max_epu8(__m128i A, __m128i B) {
    580   // CHECK-LABEL: test_mm_max_epu8
    581   // CHECK: call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
    582   return _mm_max_epu8(A, B);
    583 }
    584 
    585 __m128d test_mm_max_pd(__m128d A, __m128d B) {
    586   // CHECK-LABEL: test_mm_max_pd
    587   // CHECK: call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
    588   return _mm_max_pd(A, B);
    589 }
    590 
    591 __m128d test_mm_max_sd(__m128d A, __m128d B) {
    592   // CHECK-LABEL: test_mm_max_sd
    593   // CHECK: call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
    594   return _mm_max_sd(A, B);
    595 }
    596 
    597 void test_mm_mfence() {
    598   // CHECK-LABEL: test_mm_mfence
    599   // CHECK: call void @llvm.x86.sse2.mfence()
    600   _mm_mfence();
    601 }
    602 
    603 __m128i test_mm_min_epi16(__m128i A, __m128i B) {
    604   // CHECK-LABEL: test_mm_min_epi16
    605   // CHECK: call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
    606   return _mm_min_epi16(A, B);
    607 }
    608 
    609 __m128i test_mm_min_epu8(__m128i A, __m128i B) {
    610   // CHECK-LABEL: test_mm_min_epu8
    611   // CHECK: call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
    612   return _mm_min_epu8(A, B);
    613 }
    614 
    615 __m128d test_mm_min_pd(__m128d A, __m128d B) {
    616   // CHECK-LABEL: test_mm_min_pd
    617   // CHECK: call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
    618   return _mm_min_pd(A, B);
    619 }
    620 
    621 __m128d test_mm_min_sd(__m128d A, __m128d B) {
    622   // CHECK-LABEL: test_mm_min_sd
    623   // CHECK: call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
    624   return _mm_min_sd(A, B);
    625 }
    626 
    627 int test_mm_movemask_epi8(__m128i A) {
    628   // CHECK-LABEL: test_mm_movemask_epi8
    629   // CHECK: call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %{{.*}})
    630   return _mm_movemask_epi8(A);
    631 }
    632 
    633 int test_mm_movemask_pd(__m128d A) {
    634   // CHECK-LABEL: test_mm_movemask_pd
    635   // CHECK: call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %{{.*}})
    636   return _mm_movemask_pd(A);
    637 }
    638 
    639 __m128i test_mm_mul_epu32(__m128i A, __m128i B) {
    640   // CHECK-LABEL: test_mm_mul_epu32
    641   // CHECK: call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
    642   return _mm_mul_epu32(A, B);
    643 }
    644 
    645 __m128d test_mm_mul_pd(__m128d A, __m128d B) {
    646   // CHECK-LABEL: test_mm_mul_pd
    647   // CHECK: fmul <2 x double> %{{.*}}, %{{.*}}
    648   return _mm_mul_pd(A, B);
    649 }
    650 
    651 __m128d test_mm_mul_sd(__m128d A, __m128d B) {
    652   // CHECK-LABEL: test_mm_mul_sd
    653   // CHECK: fmul double %{{.*}}, %{{.*}}
    654   return _mm_mul_sd(A, B);
    655 }
    656 
    657 __m128i test_mm_mulhi_epi16(__m128i A, __m128i B) {
    658   // CHECK-LABEL: test_mm_mulhi_epi16
    659   // CHECK: call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
    660   return _mm_mulhi_epi16(A, B);
    661 }
    662 
    663 __m128i test_mm_mulhi_epu16(__m128i A, __m128i B) {
    664   // CHECK-LABEL: test_mm_mulhi_epu16
    665   // CHECK: call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
    666   return _mm_mulhi_epu16(A, B);
    667 }
    668 
    669 __m128i test_mm_mullo_epi16(__m128i A, __m128i B) {
    670   // CHECK-LABEL: test_mm_mullo_epi16
    671   // CHECK: mul <8 x i16> %{{.*}}, %{{.*}}
    672   return _mm_mullo_epi16(A, B);
    673 }
    674 
    675 __m128d test_mm_or_pd(__m128d A, __m128d B) {
    676   // CHECK-LABEL: test_mm_or_pd
    677   // CHECK: or <4 x i32> %{{.*}}, %{{.*}}
    678   return _mm_or_pd(A, B);
    679 }
    680 
    681 __m128i test_mm_or_si128(__m128i A, __m128i B) {
    682   // CHECK-LABEL: test_mm_or_si128
    683   // CHECK: or <2 x i64> %{{.*}}, %{{.*}}
    684   return _mm_or_si128(A, B);
    685 }
    686 
    687 __m128i test_mm_packs_epi16(__m128i A, __m128i B) {
    688   // CHECK-LABEL: test_mm_packs_epi16
    689   // CHECK: call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
    690   return _mm_packs_epi16(A, B);
    691 }
    692 
    693 __m128i test_mm_packs_epi32(__m128i A, __m128i B) {
    694   // CHECK-LABEL: test_mm_packs_epi32
    695   // CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
    696   return _mm_packs_epi32(A, B);
    697 }
    698 
    699 __m128i test_mm_packus_epi16(__m128i A, __m128i B) {
    700   // CHECK-LABEL: test_mm_packus_epi16
    701   // CHECK: call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
    702   return _mm_packus_epi16(A, B);
    703 }
    704 
    705 void test_mm_pause() {
    706   // CHECK-LABEL: test_mm_pause
    707   // CHECK: call void @llvm.x86.sse2.pause()
    708   return _mm_pause();
    709 }
    710 
    711 __m128i test_mm_sad_epu8(__m128i A, __m128i B) {
    712   // CHECK-LABEL: test_mm_sad_epu8
    713   // CHECK: call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
    714   return _mm_sad_epu8(A, B);
    715 }
    716 
    717 __m128d test_mm_setzero_pd() {
    718   // CHECK-LABEL: test_mm_setzero_pd
    719   // CHECK: store <2 x double> zeroinitializer
    720   return _mm_setzero_pd();
    721 }
    722 
    723 __m128i test_mm_setzero_si128() {
    724   // CHECK-LABEL: test_mm_setzero_si128
    725   // CHECK: store <2 x i64> zeroinitializer
    726   return _mm_setzero_si128();
    727 }
    728 
    729 __m128i test_mm_shuffle_epi32(__m128i A) {
    730   // CHECK-LABEL: test_mm_shuffle_epi32
    731   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> zeroinitializer
    732   return _mm_shuffle_epi32(A, 0);
    733 }
    734 
    735 __m128d test_mm_shuffle_pd(__m128d A, __m128d B) {
    736   // CHECK-LABEL: test_mm_shuffle_pd
    737   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 2>
    738   return _mm_shuffle_pd(A, B, 1);
    739 }
    740 
    741 __m128i test_mm_shufflehi_epi16(__m128i A) {
    742   // CHECK-LABEL: test_mm_shufflehi_epi16
    743   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
    744   return _mm_shufflehi_epi16(A, 0);
    745 }
    746 
    747 __m128i test_mm_shufflelo_epi16(__m128i A) {
    748   // CHECK-LABEL: test_mm_shufflelo_epi16
    749   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
    750   return _mm_shufflelo_epi16(A, 0);
    751 }
    752 
    753 __m128i test_mm_sll_epi16(__m128i A, __m128i B) {
    754   // CHECK-LABEL: test_mm_sll_epi16
    755   // CHECK: call <8 x i16> @llvm.x86.sse2.psll.w
    756   return _mm_sll_epi16(A, B);
    757 }
    758 
    759 __m128i test_mm_sll_epi32(__m128i A, __m128i B) {
    760   // CHECK-LABEL: test_mm_sll_epi32
    761   // CHECK: call <4 x i32> @llvm.x86.sse2.psll.d
    762   return _mm_sll_epi32(A, B);
    763 }
    764 
    765 __m128i test_mm_sll_epi64(__m128i A, __m128i B) {
    766   // CHECK-LABEL: test_mm_sll_epi64
    767   // CHECK: call <2 x i64> @llvm.x86.sse2.psll.q
    768   return _mm_sll_epi64(A, B);
    769 }
    770 
    771 __m128i test_mm_slli_epi16(__m128i A) {
    772   // CHECK-LABEL: test_mm_slli_epi16
    773   // CHECK: call <8 x i16> @llvm.x86.sse2.pslli.w
    774   return _mm_slli_epi16(A, 1);
    775 }
    776 
    777 __m128i test_mm_slli_epi32(__m128i A) {
    778   // CHECK-LABEL: test_mm_slli_epi32
    779   // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d
    780   return _mm_slli_epi32(A, 1);
    781 }
    782 
    783 __m128i test_mm_slli_epi64(__m128i A) {
    784   // CHECK-LABEL: test_mm_slli_epi64
    785   // CHECK: call <2 x i64> @llvm.x86.sse2.pslli.q
    786   return _mm_slli_epi64(A, 1);
    787 }
    788 
    789 __m128i test_mm_slli_si128(__m128i A) {
    790   // CHECK-LABEL: test_mm_slli_si128
    791   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
    792   return _mm_slli_si128(A, 5);
    793 }
    794 
    795 __m128d test_mm_sqrt_pd(__m128d A) {
    796   // CHECK-LABEL: test_mm_sqrt_pd
    797   // CHECK: call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %{{.*}})
    798   return _mm_sqrt_pd(A);
    799 }
    800 
    801 __m128d test_mm_sqrt_sd(__m128d A, __m128d B) {
    802   // CHECK-LABEL: test_mm_sqrt_sd
    803   // CHECK: call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %{{.*}})
    804   return _mm_sqrt_sd(A, B);
    805 }
    806 
    807 __m128i test_mm_sra_epi16(__m128i A, __m128i B) {
    808   // CHECK-LABEL: test_mm_sra_epi16
    809   // CHECK: call <8 x i16> @llvm.x86.sse2.psra.w
    810   return _mm_sra_epi16(A, B);
    811 }
    812 
    813 __m128i test_mm_sra_epi32(__m128i A, __m128i B) {
    814   // CHECK-LABEL: test_mm_sra_epi32
    815   // CHECK: call <4 x i32> @llvm.x86.sse2.psra.d
    816   return _mm_sra_epi32(A, B);
    817 }
    818 
    819 __m128i test_mm_srai_epi16(__m128i A) {
    820   // CHECK-LABEL: test_mm_srai_epi16
    821   // CHECK: call <8 x i16> @llvm.x86.sse2.psrai.w
    822   return _mm_srai_epi16(A, 1);
    823 }
    824 
    825 __m128i test_mm_srai_epi32(__m128i A) {
    826   // CHECK-LABEL: test_mm_srai_epi32
    827   // CHECK: call <4 x i32> @llvm.x86.sse2.psrai.d
    828   return _mm_srai_epi32(A, 1);
    829 }
    830 
    831 __m128i test_mm_srl_epi16(__m128i A, __m128i B) {
    832   // CHECK-LABEL: test_mm_srl_epi16
    833   // CHECK: call <8 x i16> @llvm.x86.sse2.psrl.w
    834   return _mm_srl_epi16(A, B);
    835 }
    836 
    837 __m128i test_mm_srl_epi32(__m128i A, __m128i B) {
    838   // CHECK-LABEL: test_mm_srl_epi32
    839   // CHECK: call <4 x i32> @llvm.x86.sse2.psrl.d
    840   return _mm_srl_epi32(A, B);
    841 }
    842 
    843 __m128i test_mm_srl_epi64(__m128i A, __m128i B) {
    844   // CHECK-LABEL: test_mm_srl_epi64
    845   // CHECK: call <2 x i64> @llvm.x86.sse2.psrl.q
    846   return _mm_srl_epi64(A, B);
    847 }
    848 
    849 __m128i test_mm_srli_epi16(__m128i A) {
    850   // CHECK-LABEL: test_mm_srli_epi16
    851   // CHECK: call <8 x i16> @llvm.x86.sse2.psrli.w
    852   return _mm_srli_epi16(A, 1);
    853 }
    854 
    855 __m128i test_mm_srli_epi32(__m128i A) {
    856   // CHECK-LABEL: test_mm_srli_epi32
    857   // CHECK: call <4 x i32> @llvm.x86.sse2.psrli.d
    858   return _mm_srli_epi32(A, 1);
    859 }
    860 
    861 __m128i test_mm_srli_epi64(__m128i A) {
    862   // CHECK-LABEL: test_mm_srli_epi64
    863   // CHECK: call <2 x i64> @llvm.x86.sse2.psrli.q
    864   return _mm_srli_epi64(A, 1);
    865 }
    866 
    867 __m128i test_mm_srli_si128(__m128i A) {
    868   // CHECK-LABEL: test_mm_srli_si128
    869   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
    870   return _mm_srli_si128(A, 5);
    871 }
    872 
    873 void test_mm_store_pd(double* A, __m128d B) {
    874   // CHECK-LABEL: test_mm_store_pd
    875   // CHECK: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 16
    876   _mm_store_pd(A, B);
    877 }
    878 
    879 void test_mm_store_sd(double* A, __m128d B) {
    880   // CHECK-LABEL: test_mm_store_sd
    881   // CHECK: store double %{{.*}}, double* %{{.*}}, align 1{{$}}
    882   _mm_store_sd(A, B);
    883 }
    884 
    885 void test_mm_store_si128(__m128i* A, __m128i B) {
    886   // CHECK-LABEL: test_mm_store_si128
    887   // CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
    888   _mm_store_si128(A, B);
    889 }
    890 
    891 void test_mm_storeh_pd(double* A, __m128d B) {
    892   // CHECK-LABEL: test_mm_storeh_pd
    893   // CHECK: store double %{{.*}}, double* %{{.*}}, align 1
    894   _mm_storeh_pd(A, B);
    895 }
    896 
    897 void test_mm_storel_pd(double* A, __m128d B) {
    898   // CHECK-LABEL: test_mm_storel_pd
    899   // CHECK: store double %{{.*}}, double* %{{.*}}, align 1
    900   _mm_storel_pd(A, B);
    901 }
    902 
    903 void test_mm_storeu_pd(double* A, __m128d B) {
    904   // CHECK-LABEL: test_mm_storeu_pd
    905   // CHECK: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 1
    906   _mm_storeu_pd(A, B);
    907 }
    908 
    909 void test_mm_storeu_si128(__m128i* A, __m128i B) {
    910   // CHECK-LABEL: test_mm_storeu_si128
    911   // CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1
    912   _mm_storeu_si128(A, B);
    913 }
    914 
    915 void test_mm_stream_pd(double *A, __m128d B) {
    916   // CHECK-LABEL: test_mm_stream_pd
    917   // CHECK: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 16, !nontemporal
    918   _mm_stream_pd(A, B);
    919 }
    920 
    921 void test_mm_stream_si32(int *A, int B) {
    922   // CHECK-LABEL: test_mm_stream_si32
    923   // CHECK: store i32 %{{.*}}, i32* %{{.*}}, align 1, !nontemporal
    924   _mm_stream_si32(A, B);
    925 }
    926 
    927 void test_mm_stream_si64(long long *A, long long B) {
    928   // CHECK-LABEL: test_mm_stream_si64
    929   // CHECK: store i64 %{{.*}}, i64* %{{.*}}, align 1, !nontemporal
    930   _mm_stream_si64(A, B);
    931 }
    932 
    933 void test_mm_stream_si128(__m128i *A, __m128i B) {
    934   // CHECK-LABEL: test_mm_stream_si128
    935   // CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16, !nontemporal
    936   _mm_stream_si128(A, B);
    937 }
    938 
    939 __m128i test_mm_sub_epi8(__m128i A, __m128i B) {
    940   // CHECK-LABEL: test_mm_sub_epi8
    941   // CHECK: sub <16 x i8>
    942   return _mm_sub_epi8(A, B);
    943 }
    944 
    945 __m128i test_mm_sub_epi16(__m128i A, __m128i B) {
    946   // CHECK-LABEL: test_mm_sub_epi16
    947   // CHECK: sub <8 x i16>
    948   return _mm_sub_epi16(A, B);
    949 }
    950 
    951 __m128i test_mm_sub_epi32(__m128i A, __m128i B) {
    952   // CHECK-LABEL: test_mm_sub_epi32
    953   // CHECK: sub <4 x i32>
    954   return _mm_sub_epi32(A, B);
    955 }
    956 
    957 __m128i test_mm_sub_epi64(__m128i A, __m128i B) {
    958   // CHECK-LABEL: test_mm_sub_epi64
    959   // CHECK: sub <2 x i64>
    960   return _mm_sub_epi64(A, B);
    961 }
    962 
    963 __m128d test_mm_sub_pd(__m128d A, __m128d B) {
    964   // CHECK-LABEL: test_mm_sub_pd
    965   // CHECK: fsub <2 x double>
    966   return _mm_sub_pd(A, B);
    967 }
    968 
    969 __m128d test_mm_sub_sd(__m128d A, __m128d B) {
    970   // CHECK-LABEL: test_mm_sub_sd
    971   // CHECK: fsub double
    972   return _mm_sub_sd(A, B);
    973 }
    974 
    975 __m128i test_mm_subs_epi8(__m128i A, __m128i B) {
    976   // CHECK-LABEL: test_mm_subs_epi8
    977   // CHECK: call <16 x i8> @llvm.x86.sse2.psubs.b
    978   return _mm_subs_epi8(A, B);
    979 }
    980 
    981 __m128i test_mm_subs_epi16(__m128i A, __m128i B) {
    982   // CHECK-LABEL: test_mm_subs_epi16
    983   // CHECK: call <8 x i16> @llvm.x86.sse2.psubs.w
    984   return _mm_subs_epi16(A, B);
    985 }
    986 
    987 __m128i test_mm_subs_epu8(__m128i A, __m128i B) {
    988   // CHECK-LABEL: test_mm_subs_epu8
    989   // CHECK: call <16 x i8> @llvm.x86.sse2.psubus.b
    990   return _mm_subs_epu8(A, B);
    991 }
    992 
    993 __m128i test_mm_subs_epu16(__m128i A, __m128i B) {
    994   // CHECK-LABEL: test_mm_subs_epu16
    995   // CHECK: call <8 x i16> @llvm.x86.sse2.psubus.w
    996   return _mm_subs_epu16(A, B);
    997 }
    998 
    999 int test_mm_ucomieq_sd(__m128d A, __m128d B) {
   1000   // CHECK-LABEL: test_mm_ucomieq_sd
   1001   // CHECK: call i32 @llvm.x86.sse2.ucomieq.sd
   1002   return _mm_ucomieq_sd(A, B);
   1003 }
   1004 
   1005 int test_mm_ucomige_sd(__m128d A, __m128d B) {
   1006   // CHECK-LABEL: test_mm_ucomige_sd
   1007   // CHECK: call i32 @llvm.x86.sse2.ucomige.sd
   1008   return _mm_ucomige_sd(A, B);
   1009 }
   1010 
   1011 int test_mm_ucomigt_sd(__m128d A, __m128d B) {
   1012   // CHECK-LABEL: test_mm_ucomigt_sd
   1013   // CHECK: call i32 @llvm.x86.sse2.ucomigt.sd
   1014   return _mm_ucomigt_sd(A, B);
   1015 }
   1016 
   1017 int test_mm_ucomile_sd(__m128d A, __m128d B) {
   1018   // CHECK-LABEL: test_mm_ucomile_sd
   1019   // CHECK: call i32 @llvm.x86.sse2.ucomile.sd
   1020   return _mm_ucomile_sd(A, B);
   1021 }
   1022 
   1023 int test_mm_ucomilt_sd(__m128d A, __m128d B) {
   1024   // CHECK-LABEL: test_mm_ucomilt_sd
   1025   // CHECK: call i32 @llvm.x86.sse2.ucomilt.sd
   1026   return _mm_ucomilt_sd(A, B);
   1027 }
   1028 
   1029 int test_mm_ucomineq_sd(__m128d A, __m128d B) {
   1030   // CHECK-LABEL: test_mm_ucomineq_sd
   1031   // CHECK: call i32 @llvm.x86.sse2.ucomineq.sd
   1032   return _mm_ucomineq_sd(A, B);
   1033 }
   1034 
   1035 __m128i test_mm_unpackhi_epi8(__m128i A, __m128i B) {
   1036   // CHECK-LABEL: test_mm_unpackhi_epi8
   1037   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   1038   return _mm_unpackhi_epi8(A, B);
   1039 }
   1040 
   1041 __m128i test_mm_unpackhi_epi16(__m128i A, __m128i B) {
   1042   // CHECK-LABEL: test_mm_unpackhi_epi16
   1043   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   1044   return _mm_unpackhi_epi16(A, B);
   1045 }
   1046 
   1047 __m128i test_mm_unpackhi_epi32(__m128i A, __m128i B) {
   1048   // CHECK-LABEL: test_mm_unpackhi_epi32
   1049   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   1050   return _mm_unpackhi_epi32(A, B);
   1051 }
   1052 
   1053 __m128i test_mm_unpackhi_epi64(__m128i A, __m128i B) {
   1054   // CHECK-LABEL: test_mm_unpackhi_epi64
   1055   // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 1, i32 3>
   1056   return _mm_unpackhi_epi64(A, B);
   1057 }
   1058 
   1059 __m128d test_mm_unpackhi_pd(__m128d A, __m128d B) {
   1060   // CHECK-LABEL: test_mm_unpackhi_pd
   1061   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 3>
   1062   return _mm_unpackhi_pd(A, B);
   1063 }
   1064 
   1065 __m128i test_mm_unpacklo_epi8(__m128i A, __m128i B) {
   1066   // CHECK-LABEL: test_mm_unpacklo_epi8
   1067   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   1068   return _mm_unpacklo_epi8(A, B);
   1069 }
   1070 
   1071 __m128i test_mm_unpacklo_epi16(__m128i A, __m128i B) {
   1072   // CHECK-LABEL: test_mm_unpacklo_epi16
   1073   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   1074   return _mm_unpacklo_epi16(A, B);
   1075 }
   1076 
   1077 __m128i test_mm_unpacklo_epi32(__m128i A, __m128i B) {
   1078   // CHECK-LABEL: test_mm_unpacklo_epi32
   1079   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   1080   return _mm_unpacklo_epi32(A, B);
   1081 }
   1082 
   1083 __m128i test_mm_unpacklo_epi64(__m128i A, __m128i B) {
   1084   // CHECK-LABEL: test_mm_unpacklo_epi64
   1085   // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 0, i32 2>
   1086   return _mm_unpacklo_epi64(A, B);
   1087 }
   1088 
   1089 __m128d test_mm_unpacklo_pd(__m128d A, __m128d B) {
   1090   // CHECK-LABEL: test_mm_unpacklo_pd
   1091   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 0, i32 2>
   1092   return _mm_unpacklo_pd(A, B);
   1093 }
   1094 
   1095 __m128d test_mm_xor_pd(__m128d A, __m128d B) {
   1096   // CHECK-LABEL: test_mm_xor_pd
   1097   // CHECK: xor <4 x i32> %{{.*}}, %{{.*}}
   1098   return _mm_xor_pd(A, B);
   1099 }
   1100 
   1101 __m128i test_mm_xor_si128(__m128i A, __m128i B) {
   1102   // CHECK-LABEL: test_mm_xor_si128
   1103   // CHECK: xor <2 x i64> %{{.*}}, %{{.*}}
   1104   return _mm_xor_si128(A, B);
   1105 }
   1106