Home | History | Annotate | Download | only in CodeGen
      1 // RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse2 -emit-llvm -o - -Werror | FileCheck %s
      2 // RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse2 -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s
      3 
      4 // Don't include mm_malloc.h, it's system specific.
      5 #define __MM_MALLOC_H
      6 
      7 #include <x86intrin.h>
      8 
      9 // NOTE: This should match the tests in llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
     10 
     11 __m128i test_mm_add_epi8(__m128i A, __m128i B) {
     12   // CHECK-LABEL: test_mm_add_epi8
     13   // CHECK: add <16 x i8>
     14   return _mm_add_epi8(A, B);
     15 }
     16 
     17 __m128i test_mm_add_epi16(__m128i A, __m128i B) {
     18   // CHECK-LABEL: test_mm_add_epi16
     19   // CHECK: add <8 x i16>
     20   return _mm_add_epi16(A, B);
     21 }
     22 
     23 __m128i test_mm_add_epi32(__m128i A, __m128i B) {
     24   // CHECK-LABEL: test_mm_add_epi32
     25   // CHECK: add <4 x i32>
     26   return _mm_add_epi32(A, B);
     27 }
     28 
     29 __m128i test_mm_add_epi64(__m128i A, __m128i B) {
     30   // CHECK-LABEL: test_mm_add_epi64
     31   // CHECK: add <2 x i64>
     32   return _mm_add_epi64(A, B);
     33 }
     34 
     35 __m128d test_mm_add_pd(__m128d A, __m128d B) {
     36   // CHECK-LABEL: test_mm_add_pd
     37   // CHECK: fadd <2 x double>
     38   return _mm_add_pd(A, B);
     39 }
     40 
     41 __m128d test_mm_add_sd(__m128d A, __m128d B) {
     42   // CHECK-LABEL: test_mm_add_sd
     43   // CHECK: extractelement <2 x double> %{{.*}}, i32 0
     44   // CHECK: extractelement <2 x double> %{{.*}}, i32 0
     45   // CHECK: fadd double
     46   // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
     47   return _mm_add_sd(A, B);
     48 }
     49 
     50 __m128i test_mm_adds_epi8(__m128i A, __m128i B) {
     51   // CHECK-LABEL: test_mm_adds_epi8
     52   // CHECK: call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
     53   return _mm_adds_epi8(A, B);
     54 }
     55 
     56 __m128i test_mm_adds_epi16(__m128i A, __m128i B) {
     57   // CHECK-LABEL: test_mm_adds_epi16
     58   // CHECK: call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
     59   return _mm_adds_epi16(A, B);
     60 }
     61 
     62 __m128i test_mm_adds_epu8(__m128i A, __m128i B) {
     63   // CHECK-LABEL: test_mm_adds_epu8
     64   // CHECK: call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
     65   return _mm_adds_epu8(A, B);
     66 }
     67 
     68 __m128i test_mm_adds_epu16(__m128i A, __m128i B) {
     69   // CHECK-LABEL: test_mm_adds_epu16
     70   // CHECK: call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
     71   return _mm_adds_epu16(A, B);
     72 }
     73 
     74 __m128d test_mm_and_pd(__m128d A, __m128d B) {
     75   // CHECK-LABEL: test_mm_and_pd
     76   // CHECK: and <4 x i32>
     77   return _mm_and_pd(A, B);
     78 }
     79 
     80 __m128i test_mm_and_si128(__m128i A, __m128i B) {
     81   // CHECK-LABEL: test_mm_and_si128
     82   // CHECK: and <2 x i64>
     83   return _mm_and_si128(A, B);
     84 }
     85 
     86 __m128d test_mm_andnot_pd(__m128d A, __m128d B) {
     87   // CHECK-LABEL: test_mm_andnot_pd
     88   // CHECK: xor <4 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1>
     89   // CHECK: and <4 x i32>
     90   return _mm_andnot_pd(A, B);
     91 }
     92 
     93 __m128i test_mm_andnot_si128(__m128i A, __m128i B) {
     94   // CHECK-LABEL: test_mm_andnot_si128
     95   // CHECK: xor <2 x i64> %{{.*}}, <i64 -1, i64 -1>
     96   // CHECK: and <2 x i64>
     97   return _mm_andnot_si128(A, B);
     98 }
     99 
    100 __m128i test_mm_avg_epu8(__m128i A, __m128i B) {
    101   // CHECK-LABEL: test_mm_avg_epu8
    102   // CHECK: call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
    103   return _mm_avg_epu8(A, B);
    104 }
    105 
    106 __m128i test_mm_avg_epu16(__m128i A, __m128i B) {
    107   // CHECK-LABEL: test_mm_avg_epu16
    108   // CHECK: call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
    109   return _mm_avg_epu16(A, B);
    110 }
    111 
    112 __m128i test_mm_bslli_si128(__m128i A) {
    113   // CHECK-LABEL: test_mm_bslli_si128
    114   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
    115   return _mm_bslli_si128(A, 5);
    116 }
    117 
    118 __m128i test_mm_bsrli_si128(__m128i A) {
    119   // CHECK-LABEL: test_mm_bsrli_si128
    120   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
    121   return _mm_bsrli_si128(A, 5);
    122 }
    123 
    124 __m128 test_mm_castpd_ps(__m128d A) {
    125   // CHECK-LABEL: test_mm_castpd_ps
    126   // CHECK: bitcast <2 x double> %{{.*}} to <4 x float>
    127   return _mm_castpd_ps(A);
    128 }
    129 
    130 __m128i test_mm_castpd_si128(__m128d A) {
    131   // CHECK-LABEL: test_mm_castpd_si128
    132   // CHECK: bitcast <2 x double> %{{.*}} to <2 x i64>
    133   return _mm_castpd_si128(A);
    134 }
    135 
    136 __m128d test_mm_castps_pd(__m128 A) {
    137   // CHECK-LABEL: test_mm_castps_pd
    138   // CHECK: bitcast <4 x float> %{{.*}} to <2 x double>
    139   return _mm_castps_pd(A);
    140 }
    141 
    142 __m128i test_mm_castps_si128(__m128 A) {
    143   // CHECK-LABEL: test_mm_castps_si128
    144   // CHECK: bitcast <4 x float> %{{.*}} to <2 x i64>
    145   return _mm_castps_si128(A);
    146 }
    147 
    148 __m128d test_mm_castsi128_pd(__m128i A) {
    149   // CHECK-LABEL: test_mm_castsi128_pd
    150   // CHECK: bitcast <2 x i64> %{{.*}} to <2 x double>
    151   return _mm_castsi128_pd(A);
    152 }
    153 
    154 __m128 test_mm_castsi128_ps(__m128i A) {
    155   // CHECK-LABEL: test_mm_castsi128_ps
    156   // CHECK: bitcast <2 x i64> %{{.*}} to <4 x float>
    157   return _mm_castsi128_ps(A);
    158 }
    159 
    160 void test_mm_clflush(void* A) {
    161   // CHECK-LABEL: test_mm_clflush
    162   // CHECK: call void @llvm.x86.sse2.clflush(i8* %{{.*}})
    163   _mm_clflush(A);
    164 }
    165 
    166 __m128i test_mm_cmpeq_epi8(__m128i A, __m128i B) {
    167   // CHECK-LABEL: test_mm_cmpeq_epi8
    168   // CHECK: icmp eq <16 x i8>
    169   return _mm_cmpeq_epi8(A, B);
    170 }
    171 
    172 __m128i test_mm_cmpeq_epi16(__m128i A, __m128i B) {
    173   // CHECK-LABEL: test_mm_cmpeq_epi16
    174   // CHECK: icmp eq <8 x i16>
    175   return _mm_cmpeq_epi16(A, B);
    176 }
    177 
    178 __m128i test_mm_cmpeq_epi32(__m128i A, __m128i B) {
    179   // CHECK-LABEL: test_mm_cmpeq_epi32
    180   // CHECK: icmp eq <4 x i32>
    181   return _mm_cmpeq_epi32(A, B);
    182 }
    183 
    184 __m128d test_mm_cmpeq_pd(__m128d A, __m128d B) {
    185   // CHECK-LABEL: test_mm_cmpeq_pd
    186   // CHECK:         [[CMP:%.*]] = fcmp oeq <2 x double>
    187   // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
    188   // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
    189   // CHECK-NEXT:    ret <2 x double> [[BC]]
    190   return _mm_cmpeq_pd(A, B);
    191 }
    192 
    193 __m128d test_mm_cmpeq_sd(__m128d A, __m128d B) {
    194   // CHECK-LABEL: test_mm_cmpeq_sd
    195   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 0)
    196   return _mm_cmpeq_sd(A, B);
    197 }
    198 
    199 __m128d test_mm_cmpge_pd(__m128d A, __m128d B) {
    200   // CHECK-LABEL: test_mm_cmpge_pd
    201   // CHECK:         [[CMP:%.*]] = fcmp ole <2 x double>
    202   // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
    203   // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
    204   // CHECK-NEXT:    ret <2 x double> [[BC]]
    205   return _mm_cmpge_pd(A, B);
    206 }
    207 
    208 __m128d test_mm_cmpge_sd(__m128d A, __m128d B) {
    209   // CHECK-LABEL: test_mm_cmpge_sd
    210   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 2)
    211   // CHECK: extractelement <2 x double> %{{.*}}, i32 0
    212   // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
    213   // CHECK: extractelement <2 x double> %{{.*}}, i32 1
    214   // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
    215   return _mm_cmpge_sd(A, B);
    216 }
    217 
    218 __m128i test_mm_cmpgt_epi8(__m128i A, __m128i B) {
    219   // CHECK-LABEL: test_mm_cmpgt_epi8
    220   // CHECK: icmp sgt <16 x i8>
    221   return _mm_cmpgt_epi8(A, B);
    222 }
    223 
    224 __m128i test_mm_cmpgt_epi16(__m128i A, __m128i B) {
    225   // CHECK-LABEL: test_mm_cmpgt_epi16
    226   // CHECK: icmp sgt <8 x i16>
    227   return _mm_cmpgt_epi16(A, B);
    228 }
    229 
    230 __m128i test_mm_cmpgt_epi32(__m128i A, __m128i B) {
    231   // CHECK-LABEL: test_mm_cmpgt_epi32
    232   // CHECK: icmp sgt <4 x i32>
    233   return _mm_cmpgt_epi32(A, B);
    234 }
    235 
    236 __m128d test_mm_cmpgt_pd(__m128d A, __m128d B) {
    237   // CHECK-LABEL: test_mm_cmpgt_pd
    238   // CHECK:         [[CMP:%.*]] = fcmp olt <2 x double>
    239   // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
    240   // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
    241   // CHECK-NEXT:    ret <2 x double> [[BC]]
    242   return _mm_cmpgt_pd(A, B);
    243 }
    244 
    245 __m128d test_mm_cmpgt_sd(__m128d A, __m128d B) {
    246   // CHECK-LABEL: test_mm_cmpgt_sd
    247   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
    248   // CHECK: extractelement <2 x double> %{{.*}}, i32 0
    249   // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
    250   // CHECK: extractelement <2 x double> %{{.*}}, i32 1
    251   // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
    252   return _mm_cmpgt_sd(A, B);
    253 }
    254 
    255 __m128d test_mm_cmple_pd(__m128d A, __m128d B) {
    256   // CHECK-LABEL: test_mm_cmple_pd
    257   // CHECK:         [[CMP:%.*]] = fcmp ole <2 x double>
    258   // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
    259   // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
    260   // CHECK-NEXT:    ret <2 x double> [[BC]]
    261   return _mm_cmple_pd(A, B);
    262 }
    263 
    264 __m128d test_mm_cmple_sd(__m128d A, __m128d B) {
    265   // CHECK-LABEL: test_mm_cmple_sd
    266   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 2)
    267   return _mm_cmple_sd(A, B);
    268 }
    269 
    270 __m128i test_mm_cmplt_epi8(__m128i A, __m128i B) {
    271   // CHECK-LABEL: test_mm_cmplt_epi8
    272   // CHECK: icmp sgt <16 x i8>
    273   return _mm_cmplt_epi8(A, B);
    274 }
    275 
    276 __m128i test_mm_cmplt_epi16(__m128i A, __m128i B) {
    277   // CHECK-LABEL: test_mm_cmplt_epi16
    278   // CHECK: icmp sgt <8 x i16>
    279   return _mm_cmplt_epi16(A, B);
    280 }
    281 
    282 __m128i test_mm_cmplt_epi32(__m128i A, __m128i B) {
    283   // CHECK-LABEL: test_mm_cmplt_epi32
    284   // CHECK: icmp sgt <4 x i32>
    285   return _mm_cmplt_epi32(A, B);
    286 }
    287 
    288 __m128d test_mm_cmplt_pd(__m128d A, __m128d B) {
    289   // CHECK-LABEL: test_mm_cmplt_pd
    290   // CHECK:         [[CMP:%.*]] = fcmp olt <2 x double>
    291   // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
    292   // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
    293   // CHECK-NEXT:    ret <2 x double> [[BC]]
    294   return _mm_cmplt_pd(A, B);
    295 }
    296 
    297 __m128d test_mm_cmplt_sd(__m128d A, __m128d B) {
    298   // CHECK-LABEL: test_mm_cmplt_sd
    299   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
    300   return _mm_cmplt_sd(A, B);
    301 }
    302 
    303 __m128d test_mm_cmpneq_pd(__m128d A, __m128d B) {
    304   // CHECK-LABEL: test_mm_cmpneq_pd
    305   // CHECK:         [[CMP:%.*]] = fcmp une <2 x double>
    306   // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
    307   // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
    308   // CHECK-NEXT:    ret <2 x double> [[BC]]
    309   return _mm_cmpneq_pd(A, B);
    310 }
    311 
    312 __m128d test_mm_cmpneq_sd(__m128d A, __m128d B) {
    313   // CHECK-LABEL: test_mm_cmpneq_sd
    314   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 4)
    315   return _mm_cmpneq_sd(A, B);
    316 }
    317 
    318 __m128d test_mm_cmpnge_pd(__m128d A, __m128d B) {
    319   // CHECK-LABEL: test_mm_cmpnge_pd
    320   // CHECK:         [[CMP:%.*]] = fcmp ugt <2 x double>
    321   // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
    322   // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
    323   // CHECK-NEXT:    ret <2 x double> [[BC]]
    324   return _mm_cmpnge_pd(A, B);
    325 }
    326 
    327 __m128d test_mm_cmpnge_sd(__m128d A, __m128d B) {
    328   // CHECK-LABEL: test_mm_cmpnge_sd
    329   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 6)
    330   // CHECK: extractelement <2 x double> %{{.*}}, i32 0
    331   // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
    332   // CHECK: extractelement <2 x double> %{{.*}}, i32 1
    333   // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
    334   return _mm_cmpnge_sd(A, B);
    335 }
    336 
    337 __m128d test_mm_cmpngt_pd(__m128d A, __m128d B) {
    338   // CHECK-LABEL: test_mm_cmpngt_pd
    339   // CHECK:         [[CMP:%.*]] = fcmp uge <2 x double>
    340   // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
    341   // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
    342   // CHECK-NEXT:    ret <2 x double> [[BC]]
    343   return _mm_cmpngt_pd(A, B);
    344 }
    345 
    346 __m128d test_mm_cmpngt_sd(__m128d A, __m128d B) {
    347   // CHECK-LABEL: test_mm_cmpngt_sd
    348   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 5)
    349   // CHECK: extractelement <2 x double> %{{.*}}, i32 0
    350   // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
    351   // CHECK: extractelement <2 x double> %{{.*}}, i32 1
    352   // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
    353   return _mm_cmpngt_sd(A, B);
    354 }
    355 
    356 __m128d test_mm_cmpnle_pd(__m128d A, __m128d B) {
    357   // CHECK-LABEL: test_mm_cmpnle_pd
    358   // CHECK:         [[CMP:%.*]] = fcmp ugt <2 x double>
    359   // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
    360   // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
    361   // CHECK-NEXT:    ret <2 x double> [[BC]]
    362   return _mm_cmpnle_pd(A, B);
    363 }
    364 
    365 __m128d test_mm_cmpnle_sd(__m128d A, __m128d B) {
    366   // CHECK-LABEL: test_mm_cmpnle_sd
    367   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 6)
    368   return _mm_cmpnle_sd(A, B);
    369 }
    370 
    371 __m128d test_mm_cmpnlt_pd(__m128d A, __m128d B) {
    372   // CHECK-LABEL: test_mm_cmpnlt_pd
    373   // CHECK:         [[CMP:%.*]] = fcmp uge <2 x double>
    374   // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
    375   // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
    376   // CHECK-NEXT:    ret <2 x double> [[BC]]
    377   return _mm_cmpnlt_pd(A, B);
    378 }
    379 
    380 __m128d test_mm_cmpnlt_sd(__m128d A, __m128d B) {
    381   // CHECK-LABEL: test_mm_cmpnlt_sd
    382   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 5)
    383   return _mm_cmpnlt_sd(A, B);
    384 }
    385 
    386 __m128d test_mm_cmpord_pd(__m128d A, __m128d B) {
    387   // CHECK-LABEL: test_mm_cmpord_pd
    388   // CHECK:         [[CMP:%.*]] = fcmp ord <2 x double>
    389   // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
    390   // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
    391   // CHECK-NEXT:    ret <2 x double> [[BC]]
    392   return _mm_cmpord_pd(A, B);
    393 }
    394 
    395 __m128d test_mm_cmpord_sd(__m128d A, __m128d B) {
    396   // CHECK-LABEL: test_mm_cmpord_sd
    397   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 7)
    398   return _mm_cmpord_sd(A, B);
    399 }
    400 
    401 __m128d test_mm_cmpunord_pd(__m128d A, __m128d B) {
    402   // CHECK-LABEL: test_mm_cmpunord_pd
    403   // CHECK:         [[CMP:%.*]] = fcmp uno <2 x double>
    404   // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
    405   // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
    406   // CHECK-NEXT:    ret <2 x double> [[BC]]
    407   return _mm_cmpunord_pd(A, B);
    408 }
    409 
    410 __m128d test_mm_cmpunord_sd(__m128d A, __m128d B) {
    411   // CHECK-LABEL: test_mm_cmpunord_sd
    412   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 3)
    413   return _mm_cmpunord_sd(A, B);
    414 }
    415 
    416 int test_mm_comieq_sd(__m128d A, __m128d B) {
    417   // CHECK-LABEL: test_mm_comieq_sd
    418   // CHECK: call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
    419   return _mm_comieq_sd(A, B);
    420 }
    421 
    422 int test_mm_comige_sd(__m128d A, __m128d B) {
    423   // CHECK-LABEL: test_mm_comige_sd
    424   // CHECK: call i32 @llvm.x86.sse2.comige.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
    425   return _mm_comige_sd(A, B);
    426 }
    427 
    428 int test_mm_comigt_sd(__m128d A, __m128d B) {
    429   // CHECK-LABEL: test_mm_comigt_sd
    430   // CHECK: call i32 @llvm.x86.sse2.comigt.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
    431   return _mm_comigt_sd(A, B);
    432 }
    433 
    434 int test_mm_comile_sd(__m128d A, __m128d B) {
    435   // CHECK-LABEL: test_mm_comile_sd
    436   // CHECK: call i32 @llvm.x86.sse2.comile.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
    437   return _mm_comile_sd(A, B);
    438 }
    439 
    440 int test_mm_comilt_sd(__m128d A, __m128d B) {
    441   // CHECK-LABEL: test_mm_comilt_sd
    442   // CHECK: call i32 @llvm.x86.sse2.comilt.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
    443   return _mm_comilt_sd(A, B);
    444 }
    445 
    446 int test_mm_comineq_sd(__m128d A, __m128d B) {
    447   // CHECK-LABEL: test_mm_comineq_sd
    448   // CHECK: call i32 @llvm.x86.sse2.comineq.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
    449   return _mm_comineq_sd(A, B);
    450 }
    451 
    452 __m128d test_mm_cvtepi32_pd(__m128i A) {
    453   // CHECK-LABEL: test_mm_cvtepi32_pd
    454   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <2 x i32> <i32 0, i32 1>
    455   // CHECK: sitofp <2 x i32> %{{.*}} to <2 x double>
    456   return _mm_cvtepi32_pd(A);
    457 }
    458 
    459 __m128 test_mm_cvtepi32_ps(__m128i A) {
    460   // CHECK-LABEL: test_mm_cvtepi32_ps
    461   // CHECK: call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %{{.*}})
    462   return _mm_cvtepi32_ps(A);
    463 }
    464 
    465 __m128i test_mm_cvtpd_epi32(__m128d A) {
    466   // CHECK-LABEL: test_mm_cvtpd_epi32
    467   // CHECK: call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %{{.*}})
    468   return _mm_cvtpd_epi32(A);
    469 }
    470 
    471 __m128 test_mm_cvtpd_ps(__m128d A) {
    472   // CHECK-LABEL: test_mm_cvtpd_ps
    473   // CHECK: call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %{{.*}})
    474   return _mm_cvtpd_ps(A);
    475 }
    476 
    477 __m128i test_mm_cvtps_epi32(__m128 A) {
    478   // CHECK-LABEL: test_mm_cvtps_epi32
    479   // CHECK: call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %{{.*}})
    480   return _mm_cvtps_epi32(A);
    481 }
    482 
    483 __m128d test_mm_cvtps_pd(__m128 A) {
    484   // CHECK-LABEL: test_mm_cvtps_pd
    485   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <2 x i32> <i32 0, i32 1>
    486   // CHECK: fpext <2 x float> %{{.*}} to <2 x double>
    487   return _mm_cvtps_pd(A);
    488 }
    489 
    490 double test_mm_cvtsd_f64(__m128d A) {
    491   // CHECK-LABEL: test_mm_cvtsd_f64
    492   // CHECK: extractelement <2 x double> %{{.*}}, i32 0
    493   return _mm_cvtsd_f64(A);
    494 }
    495 
    496 int test_mm_cvtsd_si32(__m128d A) {
    497   // CHECK-LABEL: test_mm_cvtsd_si32
    498   // CHECK: call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %{{.*}})
    499   return _mm_cvtsd_si32(A);
    500 }
    501 
    502 long long test_mm_cvtsd_si64(__m128d A) {
    503   // CHECK-LABEL: test_mm_cvtsd_si64
    504   // CHECK: call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %{{.*}})
    505   return _mm_cvtsd_si64(A);
    506 }
    507 
    508 __m128 test_mm_cvtsd_ss(__m128 A, __m128d B) {
    509   // CHECK-LABEL: test_mm_cvtsd_ss
    510   // CHECK: fptrunc double %{{.*}} to float
    511   return _mm_cvtsd_ss(A, B);
    512 }
    513 
    514 int test_mm_cvtsi128_si32(__m128i A) {
    515   // CHECK-LABEL: test_mm_cvtsi128_si32
    516   // CHECK: extractelement <4 x i32> %{{.*}}, i32 0
    517   return _mm_cvtsi128_si32(A);
    518 }
    519 
    520 long long test_mm_cvtsi128_si64(__m128i A) {
    521   // CHECK-LABEL: test_mm_cvtsi128_si64
    522   // CHECK: extractelement <2 x i64> %{{.*}}, i32 0
    523   return _mm_cvtsi128_si64(A);
    524 }
    525 
    526 __m128d test_mm_cvtsi32_sd(__m128d A, int B) {
    527   // CHECK-LABEL: test_mm_cvtsi32_sd
    528   // CHECK: sitofp i32 %{{.*}} to double
    529   // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
    530   return _mm_cvtsi32_sd(A, B);
    531 }
    532 
    533 __m128i test_mm_cvtsi32_si128(int A) {
    534   // CHECK-LABEL: test_mm_cvtsi32_si128
    535   // CHECK: insertelement <4 x i32> undef, i32 %{{.*}}, i32 0
    536   // CHECK: insertelement <4 x i32> %{{.*}}, i32 0, i32 1
    537   // CHECK: insertelement <4 x i32> %{{.*}}, i32 0, i32 2
    538   // CHECK: insertelement <4 x i32> %{{.*}}, i32 0, i32 3
    539   return _mm_cvtsi32_si128(A);
    540 }
    541 
    542 __m128d test_mm_cvtsi64_sd(__m128d A, long long B) {
    543   // CHECK-LABEL: test_mm_cvtsi64_sd
    544   // CHECK: sitofp i64 %{{.*}} to double
    545   // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
    546   return _mm_cvtsi64_sd(A, B);
    547 }
    548 
    549 __m128i test_mm_cvtsi64_si128(long long A) {
    550   // CHECK-LABEL: test_mm_cvtsi64_si128
    551   // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
    552   // CHECK: insertelement <2 x i64> %{{.*}}, i64 0, i32 1
    553   return _mm_cvtsi64_si128(A);
    554 }
    555 
    556 __m128d test_mm_cvtss_sd(__m128d A, __m128 B) {
    557   // CHECK-LABEL: test_mm_cvtss_sd
    558   // CHECK: extractelement <4 x float> %{{.*}}, i32 0
    559   // CHECK: fpext float %{{.*}} to double
    560   // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
    561   return _mm_cvtss_sd(A, B);
    562 }
    563 
    564 __m128i test_mm_cvttpd_epi32(__m128d A) {
    565   // CHECK-LABEL: test_mm_cvttpd_epi32
    566   // CHECK: call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %{{.*}})
    567   return _mm_cvttpd_epi32(A);
    568 }
    569 
    570 __m128i test_mm_cvttps_epi32(__m128 A) {
    571   // CHECK-LABEL: test_mm_cvttps_epi32
    572   // CHECK: fptosi <4 x float> %{{.*}} to <4 x i32>
    573   return _mm_cvttps_epi32(A);
    574 }
    575 
    576 int test_mm_cvttsd_si32(__m128d A) {
    577   // CHECK-LABEL: test_mm_cvttsd_si32
    578   // CHECK: extractelement <2 x double> %{{.*}}, i32 0
    579   // CHECK: fptosi double %{{.*}} to i32
    580   return _mm_cvttsd_si32(A);
    581 }
    582 
    583 long long test_mm_cvttsd_si64(__m128d A) {
    584   // CHECK-LABEL: test_mm_cvttsd_si64
    585   // CHECK: extractelement <2 x double> %{{.*}}, i32 0
    586   // CHECK: fptosi double %{{.*}} to i64
    587   return _mm_cvttsd_si64(A);
    588 }
    589 
    590 __m128d test_mm_div_pd(__m128d A, __m128d B) {
    591   // CHECK-LABEL: test_mm_div_pd
    592   // CHECK: fdiv <2 x double>
    593   return _mm_div_pd(A, B);
    594 }
    595 
    596 __m128d test_mm_div_sd(__m128d A, __m128d B) {
    597   // CHECK-LABEL: test_mm_div_sd
    598   // CHECK: extractelement <2 x double> %{{.*}}, i32 0
    599   // CHECK: extractelement <2 x double> %{{.*}}, i32 0
    600   // CHECK: fdiv double
    601   // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
    602   return _mm_div_sd(A, B);
    603 }
    604 
    605 // Lowering to pextrw requires optimization.
    606 int test_mm_extract_epi16(__m128i A) {
    607   // CHECK-LABEL: test_mm_extract_epi16
    608   // CHECK: [[x:%.*]] = and i32 %{{.*}}, 7
    609   // CHECK: extractelement <8 x i16> %{{.*}}, i32 [[x]]
    610   // CHECK: zext i16 %{{.*}} to i32
    611   return _mm_extract_epi16(A, 9);
    612 }
    613 
    614 __m128i test_mm_insert_epi16(__m128i A, int B) {
    615   // CHECK-LABEL: test_mm_insert_epi16
    616   // CHECK: [[x:%.*]] = and i32 %{{.*}}, 7
    617   // CHECK: insertelement <8 x i16> %{{.*}}, i32 [[x]]
    618   return _mm_insert_epi16(A, B, 8);
    619 }
    620 
    621 void test_mm_lfence() {
    622   // CHECK-LABEL: test_mm_lfence
    623   // CHECK: call void @llvm.x86.sse2.lfence()
    624   _mm_lfence();
    625 }
    626 
    627 __m128d test_mm_load_pd(double const* A) {
    628   // CHECK-LABEL: test_mm_load_pd
    629   // CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 16
    630   return _mm_load_pd(A);
    631 }
    632 
    633 __m128d test_mm_load_pd1(double const* A) {
    634   // CHECK-LABEL: test_mm_load_pd1
    635   // CHECK: load double, double* %{{.*}}, align 8
    636   // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
    637   // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
    638   return _mm_load_pd1(A);
    639 }
    640 
    641 __m128d test_mm_load_sd(double const* A) {
    642   // CHECK-LABEL: test_mm_load_sd
    643   // CHECK: load double, double* %{{.*}}, align 1{{$}}
    644   return _mm_load_sd(A);
    645 }
    646 
    647 __m128i test_mm_load_si128(__m128i const* A) {
    648   // CHECK-LABEL: test_mm_load_si128
    649   // CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16
    650   return _mm_load_si128(A);
    651 }
    652 
    653 __m128d test_mm_load1_pd(double const* A) {
    654   // CHECK-LABEL: test_mm_load1_pd
    655   // CHECK: load double, double* %{{.*}}, align 8
    656   // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
    657   // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
    658   return _mm_load1_pd(A);
    659 }
    660 
    661 __m128d test_mm_loadh_pd(__m128d x, void* y) {
    662   // CHECK-LABEL: test_mm_loadh_pd
    663   // CHECK: load double, double* %{{.*}}, align 1{{$}}
    664   // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
    665   return _mm_loadh_pd(x, y);
    666 }
    667 
    668 __m128i test_mm_loadl_epi64(__m128i* y) {
    669   // CHECK: test_mm_loadl_epi64
    670   // CHECK: load i64, i64* {{.*}}, align 1{{$}}
    671   // CHECK: insertelement <2 x i64> undef, i64 {{.*}}, i32 0
    672   // CHECK: insertelement <2 x i64> {{.*}}, i64 0, i32 1
    673   return _mm_loadl_epi64(y);
    674 }
    675 
    676 __m128d test_mm_loadl_pd(__m128d x, void* y) {
    677   // CHECK-LABEL: test_mm_loadl_pd
    678   // CHECK: load double, double* %{{.*}}, align 1{{$}}
    679   // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
    680   // CHECK: extractelement <2 x double> %{{.*}}, i32 1
    681   // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
    682   return _mm_loadl_pd(x, y);
    683 }
    684 
    685 __m128d test_mm_loadr_pd(double const* A) {
    686   // CHECK-LABEL: test_mm_loadr_pd
    687   // CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 16
    688   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 0>
    689   return _mm_loadr_pd(A);
    690 }
    691 
    692 __m128d test_mm_loadu_pd(double const* A) {
    693   // CHECK-LABEL: test_mm_loadu_pd
    694   // CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 1{{$}}
    695   return _mm_loadu_pd(A);
    696 }
    697 
    698 __m128i test_mm_loadu_si128(__m128i const* A) {
    699   // CHECK-LABEL: test_mm_loadu_si128
    700   // CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 1{{$}}
    701   return _mm_loadu_si128(A);
    702 }
    703 
    704 __m128i test_mm_loadu_si64(void const* A) {
    705   // CHECK-LABEL: test_mm_loadu_si64
    706   // CHECK: load i64, i64* %{{.*}}, align 1{{$}}
    707   // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
    708   // CHECK: insertelement <2 x i64> %{{.*}}, i64 0, i32 1
    709   return _mm_loadu_si64(A);
    710 }
    711 
    712 __m128i test_mm_madd_epi16(__m128i A, __m128i B) {
    713   // CHECK-LABEL: test_mm_madd_epi16
    714   // CHECK: call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
    715   return _mm_madd_epi16(A, B);
    716 }
    717 
    718 void test_mm_maskmoveu_si128(__m128i A, __m128i B, char* C) {
    719   // CHECK-LABEL: test_mm_maskmoveu_si128
    720   // CHECK: call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8* %{{.*}})
    721   _mm_maskmoveu_si128(A, B, C);
    722 }
    723 
    724 __m128i test_mm_max_epi16(__m128i A, __m128i B) {
    725   // CHECK-LABEL: test_mm_max_epi16
    726   // CHECK:       [[CMP:%.*]] = icmp sgt <8 x i16> [[X:%.*]], [[Y:%.*]]
    727   // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
    728   return _mm_max_epi16(A, B);
    729 }
    730 
    731 __m128i test_mm_max_epu8(__m128i A, __m128i B) {
    732   // CHECK-LABEL: test_mm_max_epu8
    733   // CHECK:       [[CMP:%.*]] = icmp ugt <16 x i8> [[X:%.*]], [[Y:%.*]]
    734   // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
    735   return _mm_max_epu8(A, B);
    736 }
    737 
    738 __m128d test_mm_max_pd(__m128d A, __m128d B) {
    739   // CHECK-LABEL: test_mm_max_pd
    740   // CHECK: call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
    741   return _mm_max_pd(A, B);
    742 }
    743 
    744 __m128d test_mm_max_sd(__m128d A, __m128d B) {
    745   // CHECK-LABEL: test_mm_max_sd
    746   // CHECK: call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
    747   return _mm_max_sd(A, B);
    748 }
    749 
    750 void test_mm_mfence() {
    751   // CHECK-LABEL: test_mm_mfence
    752   // CHECK: call void @llvm.x86.sse2.mfence()
    753   _mm_mfence();
    754 }
    755 
    756 __m128i test_mm_min_epi16(__m128i A, __m128i B) {
    757   // CHECK-LABEL: test_mm_min_epi16
    758   // CHECK:       [[CMP:%.*]] = icmp slt <8 x i16> [[X:%.*]], [[Y:%.*]]
    759   // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
    760   return _mm_min_epi16(A, B);
    761 }
    762 
    763 __m128i test_mm_min_epu8(__m128i A, __m128i B) {
    764   // CHECK-LABEL: test_mm_min_epu8
    765   // CHECK:       [[CMP:%.*]] = icmp ult <16 x i8> [[X:%.*]], [[Y:%.*]]
    766   // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
    767   return _mm_min_epu8(A, B);
    768 }
    769 
    770 __m128d test_mm_min_pd(__m128d A, __m128d B) {
    771   // CHECK-LABEL: test_mm_min_pd
    772   // CHECK: call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
    773   return _mm_min_pd(A, B);
    774 }
    775 
    776 __m128d test_mm_min_sd(__m128d A, __m128d B) {
    777   // CHECK-LABEL: test_mm_min_sd
    778   // CHECK: call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
    779   return _mm_min_sd(A, B);
    780 }
    781 
    782 __m128i test_mm_move_epi64(__m128i A) {
    783   // CHECK-LABEL: test_mm_move_epi64
    784   // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 0, i32 2>
    785   return _mm_move_epi64(A);
    786 }
    787 
    788 __m128d test_mm_move_sd(__m128d A, __m128d B) {
    789   // CHECK-LABEL: test_mm_move_sd
    790   // CHECK: extractelement <2 x double> %{{.*}}, i32 0
    791   // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
    792   // CHECK: extractelement <2 x double> %{{.*}}, i32 1
    793   // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
    794   return _mm_move_sd(A, B);
    795 }
    796 
    797 int test_mm_movemask_epi8(__m128i A) {
    798   // CHECK-LABEL: test_mm_movemask_epi8
    799   // CHECK: call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %{{.*}})
    800   return _mm_movemask_epi8(A);
    801 }
    802 
    803 int test_mm_movemask_pd(__m128d A) {
    804   // CHECK-LABEL: test_mm_movemask_pd
    805   // CHECK: call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %{{.*}})
    806   return _mm_movemask_pd(A);
    807 }
    808 
    809 __m128i test_mm_mul_epu32(__m128i A, __m128i B) {
    810   // CHECK-LABEL: test_mm_mul_epu32
    811   // CHECK: call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
    812   return _mm_mul_epu32(A, B);
    813 }
    814 
    815 __m128d test_mm_mul_pd(__m128d A, __m128d B) {
    816   // CHECK-LABEL: test_mm_mul_pd
    817   // CHECK: fmul <2 x double> %{{.*}}, %{{.*}}
    818   return _mm_mul_pd(A, B);
    819 }
    820 
    821 __m128d test_mm_mul_sd(__m128d A, __m128d B) {
    822   // CHECK-LABEL: test_mm_mul_sd
    823   // CHECK: extractelement <2 x double> %{{.*}}, i32 0
    824   // CHECK: extractelement <2 x double> %{{.*}}, i32 0
    825   // CHECK: fmul double
    826   // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
    827   return _mm_mul_sd(A, B);
    828 }
    829 
    830 __m128i test_mm_mulhi_epi16(__m128i A, __m128i B) {
    831   // CHECK-LABEL: test_mm_mulhi_epi16
    832   // CHECK: call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
    833   return _mm_mulhi_epi16(A, B);
    834 }
    835 
    836 __m128i test_mm_mulhi_epu16(__m128i A, __m128i B) {
    837   // CHECK-LABEL: test_mm_mulhi_epu16
    838   // CHECK: call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
    839   return _mm_mulhi_epu16(A, B);
    840 }
    841 
    842 __m128i test_mm_mullo_epi16(__m128i A, __m128i B) {
    843   // CHECK-LABEL: test_mm_mullo_epi16
    844   // CHECK: mul <8 x i16> %{{.*}}, %{{.*}}
    845   return _mm_mullo_epi16(A, B);
    846 }
    847 
    848 __m128d test_mm_or_pd(__m128d A, __m128d B) {
    849   // CHECK-LABEL: test_mm_or_pd
    850   // CHECK: or <4 x i32> %{{.*}}, %{{.*}}
    851   return _mm_or_pd(A, B);
    852 }
    853 
    854 __m128i test_mm_or_si128(__m128i A, __m128i B) {
    855   // CHECK-LABEL: test_mm_or_si128
    856   // CHECK: or <2 x i64> %{{.*}}, %{{.*}}
    857   return _mm_or_si128(A, B);
    858 }
    859 
    860 __m128i test_mm_packs_epi16(__m128i A, __m128i B) {
    861   // CHECK-LABEL: test_mm_packs_epi16
    862   // CHECK: call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
    863   return _mm_packs_epi16(A, B);
    864 }
    865 
    866 __m128i test_mm_packs_epi32(__m128i A, __m128i B) {
    867   // CHECK-LABEL: test_mm_packs_epi32
    868   // CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
    869   return _mm_packs_epi32(A, B);
    870 }
    871 
    872 __m128i test_mm_packus_epi16(__m128i A, __m128i B) {
    873   // CHECK-LABEL: test_mm_packus_epi16
    874   // CHECK: call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
    875   return _mm_packus_epi16(A, B);
    876 }
    877 
    878 void test_mm_pause() {
    879   // CHECK-LABEL: test_mm_pause
    880   // CHECK: call void @llvm.x86.sse2.pause()
    881   return _mm_pause();
    882 }
    883 
    884 __m128i test_mm_sad_epu8(__m128i A, __m128i B) {
    885   // CHECK-LABEL: test_mm_sad_epu8
    886   // CHECK: call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
    887   return _mm_sad_epu8(A, B);
    888 }
    889 
    890 __m128i test_mm_set_epi8(char A, char B, char C, char D,
    891                          char E, char F, char G, char H,
    892                          char I, char J, char K, char L,
    893                          char M, char N, char O, char P) {
    894   // CHECK-LABEL: test_mm_set_epi8
    895   // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0
    896   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
    897   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
    898   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
    899   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
    900   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
    901   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
    902   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
    903   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
    904   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
    905   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
    906   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
    907   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
    908   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
    909   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
    910   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
    911   return _mm_set_epi8(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P);
    912 }
    913 
    914 __m128i test_mm_set_epi16(short A, short B, short C, short D,
    915                           short E, short F, short G, short H) {
    916   // CHECK-LABEL: test_mm_set_epi16
    917   // CHECK: insertelement <8 x i16> undef, i16 %{{.*}}, i32 0
    918   // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 1
    919   // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 2
    920   // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 3
    921   // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 4
    922   // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 5
    923   // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 6
    924   // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 7
    925   return _mm_set_epi16(A, B, C, D, E, F, G, H);
    926 }
    927 
    928 __m128i test_mm_set_epi32(int A, int B, int C, int D) {
    929   // CHECK-LABEL: test_mm_set_epi32
    930   // CHECK: insertelement <4 x i32> undef, i32 %{{.*}}, i32 0
    931   // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 1
    932   // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 2
    933   // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 3
    934   return _mm_set_epi32(A, B, C, D);
    935 }
    936 
    937 __m128i test_mm_set_epi64(__m64 A, __m64 B) {
    938   // CHECK-LABEL: test_mm_set_epi64
    939   // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
    940   // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
    941   return _mm_set_epi64(A, B);
    942 }
    943 
    944 __m128i test_mm_set_epi64x(long long A, long long B) {
    945   // CHECK-LABEL: test_mm_set_epi64x
    946   // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
    947   // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
    948   return _mm_set_epi64x(A, B);
    949 }
    950 
    951 __m128d test_mm_set_pd(double A, double B) {
    952   // CHECK-LABEL: test_mm_set_pd
    953   // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
    954   // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
    955   return _mm_set_pd(A, B);
    956 }
    957 
    958 __m128d test_mm_set_sd(double A) {
    959   // CHECK-LABEL: test_mm_set_sd
    960   // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
    961   // CHECK: insertelement <2 x double> %{{.*}}, double 0.000000e+00, i32 1
    962   return _mm_set_sd(A);
    963 }
    964 
    965 __m128i test_mm_set1_epi8(char A) {
    966   // CHECK-LABEL: test_mm_set1_epi8
    967   // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0
    968   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
    969   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
    970   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
    971   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
    972   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
    973   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
    974   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
    975   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
    976   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
    977   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
    978   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
    979   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
    980   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
    981   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
    982   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
    983   return _mm_set1_epi8(A);
    984 }
    985 
    986 __m128i test_mm_set1_epi16(short A) {
    987   // CHECK-LABEL: test_mm_set1_epi16
    988   // CHECK: insertelement <8 x i16> undef, i16 %{{.*}}, i32 0
    989   // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 1
    990   // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 2
    991   // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 3
    992   // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 4
    993   // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 5
    994   // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 6
    995   // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 7
    996   return _mm_set1_epi16(A);
    997 }
    998 
    999 __m128i test_mm_set1_epi32(int A) {
   1000   // CHECK-LABEL: test_mm_set1_epi32
   1001   // CHECK: insertelement <4 x i32> undef, i32 %{{.*}}, i32 0
   1002   // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 1
   1003   // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 2
   1004   // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 3
   1005   return _mm_set1_epi32(A);
   1006 }
   1007 
   1008 __m128i test_mm_set1_epi64(__m64 A) {
   1009   // CHECK-LABEL: test_mm_set1_epi64
   1010   // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
   1011   // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
   1012   return _mm_set1_epi64(A);
   1013 }
   1014 
   1015 __m128i test_mm_set1_epi64x(long long A) {
   1016   // CHECK-LABEL: test_mm_set1_epi64x
   1017   // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
   1018   // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
   1019   return _mm_set1_epi64x(A);
   1020 }
   1021 
   1022 __m128d test_mm_set1_pd(double A) {
   1023   // CHECK-LABEL: test_mm_set1_pd
   1024   // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
   1025   // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   1026   return _mm_set1_pd(A);
   1027 }
   1028 
   1029 __m128i test_mm_setr_epi8(char A, char B, char C, char D,
   1030                           char E, char F, char G, char H,
   1031                           char I, char J, char K, char L,
   1032                           char M, char N, char O, char P) {
   1033   // CHECK-LABEL: test_mm_setr_epi8
   1034   // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0
   1035   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
   1036   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
   1037   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
   1038   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
   1039   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
   1040   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
   1041   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
   1042   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
   1043   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
   1044   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
   1045   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
   1046   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
   1047   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
   1048   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
   1049   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
   1050   return _mm_setr_epi8(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P);
   1051 }
   1052 
   1053 __m128i test_mm_setr_epi16(short A, short B, short C, short D,
   1054                            short E, short F, short G, short H) {
   1055   // CHECK-LABEL: test_mm_setr_epi16
   1056   // CHECK: insertelement <8 x i16> undef, i16 %{{.*}}, i32 0
   1057   // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 1
   1058   // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 2
   1059   // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 3
   1060   // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 4
   1061   // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 5
   1062   // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 6
   1063   // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 7
   1064   return _mm_setr_epi16(A, B, C, D, E, F, G, H);
   1065 }
   1066 
   1067 __m128i test_mm_setr_epi32(int A, int B, int C, int D) {
   1068   // CHECK-LABEL: test_mm_setr_epi32
   1069   // CHECK: insertelement <4 x i32> undef, i32 %{{.*}}, i32 0
   1070   // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 1
   1071   // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 2
   1072   // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 3
   1073   return _mm_setr_epi32(A, B, C, D);
   1074 }
   1075 
   1076 __m128i test_mm_setr_epi64(__m64 A, __m64 B) {
   1077   // CHECK-LABEL: test_mm_setr_epi64
   1078   // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
   1079   // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
   1080   return _mm_setr_epi64(A, B);
   1081 }
   1082 
   1083 __m128d test_mm_setr_pd(double A, double B) {
   1084   // CHECK-LABEL: test_mm_setr_pd
   1085   // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
   1086   // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   1087   return _mm_setr_pd(A, B);
   1088 }
   1089 
   1090 __m128d test_mm_setzero_pd() {
   1091   // CHECK-LABEL: test_mm_setzero_pd
   1092   // CHECK: store <2 x double> zeroinitializer
   1093   return _mm_setzero_pd();
   1094 }
   1095 
   1096 __m128i test_mm_setzero_si128() {
   1097   // CHECK-LABEL: test_mm_setzero_si128
   1098   // CHECK: store <2 x i64> zeroinitializer
   1099   return _mm_setzero_si128();
   1100 }
   1101 
   1102 __m128i test_mm_shuffle_epi32(__m128i A) {
   1103   // CHECK-LABEL: test_mm_shuffle_epi32
   1104   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> zeroinitializer
   1105   return _mm_shuffle_epi32(A, 0);
   1106 }
   1107 
   1108 __m128d test_mm_shuffle_pd(__m128d A, __m128d B) {
   1109   // CHECK-LABEL: test_mm_shuffle_pd
   1110   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 2>
   1111   return _mm_shuffle_pd(A, B, 1);
   1112 }
   1113 
   1114 __m128i test_mm_shufflehi_epi16(__m128i A) {
   1115   // CHECK-LABEL: test_mm_shufflehi_epi16
   1116   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
   1117   return _mm_shufflehi_epi16(A, 0);
   1118 }
   1119 
   1120 __m128i test_mm_shufflelo_epi16(__m128i A) {
   1121   // CHECK-LABEL: test_mm_shufflelo_epi16
   1122   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
   1123   return _mm_shufflelo_epi16(A, 0);
   1124 }
   1125 
   1126 __m128i test_mm_sll_epi16(__m128i A, __m128i B) {
   1127   // CHECK-LABEL: test_mm_sll_epi16
   1128   // CHECK: call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   1129   return _mm_sll_epi16(A, B);
   1130 }
   1131 
   1132 __m128i test_mm_sll_epi32(__m128i A, __m128i B) {
   1133   // CHECK-LABEL: test_mm_sll_epi32
   1134   // CHECK: call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   1135   return _mm_sll_epi32(A, B);
   1136 }
   1137 
   1138 __m128i test_mm_sll_epi64(__m128i A, __m128i B) {
   1139   // CHECK-LABEL: test_mm_sll_epi64
   1140   // CHECK: call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   1141   return _mm_sll_epi64(A, B);
   1142 }
   1143 
   1144 __m128i test_mm_slli_epi16(__m128i A) {
   1145   // CHECK-LABEL: test_mm_slli_epi16
   1146   // CHECK: call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %{{.*}}, i32 %{{.*}})
   1147   return _mm_slli_epi16(A, 1);
   1148 }
   1149 
   1150 __m128i test_mm_slli_epi32(__m128i A) {
   1151   // CHECK-LABEL: test_mm_slli_epi32
   1152   // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 %{{.*}})
   1153   return _mm_slli_epi32(A, 1);
   1154 }
   1155 
   1156 __m128i test_mm_slli_epi64(__m128i A) {
   1157   // CHECK-LABEL: test_mm_slli_epi64
   1158   // CHECK: call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %{{.*}}, i32 %{{.*}})
   1159   return _mm_slli_epi64(A, 1);
   1160 }
   1161 
   1162 __m128i test_mm_slli_si128(__m128i A) {
   1163   // CHECK-LABEL: test_mm_slli_si128
   1164   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
   1165   return _mm_slli_si128(A, 5);
   1166 }
   1167 
   1168 __m128i test_mm_slli_si128_2(__m128i A) {
   1169   // CHECK-LABEL: test_mm_slli_si128_2
   1170   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1171   return _mm_slli_si128(A, 17);
   1172 }
   1173 
   1174 __m128d test_mm_sqrt_pd(__m128d A) {
   1175   // CHECK-LABEL: test_mm_sqrt_pd
   1176   // CHECK: call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %{{.*}})
   1177   return _mm_sqrt_pd(A);
   1178 }
   1179 
   1180 __m128d test_mm_sqrt_sd(__m128d A, __m128d B) {
   1181   // CHECK-LABEL: test_mm_sqrt_sd
   1182   // CHECK: call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %{{.*}})
   1183   // CHECK: extractelement <2 x double> %{{.*}}, i32 0
   1184   // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
   1185   // CHECK: extractelement <2 x double> %{{.*}}, i32 1
   1186   // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   1187   return _mm_sqrt_sd(A, B);
   1188 }
   1189 
   1190 __m128i test_mm_sra_epi16(__m128i A, __m128i B) {
   1191   // CHECK-LABEL: test_mm_sra_epi16
   1192   // CHECK: call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   1193   return _mm_sra_epi16(A, B);
   1194 }
   1195 
   1196 __m128i test_mm_sra_epi32(__m128i A, __m128i B) {
   1197   // CHECK-LABEL: test_mm_sra_epi32
   1198   // CHECK: call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   1199   return _mm_sra_epi32(A, B);
   1200 }
   1201 
   1202 __m128i test_mm_srai_epi16(__m128i A) {
   1203   // CHECK-LABEL: test_mm_srai_epi16
   1204   // CHECK: call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %{{.*}}, i32 %{{.*}})
   1205   return _mm_srai_epi16(A, 1);
   1206 }
   1207 
   1208 __m128i test_mm_srai_epi32(__m128i A) {
   1209   // CHECK-LABEL: test_mm_srai_epi32
   1210   // CHECK: call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %{{.*}}, i32 %{{.*}})
   1211   return _mm_srai_epi32(A, 1);
   1212 }
   1213 
   1214 __m128i test_mm_srl_epi16(__m128i A, __m128i B) {
   1215   // CHECK-LABEL: test_mm_srl_epi16
   1216   // CHECK: call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   1217   return _mm_srl_epi16(A, B);
   1218 }
   1219 
   1220 __m128i test_mm_srl_epi32(__m128i A, __m128i B) {
   1221   // CHECK-LABEL: test_mm_srl_epi32
   1222   // CHECK: call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   1223   return _mm_srl_epi32(A, B);
   1224 }
   1225 
   1226 __m128i test_mm_srl_epi64(__m128i A, __m128i B) {
   1227   // CHECK-LABEL: test_mm_srl_epi64
   1228   // CHECK: call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   1229   return _mm_srl_epi64(A, B);
   1230 }
   1231 
   1232 __m128i test_mm_srli_epi16(__m128i A) {
   1233   // CHECK-LABEL: test_mm_srli_epi16
   1234   // CHECK: call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %{{.*}}, i32 %{{.*}})
   1235   return _mm_srli_epi16(A, 1);
   1236 }
   1237 
   1238 __m128i test_mm_srli_epi32(__m128i A) {
   1239   // CHECK-LABEL: test_mm_srli_epi32
   1240   // CHECK: call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %{{.*}}, i32 %{{.*}})
   1241   return _mm_srli_epi32(A, 1);
   1242 }
   1243 
   1244 __m128i test_mm_srli_epi64(__m128i A) {
   1245   // CHECK-LABEL: test_mm_srli_epi64
   1246   // CHECK: call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %{{.*}}, i32 %{{.*}})
   1247   return _mm_srli_epi64(A, 1);
   1248 }
   1249 
   1250 __m128i test_mm_srli_si128(__m128i A) {
   1251   // CHECK-LABEL: test_mm_srli_si128
   1252   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
   1253   return _mm_srli_si128(A, 5);
   1254 }
   1255 
   1256 __m128i test_mm_srli_si128_2(__m128i A) {
   1257   // CHECK-LABEL: test_mm_srli_si128_2
   1258   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   1259   return _mm_srli_si128(A, 17);
   1260 }
   1261 
   1262 void test_mm_store_pd(double* A, __m128d B) {
   1263   // CHECK-LABEL: test_mm_store_pd
   1264   // CHECK: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 16
   1265   _mm_store_pd(A, B);
   1266 }
   1267 
   1268 void test_mm_store_pd1(double* x, __m128d y) {
   1269   // CHECK-LABEL: test_mm_store_pd1
   1270   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> zeroinitializer
   1271   // CHECK: store <2 x double> %{{.*}}, <2 x double>* {{.*}}, align 16
   1272   _mm_store_pd1(x, y);
   1273 }
   1274 
   1275 void test_mm_store_sd(double* A, __m128d B) {
   1276   // CHECK-LABEL: test_mm_store_sd
   1277   // CHECK: extractelement <2 x double> %{{.*}}, i32 0
   1278   // CHECK: store double %{{.*}}, double* %{{.*}}, align 1{{$}}
   1279   _mm_store_sd(A, B);
   1280 }
   1281 
   1282 void test_mm_store_si128(__m128i* A, __m128i B) {
   1283   // CHECK-LABEL: test_mm_store_si128
   1284   // CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16
   1285   _mm_store_si128(A, B);
   1286 }
   1287 
   1288 void test_mm_store1_pd(double* x, __m128d y) {
   1289   // CHECK-LABEL: test_mm_store1_pd
   1290   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> zeroinitializer
   1291   // CHECK: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 16
   1292   _mm_store1_pd(x, y);
   1293 }
   1294 
   1295 void test_mm_storeh_pd(double* A, __m128d B) {
   1296   // CHECK-LABEL: test_mm_storeh_pd
   1297   // CHECK: extractelement <2 x double> %{{.*}}, i32 1
   1298   // CHECK: store double %{{.*}}, double* %{{.*}}, align 1{{$}}
   1299   _mm_storeh_pd(A, B);
   1300 }
   1301 
   1302 void test_mm_storel_epi64(__m128i x, void* y) {
   1303   // CHECK-LABEL: test_mm_storel_epi64
   1304   // CHECK: extractelement <2 x i64> %{{.*}}, i32 0
   1305   // CHECK: store {{.*}} i64* {{.*}}, align 1{{$}}
   1306   _mm_storel_epi64(y, x);
   1307 }
   1308 
   1309 void test_mm_storel_pd(double* A, __m128d B) {
   1310   // CHECK-LABEL: test_mm_storel_pd
   1311   // CHECK: extractelement <2 x double> %{{.*}}, i32 0
   1312   // CHECK: store double %{{.*}}, double* %{{.*}}, align 1{{$}}
   1313   _mm_storel_pd(A, B);
   1314 }
   1315 
   1316 void test_mm_storer_pd(__m128d A, double* B) {
   1317   // CHECK-LABEL: test_mm_storer_pd
   1318   // CHECK: shufflevector <2 x double> {{.*}}, <2 x double> {{.*}}, <2 x i32> <i32 1, i32 0>
   1319   // CHECK: store {{.*}} <2 x double>* {{.*}}, align 16{{$}}
   1320   _mm_storer_pd(B, A);
   1321 }
   1322 
   1323 void test_mm_storeu_pd(double* A, __m128d B) {
   1324   // CHECK-LABEL: test_mm_storeu_pd
   1325   // CHECK: store {{.*}} <2 x double>* {{.*}}, align 1{{$}}
   1326   // CHECK-NEXT: ret void
   1327   _mm_storeu_pd(A, B);
   1328 }
   1329 
   1330 void test_mm_storeu_si128(__m128i* A, __m128i B) {
   1331   // CHECK-LABEL: test_mm_storeu_si128
   1332   // CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
   1333   // CHECK-NEXT: ret void
   1334   _mm_storeu_si128(A, B);
   1335 }
   1336 
   1337 void test_mm_stream_pd(double *A, __m128d B) {
   1338   // CHECK-LABEL: test_mm_stream_pd
   1339   // CHECK: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 16, !nontemporal
   1340   _mm_stream_pd(A, B);
   1341 }
   1342 
   1343 void test_mm_stream_si32(int *A, int B) {
   1344   // CHECK-LABEL: test_mm_stream_si32
   1345   // CHECK: store i32 %{{.*}}, i32* %{{.*}}, align 1, !nontemporal
   1346   _mm_stream_si32(A, B);
   1347 }
   1348 
   1349 void test_mm_stream_si64(long long *A, long long B) {
   1350   // CHECK-LABEL: test_mm_stream_si64
   1351   // CHECK: store i64 %{{.*}}, i64* %{{.*}}, align 1, !nontemporal
   1352   _mm_stream_si64(A, B);
   1353 }
   1354 
   1355 void test_mm_stream_si128(__m128i *A, __m128i B) {
   1356   // CHECK-LABEL: test_mm_stream_si128
   1357   // CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16, !nontemporal
   1358   _mm_stream_si128(A, B);
   1359 }
   1360 
   1361 __m128i test_mm_sub_epi8(__m128i A, __m128i B) {
   1362   // CHECK-LABEL: test_mm_sub_epi8
   1363   // CHECK: sub <16 x i8>
   1364   return _mm_sub_epi8(A, B);
   1365 }
   1366 
   1367 __m128i test_mm_sub_epi16(__m128i A, __m128i B) {
   1368   // CHECK-LABEL: test_mm_sub_epi16
   1369   // CHECK: sub <8 x i16>
   1370   return _mm_sub_epi16(A, B);
   1371 }
   1372 
   1373 __m128i test_mm_sub_epi32(__m128i A, __m128i B) {
   1374   // CHECK-LABEL: test_mm_sub_epi32
   1375   // CHECK: sub <4 x i32>
   1376   return _mm_sub_epi32(A, B);
   1377 }
   1378 
   1379 __m128i test_mm_sub_epi64(__m128i A, __m128i B) {
   1380   // CHECK-LABEL: test_mm_sub_epi64
   1381   // CHECK: sub <2 x i64>
   1382   return _mm_sub_epi64(A, B);
   1383 }
   1384 
   1385 __m128d test_mm_sub_pd(__m128d A, __m128d B) {
   1386   // CHECK-LABEL: test_mm_sub_pd
   1387   // CHECK: fsub <2 x double>
   1388   return _mm_sub_pd(A, B);
   1389 }
   1390 
   1391 __m128d test_mm_sub_sd(__m128d A, __m128d B) {
   1392   // CHECK-LABEL: test_mm_sub_sd
   1393   // CHECK: extractelement <2 x double> %{{.*}}, i32 0
   1394   // CHECK: extractelement <2 x double> %{{.*}}, i32 0
   1395   // CHECK: fsub double
   1396   // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   1397   return _mm_sub_sd(A, B);
   1398 }
   1399 
   1400 __m128i test_mm_subs_epi8(__m128i A, __m128i B) {
   1401   // CHECK-LABEL: test_mm_subs_epi8
   1402   // CHECK: call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   1403   return _mm_subs_epi8(A, B);
   1404 }
   1405 
   1406 __m128i test_mm_subs_epi16(__m128i A, __m128i B) {
   1407   // CHECK-LABEL: test_mm_subs_epi16
   1408   // CHECK: call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   1409   return _mm_subs_epi16(A, B);
   1410 }
   1411 
   1412 __m128i test_mm_subs_epu8(__m128i A, __m128i B) {
   1413   // CHECK-LABEL: test_mm_subs_epu8
   1414   // CHECK: call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   1415   return _mm_subs_epu8(A, B);
   1416 }
   1417 
   1418 __m128i test_mm_subs_epu16(__m128i A, __m128i B) {
   1419   // CHECK-LABEL: test_mm_subs_epu16
   1420   // CHECK: call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   1421   return _mm_subs_epu16(A, B);
   1422 }
   1423 
   1424 int test_mm_ucomieq_sd(__m128d A, __m128d B) {
   1425   // CHECK-LABEL: test_mm_ucomieq_sd
   1426   // CHECK: call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   1427   return _mm_ucomieq_sd(A, B);
   1428 }
   1429 
   1430 int test_mm_ucomige_sd(__m128d A, __m128d B) {
   1431   // CHECK-LABEL: test_mm_ucomige_sd
   1432   // CHECK: call i32 @llvm.x86.sse2.ucomige.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   1433   return _mm_ucomige_sd(A, B);
   1434 }
   1435 
   1436 int test_mm_ucomigt_sd(__m128d A, __m128d B) {
   1437   // CHECK-LABEL: test_mm_ucomigt_sd
   1438   // CHECK: call i32 @llvm.x86.sse2.ucomigt.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   1439   return _mm_ucomigt_sd(A, B);
   1440 }
   1441 
   1442 int test_mm_ucomile_sd(__m128d A, __m128d B) {
   1443   // CHECK-LABEL: test_mm_ucomile_sd
   1444   // CHECK: call i32 @llvm.x86.sse2.ucomile.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   1445   return _mm_ucomile_sd(A, B);
   1446 }
   1447 
   1448 int test_mm_ucomilt_sd(__m128d A, __m128d B) {
   1449   // CHECK-LABEL: test_mm_ucomilt_sd
   1450   // CHECK: call i32 @llvm.x86.sse2.ucomilt.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   1451   return _mm_ucomilt_sd(A, B);
   1452 }
   1453 
   1454 int test_mm_ucomineq_sd(__m128d A, __m128d B) {
   1455   // CHECK-LABEL: test_mm_ucomineq_sd
   1456   // CHECK: call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   1457   return _mm_ucomineq_sd(A, B);
   1458 }
   1459 
   1460 __m128d test_mm_undefined_pd() {
   1461   // CHECK-LABEL: @test_mm_undefined_pd
   1462   // CHECK: ret <2 x double> undef
   1463   return _mm_undefined_pd();
   1464 }
   1465 
   1466 __m128i test_mm_undefined_si128() {
   1467   // CHECK-LABEL: @test_mm_undefined_si128
   1468   // CHECK: ret <2 x i64> undef
   1469   return _mm_undefined_si128();
   1470 }
   1471 
   1472 __m128i test_mm_unpackhi_epi8(__m128i A, __m128i B) {
   1473   // CHECK-LABEL: test_mm_unpackhi_epi8
   1474   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   1475   return _mm_unpackhi_epi8(A, B);
   1476 }
   1477 
   1478 __m128i test_mm_unpackhi_epi16(__m128i A, __m128i B) {
   1479   // CHECK-LABEL: test_mm_unpackhi_epi16
   1480   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   1481   return _mm_unpackhi_epi16(A, B);
   1482 }
   1483 
   1484 __m128i test_mm_unpackhi_epi32(__m128i A, __m128i B) {
   1485   // CHECK-LABEL: test_mm_unpackhi_epi32
   1486   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   1487   return _mm_unpackhi_epi32(A, B);
   1488 }
   1489 
   1490 __m128i test_mm_unpackhi_epi64(__m128i A, __m128i B) {
   1491   // CHECK-LABEL: test_mm_unpackhi_epi64
   1492   // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 1, i32 3>
   1493   return _mm_unpackhi_epi64(A, B);
   1494 }
   1495 
   1496 __m128d test_mm_unpackhi_pd(__m128d A, __m128d B) {
   1497   // CHECK-LABEL: test_mm_unpackhi_pd
   1498   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 3>
   1499   return _mm_unpackhi_pd(A, B);
   1500 }
   1501 
   1502 __m128i test_mm_unpacklo_epi8(__m128i A, __m128i B) {
   1503   // CHECK-LABEL: test_mm_unpacklo_epi8
   1504   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   1505   return _mm_unpacklo_epi8(A, B);
   1506 }
   1507 
   1508 __m128i test_mm_unpacklo_epi16(__m128i A, __m128i B) {
   1509   // CHECK-LABEL: test_mm_unpacklo_epi16
   1510   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   1511   return _mm_unpacklo_epi16(A, B);
   1512 }
   1513 
   1514 __m128i test_mm_unpacklo_epi32(__m128i A, __m128i B) {
   1515   // CHECK-LABEL: test_mm_unpacklo_epi32
   1516   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   1517   return _mm_unpacklo_epi32(A, B);
   1518 }
   1519 
   1520 __m128i test_mm_unpacklo_epi64(__m128i A, __m128i B) {
   1521   // CHECK-LABEL: test_mm_unpacklo_epi64
   1522   // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 0, i32 2>
   1523   return _mm_unpacklo_epi64(A, B);
   1524 }
   1525 
   1526 __m128d test_mm_unpacklo_pd(__m128d A, __m128d B) {
   1527   // CHECK-LABEL: test_mm_unpacklo_pd
   1528   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 0, i32 2>
   1529   return _mm_unpacklo_pd(A, B);
   1530 }
   1531 
   1532 __m128d test_mm_xor_pd(__m128d A, __m128d B) {
   1533   // CHECK-LABEL: test_mm_xor_pd
   1534   // CHECK: xor <4 x i32> %{{.*}}, %{{.*}}
   1535   return _mm_xor_pd(A, B);
   1536 }
   1537 
   1538 __m128i test_mm_xor_si128(__m128i A, __m128i B) {
   1539   // CHECK-LABEL: test_mm_xor_si128
   1540   // CHECK: xor <2 x i64> %{{.*}}, %{{.*}}
   1541   return _mm_xor_si128(A, B);
   1542 }
   1543