Home | History | Annotate | Download | only in InstCombine
      1 ; RUN: opt < %s -instcombine -S | FileCheck %s
      2 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
      3 
      4 define i16 @test1(float %f) {
      5 entry:
      6 ; CHECK-LABEL: @test1(
      7 ; CHECK: fmul float
      8 ; CHECK-NOT: insertelement {{.*}} 0.00
      9 ; CHECK-NOT: call {{.*}} @llvm.x86.sse.mul
     10 ; CHECK-NOT: call {{.*}} @llvm.x86.sse.sub
     11 ; CHECK: ret
     12 	%tmp = insertelement <4 x float> undef, float %f, i32 0		; <<4 x float>> [#uses=1]
     13 	%tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1		; <<4 x float>> [#uses=1]
     14 	%tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2		; <<4 x float>> [#uses=1]
     15 	%tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3		; <<4 x float>> [#uses=1]
     16 	%tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )		; <<4 x float>> [#uses=1]
     17 	%tmp37 = tail call <4 x float> @llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )		; <<4 x float>> [#uses=1]
     18 	%tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )		; <<4 x float>> [#uses=1]
     19 	%tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer )		; <<4 x float>> [#uses=1]
     20 	%tmp.upgrd.1 = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 )		; <i32> [#uses=1]
     21 	%tmp69 = trunc i32 %tmp.upgrd.1 to i16		; <i16> [#uses=1]
     22 	ret i16 %tmp69
     23 }
     24 
     25 define i32 @test2(float %f) {
     26 ; CHECK-LABEL: @test2(
     27 ; CHECK-NOT: insertelement
     28 ; CHECK-NOT: extractelement
     29 ; CHECK: ret
     30   %tmp5 = fmul float %f, %f
     31   %tmp9 = insertelement <4 x float> undef, float %tmp5, i32 0
     32   %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 1
     33   %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2
     34   %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3
     35   %tmp19 = bitcast <4 x float> %tmp12 to <4 x i32>
     36   %tmp21 = extractelement <4 x i32> %tmp19, i32 0
     37   ret i32 %tmp21
     38 }
     39 
     40 define i64 @test3(float %f, double %d) {
     41 ; CHECK-LABEL: @test3(
     42 ; CHECK-NOT: insertelement {{.*}} 0.00
     43 ; CHECK: ret
     44 entry:
     45   %v00 = insertelement <4 x float> undef, float %f, i32 0
     46   %v01 = insertelement <4 x float> %v00, float 0.000000e+00, i32 1
     47   %v02 = insertelement <4 x float> %v01, float 0.000000e+00, i32 2
     48   %v03 = insertelement <4 x float> %v02, float 0.000000e+00, i32 3
     49   %tmp0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %v03)
     50   %v10 = insertelement <4 x float> undef, float %f, i32 0
     51   %v11 = insertelement <4 x float> %v10, float 0.000000e+00, i32 1
     52   %v12 = insertelement <4 x float> %v11, float 0.000000e+00, i32 2
     53   %v13 = insertelement <4 x float> %v12, float 0.000000e+00, i32 3
     54   %tmp1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %v13)
     55   %v20 = insertelement <4 x float> undef, float %f, i32 0
     56   %v21 = insertelement <4 x float> %v20, float 0.000000e+00, i32 1
     57   %v22 = insertelement <4 x float> %v21, float 0.000000e+00, i32 2
     58   %v23 = insertelement <4 x float> %v22, float 0.000000e+00, i32 3
     59   %tmp2 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %v23)
     60   %v30 = insertelement <4 x float> undef, float %f, i32 0
     61   %v31 = insertelement <4 x float> %v30, float 0.000000e+00, i32 1
     62   %v32 = insertelement <4 x float> %v31, float 0.000000e+00, i32 2
     63   %v33 = insertelement <4 x float> %v32, float 0.000000e+00, i32 3
     64   %tmp3 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %v33)
     65   %v40 = insertelement <2 x double> undef, double %d, i32 0
     66   %v41 = insertelement <2 x double> %v40, double 0.000000e+00, i32 1
     67   %tmp4 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %v41)
     68   %v50 = insertelement <2 x double> undef, double %d, i32 0
     69   %v51 = insertelement <2 x double> %v50, double 0.000000e+00, i32 1
     70   %tmp5 = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %v51)
     71   %v60 = insertelement <2 x double> undef, double %d, i32 0
     72   %v61 = insertelement <2 x double> %v60, double 0.000000e+00, i32 1
     73   %tmp6 = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %v61)
     74   %v70 = insertelement <2 x double> undef, double %d, i32 0
     75   %v71 = insertelement <2 x double> %v70, double 0.000000e+00, i32 1
     76   %tmp7 = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %v71)
     77   %tmp8 = add i32 %tmp0, %tmp2
     78   %tmp9 = add i32 %tmp4, %tmp6
     79   %tmp10 = add i32 %tmp8, %tmp9
     80   %tmp11 = sext i32 %tmp10 to i64
     81   %tmp12 = add i64 %tmp1, %tmp3
     82   %tmp13 = add i64 %tmp5, %tmp7
     83   %tmp14 = add i64 %tmp12, %tmp13
     84   %tmp15 = add i64 %tmp11, %tmp14
     85   ret i64 %tmp15
     86 }
     87 
     88 define void @get_image() nounwind {
     89 ; CHECK-LABEL: @get_image(
     90 ; CHECK-NOT: extractelement
     91 ; CHECK: unreachable
     92 entry:
     93   %0 = call i32 @fgetc(i8* null) nounwind               ; <i32> [#uses=1]
     94   %1 = trunc i32 %0 to i8         ; <i8> [#uses=1]
     95   %tmp2 = insertelement <100 x i8> zeroinitializer, i8 %1, i32 1          ; <<100 x i8>> [#uses=1]
     96   %tmp1 = extractelement <100 x i8> %tmp2, i32 0          ; <i8> [#uses=1]
     97   %2 = icmp eq i8 %tmp1, 80               ; <i1> [#uses=1]
     98   br i1 %2, label %bb2, label %bb3
     99 
    100 bb2:            ; preds = %entry
    101   br label %bb3
    102 
    103 bb3:            ; preds = %bb2, %entry
    104   unreachable
    105 }
    106 
    107 ; PR4340
    108 define void @vac(<4 x float>* nocapture %a) nounwind {
    109 ; CHECK-LABEL: @vac(
    110 ; CHECK-NOT: load
    111 ; CHECK: ret
    112 entry:
    113 	%tmp1 = load <4 x float>, <4 x float>* %a		; <<4 x float>> [#uses=1]
    114 	%vecins = insertelement <4 x float> %tmp1, float 0.000000e+00, i32 0	; <<4 x float>> [#uses=1]
    115 	%vecins4 = insertelement <4 x float> %vecins, float 0.000000e+00, i32 1; <<4 x float>> [#uses=1]
    116 	%vecins6 = insertelement <4 x float> %vecins4, float 0.000000e+00, i32 2; <<4 x float>> [#uses=1]
    117 	%vecins8 = insertelement <4 x float> %vecins6, float 0.000000e+00, i32 3; <<4 x float>> [#uses=1]
    118 	store <4 x float> %vecins8, <4 x float>* %a
    119 	ret void
    120 }
    121 
    122 declare i32 @fgetc(i8*)
    123 
    124 declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>)
    125 
    126 declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>)
    127 
    128 declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>)
    129 
    130 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>)
    131 
    132 declare i32 @llvm.x86.sse.cvtss2si(<4 x float>)
    133 declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>)
    134 declare i32 @llvm.x86.sse.cvttss2si(<4 x float>)
    135 declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>)
    136 declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>)
    137 declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>)
    138 declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>)
    139 declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>)
    140 
    141 ; <rdar://problem/6945110>
    142 define <4 x i32> @kernel3_vertical(<4 x i16> * %src, <8 x i16> * %foo) nounwind {
    143 entry:
    144 	%tmp = load <4 x i16>, <4 x i16>* %src
    145 	%tmp1 = load <8 x i16>, <8 x i16>* %foo
    146 ; CHECK: %tmp2 = shufflevector
    147 	%tmp2 = shufflevector <4 x i16> %tmp, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
    148 ; pmovzxwd ignores the upper 64-bits of its input; -instcombine should remove this shuffle:
    149 ; CHECK-NOT: shufflevector
    150 	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
    151 ; CHECK-NEXT: pmovzxwd
    152 	%0 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %tmp3)
    153 	ret <4 x i32> %0
    154 }
    155 declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
    156 
    157 define <4 x float> @dead_shuffle_elt(<4 x float> %x, <2 x float> %y) nounwind {
    158 entry:
    159 ; CHECK-LABEL: define <4 x float> @dead_shuffle_elt(
    160 ; CHECK: shufflevector <2 x float> %y, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
    161   %shuffle.i = shufflevector <2 x float> %y, <2 x float> %y, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
    162   %shuffle9.i = shufflevector <4 x float> %x, <4 x float> %shuffle.i, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
    163   ret <4 x float> %shuffle9.i
    164 }
    165 
    166 define <2 x float> @test_fptrunc(double %f) {
    167 ; CHECK-LABEL: @test_fptrunc(
    168 ; CHECK: insertelement
    169 ; CHECK: insertelement
    170 ; CHECK-NOT: insertelement
    171   %tmp9 = insertelement <4 x double> undef, double %f, i32 0
    172   %tmp10 = insertelement <4 x double> %tmp9, double 0.000000e+00, i32 1
    173   %tmp11 = insertelement <4 x double> %tmp10, double 0.000000e+00, i32 2
    174   %tmp12 = insertelement <4 x double> %tmp11, double 0.000000e+00, i32 3
    175   %tmp5 = fptrunc <4 x double> %tmp12 to <4 x float>
    176   %ret = shufflevector <4 x float> %tmp5, <4 x float> undef, <2 x i32> <i32 0, i32 1>
    177   ret <2 x float> %ret
    178 }
    179 
    180 define <2 x double> @test_fpext(float %f) {
    181 ; CHECK-LABEL: @test_fpext(
    182 ; CHECK: insertelement
    183 ; CHECK: insertelement
    184 ; CHECK-NOT: insertelement
    185   %tmp9 = insertelement <4 x float> undef, float %f, i32 0
    186   %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 1
    187   %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2
    188   %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3
    189   %tmp5 = fpext <4 x float> %tmp12 to <4 x double>
    190   %ret = shufflevector <4 x double> %tmp5, <4 x double> undef, <2 x i32> <i32 0, i32 1>
    191   ret <2 x double> %ret
    192 }
    193 
    194 define <4 x float> @test_select(float %f, float %g) {
    195 ; CHECK-LABEL: @test_select(
    196 ; CHECK: %a0 = insertelement <4 x float> undef, float %f, i32 0
    197 ; CHECK-NOT: insertelement
    198 ; CHECK: %a3 = insertelement <4 x float> %a0, float 3.000000e+00, i32 3
    199 ; CHECK-NOT: insertelement
    200 ; CHECK: %ret = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %a3, <4 x float> <float undef, float 4.000000e+00, float 5.000000e+00, float undef>
    201   %a0 = insertelement <4 x float> undef, float %f, i32 0
    202   %a1 = insertelement <4 x float> %a0, float 1.000000e+00, i32 1
    203   %a2 = insertelement <4 x float> %a1, float 2.000000e+00, i32 2
    204   %a3 = insertelement <4 x float> %a2, float 3.000000e+00, i32 3
    205   %b0 = insertelement <4 x float> undef, float %g, i32 0
    206   %b1 = insertelement <4 x float> %b0, float 4.000000e+00, i32 1
    207   %b2 = insertelement <4 x float> %b1, float 5.000000e+00, i32 2
    208   %b3 = insertelement <4 x float> %b2, float 6.000000e+00, i32 3
    209   %ret = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %a3, <4 x float> %b3
    210   ret <4 x float> %ret
    211 }
    212 
    213 ; We should optimize these two redundant insertqi into one
    214 ; CHECK: define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i)
    215 define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) {
    216 ; CHECK: call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32)
    217 ; CHECK-NOT: insertqi
    218   %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32)
    219   %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 32)
    220   ret <2 x i64> %2
    221 }
    222 
    223 ; The result of this insert is the second arg, since the top 64 bits of
    224 ; the result are undefined, and we copy the bottom 64 bits from the
    225 ; second arg
    226 ; CHECK: define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i)
    227 define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) {
    228 ; CHECK: ret <2 x i64> %i
    229   %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 0)
    230   ret <2 x i64> %1
    231 }
    232 
    233 ; Test the several types of ranges and ordering that exist for two insertqi
    234 ; CHECK: define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i)
    235 define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) {
    236 ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
    237 ; CHECK: ret <2 x i64> %[[RES]]
    238   %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
    239   %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 16)
    240   ret <2 x i64> %2
    241 }
    242 
    243 ; CHECK: define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i)
    244 define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) {
    245 ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
    246 ; CHECK: ret <2 x i64> %[[RES]]
    247   %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 16)
    248   %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
    249   ret <2 x i64> %2
    250 }
    251 
    252 ; CHECK: define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i)
    253 define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) {
    254 ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
    255 ; CHECK: ret <2 x i64> %[[RES]]
    256   %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
    257   %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 16)
    258   ret <2 x i64> %2
    259 }
    260 
    261 ; CHECK: define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i)
    262 define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) {
    263 ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
    264 ; CHECK: ret <2 x i64> %[[RES]]
    265   %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 16)
    266   %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
    267   ret <2 x i64> %2
    268 }
    269 
    270 ; CHECK: define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i)
    271 define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) {
    272 ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
    273 ; CHECK: ret <2 x i64> %[[RES]]
    274   %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
    275   %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
    276   ret <2 x i64> %2
    277 }
    278 
    279 ; CHECK: define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i)
    280 define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) {
    281 ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
    282 ; CHECK: ret <2 x i64> %[[RES]]
    283   %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 32)
    284   %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
    285   ret <2 x i64> %2
    286 }
    287 
    288 ; CHECK: define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i)
    289 define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) {
    290 ; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
    291 ; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
    292   %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
    293   %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
    294   ret <2 x i64> %2
    295 }
    296 
    297 ; CHECK: define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i)
    298 define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) {
    299 ; CHECK:  tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
    300 ; CHECK:  tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
    301   %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
    302   %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
    303   ret <2 x i64> %2
    304 }
    305 
    306 ; CHECK: define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i)
    307 define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i) {
    308 ; CHECK: ret <2 x i64> %i
    309   %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 0)
    310   ret <2 x i64> %1
    311 }
    312 
    313 ; CHECK: define <2 x i64> @testUndefinedInsertq_1(<2 x i64> %v, <2 x i64> %i)
    314 define <2 x i64> @testUndefinedInsertq_1(<2 x i64> %v, <2 x i64> %i) {
    315 ; CHECK: ret <2 x i64> undef
    316   %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 16)
    317   ret <2 x i64> %1
    318 }
    319 
    320 ; CHECK: define <2 x i64> @testUndefinedInsertq_2(<2 x i64> %v, <2 x i64> %i)
    321 define <2 x i64> @testUndefinedInsertq_2(<2 x i64> %v, <2 x i64> %i) {
    322 ; CHECK: ret <2 x i64> undef
    323   %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 32)
    324   ret <2 x i64> %1
    325 }
    326 
    327 ; CHECK: define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i)
    328 define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i) {
    329 ; CHECK: ret <2 x i64> undef
    330   %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 16)
    331   ret <2 x i64> %1
    332 }
    333 
    334 ; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertqi
    335 declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind
    336 
    337 declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>)
    338 define <4 x float> @test_vpermilvar_ps(<4 x float> %v) {
    339 ; CHECK-LABEL: @test_vpermilvar_ps(
    340 ; CHECK: shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
    341   %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
    342   ret <4 x float> %a
    343 }
    344 
    345 declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>)
    346 define <8 x float> @test_vpermilvar_ps_256(<8 x float> %v) {
    347 ; CHECK-LABEL: @test_vpermilvar_ps_256(
    348 ; CHECK: shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
    349   %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
    350   ret <8 x float> %a
    351 }
    352 
    353 declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>)
    354 define <2 x double> @test_vpermilvar_pd(<2 x double> %v) {
    355 ; CHECK-LABEL: @test_vpermilvar_pd(
    356 ; CHECK: shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 0>
    357   %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> <i64 2, i64 0>)
    358   ret <2 x double> %a
    359 }
    360 
    361 declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>)
    362 define <4 x double> @test_vpermilvar_pd_256(<4 x double> %v) {
    363 ; CHECK-LABEL: @test_vpermilvar_pd_256(
    364 ; CHECK: shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
    365   %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> <i64 3, i64 1, i64 2, i64 0>)
    366   ret <4 x double> %a
    367 }
    368 
    369 define <4 x float> @test_vpermilvar_ps_zero(<4 x float> %v) {
    370 ; CHECK-LABEL: @test_vpermilvar_ps_zero(
    371 ; CHECK: shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
    372   %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> zeroinitializer)
    373   ret <4 x float> %a
    374 }
    375 
    376 define <8 x float> @test_vpermilvar_ps_256_zero(<8 x float> %v) {
    377 ; CHECK-LABEL: @test_vpermilvar_ps_256_zero(
    378 ; CHECK: shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
    379   %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> zeroinitializer)
    380   ret <8 x float> %a
    381 }
    382 
    383 define <2 x double> @test_vpermilvar_pd_zero(<2 x double> %v) {
    384 ; CHECK-LABEL: @test_vpermilvar_pd_zero(
    385 ; CHECK: shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
    386   %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> zeroinitializer)
    387   ret <2 x double> %a
    388 }
    389 
    390 define <4 x double> @test_vpermilvar_pd_256_zero(<4 x double> %v) {
    391 ; CHECK-LABEL: @test_vpermilvar_pd_256_zero(
    392 ; CHECK: shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    393   %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> zeroinitializer)
    394   ret <4 x double> %a
    395 }
    396 
    397 define <2 x i64> @test_sse2_1() nounwind readnone uwtable {
    398   %S = bitcast i32 1 to i32
    399   %1 = zext i32 %S to i64
    400   %2 = insertelement <2 x i64> undef, i64 %1, i32 0
    401   %3 = insertelement <2 x i64> %2, i64 0, i32 1
    402   %4 = bitcast <2 x i64> %3 to <8 x i16>
    403   %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4)
    404   %6 = bitcast <8 x i16> %5 to <4 x i32>
    405   %7 = bitcast <2 x i64> %3 to <4 x i32>
    406   %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7)
    407   %9 = bitcast <4 x i32> %8 to <2 x i64>
    408   %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3)
    409   %11 = bitcast <2 x i64> %10 to <8 x i16>
    410   %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S)
    411   %13 = bitcast <8 x i16> %12 to <4 x i32>
    412   %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S)
    413   %15 = bitcast <4 x i32> %14 to <2 x i64>
    414   %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S)
    415   ret <2 x i64> %16
    416 ; CHECK: test_sse2_1
    417 ; CHECK: ret <2 x i64> <i64 72058418680037440, i64 144117112246370624>
    418 }
    419 
    420 define <4 x i64> @test_avx2_1() nounwind readnone uwtable {
    421   %S = bitcast i32 1 to i32
    422   %1 = zext i32 %S to i64
    423   %2 = insertelement <2 x i64> undef, i64 %1, i32 0
    424   %3 = insertelement <2 x i64> %2, i64 0, i32 1
    425   %4 = bitcast <2 x i64> %3 to <8 x i16>
    426   %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4)
    427   %6 = bitcast <16 x i16> %5 to <8 x i32>
    428   %7 = bitcast <2 x i64> %3 to <4 x i32>
    429   %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7)
    430   %9 = bitcast <8 x i32> %8 to <4 x i64>
    431   %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3)
    432   %11 = bitcast <4 x i64> %10 to <16 x i16>
    433   %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S)
    434   %13 = bitcast <16 x i16> %12 to <8 x i32>
    435   %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S)
    436   %15 = bitcast <8 x i32> %14 to <4 x i64>
    437   %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S)
    438   ret <4 x i64> %16
    439 ; CHECK: test_avx2_1
    440 ; CHECK: ret <4 x i64> <i64 64, i64 128, i64 192, i64 256>
    441 }
    442 
    443 define <2 x i64> @test_sse2_0() nounwind readnone uwtable {
    444   %S = bitcast i32 128 to i32
    445   %1 = zext i32 %S to i64
    446   %2 = insertelement <2 x i64> undef, i64 %1, i32 0
    447   %3 = insertelement <2 x i64> %2, i64 0, i32 1
    448   %4 = bitcast <2 x i64> %3 to <8 x i16>
    449   %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4)
    450   %6 = bitcast <8 x i16> %5 to <4 x i32>
    451   %7 = bitcast <2 x i64> %3 to <4 x i32>
    452   %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7)
    453   %9 = bitcast <4 x i32> %8 to <2 x i64>
    454   %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3)
    455   %11 = bitcast <2 x i64> %10 to <8 x i16>
    456   %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S)
    457   %13 = bitcast <8 x i16> %12 to <4 x i32>
    458   %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S)
    459   %15 = bitcast <4 x i32> %14 to <2 x i64>
    460   %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S)
    461   ret <2 x i64> %16
    462 ; CHECK: test_sse2_0
    463 ; CHECK: ret <2 x i64> zeroinitializer
    464 }
    465 
    466 define <4 x i64> @test_avx2_0() nounwind readnone uwtable {
    467   %S = bitcast i32 128 to i32
    468   %1 = zext i32 %S to i64
    469   %2 = insertelement <2 x i64> undef, i64 %1, i32 0
    470   %3 = insertelement <2 x i64> %2, i64 0, i32 1
    471   %4 = bitcast <2 x i64> %3 to <8 x i16>
    472   %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4)
    473   %6 = bitcast <16 x i16> %5 to <8 x i32>
    474   %7 = bitcast <2 x i64> %3 to <4 x i32>
    475   %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7)
    476   %9 = bitcast <8 x i32> %8 to <4 x i64>
    477   %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3)
    478   %11 = bitcast <4 x i64> %10 to <16 x i16>
    479   %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S)
    480   %13 = bitcast <16 x i16> %12 to <8 x i32>
    481   %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S)
    482   %15 = bitcast <8 x i32> %14 to <4 x i64>
    483   %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S)
    484   ret <4 x i64> %16
    485 ; CHECK: test_avx2_0
    486 ; CHECK: ret <4 x i64> zeroinitializer
    487 }
    488 define <2 x i64> @test_sse2_psrl_1() nounwind readnone uwtable {
    489   %S = bitcast i32 1 to i32
    490   %1 = zext i32 %S to i64
    491   %2 = insertelement <2 x i64> undef, i64 %1, i32 0
    492   %3 = insertelement <2 x i64> %2, i64 0, i32 1
    493   %4 = bitcast <2 x i64> %3 to <8 x i16>
    494   %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> <i16 16, i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048>, <8 x i16> %4)
    495   %6 = bitcast <8 x i16> %5 to <4 x i32>
    496   %7 = bitcast <2 x i64> %3 to <4 x i32>
    497   %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7)
    498   %9 = bitcast <4 x i32> %8 to <2 x i64>
    499   %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3)
    500   %11 = bitcast <2 x i64> %10 to <8 x i16>
    501   %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S)
    502   %13 = bitcast <8 x i16> %12 to <4 x i32>
    503   %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S)
    504   %15 = bitcast <4 x i32> %14 to <2 x i64>
    505   %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S)
    506   ret <2 x i64> %16
    507 ; CHECK: test_sse2_psrl_1
    508 ; CHECK: ret <2 x i64> <i64 562954248421376, i64 9007267974742020>
    509 }
    510 
    511 define <4 x i64> @test_avx2_psrl_1() nounwind readnone uwtable {
    512   %S = bitcast i32 1 to i32
    513   %1 = zext i32 %S to i64
    514   %2 = insertelement <2 x i64> undef, i64 %1, i32 0
    515   %3 = insertelement <2 x i64> %2, i64 0, i32 1
    516   %4 = bitcast <2 x i64> %3 to <8 x i16>
    517   %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4)
    518   %6 = bitcast <16 x i16> %5 to <8 x i32>
    519   %7 = bitcast <2 x i64> %3 to <4 x i32>
    520   %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7)
    521   %9 = bitcast <8 x i32> %8 to <4 x i64>
    522   %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3)
    523   %11 = bitcast <4 x i64> %10 to <16 x i16>
    524   %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S)
    525   %13 = bitcast <16 x i16> %12 to <8 x i32>
    526   %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S)
    527   %15 = bitcast <8 x i32> %14 to <4 x i64>
    528   %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S)
    529   ret <4 x i64> %16
    530 ; CHECK: test_avx2_psrl_1
    531 ; CHECK: ret <4 x i64> <i64 16, i64 32, i64 64, i64 128>
    532 }
    533 
    534 define <2 x i64> @test_sse2_psrl_0() nounwind readnone uwtable {
    535   %S = bitcast i32 128 to i32
    536   %1 = zext i32 %S to i64
    537   %2 = insertelement <2 x i64> undef, i64 %1, i32 0
    538   %3 = insertelement <2 x i64> %2, i64 0, i32 1
    539   %4 = bitcast <2 x i64> %3 to <8 x i16>
    540   %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> <i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096>, <8 x i16> %4)
    541   %6 = bitcast <8 x i16> %5 to <4 x i32>
    542   %7 = bitcast <2 x i64> %3 to <4 x i32>
    543   %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7)
    544   %9 = bitcast <4 x i32> %8 to <2 x i64>
    545   %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3)
    546   %11 = bitcast <2 x i64> %10 to <8 x i16>
    547   %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S)
    548   %13 = bitcast <8 x i16> %12 to <4 x i32>
    549   %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S)
    550   %15 = bitcast <4 x i32> %14 to <2 x i64>
    551   %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S)
    552   ret <2 x i64> %16
    553 ; CHECK: test_sse2_psrl_0
    554 ; CHECK: ret <2 x i64> zeroinitializer
    555 }
    556 
    557 define <4 x i64> @test_avx2_psrl_0() nounwind readnone uwtable {
    558   %S = bitcast i32 128 to i32
    559   %1 = zext i32 %S to i64
    560   %2 = insertelement <2 x i64> undef, i64 %1, i32 0
    561   %3 = insertelement <2 x i64> %2, i64 0, i32 1
    562   %4 = bitcast <2 x i64> %3 to <8 x i16>
    563   %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4)
    564   %6 = bitcast <16 x i16> %5 to <8 x i32>
    565   %7 = bitcast <2 x i64> %3 to <4 x i32>
    566   %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7)
    567   %9 = bitcast <8 x i32> %8 to <4 x i64>
    568   %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3)
    569   %11 = bitcast <4 x i64> %10 to <16 x i16>
    570   %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S)
    571   %13 = bitcast <16 x i16> %12 to <8 x i32>
    572   %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S)
    573   %15 = bitcast <8 x i32> %14 to <4 x i64>
    574   %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S)
    575   ret <4 x i64> %16
    576 ; CHECK: test_avx2_psrl_0
    577 ; CHECK: ret <4 x i64> zeroinitializer
    578 }
    579 
    580 declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) #1
    581 declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) #1
    582 declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) #1
    583 declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) #1
    584 declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) #1
    585 declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) #1
    586 declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) #1
    587 declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) #1
    588 declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) #1
    589 declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1
    590 declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) #1
    591 declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) #1
    592 declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) #1
    593 declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) #1
    594 declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) #1
    595 declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) #1
    596 declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) #1
    597 declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) #1
    598 declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) #1
    599 declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) #1
    600 declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) #1
    601 declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) #1
    602 declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) #1
    603 declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) #1
    604 
    605 attributes #1 = { nounwind readnone }
    606