1 ; RUN: opt < %s -instcombine -S | FileCheck %s 2 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 3 4 define i16 @test1(float %f) { 5 entry: 6 ; CHECK-LABEL: @test1( 7 ; CHECK: fmul float 8 ; CHECK-NOT: insertelement {{.*}} 0.00 9 ; CHECK-NOT: call {{.*}} @llvm.x86.sse.mul 10 ; CHECK-NOT: call {{.*}} @llvm.x86.sse.sub 11 ; CHECK: ret 12 %tmp = insertelement <4 x float> undef, float %f, i32 0 ; <<4 x float>> [#uses=1] 13 %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1] 14 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1] 15 %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1] 16 %tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1] 17 %tmp37 = tail call <4 x float> @llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1] 18 %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1] 19 %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer ) ; <<4 x float>> [#uses=1] 20 %tmp.upgrd.1 = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 ) ; <i32> [#uses=1] 21 %tmp69 = trunc i32 %tmp.upgrd.1 to i16 ; <i16> [#uses=1] 22 ret i16 %tmp69 23 } 24 25 define i32 @test2(float %f) { 26 ; CHECK-LABEL: @test2( 27 ; CHECK-NOT: insertelement 28 ; CHECK-NOT: extractelement 29 ; CHECK: ret 30 %tmp5 = fmul float %f, %f 31 %tmp9 = insertelement <4 x float> undef, float %tmp5, i32 0 32 %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 1 33 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2 34 %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3 35 %tmp19 = bitcast <4 x float> %tmp12 to <4 x i32> 36 %tmp21 = extractelement <4 x i32> %tmp19, i32 0 37 ret i32 %tmp21 38 } 39 40 define i64 @test3(float %f, double %d) { 41 ; CHECK-LABEL: @test3( 42 ; CHECK-NOT: insertelement {{.*}} 0.00 43 ; CHECK: ret 44 entry: 45 %v00 = insertelement <4 x float> undef, float %f, i32 0 46 %v01 = insertelement <4 x float> %v00, float 0.000000e+00, i32 1 47 %v02 = insertelement <4 x float> %v01, float 0.000000e+00, i32 2 48 %v03 = insertelement <4 x float> %v02, float 0.000000e+00, i32 3 49 %tmp0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %v03) 50 %v10 = insertelement <4 x float> undef, float %f, i32 0 51 %v11 = insertelement <4 x float> %v10, float 0.000000e+00, i32 1 52 %v12 = insertelement <4 x float> %v11, float 0.000000e+00, i32 2 53 %v13 = insertelement <4 x float> %v12, float 0.000000e+00, i32 3 54 %tmp1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %v13) 55 %v20 = insertelement <4 x float> undef, float %f, i32 0 56 %v21 = insertelement <4 x float> %v20, float 0.000000e+00, i32 1 57 %v22 = insertelement <4 x float> %v21, float 0.000000e+00, i32 2 58 %v23 = insertelement <4 x float> %v22, float 0.000000e+00, i32 3 59 %tmp2 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %v23) 60 %v30 = insertelement <4 x float> undef, float %f, i32 0 61 %v31 = insertelement <4 x float> %v30, float 0.000000e+00, i32 1 62 %v32 = insertelement <4 x float> %v31, float 0.000000e+00, i32 2 63 %v33 = insertelement <4 x float> %v32, float 0.000000e+00, i32 3 64 %tmp3 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %v33) 65 %v40 = insertelement <2 x double> undef, double %d, i32 0 66 %v41 = insertelement <2 x double> %v40, double 0.000000e+00, i32 1 67 %tmp4 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %v41) 68 %v50 = insertelement <2 x double> undef, double %d, i32 0 69 %v51 = insertelement <2 x double> %v50, double 0.000000e+00, i32 1 70 %tmp5 = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %v51) 71 %v60 = insertelement <2 x double> undef, double %d, i32 0 72 %v61 = insertelement <2 x double> %v60, double 0.000000e+00, i32 1 73 %tmp6 = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %v61) 74 %v70 = insertelement <2 x double> undef, double %d, i32 0 75 %v71 = insertelement <2 x double> %v70, double 0.000000e+00, i32 1 76 %tmp7 = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %v71) 77 %tmp8 = add i32 %tmp0, %tmp2 78 %tmp9 = add i32 %tmp4, %tmp6 79 %tmp10 = add i32 %tmp8, %tmp9 80 %tmp11 = sext i32 %tmp10 to i64 81 %tmp12 = add i64 %tmp1, %tmp3 82 %tmp13 = add i64 %tmp5, %tmp7 83 %tmp14 = add i64 %tmp12, %tmp13 84 %tmp15 = add i64 %tmp11, %tmp14 85 ret i64 %tmp15 86 } 87 88 define void @get_image() nounwind { 89 ; CHECK-LABEL: @get_image( 90 ; CHECK-NOT: extractelement 91 ; CHECK: unreachable 92 entry: 93 %0 = call i32 @fgetc(i8* null) nounwind ; <i32> [#uses=1] 94 %1 = trunc i32 %0 to i8 ; <i8> [#uses=1] 95 %tmp2 = insertelement <100 x i8> zeroinitializer, i8 %1, i32 1 ; <<100 x i8>> [#uses=1] 96 %tmp1 = extractelement <100 x i8> %tmp2, i32 0 ; <i8> [#uses=1] 97 %2 = icmp eq i8 %tmp1, 80 ; <i1> [#uses=1] 98 br i1 %2, label %bb2, label %bb3 99 100 bb2: ; preds = %entry 101 br label %bb3 102 103 bb3: ; preds = %bb2, %entry 104 unreachable 105 } 106 107 ; PR4340 108 define void @vac(<4 x float>* nocapture %a) nounwind { 109 ; CHECK-LABEL: @vac( 110 ; CHECK-NOT: load 111 ; CHECK: ret 112 entry: 113 %tmp1 = load <4 x float>* %a ; <<4 x float>> [#uses=1] 114 %vecins = insertelement <4 x float> %tmp1, float 0.000000e+00, i32 0 ; <<4 x float>> [#uses=1] 115 %vecins4 = insertelement <4 x float> %vecins, float 0.000000e+00, i32 1; <<4 x float>> [#uses=1] 116 %vecins6 = insertelement <4 x float> %vecins4, float 0.000000e+00, i32 2; <<4 x float>> [#uses=1] 117 %vecins8 = insertelement <4 x float> %vecins6, float 0.000000e+00, i32 3; <<4 x float>> [#uses=1] 118 store <4 x float> %vecins8, <4 x float>* %a 119 ret void 120 } 121 122 declare i32 @fgetc(i8*) 123 124 declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) 125 126 declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) 127 128 declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) 129 130 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) 131 132 declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) 133 declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) 134 declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) 135 declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) 136 declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) 137 declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) 138 declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) 139 declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) 140 141 ; <rdar://problem/6945110> 142 define <4 x i32> @kernel3_vertical(<4 x i16> * %src, <8 x i16> * %foo) nounwind { 143 entry: 144 %tmp = load <4 x i16>* %src 145 %tmp1 = load <8 x i16>* %foo 146 ; CHECK: %tmp2 = shufflevector 147 %tmp2 = shufflevector <4 x i16> %tmp, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 148 ; pmovzxwd ignores the upper 64-bits of its input; -instcombine should remove this shuffle: 149 ; CHECK-NOT: shufflevector 150 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7> 151 ; CHECK-NEXT: pmovzxwd 152 %0 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %tmp3) 153 ret <4 x i32> %0 154 } 155 declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone 156 157 define <4 x float> @dead_shuffle_elt(<4 x float> %x, <2 x float> %y) nounwind { 158 entry: 159 ; CHECK-LABEL: define <4 x float> @dead_shuffle_elt( 160 ; CHECK: shufflevector <2 x float> %y, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 161 %shuffle.i = shufflevector <2 x float> %y, <2 x float> %y, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 162 %shuffle9.i = shufflevector <4 x float> %x, <4 x float> %shuffle.i, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 163 ret <4 x float> %shuffle9.i 164 } 165 166 define <2 x float> @test_fptrunc(double %f) { 167 ; CHECK-LABEL: @test_fptrunc( 168 ; CHECK: insertelement 169 ; CHECK: insertelement 170 ; CHECK-NOT: insertelement 171 %tmp9 = insertelement <4 x double> undef, double %f, i32 0 172 %tmp10 = insertelement <4 x double> %tmp9, double 0.000000e+00, i32 1 173 %tmp11 = insertelement <4 x double> %tmp10, double 0.000000e+00, i32 2 174 %tmp12 = insertelement <4 x double> %tmp11, double 0.000000e+00, i32 3 175 %tmp5 = fptrunc <4 x double> %tmp12 to <4 x float> 176 %ret = shufflevector <4 x float> %tmp5, <4 x float> undef, <2 x i32> <i32 0, i32 1> 177 ret <2 x float> %ret 178 } 179 180 define <2 x double> @test_fpext(float %f) { 181 ; CHECK-LABEL: @test_fpext( 182 ; CHECK: insertelement 183 ; CHECK: insertelement 184 ; CHECK-NOT: insertelement 185 %tmp9 = insertelement <4 x float> undef, float %f, i32 0 186 %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 1 187 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2 188 %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3 189 %tmp5 = fpext <4 x float> %tmp12 to <4 x double> 190 %ret = shufflevector <4 x double> %tmp5, <4 x double> undef, <2 x i32> <i32 0, i32 1> 191 ret <2 x double> %ret 192 } 193 194 define <4 x float> @test_select(float %f, float %g) { 195 ; CHECK-LABEL: @test_select( 196 ; CHECK: %a0 = insertelement <4 x float> undef, float %f, i32 0 197 ; CHECK-NOT: insertelement 198 ; CHECK: %a3 = insertelement <4 x float> %a0, float 3.000000e+00, i32 3 199 ; CHECK-NOT: insertelement 200 ; CHECK: %ret = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %a3, <4 x float> <float undef, float 4.000000e+00, float 5.000000e+00, float undef> 201 %a0 = insertelement <4 x float> undef, float %f, i32 0 202 %a1 = insertelement <4 x float> %a0, float 1.000000e+00, i32 1 203 %a2 = insertelement <4 x float> %a1, float 2.000000e+00, i32 2 204 %a3 = insertelement <4 x float> %a2, float 3.000000e+00, i32 3 205 %b0 = insertelement <4 x float> undef, float %g, i32 0 206 %b1 = insertelement <4 x float> %b0, float 4.000000e+00, i32 1 207 %b2 = insertelement <4 x float> %b1, float 5.000000e+00, i32 2 208 %b3 = insertelement <4 x float> %b2, float 6.000000e+00, i32 3 209 %ret = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %a3, <4 x float> %b3 210 ret <4 x float> %ret 211 } 212 213 ; We should optimize these two redundant insertqi into one 214 ; CHECK: define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) 215 define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) { 216 ; CHECK: call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) 217 ; CHECK-NOT: insertqi 218 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) 219 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 32) 220 ret <2 x i64> %2 221 } 222 223 ; The result of this insert is the second arg, since the top 64 bits of 224 ; the result are undefined, and we copy the bottom 64 bits from the 225 ; second arg 226 ; CHECK: define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) 227 define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) { 228 ; CHECK: ret <2 x i64> %i 229 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 0) 230 ret <2 x i64> %1 231 } 232 233 ; Test the several types of ranges and ordering that exist for two insertqi 234 ; CHECK: define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) 235 define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) { 236 ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) 237 ; CHECK: ret <2 x i64> %[[RES]] 238 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) 239 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 16) 240 ret <2 x i64> %2 241 } 242 243 ; CHECK: define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) 244 define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) { 245 ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) 246 ; CHECK: ret <2 x i64> %[[RES]] 247 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 16) 248 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) 249 ret <2 x i64> %2 250 } 251 252 ; CHECK: define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) 253 define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) { 254 ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) 255 ; CHECK: ret <2 x i64> %[[RES]] 256 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) 257 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 16) 258 ret <2 x i64> %2 259 } 260 261 ; CHECK: define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) 262 define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) { 263 ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) 264 ; CHECK: ret <2 x i64> %[[RES]] 265 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 16) 266 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) 267 ret <2 x i64> %2 268 } 269 270 ; CHECK: define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) 271 define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) { 272 ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) 273 ; CHECK: ret <2 x i64> %[[RES]] 274 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) 275 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) 276 ret <2 x i64> %2 277 } 278 279 ; CHECK: define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) 280 define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) { 281 ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) 282 ; CHECK: ret <2 x i64> %[[RES]] 283 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 32) 284 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) 285 ret <2 x i64> %2 286 } 287 288 ; CHECK: define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) 289 define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) { 290 ; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) 291 ; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) 292 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) 293 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) 294 ret <2 x i64> %2 295 } 296 297 ; CHECK: define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) 298 define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) { 299 ; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) 300 ; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) 301 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) 302 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) 303 ret <2 x i64> %2 304 } 305 306 307 ; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertqi 308 declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind 309 310 declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) 311 define <4 x float> @test_vpermilvar_ps(<4 x float> %v) { 312 ; CHECK-LABEL: @test_vpermilvar_ps( 313 ; CHECK: shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 314 %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 3, i32 2, i32 1, i32 0>) 315 ret <4 x float> %a 316 } 317 318 declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) 319 define <8 x float> @test_vpermilvar_ps_256(<8 x float> %v) { 320 ; CHECK-LABEL: @test_vpermilvar_ps_256( 321 ; CHECK: shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 322 %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>) 323 ret <8 x float> %a 324 } 325 326 declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i32>) 327 define <2 x double> @test_vpermilvar_pd(<2 x double> %v) { 328 ; CHECK-LABEL: @test_vpermilvar_pd( 329 ; CHECK: shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 0> 330 %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i32> <i32 2, i32 0>) 331 ret <2 x double> %a 332 } 333 334 declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i32>) 335 define <4 x double> @test_vpermilvar_pd_256(<4 x double> %v) { 336 ; CHECK-LABEL: @test_vpermilvar_pd_256( 337 ; CHECK: shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 338 %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i32> <i32 3, i32 1, i32 2, i32 0>) 339 ret <4 x double> %a 340 } 341 342 define <4 x float> @test_vpermilvar_ps_zero(<4 x float> %v) { 343 ; CHECK-LABEL: @test_vpermilvar_ps_zero( 344 ; CHECK: shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer 345 %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> zeroinitializer) 346 ret <4 x float> %a 347 } 348 349 define <8 x float> @test_vpermilvar_ps_256_zero(<8 x float> %v) { 350 ; CHECK-LABEL: @test_vpermilvar_ps_256_zero( 351 ; CHECK: shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> 352 %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> zeroinitializer) 353 ret <8 x float> %a 354 } 355 356 define <2 x double> @test_vpermilvar_pd_zero(<2 x double> %v) { 357 ; CHECK-LABEL: @test_vpermilvar_pd_zero( 358 ; CHECK: shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer 359 %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i32> zeroinitializer) 360 ret <2 x double> %a 361 } 362 363 define <4 x double> @test_vpermilvar_pd_256_zero(<4 x double> %v) { 364 ; CHECK-LABEL: @test_vpermilvar_pd_256_zero( 365 ; CHECK: shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 366 %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i32> zeroinitializer) 367 ret <4 x double> %a 368 } 369 370 define <2 x i64> @test_sse2_1() nounwind readnone uwtable { 371 %S = bitcast i32 1 to i32 372 %1 = zext i32 %S to i64 373 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 374 %3 = insertelement <2 x i64> %2, i64 0, i32 1 375 %4 = bitcast <2 x i64> %3 to <8 x i16> 376 %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4) 377 %6 = bitcast <8 x i16> %5 to <4 x i32> 378 %7 = bitcast <2 x i64> %3 to <4 x i32> 379 %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7) 380 %9 = bitcast <4 x i32> %8 to <2 x i64> 381 %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3) 382 %11 = bitcast <2 x i64> %10 to <8 x i16> 383 %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S) 384 %13 = bitcast <8 x i16> %12 to <4 x i32> 385 %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S) 386 %15 = bitcast <4 x i32> %14 to <2 x i64> 387 %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S) 388 ret <2 x i64> %16 389 ; CHECK: test_sse2_1 390 ; CHECK: ret <2 x i64> <i64 72058418680037440, i64 144117112246370624> 391 } 392 393 define <4 x i64> @test_avx2_1() nounwind readnone uwtable { 394 %S = bitcast i32 1 to i32 395 %1 = zext i32 %S to i64 396 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 397 %3 = insertelement <2 x i64> %2, i64 0, i32 1 398 %4 = bitcast <2 x i64> %3 to <8 x i16> 399 %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4) 400 %6 = bitcast <16 x i16> %5 to <8 x i32> 401 %7 = bitcast <2 x i64> %3 to <4 x i32> 402 %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7) 403 %9 = bitcast <8 x i32> %8 to <4 x i64> 404 %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3) 405 %11 = bitcast <4 x i64> %10 to <16 x i16> 406 %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S) 407 %13 = bitcast <16 x i16> %12 to <8 x i32> 408 %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S) 409 %15 = bitcast <8 x i32> %14 to <4 x i64> 410 %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S) 411 ret <4 x i64> %16 412 ; CHECK: test_avx2_1 413 ; CHECK: ret <4 x i64> <i64 64, i64 128, i64 192, i64 256> 414 } 415 416 define <2 x i64> @test_sse2_0() nounwind readnone uwtable { 417 %S = bitcast i32 128 to i32 418 %1 = zext i32 %S to i64 419 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 420 %3 = insertelement <2 x i64> %2, i64 0, i32 1 421 %4 = bitcast <2 x i64> %3 to <8 x i16> 422 %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4) 423 %6 = bitcast <8 x i16> %5 to <4 x i32> 424 %7 = bitcast <2 x i64> %3 to <4 x i32> 425 %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7) 426 %9 = bitcast <4 x i32> %8 to <2 x i64> 427 %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3) 428 %11 = bitcast <2 x i64> %10 to <8 x i16> 429 %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S) 430 %13 = bitcast <8 x i16> %12 to <4 x i32> 431 %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S) 432 %15 = bitcast <4 x i32> %14 to <2 x i64> 433 %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S) 434 ret <2 x i64> %16 435 ; CHECK: test_sse2_0 436 ; CHECK: ret <2 x i64> zeroinitializer 437 } 438 439 define <4 x i64> @test_avx2_0() nounwind readnone uwtable { 440 %S = bitcast i32 128 to i32 441 %1 = zext i32 %S to i64 442 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 443 %3 = insertelement <2 x i64> %2, i64 0, i32 1 444 %4 = bitcast <2 x i64> %3 to <8 x i16> 445 %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4) 446 %6 = bitcast <16 x i16> %5 to <8 x i32> 447 %7 = bitcast <2 x i64> %3 to <4 x i32> 448 %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7) 449 %9 = bitcast <8 x i32> %8 to <4 x i64> 450 %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3) 451 %11 = bitcast <4 x i64> %10 to <16 x i16> 452 %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S) 453 %13 = bitcast <16 x i16> %12 to <8 x i32> 454 %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S) 455 %15 = bitcast <8 x i32> %14 to <4 x i64> 456 %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S) 457 ret <4 x i64> %16 458 ; CHECK: test_avx2_0 459 ; CHECK: ret <4 x i64> zeroinitializer 460 } 461 define <2 x i64> @test_sse2_psrl_1() nounwind readnone uwtable { 462 %S = bitcast i32 1 to i32 463 %1 = zext i32 %S to i64 464 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 465 %3 = insertelement <2 x i64> %2, i64 0, i32 1 466 %4 = bitcast <2 x i64> %3 to <8 x i16> 467 %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> <i16 16, i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048>, <8 x i16> %4) 468 %6 = bitcast <8 x i16> %5 to <4 x i32> 469 %7 = bitcast <2 x i64> %3 to <4 x i32> 470 %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7) 471 %9 = bitcast <4 x i32> %8 to <2 x i64> 472 %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3) 473 %11 = bitcast <2 x i64> %10 to <8 x i16> 474 %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S) 475 %13 = bitcast <8 x i16> %12 to <4 x i32> 476 %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S) 477 %15 = bitcast <4 x i32> %14 to <2 x i64> 478 %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S) 479 ret <2 x i64> %16 480 ; CHECK: test_sse2_psrl_1 481 ; CHECK: ret <2 x i64> <i64 562954248421376, i64 9007267974742020> 482 } 483 484 define <4 x i64> @test_avx2_psrl_1() nounwind readnone uwtable { 485 %S = bitcast i32 1 to i32 486 %1 = zext i32 %S to i64 487 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 488 %3 = insertelement <2 x i64> %2, i64 0, i32 1 489 %4 = bitcast <2 x i64> %3 to <8 x i16> 490 %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4) 491 %6 = bitcast <16 x i16> %5 to <8 x i32> 492 %7 = bitcast <2 x i64> %3 to <4 x i32> 493 %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7) 494 %9 = bitcast <8 x i32> %8 to <4 x i64> 495 %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3) 496 %11 = bitcast <4 x i64> %10 to <16 x i16> 497 %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S) 498 %13 = bitcast <16 x i16> %12 to <8 x i32> 499 %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S) 500 %15 = bitcast <8 x i32> %14 to <4 x i64> 501 %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S) 502 ret <4 x i64> %16 503 ; CHECK: test_avx2_psrl_1 504 ; CHECK: ret <4 x i64> <i64 16, i64 32, i64 64, i64 128> 505 } 506 507 define <2 x i64> @test_sse2_psrl_0() nounwind readnone uwtable { 508 %S = bitcast i32 128 to i32 509 %1 = zext i32 %S to i64 510 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 511 %3 = insertelement <2 x i64> %2, i64 0, i32 1 512 %4 = bitcast <2 x i64> %3 to <8 x i16> 513 %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> <i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096>, <8 x i16> %4) 514 %6 = bitcast <8 x i16> %5 to <4 x i32> 515 %7 = bitcast <2 x i64> %3 to <4 x i32> 516 %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7) 517 %9 = bitcast <4 x i32> %8 to <2 x i64> 518 %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3) 519 %11 = bitcast <2 x i64> %10 to <8 x i16> 520 %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S) 521 %13 = bitcast <8 x i16> %12 to <4 x i32> 522 %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S) 523 %15 = bitcast <4 x i32> %14 to <2 x i64> 524 %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S) 525 ret <2 x i64> %16 526 ; CHECK: test_sse2_psrl_0 527 ; CHECK: ret <2 x i64> zeroinitializer 528 } 529 530 define <4 x i64> @test_avx2_psrl_0() nounwind readnone uwtable { 531 %S = bitcast i32 128 to i32 532 %1 = zext i32 %S to i64 533 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 534 %3 = insertelement <2 x i64> %2, i64 0, i32 1 535 %4 = bitcast <2 x i64> %3 to <8 x i16> 536 %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4) 537 %6 = bitcast <16 x i16> %5 to <8 x i32> 538 %7 = bitcast <2 x i64> %3 to <4 x i32> 539 %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7) 540 %9 = bitcast <8 x i32> %8 to <4 x i64> 541 %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3) 542 %11 = bitcast <4 x i64> %10 to <16 x i16> 543 %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S) 544 %13 = bitcast <16 x i16> %12 to <8 x i32> 545 %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S) 546 %15 = bitcast <8 x i32> %14 to <4 x i64> 547 %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S) 548 ret <4 x i64> %16 549 ; CHECK: test_avx2_psrl_0 550 ; CHECK: ret <4 x i64> zeroinitializer 551 } 552 553 declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) #1 554 declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) #1 555 declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) #1 556 declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) #1 557 declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) #1 558 declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) #1 559 declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) #1 560 declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) #1 561 declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) #1 562 declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1 563 declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) #1 564 declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) #1 565 declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) #1 566 declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) #1 567 declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) #1 568 declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) #1 569 declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) #1 570 declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) #1 571 declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) #1 572 declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) #1 573 declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) #1 574 declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) #1 575 declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) #1 576 declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) #1 577 578 attributes #1 = { nounwind readnone } 579