1 ; RUN: opt < %s -instcombine -S | FileCheck %s 2 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 3 4 define i16 @test1(float %f) { 5 entry: 6 ; CHECK-LABEL: @test1( 7 ; CHECK: fmul float 8 ; CHECK-NOT: insertelement {{.*}} 0.00 9 ; CHECK-NOT: call {{.*}} @llvm.x86.sse.mul 10 ; CHECK-NOT: call {{.*}} @llvm.x86.sse.sub 11 ; CHECK: ret 12 %tmp = insertelement <4 x float> undef, float %f, i32 0 ; <<4 x float>> [#uses=1] 13 %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1] 14 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1] 15 %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1] 16 %tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1] 17 %tmp37 = tail call <4 x float> @llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1] 18 %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1] 19 %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer ) ; <<4 x float>> [#uses=1] 20 %tmp.upgrd.1 = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 ) ; <i32> [#uses=1] 21 %tmp69 = trunc i32 %tmp.upgrd.1 to i16 ; <i16> [#uses=1] 22 ret i16 %tmp69 23 } 24 25 define i32 @test2(float %f) { 26 ; CHECK-LABEL: @test2( 27 ; CHECK-NOT: insertelement 28 ; CHECK-NOT: extractelement 29 ; CHECK: ret 30 %tmp5 = fmul float %f, %f 31 %tmp9 = insertelement <4 x float> undef, float %tmp5, i32 0 32 %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 1 33 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2 34 %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3 35 %tmp19 = bitcast <4 x float> %tmp12 to <4 x i32> 36 %tmp21 = extractelement <4 x i32> %tmp19, i32 0 37 ret i32 %tmp21 38 } 39 40 define i64 @test3(float %f, double %d) { 41 ; CHECK-LABEL: @test3( 42 ; CHECK-NOT: insertelement {{.*}} 0.00 43 ; CHECK: ret 44 entry: 45 %v00 = insertelement <4 x float> undef, float %f, i32 0 46 %v01 = insertelement <4 x float> %v00, float 0.000000e+00, i32 1 47 %v02 = insertelement <4 x float> %v01, float 0.000000e+00, i32 2 48 %v03 = insertelement <4 x float> %v02, float 0.000000e+00, i32 3 49 %tmp0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %v03) 50 %v10 = insertelement <4 x float> undef, float %f, i32 0 51 %v11 = insertelement <4 x float> %v10, float 0.000000e+00, i32 1 52 %v12 = insertelement <4 x float> %v11, float 0.000000e+00, i32 2 53 %v13 = insertelement <4 x float> %v12, float 0.000000e+00, i32 3 54 %tmp1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %v13) 55 %v20 = insertelement <4 x float> undef, float %f, i32 0 56 %v21 = insertelement <4 x float> %v20, float 0.000000e+00, i32 1 57 %v22 = insertelement <4 x float> %v21, float 0.000000e+00, i32 2 58 %v23 = insertelement <4 x float> %v22, float 0.000000e+00, i32 3 59 %tmp2 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %v23) 60 %v30 = insertelement <4 x float> undef, float %f, i32 0 61 %v31 = insertelement <4 x float> %v30, float 0.000000e+00, i32 1 62 %v32 = insertelement <4 x float> %v31, float 0.000000e+00, i32 2 63 %v33 = insertelement <4 x float> %v32, float 0.000000e+00, i32 3 64 %tmp3 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %v33) 65 %v40 = insertelement <2 x double> undef, double %d, i32 0 66 %v41 = insertelement <2 x double> %v40, double 0.000000e+00, i32 1 67 %tmp4 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %v41) 68 %v50 = insertelement <2 x double> undef, double %d, i32 0 69 %v51 = insertelement <2 x double> %v50, double 0.000000e+00, i32 1 70 %tmp5 = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %v51) 71 %v60 = insertelement <2 x double> undef, double %d, i32 0 72 %v61 = insertelement <2 x double> %v60, double 0.000000e+00, i32 1 73 %tmp6 = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %v61) 74 %v70 = insertelement <2 x double> undef, double %d, i32 0 75 %v71 = insertelement <2 x double> %v70, double 0.000000e+00, i32 1 76 %tmp7 = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %v71) 77 %tmp8 = add i32 %tmp0, %tmp2 78 %tmp9 = add i32 %tmp4, %tmp6 79 %tmp10 = add i32 %tmp8, %tmp9 80 %tmp11 = sext i32 %tmp10 to i64 81 %tmp12 = add i64 %tmp1, %tmp3 82 %tmp13 = add i64 %tmp5, %tmp7 83 %tmp14 = add i64 %tmp12, %tmp13 84 %tmp15 = add i64 %tmp11, %tmp14 85 ret i64 %tmp15 86 } 87 88 define void @get_image() nounwind { 89 ; CHECK-LABEL: @get_image( 90 ; CHECK-NOT: extractelement 91 ; CHECK: unreachable 92 entry: 93 %0 = call i32 @fgetc(i8* null) nounwind ; <i32> [#uses=1] 94 %1 = trunc i32 %0 to i8 ; <i8> [#uses=1] 95 %tmp2 = insertelement <100 x i8> zeroinitializer, i8 %1, i32 1 ; <<100 x i8>> [#uses=1] 96 %tmp1 = extractelement <100 x i8> %tmp2, i32 0 ; <i8> [#uses=1] 97 %2 = icmp eq i8 %tmp1, 80 ; <i1> [#uses=1] 98 br i1 %2, label %bb2, label %bb3 99 100 bb2: ; preds = %entry 101 br label %bb3 102 103 bb3: ; preds = %bb2, %entry 104 unreachable 105 } 106 107 ; PR4340 108 define void @vac(<4 x float>* nocapture %a) nounwind { 109 ; CHECK-LABEL: @vac( 110 ; CHECK-NOT: load 111 ; CHECK: ret 112 entry: 113 %tmp1 = load <4 x float>, <4 x float>* %a ; <<4 x float>> [#uses=1] 114 %vecins = insertelement <4 x float> %tmp1, float 0.000000e+00, i32 0 ; <<4 x float>> [#uses=1] 115 %vecins4 = insertelement <4 x float> %vecins, float 0.000000e+00, i32 1; <<4 x float>> [#uses=1] 116 %vecins6 = insertelement <4 x float> %vecins4, float 0.000000e+00, i32 2; <<4 x float>> [#uses=1] 117 %vecins8 = insertelement <4 x float> %vecins6, float 0.000000e+00, i32 3; <<4 x float>> [#uses=1] 118 store <4 x float> %vecins8, <4 x float>* %a 119 ret void 120 } 121 122 declare i32 @fgetc(i8*) 123 124 declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) 125 126 declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) 127 128 declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) 129 130 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) 131 132 declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) 133 declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) 134 declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) 135 declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) 136 declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) 137 declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) 138 declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) 139 declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) 140 141 ; <rdar://problem/6945110> 142 define <4 x i32> @kernel3_vertical(<4 x i16> * %src, <8 x i16> * %foo) nounwind { 143 entry: 144 %tmp = load <4 x i16>, <4 x i16>* %src 145 %tmp1 = load <8 x i16>, <8 x i16>* %foo 146 ; CHECK: %tmp2 = shufflevector 147 %tmp2 = shufflevector <4 x i16> %tmp, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 148 ; pmovzxwd ignores the upper 64-bits of its input; -instcombine should remove this shuffle: 149 ; CHECK-NOT: shufflevector 150 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7> 151 ; CHECK-NEXT: pmovzxwd 152 %0 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %tmp3) 153 ret <4 x i32> %0 154 } 155 declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone 156 157 define <4 x float> @dead_shuffle_elt(<4 x float> %x, <2 x float> %y) nounwind { 158 entry: 159 ; CHECK-LABEL: define <4 x float> @dead_shuffle_elt( 160 ; CHECK: shufflevector <2 x float> %y, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 161 %shuffle.i = shufflevector <2 x float> %y, <2 x float> %y, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 162 %shuffle9.i = shufflevector <4 x float> %x, <4 x float> %shuffle.i, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 163 ret <4 x float> %shuffle9.i 164 } 165 166 define <2 x float> @test_fptrunc(double %f) { 167 ; CHECK-LABEL: @test_fptrunc( 168 ; CHECK: insertelement 169 ; CHECK: insertelement 170 ; CHECK-NOT: insertelement 171 %tmp9 = insertelement <4 x double> undef, double %f, i32 0 172 %tmp10 = insertelement <4 x double> %tmp9, double 0.000000e+00, i32 1 173 %tmp11 = insertelement <4 x double> %tmp10, double 0.000000e+00, i32 2 174 %tmp12 = insertelement <4 x double> %tmp11, double 0.000000e+00, i32 3 175 %tmp5 = fptrunc <4 x double> %tmp12 to <4 x float> 176 %ret = shufflevector <4 x float> %tmp5, <4 x float> undef, <2 x i32> <i32 0, i32 1> 177 ret <2 x float> %ret 178 } 179 180 define <2 x double> @test_fpext(float %f) { 181 ; CHECK-LABEL: @test_fpext( 182 ; CHECK: insertelement 183 ; CHECK: insertelement 184 ; CHECK-NOT: insertelement 185 %tmp9 = insertelement <4 x float> undef, float %f, i32 0 186 %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 1 187 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2 188 %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3 189 %tmp5 = fpext <4 x float> %tmp12 to <4 x double> 190 %ret = shufflevector <4 x double> %tmp5, <4 x double> undef, <2 x i32> <i32 0, i32 1> 191 ret <2 x double> %ret 192 } 193 194 define <4 x float> @test_select(float %f, float %g) { 195 ; CHECK-LABEL: @test_select( 196 ; CHECK: %a0 = insertelement <4 x float> undef, float %f, i32 0 197 ; CHECK-NOT: insertelement 198 ; CHECK: %a3 = insertelement <4 x float> %a0, float 3.000000e+00, i32 3 199 ; CHECK-NOT: insertelement 200 ; CHECK: %ret = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %a3, <4 x float> <float undef, float 4.000000e+00, float 5.000000e+00, float undef> 201 %a0 = insertelement <4 x float> undef, float %f, i32 0 202 %a1 = insertelement <4 x float> %a0, float 1.000000e+00, i32 1 203 %a2 = insertelement <4 x float> %a1, float 2.000000e+00, i32 2 204 %a3 = insertelement <4 x float> %a2, float 3.000000e+00, i32 3 205 %b0 = insertelement <4 x float> undef, float %g, i32 0 206 %b1 = insertelement <4 x float> %b0, float 4.000000e+00, i32 1 207 %b2 = insertelement <4 x float> %b1, float 5.000000e+00, i32 2 208 %b3 = insertelement <4 x float> %b2, float 6.000000e+00, i32 3 209 %ret = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %a3, <4 x float> %b3 210 ret <4 x float> %ret 211 } 212 213 ; We should optimize these two redundant insertqi into one 214 ; CHECK: define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) 215 define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) { 216 ; CHECK: call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) 217 ; CHECK-NOT: insertqi 218 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) 219 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 32) 220 ret <2 x i64> %2 221 } 222 223 ; The result of this insert is the second arg, since the top 64 bits of 224 ; the result are undefined, and we copy the bottom 64 bits from the 225 ; second arg 226 ; CHECK: define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) 227 define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) { 228 ; CHECK: ret <2 x i64> %i 229 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 0) 230 ret <2 x i64> %1 231 } 232 233 ; Test the several types of ranges and ordering that exist for two insertqi 234 ; CHECK: define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) 235 define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) { 236 ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) 237 ; CHECK: ret <2 x i64> %[[RES]] 238 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) 239 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 16) 240 ret <2 x i64> %2 241 } 242 243 ; CHECK: define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) 244 define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) { 245 ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) 246 ; CHECK: ret <2 x i64> %[[RES]] 247 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 16) 248 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) 249 ret <2 x i64> %2 250 } 251 252 ; CHECK: define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) 253 define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) { 254 ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) 255 ; CHECK: ret <2 x i64> %[[RES]] 256 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) 257 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 16) 258 ret <2 x i64> %2 259 } 260 261 ; CHECK: define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) 262 define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) { 263 ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) 264 ; CHECK: ret <2 x i64> %[[RES]] 265 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 16) 266 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) 267 ret <2 x i64> %2 268 } 269 270 ; CHECK: define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) 271 define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) { 272 ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) 273 ; CHECK: ret <2 x i64> %[[RES]] 274 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) 275 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) 276 ret <2 x i64> %2 277 } 278 279 ; CHECK: define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) 280 define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) { 281 ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) 282 ; CHECK: ret <2 x i64> %[[RES]] 283 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 32) 284 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) 285 ret <2 x i64> %2 286 } 287 288 ; CHECK: define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) 289 define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) { 290 ; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) 291 ; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) 292 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) 293 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) 294 ret <2 x i64> %2 295 } 296 297 ; CHECK: define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) 298 define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) { 299 ; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) 300 ; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) 301 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) 302 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) 303 ret <2 x i64> %2 304 } 305 306 ; CHECK: define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i) 307 define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i) { 308 ; CHECK: ret <2 x i64> %i 309 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 0) 310 ret <2 x i64> %1 311 } 312 313 ; CHECK: define <2 x i64> @testUndefinedInsertq_1(<2 x i64> %v, <2 x i64> %i) 314 define <2 x i64> @testUndefinedInsertq_1(<2 x i64> %v, <2 x i64> %i) { 315 ; CHECK: ret <2 x i64> undef 316 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 16) 317 ret <2 x i64> %1 318 } 319 320 ; CHECK: define <2 x i64> @testUndefinedInsertq_2(<2 x i64> %v, <2 x i64> %i) 321 define <2 x i64> @testUndefinedInsertq_2(<2 x i64> %v, <2 x i64> %i) { 322 ; CHECK: ret <2 x i64> undef 323 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 32) 324 ret <2 x i64> %1 325 } 326 327 ; CHECK: define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i) 328 define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i) { 329 ; CHECK: ret <2 x i64> undef 330 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 16) 331 ret <2 x i64> %1 332 } 333 334 ; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertqi 335 declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind 336 337 declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) 338 define <4 x float> @test_vpermilvar_ps(<4 x float> %v) { 339 ; CHECK-LABEL: @test_vpermilvar_ps( 340 ; CHECK: shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 341 %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 3, i32 2, i32 1, i32 0>) 342 ret <4 x float> %a 343 } 344 345 declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) 346 define <8 x float> @test_vpermilvar_ps_256(<8 x float> %v) { 347 ; CHECK-LABEL: @test_vpermilvar_ps_256( 348 ; CHECK: shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 349 %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>) 350 ret <8 x float> %a 351 } 352 353 declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) 354 define <2 x double> @test_vpermilvar_pd(<2 x double> %v) { 355 ; CHECK-LABEL: @test_vpermilvar_pd( 356 ; CHECK: shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 0> 357 %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> <i64 2, i64 0>) 358 ret <2 x double> %a 359 } 360 361 declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) 362 define <4 x double> @test_vpermilvar_pd_256(<4 x double> %v) { 363 ; CHECK-LABEL: @test_vpermilvar_pd_256( 364 ; CHECK: shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 365 %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> <i64 3, i64 1, i64 2, i64 0>) 366 ret <4 x double> %a 367 } 368 369 define <4 x float> @test_vpermilvar_ps_zero(<4 x float> %v) { 370 ; CHECK-LABEL: @test_vpermilvar_ps_zero( 371 ; CHECK: shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer 372 %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> zeroinitializer) 373 ret <4 x float> %a 374 } 375 376 define <8 x float> @test_vpermilvar_ps_256_zero(<8 x float> %v) { 377 ; CHECK-LABEL: @test_vpermilvar_ps_256_zero( 378 ; CHECK: shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> 379 %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> zeroinitializer) 380 ret <8 x float> %a 381 } 382 383 define <2 x double> @test_vpermilvar_pd_zero(<2 x double> %v) { 384 ; CHECK-LABEL: @test_vpermilvar_pd_zero( 385 ; CHECK: shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer 386 %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> zeroinitializer) 387 ret <2 x double> %a 388 } 389 390 define <4 x double> @test_vpermilvar_pd_256_zero(<4 x double> %v) { 391 ; CHECK-LABEL: @test_vpermilvar_pd_256_zero( 392 ; CHECK: shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 393 %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> zeroinitializer) 394 ret <4 x double> %a 395 } 396 397 define <2 x i64> @test_sse2_1() nounwind readnone uwtable { 398 %S = bitcast i32 1 to i32 399 %1 = zext i32 %S to i64 400 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 401 %3 = insertelement <2 x i64> %2, i64 0, i32 1 402 %4 = bitcast <2 x i64> %3 to <8 x i16> 403 %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4) 404 %6 = bitcast <8 x i16> %5 to <4 x i32> 405 %7 = bitcast <2 x i64> %3 to <4 x i32> 406 %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7) 407 %9 = bitcast <4 x i32> %8 to <2 x i64> 408 %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3) 409 %11 = bitcast <2 x i64> %10 to <8 x i16> 410 %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S) 411 %13 = bitcast <8 x i16> %12 to <4 x i32> 412 %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S) 413 %15 = bitcast <4 x i32> %14 to <2 x i64> 414 %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S) 415 ret <2 x i64> %16 416 ; CHECK: test_sse2_1 417 ; CHECK: ret <2 x i64> <i64 72058418680037440, i64 144117112246370624> 418 } 419 420 define <4 x i64> @test_avx2_1() nounwind readnone uwtable { 421 %S = bitcast i32 1 to i32 422 %1 = zext i32 %S to i64 423 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 424 %3 = insertelement <2 x i64> %2, i64 0, i32 1 425 %4 = bitcast <2 x i64> %3 to <8 x i16> 426 %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4) 427 %6 = bitcast <16 x i16> %5 to <8 x i32> 428 %7 = bitcast <2 x i64> %3 to <4 x i32> 429 %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7) 430 %9 = bitcast <8 x i32> %8 to <4 x i64> 431 %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3) 432 %11 = bitcast <4 x i64> %10 to <16 x i16> 433 %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S) 434 %13 = bitcast <16 x i16> %12 to <8 x i32> 435 %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S) 436 %15 = bitcast <8 x i32> %14 to <4 x i64> 437 %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S) 438 ret <4 x i64> %16 439 ; CHECK: test_avx2_1 440 ; CHECK: ret <4 x i64> <i64 64, i64 128, i64 192, i64 256> 441 } 442 443 define <2 x i64> @test_sse2_0() nounwind readnone uwtable { 444 %S = bitcast i32 128 to i32 445 %1 = zext i32 %S to i64 446 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 447 %3 = insertelement <2 x i64> %2, i64 0, i32 1 448 %4 = bitcast <2 x i64> %3 to <8 x i16> 449 %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4) 450 %6 = bitcast <8 x i16> %5 to <4 x i32> 451 %7 = bitcast <2 x i64> %3 to <4 x i32> 452 %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7) 453 %9 = bitcast <4 x i32> %8 to <2 x i64> 454 %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3) 455 %11 = bitcast <2 x i64> %10 to <8 x i16> 456 %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S) 457 %13 = bitcast <8 x i16> %12 to <4 x i32> 458 %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S) 459 %15 = bitcast <4 x i32> %14 to <2 x i64> 460 %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S) 461 ret <2 x i64> %16 462 ; CHECK: test_sse2_0 463 ; CHECK: ret <2 x i64> zeroinitializer 464 } 465 466 define <4 x i64> @test_avx2_0() nounwind readnone uwtable { 467 %S = bitcast i32 128 to i32 468 %1 = zext i32 %S to i64 469 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 470 %3 = insertelement <2 x i64> %2, i64 0, i32 1 471 %4 = bitcast <2 x i64> %3 to <8 x i16> 472 %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4) 473 %6 = bitcast <16 x i16> %5 to <8 x i32> 474 %7 = bitcast <2 x i64> %3 to <4 x i32> 475 %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7) 476 %9 = bitcast <8 x i32> %8 to <4 x i64> 477 %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3) 478 %11 = bitcast <4 x i64> %10 to <16 x i16> 479 %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S) 480 %13 = bitcast <16 x i16> %12 to <8 x i32> 481 %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S) 482 %15 = bitcast <8 x i32> %14 to <4 x i64> 483 %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S) 484 ret <4 x i64> %16 485 ; CHECK: test_avx2_0 486 ; CHECK: ret <4 x i64> zeroinitializer 487 } 488 define <2 x i64> @test_sse2_psrl_1() nounwind readnone uwtable { 489 %S = bitcast i32 1 to i32 490 %1 = zext i32 %S to i64 491 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 492 %3 = insertelement <2 x i64> %2, i64 0, i32 1 493 %4 = bitcast <2 x i64> %3 to <8 x i16> 494 %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> <i16 16, i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048>, <8 x i16> %4) 495 %6 = bitcast <8 x i16> %5 to <4 x i32> 496 %7 = bitcast <2 x i64> %3 to <4 x i32> 497 %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7) 498 %9 = bitcast <4 x i32> %8 to <2 x i64> 499 %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3) 500 %11 = bitcast <2 x i64> %10 to <8 x i16> 501 %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S) 502 %13 = bitcast <8 x i16> %12 to <4 x i32> 503 %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S) 504 %15 = bitcast <4 x i32> %14 to <2 x i64> 505 %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S) 506 ret <2 x i64> %16 507 ; CHECK: test_sse2_psrl_1 508 ; CHECK: ret <2 x i64> <i64 562954248421376, i64 9007267974742020> 509 } 510 511 define <4 x i64> @test_avx2_psrl_1() nounwind readnone uwtable { 512 %S = bitcast i32 1 to i32 513 %1 = zext i32 %S to i64 514 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 515 %3 = insertelement <2 x i64> %2, i64 0, i32 1 516 %4 = bitcast <2 x i64> %3 to <8 x i16> 517 %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4) 518 %6 = bitcast <16 x i16> %5 to <8 x i32> 519 %7 = bitcast <2 x i64> %3 to <4 x i32> 520 %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7) 521 %9 = bitcast <8 x i32> %8 to <4 x i64> 522 %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3) 523 %11 = bitcast <4 x i64> %10 to <16 x i16> 524 %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S) 525 %13 = bitcast <16 x i16> %12 to <8 x i32> 526 %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S) 527 %15 = bitcast <8 x i32> %14 to <4 x i64> 528 %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S) 529 ret <4 x i64> %16 530 ; CHECK: test_avx2_psrl_1 531 ; CHECK: ret <4 x i64> <i64 16, i64 32, i64 64, i64 128> 532 } 533 534 define <2 x i64> @test_sse2_psrl_0() nounwind readnone uwtable { 535 %S = bitcast i32 128 to i32 536 %1 = zext i32 %S to i64 537 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 538 %3 = insertelement <2 x i64> %2, i64 0, i32 1 539 %4 = bitcast <2 x i64> %3 to <8 x i16> 540 %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> <i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096>, <8 x i16> %4) 541 %6 = bitcast <8 x i16> %5 to <4 x i32> 542 %7 = bitcast <2 x i64> %3 to <4 x i32> 543 %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7) 544 %9 = bitcast <4 x i32> %8 to <2 x i64> 545 %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3) 546 %11 = bitcast <2 x i64> %10 to <8 x i16> 547 %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S) 548 %13 = bitcast <8 x i16> %12 to <4 x i32> 549 %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S) 550 %15 = bitcast <4 x i32> %14 to <2 x i64> 551 %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S) 552 ret <2 x i64> %16 553 ; CHECK: test_sse2_psrl_0 554 ; CHECK: ret <2 x i64> zeroinitializer 555 } 556 557 define <4 x i64> @test_avx2_psrl_0() nounwind readnone uwtable { 558 %S = bitcast i32 128 to i32 559 %1 = zext i32 %S to i64 560 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 561 %3 = insertelement <2 x i64> %2, i64 0, i32 1 562 %4 = bitcast <2 x i64> %3 to <8 x i16> 563 %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4) 564 %6 = bitcast <16 x i16> %5 to <8 x i32> 565 %7 = bitcast <2 x i64> %3 to <4 x i32> 566 %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7) 567 %9 = bitcast <8 x i32> %8 to <4 x i64> 568 %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3) 569 %11 = bitcast <4 x i64> %10 to <16 x i16> 570 %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S) 571 %13 = bitcast <16 x i16> %12 to <8 x i32> 572 %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S) 573 %15 = bitcast <8 x i32> %14 to <4 x i64> 574 %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S) 575 ret <4 x i64> %16 576 ; CHECK: test_avx2_psrl_0 577 ; CHECK: ret <4 x i64> zeroinitializer 578 } 579 580 declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) #1 581 declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) #1 582 declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) #1 583 declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) #1 584 declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) #1 585 declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) #1 586 declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) #1 587 declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) #1 588 declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) #1 589 declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1 590 declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) #1 591 declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) #1 592 declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) #1 593 declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) #1 594 declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) #1 595 declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) #1 596 declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) #1 597 declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) #1 598 declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) #1 599 declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) #1 600 declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) #1 601 declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) #1 602 declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) #1 603 declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) #1 604 605 attributes #1 = { nounwind readnone } 606