1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s 2 3 ; PR11102 4 define <4 x float> @test1(<4 x float> %a) nounwind { 5 %b = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 5, i32 undef, i32 undef> 6 ret <4 x float> %b 7 ; CHECK: test1: 8 ; CHECK: vshufps 9 ; CHECK: vpshufd 10 } 11 12 ; rdar://10538417 13 define <3 x i64> @test2(<2 x i64> %v) nounwind readnone { 14 ; CHECK: test2: 15 ; CHECK: vinsertf128 16 %1 = shufflevector <2 x i64> %v, <2 x i64> %v, <3 x i32> <i32 0, i32 1, i32 undef> 17 %2 = shufflevector <3 x i64> zeroinitializer, <3 x i64> %1, <3 x i32> <i32 3, i32 4, i32 2> 18 ret <3 x i64> %2 19 ; CHECK: ret 20 } 21 22 define <4 x i64> @test3(<4 x i64> %a, <4 x i64> %b) nounwind { 23 %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 undef> 24 ret <4 x i64> %c 25 ; CHECK: test3: 26 ; CHECK: vperm2f128 27 ; CHECK: ret 28 } 29 30 define <8 x float> @test4(float %a) nounwind { 31 %b = insertelement <8 x float> zeroinitializer, float %a, i32 0 32 ret <8 x float> %b 33 ; CHECK: test4: 34 ; CHECK: vinsertf128 35 } 36 37 ; rdar://10594409 38 define <8 x float> @test5(float* nocapture %f) nounwind uwtable readonly ssp { 39 entry: 40 %0 = bitcast float* %f to <4 x float>* 41 %1 = load <4 x float>* %0, align 16 42 ; CHECK: test5 43 ; CHECK: vmovaps 44 ; CHECK-NOT: vxorps 45 ; CHECK-NOT: vinsertf128 46 %shuffle.i = shufflevector <4 x float> %1, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4> 47 ret <8 x float> %shuffle.i 48 } 49 50 define <4 x double> @test6(double* nocapture %d) nounwind uwtable readonly ssp { 51 entry: 52 %0 = bitcast double* %d to <2 x double>* 53 %1 = load <2 x double>* %0, align 16 54 ; CHECK: test6 55 ; CHECK: vmovaps 56 ; CHECK-NOT: vxorps 57 ; CHECK-NOT: vinsertf128 58 %shuffle.i = shufflevector <2 x double> %1, <2 x double> <double 0.000000e+00, double undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 59 ret <4 x double> %shuffle.i 60 } 61 62 define <16 x i16> @test7(<4 x i16> %a) nounwind { 63 ; CHECK: test7 64 %b = shufflevector <4 x i16> %a, <4 x i16> undef, <16 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 65 ; CHECK: ret 66 ret <16 x i16> %b 67 } 68 69 ; CHECK: test8 70 define void @test8() { 71 entry: 72 %0 = load <16 x i64> addrspace(1)* null, align 128 73 %1 = shufflevector <16 x i64> <i64 undef, i64 undef, i64 0, i64 undef, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 undef, i64 0, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i64> %0, <16 x i32> <i32 17, i32 18, i32 2, i32 undef, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 26> 74 %2 = shufflevector <16 x i64> %1, <16 x i64> %0, <16 x i32> <i32 0, i32 1, i32 2, i32 30, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 undef, i32 11, i32 undef, i32 22, i32 20, i32 15> 75 store <16 x i64> %2, <16 x i64> addrspace(1)* undef, align 128 76 ; CHECK: ret 77 ret void 78 } 79 80 ; Extract a value from a shufflevector.. 81 define i32 @test9(<4 x i32> %a) nounwind { 82 ; CHECK: test9 83 ; CHECK: vpextrd 84 %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 undef, i32 4> 85 %r = extractelement <8 x i32> %b, i32 2 86 ; CHECK: ret 87 ret i32 %r 88 } 89 90 ; Extract a value which is the result of an undef mask. 91 define i32 @test10(<4 x i32> %a) nounwind { 92 ; CHECK: @test10 93 ; CHECK-NOT: {{^[^#]*[a-z]}} 94 ; CHECK: ret 95 %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 96 %r = extractelement <8 x i32> %b, i32 2 97 ret i32 %r 98 } 99 100 define <4 x float> @test11(<4 x float> %a) nounwind { 101 ; CHECK: test11 102 ; CHECK: vpshufd $27 103 %tmp1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 104 ret <4 x float> %tmp1 105 } 106 107 define <4 x float> @test12(<4 x float>* %a) nounwind { 108 ; CHECK: test12 109 ; CHECK: vpshufd 110 %tmp0 = load <4 x float>* %a 111 %tmp1 = shufflevector <4 x float> %tmp0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 112 ret <4 x float> %tmp1 113 } 114 115 define <4 x i32> @test13(<4 x i32> %a) nounwind { 116 ; CHECK: test13 117 ; CHECK: vpshufd $27 118 %tmp1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 119 ret <4 x i32> %tmp1 120 } 121 122 define <4 x i32> @test14(<4 x i32>* %a) nounwind { 123 ; CHECK: test14 124 ; CHECK: vpshufd $27, ( 125 %tmp0 = load <4 x i32>* %a 126 %tmp1 = shufflevector <4 x i32> %tmp0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 127 ret <4 x i32> %tmp1 128 } 129 130 ; CHECK: test15 131 ; CHECK: vpshufd $8 132 ; CHECK: ret 133 define <4 x i32> @test15(<2 x i32>%x) nounwind readnone { 134 %x1 = shufflevector <2 x i32> %x, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 135 ret <4 x i32>%x1 136 } 137 138 ; rdar://10974078 139 define <8 x float> @test16(float* nocapture %f) nounwind uwtable readonly ssp { 140 entry: 141 %0 = bitcast float* %f to <4 x float>* 142 %1 = load <4 x float>* %0, align 8 143 ; CHECK: test16 144 ; CHECK: vmovups 145 ; CHECK-NOT: vxorps 146 ; CHECK-NOT: vinsertf128 147 %shuffle.i = shufflevector <4 x float> %1, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4> 148 ret <8 x float> %shuffle.i 149 } 150 151 ; PR12413 152 ; CHECK: shuf1 153 ; CHECK: vpshufb 154 ; CHECK: vpshufb 155 ; CHECK: vpshufb 156 ; CHECK: vpshufb 157 define <32 x i8> @shuf1(<32 x i8> %inval1, <32 x i8> %inval2) { 158 entry: 159 %0 = shufflevector <32 x i8> %inval1, <32 x i8> %inval2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62> 160 ret <32 x i8> %0 161 } 162 163 ; handle the case where only half of the 256-bits is splittable 164 ; CHECK: shuf2 165 ; CHECK: vpshufb 166 ; CHECK: vpshufb 167 ; CHECK: vpextrb 168 ; CHECK: vpextrb 169 define <32 x i8> @shuf2(<32 x i8> %inval1, <32 x i8> %inval2) { 170 entry: 171 %0 = shufflevector <32 x i8> %inval1, <32 x i8> %inval2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 31, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62> 172 ret <32 x i8> %0 173 } 174 175 ; CHECK: blend1 176 ; CHECK: vblendps 177 ; CHECK: ret 178 define <4 x i32> @blend1(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline { 179 %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 180 ret <4 x i32> %t 181 } 182 183 ; CHECK: blend2 184 ; CHECK: vblendps 185 ; CHECK: ret 186 define <4 x i32> @blend2(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline { 187 %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 188 ret <4 x i32> %t 189 } 190 191 ; CHECK: blend2a 192 ; CHECK: vblendps 193 ; CHECK: ret 194 define <4 x float> @blend2a(<4 x float> %a, <4 x float> %b) nounwind alwaysinline { 195 %t = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 196 ret <4 x float> %t 197 } 198 199 ; CHECK: blend3 200 ; CHECK-NOT: vblendps 201 ; CHECK: ret 202 define <4 x i32> @blend3(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline { 203 %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 2, i32 7> 204 ret <4 x i32> %t 205 } 206 207 ; CHECK: blend4 208 ; CHECK: vblendpd 209 ; CHECK: ret 210 define <4 x i64> @blend4(<4 x i64> %a, <4 x i64> %b) nounwind alwaysinline { 211 %t = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 212 ret <4 x i64> %t 213 } 214 215 ; CHECK: narrow 216 ; CHECK: vpermilps 217 ; CHECK: ret 218 define <16 x i16> @narrow(<16 x i16> %a) nounwind alwaysinline { 219 %t = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 undef, i32 14, i32 15, i32 undef, i32 undef> 220 ret <16 x i16> %t 221 } 222 223 ;CHECK: test17 224 ;CHECK-NOT: vinsertf128 225 ;CHECK: ret 226 define <8 x float> @test17(<4 x float> %y) { 227 %x = shufflevector <4 x float> %y, <4 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 228 ret <8 x float> %x 229 } 230 231 ; CHECK: test18 232 ; CHECK: vmovshdup 233 ; CHECK: vblendps 234 ; CHECK: ret 235 define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind { 236 %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 237 ret <8 x float>%S 238 } 239 240 ; CHECK: test19 241 ; CHECK: vmovsldup 242 ; CHECK: vblendps 243 ; CHECK: ret 244 define <8 x float> @test19(<8 x float> %A, <8 x float>%B) nounwind { 245 %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 246 ret <8 x float>%S 247 } 248 249 ; rdar://12684358 250 ; Make sure loads happen before stores. 251 ; CHECK: swap8doubles 252 ; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}} 253 ; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}} 254 ; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}} 255 ; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}} 256 ; CHECK: vmovaps {{[0-9]*}}(%rsi), %ymm{{[0-9]+}} 257 ; CHECK: vmovaps {{[0-9]*}}(%rsi), %ymm{{[0-9]+}} 258 ; CHECK: vmovaps %xmm{{[0-9]+}}, {{[0-9]*}}(%rdi) 259 ; CHECK: vextractf128 260 ; CHECK: vmovaps %xmm{{[0-9]+}}, {{[0-9]*}}(%rdi) 261 ; CHECK: vextractf128 262 ; CHECK: vmovaps %ymm{{[0-9]+}}, {{[0-9]*}}(%rsi) 263 ; CHECK: vmovaps %ymm{{[0-9]+}}, {{[0-9]*}}(%rsi) 264 define void @swap8doubles(double* nocapture %A, double* nocapture %C) nounwind uwtable ssp { 265 entry: 266 %add.ptr = getelementptr inbounds double* %A, i64 2 267 %v.i = bitcast double* %A to <2 x double>* 268 %0 = load <2 x double>* %v.i, align 1 269 %shuffle.i.i = shufflevector <2 x double> %0, <2 x double> <double 0.000000e+00, double undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 270 %v1.i = bitcast double* %add.ptr to <2 x double>* 271 %1 = load <2 x double>* %v1.i, align 1 272 %2 = tail call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %shuffle.i.i, <2 x double> %1, i8 1) nounwind 273 %add.ptr1 = getelementptr inbounds double* %A, i64 6 274 %add.ptr2 = getelementptr inbounds double* %A, i64 4 275 %v.i27 = bitcast double* %add.ptr2 to <2 x double>* 276 %3 = load <2 x double>* %v.i27, align 1 277 %shuffle.i.i28 = shufflevector <2 x double> %3, <2 x double> <double 0.000000e+00, double undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 278 %v1.i29 = bitcast double* %add.ptr1 to <2 x double>* 279 %4 = load <2 x double>* %v1.i29, align 1 280 %5 = tail call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %shuffle.i.i28, <2 x double> %4, i8 1) nounwind 281 %6 = bitcast double* %C to <4 x double>* 282 %7 = load <4 x double>* %6, align 32 283 %add.ptr5 = getelementptr inbounds double* %C, i64 4 284 %8 = bitcast double* %add.ptr5 to <4 x double>* 285 %9 = load <4 x double>* %8, align 32 286 %shuffle.i26 = shufflevector <4 x double> %7, <4 x double> undef, <2 x i32> <i32 0, i32 1> 287 %10 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %7, i8 1) 288 %shuffle.i = shufflevector <4 x double> %9, <4 x double> undef, <2 x i32> <i32 0, i32 1> 289 %11 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %9, i8 1) 290 store <2 x double> %shuffle.i26, <2 x double>* %v.i, align 16 291 store <2 x double> %10, <2 x double>* %v1.i, align 16 292 store <2 x double> %shuffle.i, <2 x double>* %v.i27, align 16 293 store <2 x double> %11, <2 x double>* %v1.i29, align 16 294 store <4 x double> %2, <4 x double>* %6, align 32 295 store <4 x double> %5, <4 x double>* %8, align 32 296 ret void 297 } 298 declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone 299 declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone 300