Home | History | Annotate | Download | only in X86
      1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
      2 
      3 ; PR11102
      4 define <4 x float> @test1(<4 x float> %a) nounwind {
      5   %b = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 5, i32 undef, i32 undef>
      6   ret <4 x float> %b
      7 ; CHECK-LABEL: test1:
      8 ;; TODO: This test could be improved by removing the xor instruction and
      9 ;; having vinsertps zero out the needed elements.
     10 ; CHECK: vxorps
     11 ; CHECK: vinsertps
     12 }
     13 
     14 ; rdar://10538417
     15 define <3 x i64> @test2(<2 x i64> %v) nounwind readnone {
     16 ; CHECK-LABEL: test2:
     17 ; CHECK: vinsertf128
     18   %1 = shufflevector <2 x i64> %v, <2 x i64> %v, <3 x i32> <i32 0, i32 1, i32 undef>
     19   %2 = shufflevector <3 x i64> zeroinitializer, <3 x i64> %1, <3 x i32> <i32 3, i32 4, i32 2>
     20   ret <3 x i64> %2
     21 ; CHECK: ret
     22 }
     23 
     24 define <4 x i64> @test3(<4 x i64> %a, <4 x i64> %b) nounwind {
     25   %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 undef>
     26   ret <4 x i64> %c
     27 ; CHECK-LABEL: test3:
     28 ; CHECK: vblendpd
     29 ; CHECK: ret
     30 }
     31 
     32 define <8 x float> @test4(float %a) nounwind {
     33   %b = insertelement <8 x float> zeroinitializer, float %a, i32 0
     34   ret <8 x float> %b
     35 ; CHECK-LABEL: test4:
     36 ; CHECK: vinsertf128
     37 }
     38 
     39 ; rdar://10594409
     40 define <8 x float> @test5(float* nocapture %f) nounwind uwtable readonly ssp {
     41 entry:
     42   %0 = bitcast float* %f to <4 x float>*
     43   %1 = load <4 x float>* %0, align 16
     44 ; CHECK: test5
     45 ; CHECK: vmovaps
     46 ; CHECK-NOT: vxorps
     47 ; CHECK-NOT: vinsertf128
     48   %shuffle.i = shufflevector <4 x float> %1, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
     49   ret <8 x float> %shuffle.i
     50 }
     51 
     52 define <4 x double> @test6(double* nocapture %d) nounwind uwtable readonly ssp {
     53 entry:
     54   %0 = bitcast double* %d to <2 x double>*
     55   %1 = load <2 x double>* %0, align 16
     56 ; CHECK: test6
     57 ; CHECK: vmovaps
     58 ; CHECK-NOT: vxorps
     59 ; CHECK-NOT: vinsertf128
     60   %shuffle.i = shufflevector <2 x double> %1, <2 x double> <double 0.000000e+00, double undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
     61   ret <4 x double> %shuffle.i
     62 }
     63 
     64 define <16 x i16> @test7(<4 x i16> %a) nounwind {
     65 ; CHECK: test7
     66   %b = shufflevector <4 x i16> %a, <4 x i16> undef, <16 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
     67 ; CHECK: ret
     68   ret <16 x i16> %b
     69 }
     70 
     71 ; CHECK: test8
     72 define void @test8() {
     73 entry:
     74   %0 = load <16 x i64> addrspace(1)* null, align 128
     75   %1 = shufflevector <16 x i64> <i64 undef, i64 undef, i64 0, i64 undef, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 undef, i64 0, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i64> %0, <16 x i32> <i32 17, i32 18, i32 2, i32 undef, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 26>
     76   %2 = shufflevector <16 x i64> %1, <16 x i64> %0, <16 x i32> <i32 0, i32 1, i32 2, i32 30, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 undef, i32 11, i32 undef, i32 22, i32 20, i32 15>
     77   store <16 x i64> %2, <16 x i64> addrspace(1)* undef, align 128
     78 ; CHECK: ret
     79   ret void
     80 }
     81 
     82 ; Extract a value from a shufflevector..
     83 define i32 @test9(<4 x i32> %a) nounwind {
     84 ; CHECK: test9
     85 ; CHECK: vpextrd
     86   %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 undef, i32 4>
     87   %r = extractelement <8 x i32> %b, i32 2
     88 ; CHECK: ret
     89   ret i32 %r
     90 }
     91 
     92 ; Extract a value which is the result of an undef mask.
     93 define i32 @test10(<4 x i32> %a) nounwind {
     94 ; CHECK: @test10
     95 ; CHECK-NOT: {{^[^#]*[a-z]}}
     96 ; CHECK: ret
     97   %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
     98   %r = extractelement <8 x i32> %b, i32 2
     99   ret i32 %r
    100 }
    101 
    102 define <4 x float> @test11(<4 x float> %a) nounwind  {
    103 ; CHECK: test11
    104 ; CHECK: vpshufd $27
    105   %tmp1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
    106   ret <4 x float> %tmp1
    107 }
    108 
    109 define <4 x float> @test12(<4 x float>* %a) nounwind  {
    110 ; CHECK: test12
    111 ; CHECK: vpshufd
    112   %tmp0 = load <4 x float>* %a
    113   %tmp1 = shufflevector <4 x float> %tmp0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
    114   ret <4 x float> %tmp1
    115 }
    116 
    117 define <4 x i32> @test13(<4 x i32> %a) nounwind  {
    118 ; CHECK: test13
    119 ; CHECK: vpshufd $27
    120   %tmp1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
    121   ret <4 x i32> %tmp1
    122 }
    123 
    124 define <4 x i32> @test14(<4 x i32>* %a) nounwind  {
    125 ; CHECK: test14
    126 ; CHECK: vpshufd $27, (
    127   %tmp0 = load <4 x i32>* %a
    128   %tmp1 = shufflevector <4 x i32> %tmp0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
    129   ret <4 x i32> %tmp1
    130 }
    131 
    132 ; CHECK: test15
    133 ; CHECK: vpshufd $8
    134 ; CHECK: ret
    135 define <4 x i32> @test15(<2 x i32>%x) nounwind readnone {
    136   %x1 = shufflevector <2 x i32> %x, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
    137   ret <4 x i32>%x1
    138 }
    139 
    140 ; rdar://10974078
    141 define <8 x float> @test16(float* nocapture %f) nounwind uwtable readonly ssp {
    142 entry:
    143   %0 = bitcast float* %f to <4 x float>*
    144   %1 = load <4 x float>* %0, align 8
    145 ; CHECK: test16
    146 ; CHECK: vmovups
    147 ; CHECK-NOT: vxorps
    148 ; CHECK-NOT: vinsertf128
    149   %shuffle.i = shufflevector <4 x float> %1, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
    150   ret <8 x float> %shuffle.i
    151 }
    152 
    153 ; PR12413
    154 ; CHECK: shuf1
    155 ; CHECK: vpshufb
    156 ; CHECK: vpshufb
    157 ; CHECK: vpshufb
    158 ; CHECK: vpshufb
    159 define <32 x i8> @shuf1(<32 x i8> %inval1, <32 x i8> %inval2) {
    160 entry:
    161  %0 = shufflevector <32 x i8> %inval1, <32 x i8> %inval2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
    162  ret <32 x i8> %0
    163 }
    164 
    165 ; handle the case where only half of the 256-bits is splittable
    166 ; CHECK: shuf2
    167 ; CHECK: vpshufb
    168 ; CHECK: vpshufb
    169 ; CHECK: vpextrb
    170 ; CHECK: vpextrb
    171 define <32 x i8> @shuf2(<32 x i8> %inval1, <32 x i8> %inval2) {
    172 entry:
    173  %0 = shufflevector <32 x i8> %inval1, <32 x i8> %inval2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 31, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
    174  ret <32 x i8> %0
    175 }
    176 
    177 ; CHECK: blend1
    178 ; CHECK: vblendps
    179 ; CHECK: ret
    180 define <4 x i32> @blend1(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline {
    181   %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
    182   ret <4 x i32> %t
    183 }
    184 
    185 ; CHECK: blend2
    186 ; CHECK: vblendps
    187 ; CHECK: ret
    188 define <4 x i32> @blend2(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline {
    189   %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
    190   ret <4 x i32> %t
    191 }
    192 
    193 ; CHECK: blend2a
    194 ; CHECK: vblendps
    195 ; CHECK: ret
    196 define <4 x float> @blend2a(<4 x float> %a, <4 x float> %b) nounwind alwaysinline {
    197   %t = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
    198   ret <4 x float> %t
    199 }
    200 
    201 ; CHECK: blend3
    202 ; CHECK-NOT: vblendps
    203 ; CHECK: ret
    204 define <4 x i32> @blend3(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline {
    205   %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 2, i32 7>
    206   ret <4 x i32> %t
    207 }
    208 
    209 ; CHECK: blend4
    210 ; CHECK: vblendpd
    211 ; CHECK: ret
    212 define <4 x i64> @blend4(<4 x i64> %a, <4 x i64> %b) nounwind alwaysinline {
    213   %t = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
    214   ret <4 x i64> %t
    215 }
    216 
    217 ; CHECK: narrow
    218 ; CHECK: vpermilps
    219 ; CHECK: ret
    220 define <16 x i16> @narrow(<16 x i16> %a) nounwind alwaysinline {
    221   %t = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 undef, i32 14, i32 15, i32 undef, i32 undef>
    222   ret <16 x i16> %t
    223 }
    224 
    225 ;CHECK-LABEL: test17:
    226 ;CHECK-NOT: vinsertf128
    227 ;CHECK: ret
    228 define   <8 x float> @test17(<4 x float> %y) {
    229   %x = shufflevector <4 x float> %y, <4 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    230   ret <8 x float> %x
    231 }
    232 
    233 ; CHECK: test18
    234 ; CHECK: vmovshdup
    235 ; CHECK: vblendps
    236 ; CHECK: ret
    237 define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind {
    238   %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
    239   ret <8 x float>%S
    240 }
    241 
    242 ; CHECK: test19
    243 ; CHECK: vmovsldup
    244 ; CHECK: vblendps
    245 ; CHECK: ret
    246 define <8 x float> @test19(<8 x float> %A, <8 x float>%B) nounwind {
    247   %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
    248   ret <8 x float>%S
    249 }
    250 
    251 ; rdar://12684358
    252 ; Make sure loads happen before stores.
    253 ; CHECK: swap8doubles
    254 ; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}}
    255 ; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}}
    256 ; CHECK: vinsertf128 $1, {{[0-9]*}}(%rdi), %ymm{{[0-9]+}}
    257 ; CHECK: vinsertf128 $1, {{[0-9]*}}(%rdi), %ymm{{[0-9]+}}
    258 ; CHECK: vmovaps {{[0-9]*}}(%rsi), %ymm{{[0-9]+}}
    259 ; CHECK: vmovaps {{[0-9]*}}(%rsi), %ymm{{[0-9]+}}
    260 ; CHECK: vmovaps %xmm{{[0-9]+}}, {{[0-9]*}}(%rdi)
    261 ; CHECK: vextractf128
    262 ; CHECK: vmovaps %xmm{{[0-9]+}}, {{[0-9]*}}(%rdi)
    263 ; CHECK: vextractf128
    264 ; CHECK: vmovaps %ymm{{[0-9]+}}, {{[0-9]*}}(%rsi)
    265 ; CHECK: vmovaps %ymm{{[0-9]+}}, {{[0-9]*}}(%rsi)
    266 define void @swap8doubles(double* nocapture %A, double* nocapture %C) nounwind uwtable ssp {
    267 entry:
    268   %add.ptr = getelementptr inbounds double* %A, i64 2
    269   %v.i = bitcast double* %A to <2 x double>*
    270   %0 = load <2 x double>* %v.i, align 1
    271   %shuffle.i.i = shufflevector <2 x double> %0, <2 x double> <double 0.000000e+00, double undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
    272   %v1.i = bitcast double* %add.ptr to <2 x double>*
    273   %1 = load <2 x double>* %v1.i, align 1
    274   %2 = tail call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %shuffle.i.i, <2 x double> %1, i8 1) nounwind
    275   %add.ptr1 = getelementptr inbounds double* %A, i64 6
    276   %add.ptr2 = getelementptr inbounds double* %A, i64 4
    277   %v.i27 = bitcast double* %add.ptr2 to <2 x double>*
    278   %3 = load <2 x double>* %v.i27, align 1
    279   %shuffle.i.i28 = shufflevector <2 x double> %3, <2 x double> <double 0.000000e+00, double undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
    280   %v1.i29 = bitcast double* %add.ptr1 to <2 x double>*
    281   %4 = load <2 x double>* %v1.i29, align 1
    282   %5 = tail call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %shuffle.i.i28, <2 x double> %4, i8 1) nounwind
    283   %6 = bitcast double* %C to <4 x double>*
    284   %7 = load <4 x double>* %6, align 32
    285   %add.ptr5 = getelementptr inbounds double* %C, i64 4
    286   %8 = bitcast double* %add.ptr5 to <4 x double>*
    287   %9 = load <4 x double>* %8, align 32
    288   %shuffle.i26 = shufflevector <4 x double> %7, <4 x double> undef, <2 x i32> <i32 0, i32 1>
    289   %10 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %7, i8 1)
    290   %shuffle.i = shufflevector <4 x double> %9, <4 x double> undef, <2 x i32> <i32 0, i32 1>
    291   %11 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %9, i8 1)
    292   store <2 x double> %shuffle.i26, <2 x double>* %v.i, align 16
    293   store <2 x double> %10, <2 x double>* %v1.i, align 16
    294   store <2 x double> %shuffle.i, <2 x double>* %v.i27, align 16
    295   store <2 x double> %11, <2 x double>* %v1.i29, align 16
    296   store <4 x double> %2, <4 x double>* %6, align 32
    297   store <4 x double> %5, <4 x double>* %8, align 32
    298   ret void
    299 }
    300 declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
    301 declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
    302 
    303 ; this test case just should not fail
    304 define void @test20() {
    305   %a0 = insertelement <3 x double> <double 0.000000e+00, double 0.000000e+00, double undef>, double 0.000000e+00, i32 2
    306   store <3 x double> %a0, <3 x double>* undef, align 1
    307   %a1 = insertelement <3 x double> <double 0.000000e+00, double 0.000000e+00, double undef>, double undef, i32 2
    308   store <3 x double> %a1, <3 x double>* undef, align 1
    309   ret void
    310 }
    311 
    312 define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
    313 ; CHECK-LABEL: test_insert_64_zext
    314 ; CHECK-NOT: xor
    315 ; CHECK: vmovq
    316   %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
    317   ret <2 x i64> %1
    318 }
    319 
    320 ;; Ensure we don't use insertps from non v4x32 vectors.
    321 ;; On SSE4.1 it works because bigger vectors use more than 1 register.
    322 ;; On AVX they get passed in a single register.
    323 ;; FIXME: We could probably optimize this case, if we're only using the
    324 ;; first 4 indices.
    325 define <4 x i32> @insert_from_diff_size(<8 x i32> %x) {
    326 ; CHECK-LABEL: insert_from_diff_size:
    327 ; CHECK-NOT: insertps
    328 ; CHECK: ret
    329   %vecext = extractelement <8 x i32> %x, i32 0
    330   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
    331   %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
    332   %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
    333   %a.0 = extractelement <8 x i32> %x, i32 0
    334   %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a.0, i32 3
    335   ret <4 x i32> %vecinit3
    336 }
    337