Home | History | Annotate | Download | only in X86
      1 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2
      2 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41
      3 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
      4 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
      5 
      6 
      7 ; Verify the cost of vector shift left instructions.
      8 
      9 ; We always emit a single pmullw in the case of v8i16 vector shifts by
     10 ; non-uniform constant.
     11 
     12 define <8 x i16> @test1(<8 x i16> %a) {
     13   %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
     14   ret <8 x i16> %shl
     15 }
     16 ; CHECK: 'Cost Model Analysis' for function 'test1':
     17 ; CHECK: Found an estimated cost of 1 for instruction:   %shl
     18 
     19 
     20 define <8 x i16> @test2(<8 x i16> %a) {
     21   %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
     22   ret <8 x i16> %shl
     23 }
     24 ; CHECK: 'Cost Model Analysis' for function 'test2':
     25 ; CHECK: Found an estimated cost of 1 for instruction:   %shl
     26 
     27 
     28 ; With SSE4.1, v4i32 shifts can be lowered into a single pmulld instruction.
     29 ; Make sure that the estimated cost is always 1 except for the case where
     30 ; we only have SSE2 support. With SSE2, we are forced to special lower the
     31 ; v4i32 mul as a 2x shuffle, 2x pmuludq, 2x shuffle.
     32 
     33 define <4 x i32> @test3(<4 x i32> %a) {
     34   %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
     35   ret <4 x i32> %shl
     36 }
     37 ; CHECK: 'Cost Model Analysis' for function 'test3':
     38 ; SSE2: Found an estimated cost of 6 for instruction:   %shl
     39 ; SSE41: Found an estimated cost of 1 for instruction:   %shl
     40 ; AVX: Found an estimated cost of 1 for instruction:   %shl
     41 ; AVX2: Found an estimated cost of 1 for instruction:   %shl
     42 
     43 
     44 define <4 x i32> @test4(<4 x i32> %a) {
     45   %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
     46   ret <4 x i32> %shl
     47 }
     48 ; CHECK: 'Cost Model Analysis' for function 'test4':
     49 ; SSE2: Found an estimated cost of 6 for instruction:   %shl
     50 ; SSE41: Found an estimated cost of 1 for instruction:   %shl
     51 ; AVX: Found an estimated cost of 1 for instruction:   %shl
     52 ; AVX2: Found an estimated cost of 1 for instruction:   %shl
     53 
     54 
     55 ; On AVX2 we are able to lower the following shift into a single
     56 ; vpsllvq. Therefore, the expected cost is only 1.
     57 ; In all other cases, this shift is scalarized as the target does not support
     58 ; vpsllv instructions.
     59 
     60 define <2 x i64> @test5(<2 x i64> %a) {
     61   %shl = shl <2 x i64> %a, <i64 2, i64 3>
     62   ret <2 x i64> %shl
     63 }
     64 ; CHECK: 'Cost Model Analysis' for function 'test5':
     65 ; SSE2: Found an estimated cost of 20 for instruction:   %shl
     66 ; SSE41: Found an estimated cost of 20 for instruction:   %shl
     67 ; AVX: Found an estimated cost of 20 for instruction:   %shl
     68 ; AVX2: Found an estimated cost of 1 for instruction:   %shl
     69 
     70 
     71 ; v16i16 and v8i32 shift left by non-uniform constant are lowered into
     72 ; vector multiply instructions.  With AVX (but not AVX2), the vector multiply
     73 ; is lowered into a sequence of: 1 extract + 2 vpmullw + 1 insert.
     74 ;
     75 ; With AVX2, instruction vpmullw works with 256bit quantities and
     76 ; therefore there is no need to split the resulting vector multiply into
     77 ; a sequence of two multiply.
     78 ;
     79 ; With SSE2 and SSE4.1, the vector shift cost for 'test6' is twice
     80 ; the cost computed in the case of 'test1'. That is because the backend
     81 ; simply emits 2 pmullw with no extract/insert.
     82 
     83 
     84 define <16 x i16> @test6(<16 x i16> %a) {
     85   %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
     86   ret <16 x i16> %shl
     87 }
     88 ; CHECK: 'Cost Model Analysis' for function 'test6':
     89 ; SSE2: Found an estimated cost of 2 for instruction:   %shl
     90 ; SSE41: Found an estimated cost of 2 for instruction:   %shl
     91 ; AVX: Found an estimated cost of 4 for instruction:   %shl
     92 ; AVX2: Found an estimated cost of 1 for instruction:   %shl
     93 
     94 
     95 ; With SSE2 and SSE4.1, the vector shift cost for 'test7' is twice
     96 ; the cost computed in the case of 'test3'. That is because the multiply
     97 ; is type-legalized into two 4i32 vector multiply.
     98 
     99 define <8 x i32> @test7(<8 x i32> %a) {
    100   %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
    101   ret <8 x i32> %shl
    102 }
    103 ; CHECK: 'Cost Model Analysis' for function 'test7':
    104 ; SSE2: Found an estimated cost of 12 for instruction:   %shl
    105 ; SSE41: Found an estimated cost of 2 for instruction:   %shl
    106 ; AVX: Found an estimated cost of 4 for instruction:   %shl
    107 ; AVX2: Found an estimated cost of 1 for instruction:   %shl
    108 
    109 
    110 ; On AVX2 we are able to lower the following shift into a single
    111 ; vpsllvq. Therefore, the expected cost is only 1.
    112 ; In all other cases, this shift is scalarized as the target does not support
    113 ; vpsllv instructions.
    114 
    115 define <4 x i64> @test8(<4 x i64> %a) {
    116   %shl = shl <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4>
    117   ret <4 x i64> %shl
    118 }
    119 ; CHECK: 'Cost Model Analysis' for function 'test8':
    120 ; SSE2: Found an estimated cost of 40 for instruction:   %shl
    121 ; SSE41: Found an estimated cost of 40 for instruction:   %shl
    122 ; AVX: Found an estimated cost of 40 for instruction:   %shl
    123 ; AVX2: Found an estimated cost of 1 for instruction:   %shl
    124 
    125 
    126 ; Same as 'test6', with the difference that the cost is double.
    127 
    128 define <32 x i16> @test9(<32 x i16> %a) {
    129   %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
    130   ret <32 x i16> %shl
    131 }
    132 ; CHECK: 'Cost Model Analysis' for function 'test9':
    133 ; SSE2: Found an estimated cost of 4 for instruction:   %shl
    134 ; SSE41: Found an estimated cost of 4 for instruction:   %shl
    135 ; AVX: Found an estimated cost of 8 for instruction:   %shl
    136 ; AVX2: Found an estimated cost of 2 for instruction:   %shl
    137 
    138 
    139 ; Same as 'test7', except that now the cost is double.
    140 
    141 define <16 x i32> @test10(<16 x i32> %a) {
    142   %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
    143   ret <16 x i32> %shl
    144 }
    145 ; CHECK: 'Cost Model Analysis' for function 'test10':
    146 ; SSE2: Found an estimated cost of 24 for instruction:   %shl
    147 ; SSE41: Found an estimated cost of 4 for instruction:   %shl
    148 ; AVX: Found an estimated cost of 8 for instruction:   %shl
    149 ; AVX2: Found an estimated cost of 2 for instruction:   %shl
    150 
    151 
    152 ; On AVX2 we are able to lower the following shift into a sequence of
    153 ; two vpsllvq instructions. Therefore, the expected cost is only 2.
    154 ; In all other cases, this shift is scalarized as we don't have vpsllv
    155 ; instructions.
    156 
    157 define <8 x i64> @test11(<8 x i64> %a) {
    158   %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>
    159   ret <8 x i64> %shl
    160 }
    161 ; CHECK: 'Cost Model Analysis' for function 'test11':
    162 ; SSE2: Found an estimated cost of 80 for instruction:   %shl
    163 ; SSE41: Found an estimated cost of 80 for instruction:   %shl
    164 ; AVX: Found an estimated cost of 80 for instruction:   %shl
    165 ; AVX2: Found an estimated cost of 2 for instruction:   %shl
    166 
    167 
    168