1 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2 2 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41 3 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX 4 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 5 6 7 ; Verify the cost of vector shift left instructions. 8 9 ; We always emit a single pmullw in the case of v8i16 vector shifts by 10 ; non-uniform constant. 11 12 define <8 x i16> @test1(<8 x i16> %a) { 13 %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11> 14 ret <8 x i16> %shl 15 } 16 ; CHECK: 'Cost Model Analysis' for function 'test1': 17 ; CHECK: Found an estimated cost of 1 for instruction: %shl 18 19 20 define <8 x i16> @test2(<8 x i16> %a) { 21 %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1> 22 ret <8 x i16> %shl 23 } 24 ; CHECK: 'Cost Model Analysis' for function 'test2': 25 ; CHECK: Found an estimated cost of 1 for instruction: %shl 26 27 28 ; With SSE4.1, v4i32 shifts can be lowered into a single pmulld instruction. 29 ; Make sure that the estimated cost is always 1 except for the case where 30 ; we only have SSE2 support. With SSE2, we are forced to special lower the 31 ; v4i32 mul as a 2x shuffle, 2x pmuludq, 2x shuffle. 32 33 define <4 x i32> @test3(<4 x i32> %a) { 34 %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3> 35 ret <4 x i32> %shl 36 } 37 ; CHECK: 'Cost Model Analysis' for function 'test3': 38 ; SSE2: Found an estimated cost of 6 for instruction: %shl 39 ; SSE41: Found an estimated cost of 1 for instruction: %shl 40 ; AVX: Found an estimated cost of 1 for instruction: %shl 41 ; AVX2: Found an estimated cost of 1 for instruction: %shl 42 43 44 define <4 x i32> @test4(<4 x i32> %a) { 45 %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1> 46 ret <4 x i32> %shl 47 } 48 ; CHECK: 'Cost Model Analysis' for function 'test4': 49 ; SSE2: Found an estimated cost of 6 for instruction: %shl 50 ; SSE41: Found an estimated cost of 1 for instruction: %shl 51 ; AVX: Found an estimated cost of 1 for instruction: %shl 52 ; AVX2: Found an estimated cost of 1 for instruction: %shl 53 54 55 ; On AVX2 we are able to lower the following shift into a single 56 ; vpsllvq. Therefore, the expected cost is only 1. 57 ; In all other cases, this shift is scalarized as the target does not support 58 ; vpsllv instructions. 59 60 define <2 x i64> @test5(<2 x i64> %a) { 61 %shl = shl <2 x i64> %a, <i64 2, i64 3> 62 ret <2 x i64> %shl 63 } 64 ; CHECK: 'Cost Model Analysis' for function 'test5': 65 ; SSE2: Found an estimated cost of 20 for instruction: %shl 66 ; SSE41: Found an estimated cost of 20 for instruction: %shl 67 ; AVX: Found an estimated cost of 20 for instruction: %shl 68 ; AVX2: Found an estimated cost of 1 for instruction: %shl 69 70 71 ; v16i16 and v8i32 shift left by non-uniform constant are lowered into 72 ; vector multiply instructions. With AVX (but not AVX2), the vector multiply 73 ; is lowered into a sequence of: 1 extract + 2 vpmullw + 1 insert. 74 ; 75 ; With AVX2, instruction vpmullw works with 256bit quantities and 76 ; therefore there is no need to split the resulting vector multiply into 77 ; a sequence of two multiply. 78 ; 79 ; With SSE2 and SSE4.1, the vector shift cost for 'test6' is twice 80 ; the cost computed in the case of 'test1'. That is because the backend 81 ; simply emits 2 pmullw with no extract/insert. 82 83 84 define <16 x i16> @test6(<16 x i16> %a) { 85 %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11> 86 ret <16 x i16> %shl 87 } 88 ; CHECK: 'Cost Model Analysis' for function 'test6': 89 ; SSE2: Found an estimated cost of 2 for instruction: %shl 90 ; SSE41: Found an estimated cost of 2 for instruction: %shl 91 ; AVX: Found an estimated cost of 4 for instruction: %shl 92 ; AVX2: Found an estimated cost of 1 for instruction: %shl 93 94 95 ; With SSE2 and SSE4.1, the vector shift cost for 'test7' is twice 96 ; the cost computed in the case of 'test3'. That is because the multiply 97 ; is type-legalized into two 4i32 vector multiply. 98 99 define <8 x i32> @test7(<8 x i32> %a) { 100 %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3> 101 ret <8 x i32> %shl 102 } 103 ; CHECK: 'Cost Model Analysis' for function 'test7': 104 ; SSE2: Found an estimated cost of 12 for instruction: %shl 105 ; SSE41: Found an estimated cost of 2 for instruction: %shl 106 ; AVX: Found an estimated cost of 4 for instruction: %shl 107 ; AVX2: Found an estimated cost of 1 for instruction: %shl 108 109 110 ; On AVX2 we are able to lower the following shift into a single 111 ; vpsllvq. Therefore, the expected cost is only 1. 112 ; In all other cases, this shift is scalarized as the target does not support 113 ; vpsllv instructions. 114 115 define <4 x i64> @test8(<4 x i64> %a) { 116 %shl = shl <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4> 117 ret <4 x i64> %shl 118 } 119 ; CHECK: 'Cost Model Analysis' for function 'test8': 120 ; SSE2: Found an estimated cost of 40 for instruction: %shl 121 ; SSE41: Found an estimated cost of 40 for instruction: %shl 122 ; AVX: Found an estimated cost of 40 for instruction: %shl 123 ; AVX2: Found an estimated cost of 1 for instruction: %shl 124 125 126 ; Same as 'test6', with the difference that the cost is double. 127 128 define <32 x i16> @test9(<32 x i16> %a) { 129 %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11> 130 ret <32 x i16> %shl 131 } 132 ; CHECK: 'Cost Model Analysis' for function 'test9': 133 ; SSE2: Found an estimated cost of 4 for instruction: %shl 134 ; SSE41: Found an estimated cost of 4 for instruction: %shl 135 ; AVX: Found an estimated cost of 8 for instruction: %shl 136 ; AVX2: Found an estimated cost of 2 for instruction: %shl 137 138 139 ; Same as 'test7', except that now the cost is double. 140 141 define <16 x i32> @test10(<16 x i32> %a) { 142 %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3> 143 ret <16 x i32> %shl 144 } 145 ; CHECK: 'Cost Model Analysis' for function 'test10': 146 ; SSE2: Found an estimated cost of 24 for instruction: %shl 147 ; SSE41: Found an estimated cost of 4 for instruction: %shl 148 ; AVX: Found an estimated cost of 8 for instruction: %shl 149 ; AVX2: Found an estimated cost of 2 for instruction: %shl 150 151 152 ; On AVX2 we are able to lower the following shift into a sequence of 153 ; two vpsllvq instructions. Therefore, the expected cost is only 2. 154 ; In all other cases, this shift is scalarized as we don't have vpsllv 155 ; instructions. 156 157 define <8 x i64> @test11(<8 x i64> %a) { 158 %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3> 159 ret <8 x i64> %shl 160 } 161 ; CHECK: 'Cost Model Analysis' for function 'test11': 162 ; SSE2: Found an estimated cost of 80 for instruction: %shl 163 ; SSE41: Found an estimated cost of 80 for instruction: %shl 164 ; AVX: Found an estimated cost of 80 for instruction: %shl 165 ; AVX2: Found an estimated cost of 2 for instruction: %shl 166 167 168