1 ; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core2 -mtriple=x86_64-apple-darwin | FileCheck %s 2 ; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=corei7 -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=SSE3 3 ; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=corei7-avx -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=AVX 4 ; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core-avx2 -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=AVX2 5 6 define fastcc float @reduction_cost_float(<4 x float> %rdx) { 7 %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 8 %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf 9 %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 10 %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 11 12 ; Check that we recognize the tree starting at the extractelement as a 13 ; reduction. 14 ; CHECK-LABEL: reduction_cost 15 ; CHECK: cost of 9 {{.*}} extractelement 16 17 %r = extractelement <4 x float> %bin.rdx8, i32 0 18 ret float %r 19 } 20 21 define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) { 22 %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, 23 <8 x i32> <i32 4 , i32 5, i32 6, i32 7, 24 i32 undef, i32 undef, i32 undef, i32 undef> 25 %bin.rdx = add <8 x i32> %rdx, %rdx.shuf 26 %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, 27 <8 x i32> <i32 2 , i32 3, i32 undef, i32 undef, 28 i32 undef, i32 undef, i32 undef, i32 undef> 29 %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 30 %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, 31 <8 x i32> <i32 1 , i32 undef, i32 undef, i32 undef, 32 i32 undef, i32 undef, i32 undef, i32 undef> 33 %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 34 35 ; CHECK-LABEL: reduction_cost_int 36 ; CHECK: cost of 17 {{.*}} extractelement 37 38 %r = extractelement <8 x i32> %bin.rdx.3, i32 0 39 ret i32 %r 40 } 41 42 define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) { 43 %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, 44 <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef> 45 %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, 46 <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 47 %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 48 %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, 49 <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 50 %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, 51 <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 52 %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 53 54 ; CHECK-LABEL: pairwise_hadd 55 ; CHECK: cost of 11 {{.*}} extractelement 56 57 %r = extractelement <4 x float> %bin.rdx.1, i32 0 58 %r2 = fadd float %r, %f1 59 ret float %r2 60 } 61 62 define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) { 63 %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, 64 <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef> 65 %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, 66 <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 67 %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0 68 %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, 69 <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 70 %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, 71 <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 72 %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 73 74 ; CHECK-LABEL: pairwise_hadd_assoc 75 ; CHECK: cost of 11 {{.*}} extractelement 76 77 %r = extractelement <4 x float> %bin.rdx.1, i32 0 78 %r2 = fadd float %r, %f1 79 ret float %r2 80 } 81 82 define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) { 83 %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, 84 <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef> 85 %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, 86 <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 87 %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 88 %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, 89 <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 90 %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 91 92 ; CHECK-LABEL: pairwise_hadd_skip_first 93 ; CHECK: cost of 11 {{.*}} extractelement 94 95 %r = extractelement <4 x float> %bin.rdx.1, i32 0 96 %r2 = fadd float %r, %f1 97 ret float %r2 98 } 99 100 define fastcc double @no_pairwise_reduction2double(<2 x double> %rdx, double %f1) { 101 %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef> 102 %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf 103 104 ; SSE3: cost of 2 {{.*}} extractelement 105 ; AVX: cost of 2 {{.*}} extractelement 106 ; AVX2: cost of 2 {{.*}} extractelement 107 108 %r = extractelement <2 x double> %bin.rdx, i32 0 109 ret double %r 110 } 111 112 define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) { 113 %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 114 %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf 115 %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 116 %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 117 118 ; SSE3: cost of 4 {{.*}} extractelement 119 ; AVX: cost of 3 {{.*}} extractelement 120 ; AVX2: cost of 3 {{.*}} extractelement 121 122 %r = extractelement <4 x float> %bin.rdx8, i32 0 123 ret float %r 124 } 125 126 define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1) { 127 %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 128 %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf 129 %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 130 %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 131 132 ; AVX: cost of 3 {{.*}} extractelement 133 ; AVX2: cost of 3 {{.*}} extractelement 134 135 %r = extractelement <4 x double> %bin.rdx8, i32 0 136 ret double %r 137 } 138 139 define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) { 140 %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> 141 %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 142 %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 143 %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf 144 %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 145 %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 146 147 ; AVX: cost of 4 {{.*}} extractelement 148 ; AVX2: cost of 4 {{.*}} extractelement 149 150 %r = extractelement <8 x float> %bin.rdx8, i32 0 151 ret float %r 152 } 153 154 define fastcc i64 @no_pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) { 155 %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 156 %bin.rdx = add <2 x i64> %rdx, %rdx.shuf 157 158 ; SSE3: cost of 2 {{.*}} extractelement 159 ; AVX: cost of 1 {{.*}} extractelement 160 ; AVX2: cost of 1 {{.*}} extractelement 161 162 %r = extractelement <2 x i64> %bin.rdx, i32 0 163 ret i64 %r 164 } 165 166 define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { 167 %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 168 %bin.rdx = add <4 x i32> %rdx, %rdx.shuf 169 %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 170 %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7 171 172 ; SSE3: cost of 3 {{.*}} extractelement 173 ; AVX: cost of 3 {{.*}} extractelement 174 ; AVX2: cost of 3 {{.*}} extractelement 175 176 %r = extractelement <4 x i32> %bin.rdx8, i32 0 177 ret i32 %r 178 } 179 180 define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { 181 %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 182 %bin.rdx = add <4 x i64> %rdx, %rdx.shuf 183 %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 184 %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 185 186 ; AVX: cost of 3 {{.*}} extractelement 187 ; AVX2: cost of 3 {{.*}} extractelement 188 189 %r = extractelement <4 x i64> %bin.rdx8, i32 0 190 ret i64 %r 191 } 192 193 define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { 194 %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> 195 %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3 196 %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 197 %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf 198 %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 199 %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 200 201 ; SSE3: cost of 4 {{.*}} extractelement 202 ; AVX: cost of 4 {{.*}} extractelement 203 ; AVX2: cost of 4 {{.*}} extractelement 204 205 %r = extractelement <8 x i16> %bin.rdx8, i32 0 206 ret i16 %r 207 } 208 209 define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { 210 %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> 211 %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3 212 %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 213 %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf 214 %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 215 %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 216 217 ; AVX: cost of 5 {{.*}} extractelement 218 ; AVX2: cost of 5 {{.*}} extractelement 219 220 %r = extractelement <8 x i32> %bin.rdx8, i32 0 221 ret i32 %r 222 } 223 224 define fastcc double @pairwise_reduction2double(<2 x double> %rdx, double %f1) { 225 %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef> 226 %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef> 227 %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 228 229 ; SSE3: cost of 2 {{.*}} extractelement 230 ; AVX: cost of 2 {{.*}} extractelement 231 ; AVX2: cost of 2 {{.*}} extractelement 232 233 %r = extractelement <2 x double> %bin.rdx8, i32 0 234 ret double %r 235 } 236 237 define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) { 238 %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 239 %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 240 %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 241 %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 242 %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 243 %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 244 245 ; SSE3: cost of 4 {{.*}} extractelement 246 ; AVX: cost of 4 {{.*}} extractelement 247 ; AVX2: cost of 4 {{.*}} extractelement 248 249 %r = extractelement <4 x float> %bin.rdx8, i32 0 250 ret float %r 251 } 252 253 define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) { 254 %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 255 %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 256 %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1 257 %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 258 %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 259 %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 260 261 ; AVX: cost of 5 {{.*}} extractelement 262 ; AVX2: cost of 5 {{.*}} extractelement 263 264 %r = extractelement <4 x double> %bin.rdx8, i32 0 265 ret double %r 266 } 267 268 define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) { 269 %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef> 270 %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> 271 %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 272 %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 273 %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 274 %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 275 %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 276 %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 277 %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 278 279 ; AVX: cost of 7 {{.*}} extractelement 280 ; AVX2: cost of 7 {{.*}} extractelement 281 282 %r = extractelement <8 x float> %bin.rdx9, i32 0 283 ret float %r 284 } 285 286 define fastcc i64 @pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) { 287 %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef> 288 %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 289 %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 290 291 ; SSE3: cost of 2 {{.*}} extractelement 292 ; AVX: cost of 1 {{.*}} extractelement 293 ; AVX2: cost of 1 {{.*}} extractelement 294 295 %r = extractelement <2 x i64> %bin.rdx8, i32 0 296 ret i64 %r 297 } 298 299 define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { 300 %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 301 %rdx.shuf.0.1 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 302 %bin.rdx = add <4 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 303 %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 304 %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 305 %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 306 307 ; SSE3: cost of 3 {{.*}} extractelement 308 ; AVX: cost of 3 {{.*}} extractelement 309 ; AVX2: cost of 3 {{.*}} extractelement 310 311 %r = extractelement <4 x i32> %bin.rdx8, i32 0 312 ret i32 %r 313 } 314 315 define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { 316 %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 317 %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 318 %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1 319 %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 320 %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 321 %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 322 323 ; AVX: cost of 5 {{.*}} extractelement 324 ; AVX2: cost of 5 {{.*}} extractelement 325 326 %r = extractelement <4 x i64> %bin.rdx8, i32 0 327 ret i64 %r 328 } 329 330 define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { 331 %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef> 332 %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> 333 %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1 334 %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 335 %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 336 %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1 337 %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 338 %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 339 %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 340 341 ; SSE3: cost of 5 {{.*}} extractelement 342 ; AVX: cost of 5 {{.*}} extractelement 343 ; AVX2: cost of 5 {{.*}} extractelement 344 345 %r = extractelement <8 x i16> %bin.rdx9, i32 0 346 ret i16 %r 347 } 348 349 define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { 350 %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef> 351 %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> 352 %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 353 %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 354 %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 355 %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 356 %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 357 %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 358 %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 359 360 ; AVX: cost of 5 {{.*}} extractelement 361 ; AVX2: cost of 5 {{.*}} extractelement 362 363 %r = extractelement <8 x i32> %bin.rdx9, i32 0 364 ret i32 %r 365 } 366