Home | History | Annotate | Download | only in X86
      1 ; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core2 -mtriple=x86_64-apple-darwin | FileCheck %s
      2 ; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=corei7 -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=SSE3
      3 ; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=corei7-avx -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=AVX
      4 ; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core-avx2 -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=AVX2
      5 
      6 define fastcc float @reduction_cost_float(<4 x float> %rdx) {
      7   %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
      8   %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
      9   %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
     10   %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
     11 
     12 ; Check that we recognize the tree starting at the extractelement as a
     13 ; reduction.
     14 ; CHECK-LABEL: reduction_cost
     15 ; CHECK:  cost of 9 {{.*}} extractelement
     16 
     17   %r = extractelement <4 x float> %bin.rdx8, i32 0
     18   ret float %r
     19 }
     20 
     21 define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
     22   %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef,
     23    <8 x i32> <i32 4    , i32     5, i32     6, i32     7,
     24               i32 undef, i32 undef, i32 undef, i32 undef>
     25   %bin.rdx = add <8 x i32> %rdx, %rdx.shuf
     26   %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,
     27    <8 x i32> <i32 2    , i32 3,     i32 undef, i32 undef,
     28               i32 undef, i32 undef, i32 undef, i32 undef>
     29   %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
     30   %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef,
     31    <8 x i32> <i32 1    , i32 undef, i32 undef, i32 undef,
     32               i32 undef, i32 undef, i32 undef, i32 undef>
     33   %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
     34 
     35 ; CHECK-LABEL: reduction_cost_int
     36 ; CHECK:  cost of 17 {{.*}} extractelement
     37 
     38   %r = extractelement <8 x i32> %bin.rdx.3, i32 0
     39   ret i32 %r
     40 }
     41 
     42 define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) {
     43   %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
     44         <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
     45   %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
     46         <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
     47   %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
     48   %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
     49         <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
     50   %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
     51         <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
     52   %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
     53 
     54 ; CHECK-LABEL: pairwise_hadd
     55 ; CHECK: cost of 11 {{.*}} extractelement
     56 
     57   %r = extractelement <4 x float> %bin.rdx.1, i32 0
     58   %r2 = fadd float %r, %f1
     59   ret float %r2
     60 }
     61 
     62 define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) {
     63   %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
     64         <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
     65   %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
     66         <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
     67   %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0
     68   %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
     69         <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
     70   %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
     71         <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
     72   %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
     73 
     74 ; CHECK-LABEL: pairwise_hadd_assoc
     75 ; CHECK: cost of 11 {{.*}} extractelement
     76 
     77   %r = extractelement <4 x float> %bin.rdx.1, i32 0
     78   %r2 = fadd float %r, %f1
     79   ret float %r2
     80 }
     81 
     82 define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) {
     83   %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
     84         <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
     85   %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
     86         <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
     87   %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
     88   %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
     89         <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
     90   %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
     91 
     92 ; CHECK-LABEL: pairwise_hadd_skip_first
     93 ; CHECK: cost of 11 {{.*}} extractelement
     94 
     95   %r = extractelement <4 x float> %bin.rdx.1, i32 0
     96   %r2 = fadd float %r, %f1
     97   ret float %r2
     98 }
     99 
    100 define fastcc double @no_pairwise_reduction2double(<2 x double> %rdx, double %f1) {
    101   %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
    102   %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
    103 
    104 ; SSE3:  cost of 2 {{.*}} extractelement
    105 ; AVX:  cost of 2 {{.*}} extractelement
    106 ; AVX2:  cost of 2 {{.*}} extractelement
    107 
    108   %r = extractelement <2 x double> %bin.rdx, i32 0
    109   ret double %r
    110 }
    111 
    112 define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) {
    113   %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
    114   %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
    115   %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
    116   %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
    117 
    118 ; SSE3:  cost of 4 {{.*}} extractelement
    119 ; AVX:  cost of 3 {{.*}} extractelement
    120 ; AVX2:  cost of 3 {{.*}} extractelement
    121 
    122   %r = extractelement <4 x float> %bin.rdx8, i32 0
    123   ret float %r
    124 }
    125 
    126 define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1) {
    127   %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
    128   %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
    129   %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
    130   %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
    131 
    132 ; AVX:  cost of 3 {{.*}} extractelement
    133 ; AVX2:  cost of 3 {{.*}} extractelement
    134 
    135   %r = extractelement <4 x double> %bin.rdx8, i32 0
    136   ret double %r
    137 }
    138 
    139 define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
    140   %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
    141   %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3
    142   %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    143   %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
    144   %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    145   %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
    146 
    147 ; AVX:  cost of 4 {{.*}} extractelement
    148 ; AVX2:  cost of 4 {{.*}} extractelement
    149 
    150   %r = extractelement <8 x float> %bin.rdx8, i32 0
    151   ret float %r
    152 }
    153 
    154 define fastcc i64 @no_pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
    155   %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
    156   %bin.rdx = add <2 x i64> %rdx, %rdx.shuf
    157 
    158 ; SSE3:  cost of 2 {{.*}} extractelement
    159 ; AVX:  cost of 1 {{.*}} extractelement
    160 ; AVX2:  cost of 1 {{.*}} extractelement
    161 
    162   %r = extractelement <2 x i64> %bin.rdx, i32 0
    163   ret i64 %r
    164 }
    165 
    166 define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
    167   %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
    168   %bin.rdx = add <4 x i32> %rdx, %rdx.shuf
    169   %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
    170   %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7
    171 
    172 ; SSE3:  cost of 3 {{.*}} extractelement
    173 ; AVX:  cost of 3 {{.*}} extractelement
    174 ; AVX2:  cost of 3 {{.*}} extractelement
    175 
    176   %r = extractelement <4 x i32> %bin.rdx8, i32 0
    177   ret i32 %r
    178 }
    179 
    180 define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
    181   %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
    182   %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
    183   %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
    184   %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
    185 
    186 ; AVX:  cost of 3 {{.*}} extractelement
    187 ; AVX2:  cost of 3 {{.*}} extractelement
    188 
    189   %r = extractelement <4 x i64> %bin.rdx8, i32 0
    190   ret i64 %r
    191 }
    192 
    193 define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
    194   %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
    195   %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3
    196   %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    197   %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
    198   %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    199   %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
    200 
    201 ; SSE3:  cost of 4 {{.*}} extractelement
    202 ; AVX:  cost of 4 {{.*}} extractelement
    203 ; AVX2:  cost of 4 {{.*}} extractelement
    204 
    205   %r = extractelement <8 x i16> %bin.rdx8, i32 0
    206   ret i16 %r
    207 }
    208 
    209 define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
    210   %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
    211   %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3
    212   %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    213   %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
    214   %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    215   %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
    216 
    217 ; AVX:  cost of 5 {{.*}} extractelement
    218 ; AVX2:  cost of 5 {{.*}} extractelement
    219 
    220   %r = extractelement <8 x i32> %bin.rdx8, i32 0
    221   ret i32 %r
    222 }
    223 
    224 define fastcc double @pairwise_reduction2double(<2 x double> %rdx, double %f1) {
    225   %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
    226   %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
    227   %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
    228 
    229 ; SSE3:  cost of 2 {{.*}} extractelement
    230 ; AVX:  cost of 2 {{.*}} extractelement
    231 ; AVX2:  cost of 2 {{.*}} extractelement
    232 
    233   %r = extractelement <2 x double> %bin.rdx8, i32 0
    234   ret double %r
    235 }
    236 
    237 define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) {
    238   %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
    239   %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
    240   %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
    241   %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
    242   %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
    243   %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
    244 
    245 ; SSE3:  cost of 4 {{.*}} extractelement
    246 ; AVX:  cost of 4 {{.*}} extractelement
    247 ; AVX2:  cost of 4 {{.*}} extractelement
    248 
    249   %r = extractelement <4 x float> %bin.rdx8, i32 0
    250   ret float %r
    251 }
    252 
    253 define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
    254   %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
    255   %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
    256   %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1
    257   %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
    258   %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
    259   %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
    260 
    261 ; AVX:  cost of 5 {{.*}} extractelement
    262 ; AVX2:  cost of 5 {{.*}} extractelement
    263 
    264   %r = extractelement <4 x double> %bin.rdx8, i32 0
    265   ret double %r
    266 }
    267 
    268 define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
    269   %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
    270   %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
    271   %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
    272   %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    273   %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    274   %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
    275   %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    276   %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    277   %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
    278 
    279 ; AVX:  cost of 7 {{.*}} extractelement
    280 ; AVX2:  cost of 7 {{.*}} extractelement
    281 
    282   %r = extractelement <8 x float> %bin.rdx9, i32 0
    283   ret float %r
    284 }
    285 
    286 define fastcc i64 @pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
    287   %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef>
    288   %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
    289   %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
    290 
    291 ; SSE3:  cost of 2 {{.*}} extractelement
    292 ; AVX:  cost of 1 {{.*}} extractelement
    293 ; AVX2:  cost of 1 {{.*}} extractelement
    294 
    295   %r = extractelement <2 x i64> %bin.rdx8, i32 0
    296   ret i64 %r
    297 }
    298 
    299 define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
    300   %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
    301   %rdx.shuf.0.1 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
    302   %bin.rdx = add <4 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
    303   %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
    304   %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
    305   %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
    306 
    307 ; SSE3:  cost of 3 {{.*}} extractelement
    308 ; AVX:  cost of 3 {{.*}} extractelement
    309 ; AVX2:  cost of 3 {{.*}} extractelement
    310 
    311   %r = extractelement <4 x i32> %bin.rdx8, i32 0
    312   ret i32 %r
    313 }
    314 
    315 define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
    316   %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
    317   %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
    318   %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1
    319   %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
    320   %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
    321   %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
    322 
    323 ; AVX:  cost of 5 {{.*}} extractelement
    324 ; AVX2:  cost of 5 {{.*}} extractelement
    325 
    326   %r = extractelement <4 x i64> %bin.rdx8, i32 0
    327   ret i64 %r
    328 }
    329 
    330 define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
    331   %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
    332   %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
    333   %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1
    334   %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    335   %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    336   %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1
    337   %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    338   %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    339   %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
    340 
    341 ; SSE3:  cost of 5 {{.*}} extractelement
    342 ; AVX:  cost of 5 {{.*}} extractelement
    343 ; AVX2:  cost of 5 {{.*}} extractelement
    344 
    345   %r = extractelement <8 x i16> %bin.rdx9, i32 0
    346   ret i16 %r
    347 }
    348 
    349 define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
    350   %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
    351   %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
    352   %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
    353   %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    354   %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    355   %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
    356   %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    357   %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    358   %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
    359 
    360 ; AVX:  cost of 5 {{.*}} extractelement
    361 ; AVX2:  cost of 5 {{.*}} extractelement
    362 
    363   %r = extractelement <8 x i32> %bin.rdx9, i32 0
    364   ret i32 %r
    365 }
    366