Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
      2 ; RUN: llc -march=amdgcn -mcpu=fiji  -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
      3 
      4 ; GCN-LABEL: {{^}}reduction_half4:
      5 ; GFX9:      v_pk_add_f16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
      6 ; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
      7 
      8 ; VI:      v_add_f16_sdwa
      9 ; VI-NEXT: v_add_f16_e32
     10 ; VI-NEXT: v_add_f16_e32
     11 define half @reduction_half4(<4 x half> %vec4) {
     12 entry:
     13   %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
     14   %bin.rdx = fadd fast <4 x half> %vec4, %rdx.shuf
     15   %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
     16   %bin.rdx2 = fadd fast <4 x half> %bin.rdx, %rdx.shuf1
     17   %res = extractelement <4 x half> %bin.rdx2, i32 0
     18   ret half %res
     19 }
     20 
     21 ; GCN-LABEL: {{^}}reduction_v4i16:
     22 ; GFX9:      v_pk_add_u16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
     23 ; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
     24 
     25 ; VI:      v_add_u16_sdwa
     26 ; VI-NEXT: v_add_u16_e32
     27 ; VI-NEXT: v_add_u16_e32
     28 define i16 @reduction_v4i16(<4 x i16> %vec4) {
     29 entry:
     30   %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
     31   %bin.rdx = add <4 x i16> %vec4, %rdx.shuf
     32   %rdx.shuf1 = shufflevector <4 x i16> %bin.rdx, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
     33   %bin.rdx2 = add <4 x i16> %bin.rdx, %rdx.shuf1
     34   %res = extractelement <4 x i16> %bin.rdx2, i32 0
     35   ret i16 %res
     36 }
     37 
     38 ; GCN-LABEL: {{^}}reduction_half8:
     39 ; GFX9:      v_pk_add_f16 [[ADD1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
     40 ; GFX9-NEXT: v_pk_add_f16 [[ADD2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
     41 ; GFX9-NEXT: v_pk_add_f16 [[ADD3:v[0-9]+]], [[ADD2]], [[ADD1]]{{$}}
     42 ; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
     43 
     44 ; VI:      v_add_f16_sdwa
     45 ; VI-NEXT: v_add_f16_sdwa
     46 ; VI-NEXT: v_add_f16_e32
     47 ; VI-NEXT: v_add_f16_e32
     48 ; VI-NEXT: v_add_f16_e32
     49 ; VI-NEXT: v_add_f16_e32
     50 ; VI-NEXT: v_add_f16_e32
     51 
     52 define half @reduction_half8(<8 x half> %vec8) {
     53 entry:
     54   %rdx.shuf = shufflevector <8 x half> %vec8, <8 x half> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
     55   %bin.rdx = fadd fast <8 x half> %vec8, %rdx.shuf
     56   %rdx.shuf1 = shufflevector <8 x half> %bin.rdx, <8 x half> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
     57   %bin.rdx2 = fadd fast <8 x half> %bin.rdx, %rdx.shuf1
     58   %rdx.shuf3 = shufflevector <8 x half> %bin.rdx2, <8 x half> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
     59   %bin.rdx4 = fadd fast <8 x half> %bin.rdx2, %rdx.shuf3
     60   %res = extractelement <8 x half> %bin.rdx4, i32 0
     61   ret half %res
     62 }
     63 
     64 ; GCN-LABEL: {{^}}reduction_v8i16:
     65 ; GFX9:      v_pk_add_u16 [[ADD1]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
     66 ; GFX9-NEXT: v_pk_add_u16 [[ADD2]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
     67 ; GFX9-NEXT: v_pk_add_u16 [[ADD3]], [[ADD2]], [[ADD1]]{{$}}
     68 ; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
     69 
     70 ; VI:      v_add_u16_sdwa
     71 ; VI-NEXT: v_add_u16_sdwa
     72 ; VI-NEXT: v_add_u16_e32
     73 ; VI-NEXT: v_add_u16_e32
     74 ; VI-NEXT: v_add_u16_e32
     75 ; VI-NEXT: v_add_u16_e32
     76 ; VI-NEXT: v_add_u16_e32
     77 
     78 define i16 @reduction_v8i16(<8 x i16> %vec8) {
     79 entry:
     80   %rdx.shuf = shufflevector <8 x i16> %vec8, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
     81   %bin.rdx = add <8 x i16> %vec8, %rdx.shuf
     82   %rdx.shuf1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
     83   %bin.rdx2 = add <8 x i16> %bin.rdx, %rdx.shuf1
     84   %rdx.shuf3 = shufflevector <8 x i16> %bin.rdx2, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
     85   %bin.rdx4 = add <8 x i16> %bin.rdx2, %rdx.shuf3
     86   %res = extractelement <8 x i16> %bin.rdx4, i32 0
     87   ret i16 %res
     88 }
     89 
     90 ; GCN-LABEL: {{^}}reduction_half16:
     91 ; GFX9:      v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
     92 ; GFX9-NEXT: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
     93 ; GFX9-NEXT: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
     94 ; GFX9:      v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
     95 ; GFX9-NEXT: v_pk_add_f16 [[ADD1]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
     96 ; GFX9-NEXT: v_pk_add_f16 [[ADD2]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
     97 ; GFX9-NEXT: v_pk_add_f16 [[ADD3]], [[ADD2]], [[ADD1]]{{$}}
     98 ; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
     99 
    100 ; VI:      v_add_f16_sdwa
    101 ; VI-NEXT: v_add_f16_sdwa
    102 ; VI-NEXT: v_add_f16_sdwa
    103 ; VI-NEXT: v_add_f16_sdwa
    104 ; VI-NEXT: v_add_f16_e32
    105 ; VI-NEXT: v_add_f16_e32
    106 ; VI-NEXT: v_add_f16_e32
    107 ; VI-NEXT: v_add_f16_e32
    108 ; VI-NEXT: v_add_f16_e32
    109 ; VI-NEXT: v_add_f16_e32
    110 ; VI-NEXT: v_add_f16_e32
    111 ; VI-NEXT: v_add_f16_e32
    112 ; VI-NEXT: v_add_f16_e32
    113 ; VI-NEXT: v_add_f16_e32
    114 ; VI-NEXT: v_add_f16_e32
    115 
    116 define half @reduction_half16(<16 x half> %vec16) {
    117 entry:
    118   %rdx.shuf = shufflevector <16 x half> %vec16, <16 x half> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    119   %bin.rdx = fadd fast <16 x half> %vec16, %rdx.shuf
    120   %rdx.shuf1 = shufflevector <16 x half> %bin.rdx, <16 x half> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    121   %bin.rdx2 = fadd fast <16 x half> %bin.rdx, %rdx.shuf1
    122   %rdx.shuf3 = shufflevector <16 x half> %bin.rdx2, <16 x half> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    123   %bin.rdx4 = fadd fast <16 x half> %bin.rdx2, %rdx.shuf3
    124   %rdx.shuf5 = shufflevector <16 x half> %bin.rdx4, <16 x half> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    125   %bin.rdx6 = fadd fast <16 x half> %bin.rdx4, %rdx.shuf5
    126   %res = extractelement <16 x half> %bin.rdx6, i32 0
    127   ret half %res
    128 }
    129 
    130 ; GCN-LABEL: {{^}}reduction_min_v4i16:
    131 ; GFX9:      v_pk_min_u16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
    132 ; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
    133 
    134 ; VI:      v_min_u16_sdwa
    135 ; VI-NEXT: v_min_u16_e32
    136 ; VI-NEXT: v_min_u16_e32
    137 define i16 @reduction_min_v4i16(<4 x i16> %vec4) {
    138 entry:
    139   %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
    140   %rdx.minmax.cmp = icmp ult <4 x i16> %vec4, %rdx.shuf
    141   %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x i16> %vec4, <4 x i16> %rdx.shuf
    142   %rdx.shuf1 = shufflevector <4 x i16> %rdx.minmax.select, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
    143   %rdx.minmax.cmp2 = icmp ult <4 x i16> %rdx.minmax.select, %rdx.shuf1
    144   %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x i16> %rdx.minmax.select, <4 x i16> %rdx.shuf1
    145   %res = extractelement <4 x i16> %rdx.minmax.select3, i32 0
    146   ret i16 %res
    147 }
    148 
    149 ; GCN-LABEL: {{^}}reduction_umin_v8i16:
    150 ; GFX9:      v_pk_min_u16 [[MIN1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
    151 ; GFX9-NEXT: v_pk_min_u16 [[MIN2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
    152 ; GFX9-NEXT: v_pk_min_u16 [[MIN3:v[0-9]+]], [[MIN2]], [[MIN1]]{{$}}
    153 ; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, [[MIN3]], [[MIN3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
    154 
    155 ; VI:      v_min_u16_sdwa
    156 ; VI-NEXT: v_min_u16_sdwa
    157 ; VI-NEXT: v_min_u16_e32
    158 ; VI-NEXT: v_min_u16_e32
    159 ; VI-NEXT: v_min_u16_e32
    160 ; VI-NEXT: v_min_u16_e32
    161 ; VI-NEXT: v_min_u16_e32
    162 define i16 @reduction_umin_v8i16(<8 x i16> %vec8) {
    163 entry:
    164   %rdx.shuf = shufflevector <8 x i16> %vec8, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
    165   %rdx.minmax.cmp = icmp ult <8 x i16> %vec8, %rdx.shuf
    166   %rdx.minmax.select = select <8 x i1> %rdx.minmax.cmp, <8 x i16> %vec8, <8 x i16> %rdx.shuf
    167   %rdx.shuf1 = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    168   %rdx.minmax.cmp2 = icmp ult <8 x i16> %rdx.minmax.select, %rdx.shuf1
    169   %rdx.minmax.select3 = select <8 x i1> %rdx.minmax.cmp2, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf1
    170   %rdx.shuf4 = shufflevector <8 x i16> %rdx.minmax.select3, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    171   %rdx.minmax.cmp5 = icmp ult <8 x i16> %rdx.minmax.select3, %rdx.shuf4
    172   %rdx.minmax.select6 = select <8 x i1> %rdx.minmax.cmp5, <8 x i16> %rdx.minmax.select3, <8 x i16> %rdx.shuf4
    173   %res = extractelement <8 x i16> %rdx.minmax.select6, i32 0
    174   ret i16 %res
    175 }
    176 
    177 ; Tests to make sure without slp the number of instructions are more.
    178 ; GCN-LABEL: {{^}}reduction_umin_v8i16_woslp:
    179 ; GFX9:      v_lshrrev_b32_e32
    180 ; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
    181 ; GFX9-NEXT: v_lshrrev_b32_e32
    182 ; GFX9-NEXT: v_min3_u16
    183 ; GFX9-NEXT: v_lshrrev_b32_e32
    184 ; GFX9-NEXT: v_min3_u16
    185 ; GFX9-NEXT: v_min3_u16
    186 define i16 @reduction_umin_v8i16_woslp(<8 x i16> %vec8) {
    187 entry:
    188   %elt0 = extractelement <8 x i16> %vec8, i64 0
    189   %elt1 = extractelement <8 x i16> %vec8, i64 1
    190   %elt2 = extractelement <8 x i16> %vec8, i64 2
    191   %elt3 = extractelement <8 x i16> %vec8, i64 3
    192   %elt4 = extractelement <8 x i16> %vec8, i64 4
    193   %elt5 = extractelement <8 x i16> %vec8, i64 5
    194   %elt6 = extractelement <8 x i16> %vec8, i64 6
    195   %elt7 = extractelement <8 x i16> %vec8, i64 7
    196 
    197   %cmp0 = icmp ult i16 %elt1, %elt0
    198   %min1 = select i1 %cmp0, i16 %elt1, i16 %elt0
    199   %cmp1 = icmp ult i16 %elt2, %min1
    200   %min2 = select i1 %cmp1, i16 %elt2, i16 %min1
    201   %cmp2 = icmp ult i16 %elt3, %min2
    202   %min3 = select i1 %cmp2, i16 %elt3, i16 %min2
    203 
    204   %cmp3 = icmp ult i16 %elt4, %min3
    205   %min4 = select i1 %cmp3, i16 %elt4, i16 %min3
    206   %cmp4 = icmp ult i16 %elt5, %min4
    207   %min5 = select i1 %cmp4, i16 %elt5, i16 %min4
    208 
    209   %cmp5 = icmp ult i16 %elt6, %min5
    210   %min6 = select i1 %cmp5, i16 %elt6, i16 %min5
    211   %cmp6 = icmp ult i16 %elt7, %min6
    212   %min7 = select i1 %cmp6, i16 %elt7, i16 %min6
    213 
    214   ret i16 %min7
    215 }
    216 
    217 ; GCN-LABEL: {{^}}reduction_smin_v16i16:
    218 ; GFX9:        v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
    219 ; GFX9-NEXT:   v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
    220 ; GFX9-NEXT:   v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
    221 ; GFX9-NEXT:   v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
    222 ; GFX9-NEXT:   v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
    223 ; GFX9-NEXT:   v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
    224 ; GFX9-NEXT:   v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
    225 ; GFX9-NEXT:   v_min_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
    226 
    227 ; VI:      v_min_i16_sdwa
    228 ; VI-NEXT: v_min_i16_sdwa
    229 ; VI-NEXT: v_min_i16_sdwa
    230 ; VI-NEXT: v_min_i16_sdwa
    231 ; VI-NEXT: v_min_i16_e32
    232 ; VI-NEXT: v_min_i16_e32
    233 ; VI-NEXT: v_min_i16_e32
    234 ; VI-NEXT: v_min_i16_e32
    235 ; VI-NEXT: v_min_i16_e32
    236 ; VI-NEXT: v_min_i16_e32
    237 ; VI-NEXT: v_min_i16_e32
    238 ; VI-NEXT: v_min_i16_e32
    239 ; VI-NEXT: v_min_i16_e32
    240 ; VI-NEXT: v_min_i16_e32
    241 ; VI-NEXT: v_min_i16_e32
    242 define i16 @reduction_smin_v16i16(<16 x i16> %vec16) {
    243 entry:
    244   %rdx.shuf = shufflevector <16 x i16> %vec16, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    245   %rdx.minmax.cmp = icmp slt <16 x i16> %vec16, %rdx.shuf
    246   %rdx.minmax.select = select <16 x i1> %rdx.minmax.cmp, <16 x i16> %vec16, <16 x i16> %rdx.shuf
    247   %rdx.shuf1 = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    248   %rdx.minmax.cmp2 = icmp slt <16 x i16> %rdx.minmax.select, %rdx.shuf1
    249   %rdx.minmax.select3 = select <16 x i1> %rdx.minmax.cmp2, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf1
    250   %rdx.shuf4 = shufflevector <16 x i16> %rdx.minmax.select3, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    251   %rdx.minmax.cmp5 = icmp slt <16 x i16> %rdx.minmax.select3, %rdx.shuf4
    252   %rdx.minmax.select6 = select <16 x i1> %rdx.minmax.cmp5, <16 x i16> %rdx.minmax.select3, <16 x i16> %rdx.shuf4
    253   %rdx.shuf7 = shufflevector <16 x i16> %rdx.minmax.select6, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    254   %rdx.minmax.cmp8 = icmp slt <16 x i16> %rdx.minmax.select6, %rdx.shuf7
    255   %rdx.minmax.select9 = select <16 x i1> %rdx.minmax.cmp8, <16 x i16> %rdx.minmax.select6, <16 x i16> %rdx.shuf7
    256   %res = extractelement <16 x i16> %rdx.minmax.select9, i32 0
    257   ret i16 %res
    258 }
    259 
    260 ; Tests to make sure without slp the number of instructions are more.
    261 ; GCN-LABEL: {{^}}reduction_smin_v16i16_woslp:
    262 ; GFX9:      v_lshrrev_b32_e32
    263 ; GFX9-NEXT: v_min_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
    264 ; GFX9-NEXT: v_lshrrev_b32_e32
    265 ; GFX9-NEXT: v_min3_i16
    266 ; GFX9-NEXT: v_lshrrev_b32_e32
    267 ; GFX9-NEXT: v_min3_i16
    268 ; GFX9-NEXT: v_lshrrev_b32_e32
    269 ; GFX9-NEXT: v_min3_i16
    270 ; GFX9-NEXT: v_lshrrev_b32_e32
    271 ; GFX9-NEXT: v_min3_i16
    272 ; GFX9-NEXT: v_lshrrev_b32_e32
    273 ; GFX9-NEXT: v_min3_i16
    274 ; GFX9-NEXT: v_lshrrev_b32_e32
    275 ; GFX9-NEXT: v_min3_i16
    276 ; GFX9-NEXT: v_min3_i16
    277 define i16 @reduction_smin_v16i16_woslp(<16 x i16> %vec16) {
    278 entry:
    279   %elt0 = extractelement <16 x i16> %vec16, i64 0
    280   %elt1 = extractelement <16 x i16> %vec16, i64 1
    281   %elt2 = extractelement <16 x i16> %vec16, i64 2
    282   %elt3 = extractelement <16 x i16> %vec16, i64 3
    283   %elt4 = extractelement <16 x i16> %vec16, i64 4
    284   %elt5 = extractelement <16 x i16> %vec16, i64 5
    285   %elt6 = extractelement <16 x i16> %vec16, i64 6
    286   %elt7 = extractelement <16 x i16> %vec16, i64 7
    287 
    288   %elt8 = extractelement <16 x i16> %vec16, i64 8
    289   %elt9 = extractelement <16 x i16> %vec16, i64 9
    290   %elt10 = extractelement <16 x i16> %vec16, i64 10
    291   %elt11 = extractelement <16 x i16> %vec16, i64 11
    292   %elt12 = extractelement <16 x i16> %vec16, i64 12
    293   %elt13 = extractelement <16 x i16> %vec16, i64 13
    294   %elt14 = extractelement <16 x i16> %vec16, i64 14
    295   %elt15 = extractelement <16 x i16> %vec16, i64 15
    296 
    297   %cmp0 = icmp slt i16 %elt1, %elt0
    298   %min1 = select i1 %cmp0, i16 %elt1, i16 %elt0
    299   %cmp1 = icmp slt i16 %elt2, %min1
    300   %min2 = select i1 %cmp1, i16 %elt2, i16 %min1
    301   %cmp2 = icmp slt i16 %elt3, %min2
    302   %min3 = select i1 %cmp2, i16 %elt3, i16 %min2
    303 
    304   %cmp3 = icmp slt i16 %elt4, %min3
    305   %min4 = select i1 %cmp3, i16 %elt4, i16 %min3
    306   %cmp4 = icmp slt i16 %elt5, %min4
    307   %min5 = select i1 %cmp4, i16 %elt5, i16 %min4
    308 
    309   %cmp5 = icmp slt i16 %elt6, %min5
    310   %min6 = select i1 %cmp5, i16 %elt6, i16 %min5
    311   %cmp6 = icmp slt i16 %elt7, %min6
    312   %min7 = select i1 %cmp6, i16 %elt7, i16 %min6
    313 
    314   %cmp7 = icmp slt i16 %elt8, %min7
    315   %min8 = select i1 %cmp7, i16 %elt8, i16 %min7
    316   %cmp8 = icmp slt i16 %elt9, %min8
    317   %min9 = select i1 %cmp8, i16 %elt9, i16 %min8
    318 
    319   %cmp9 = icmp slt i16 %elt10, %min9
    320   %min10 = select i1 %cmp9, i16 %elt10, i16 %min9
    321   %cmp10 = icmp slt i16 %elt11, %min10
    322   %min11 = select i1 %cmp10, i16 %elt11, i16 %min10
    323 
    324   %cmp11 = icmp slt i16 %elt12, %min11
    325   %min12 = select i1 %cmp11, i16 %elt12, i16 %min11
    326   %cmp12 = icmp slt i16 %elt13, %min12
    327   %min13 = select i1 %cmp12, i16 %elt13, i16 %min12
    328 
    329   %cmp13 = icmp slt i16 %elt14, %min13
    330   %min14 = select i1 %cmp13, i16 %elt14, i16 %min13
    331   %cmp14 = icmp slt i16 %elt15, %min14
    332   %min15 = select i1 %cmp14, i16 %elt15, i16 %min14
    333 
    334 
    335   ret i16 %min15
    336 }
    337 
    338 ; GCN-LABEL: {{^}}reduction_umax_v4i16:
    339 ; GFX9:      v_pk_max_u16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
    340 ; GFX9-NEXT: v_max_u16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
    341 
    342 ; VI:      v_max_u16_sdwa
    343 ; VI-NEXT: v_max_u16_e32
    344 ; VI-NEXT: v_max_u16_e32
    345 define i16 @reduction_umax_v4i16(<4 x i16> %vec4) {
    346 entry:
    347   %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
    348   %rdx.minmax.cmp = icmp ugt <4 x i16> %vec4, %rdx.shuf
    349   %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x i16> %vec4, <4 x i16> %rdx.shuf
    350   %rdx.shuf1 = shufflevector <4 x i16> %rdx.minmax.select, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
    351   %rdx.minmax.cmp2 = icmp ugt <4 x i16> %rdx.minmax.select, %rdx.shuf1
    352   %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x i16> %rdx.minmax.select, <4 x i16> %rdx.shuf1
    353   %res = extractelement <4 x i16> %rdx.minmax.select3, i32 0
    354   ret i16 %res
    355 }
    356 
    357 ; GCN-LABEL: {{^}}reduction_smax_v4i16:
    358 ; GFX9:      v_pk_max_i16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
    359 ; GFX9-NEXT: v_max_i16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
    360 
    361 ; VI:      v_max_i16_sdwa
    362 ; VI-NEXT: v_max_i16_e32
    363 ; VI-NEXT: v_max_i16_e32
    364 define i16 @reduction_smax_v4i16(<4 x i16> %vec4) #0 {
    365 entry:
    366   %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
    367   %rdx.minmax.cmp = icmp sgt <4 x i16> %vec4, %rdx.shuf
    368   %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x i16> %vec4, <4 x i16> %rdx.shuf
    369   %rdx.shuf1 = shufflevector <4 x i16> %rdx.minmax.select, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
    370   %rdx.minmax.cmp2 = icmp sgt <4 x i16> %rdx.minmax.select, %rdx.shuf1
    371   %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x i16> %rdx.minmax.select, <4 x i16> %rdx.shuf1
    372   %res = extractelement <4 x i16> %rdx.minmax.select3, i32 0
    373   ret i16 %res
    374 }
    375 
    376 ; GCN-LABEL: {{^}}reduction_fmax_v4half:
    377 ; GFX9:      v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
    378 ; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
    379 
    380 ; VI:      v_max_f16_sdwa
    381 ; VI-NEXT: v_max_f16_e32
    382 ; VI-NEXT: v_max_f16_e32
    383 define half @reduction_fmax_v4half(<4 x half> %vec4) {
    384 entry:
    385   %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
    386   %rdx.minmax.cmp = fcmp fast ogt <4 x half> %vec4, %rdx.shuf
    387   %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x half> %vec4, <4 x half> %rdx.shuf
    388   %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax.select, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
    389   %rdx.minmax.cmp2 = fcmp fast ogt <4 x half> %rdx.minmax.select, %rdx.shuf1
    390   %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x half> %rdx.minmax.select, <4 x half> %rdx.shuf1
    391   %res = extractelement <4 x half> %rdx.minmax.select3, i32 0
    392   ret half %res
    393 }
    394 
    395 ; GCN-LABEL: {{^}}reduction_fmin_v4half:
    396 ; GFX9:      v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
    397 ; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
    398 
    399 ; VI:      v_min_f16_sdwa
    400 ; VI-NEXT: v_min_f16_e32
    401 ; VI-NEXT: v_min_f16_e32
    402 define half @reduction_fmin_v4half(<4 x half> %vec4) {
    403 entry:
    404   %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
    405   %rdx.minmax.cmp = fcmp fast olt <4 x half> %vec4, %rdx.shuf
    406   %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x half> %vec4, <4 x half> %rdx.shuf
    407   %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax.select, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
    408   %rdx.minmax.cmp2 = fcmp fast olt <4 x half> %rdx.minmax.select, %rdx.shuf1
    409   %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x half> %rdx.minmax.select, <4 x half> %rdx.shuf1
    410   %res = extractelement <4 x half> %rdx.minmax.select3, i32 0
    411   ret half %res
    412 }
    413