Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
      2 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
      3 
      4 ; GCN-LABEL: {{^}}fcmp_f16_lt
      5 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
      6 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
      7 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
      8 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
      9 ; SI:  v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
     10 ; VI:  v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
     11 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
     12 ; GCN: buffer_store_dword v[[R_I32]]
     13 ; GCN: s_endpgm
     14 define amdgpu_kernel void @fcmp_f16_lt(
     15     i32 addrspace(1)* %r,
     16     half addrspace(1)* %a,
     17     half addrspace(1)* %b) {
     18 entry:
     19   %a.val = load volatile half, half addrspace(1)* %a
     20   %b.val = load volatile half, half addrspace(1)* %b
     21   %r.val = fcmp olt half %a.val, %b.val
     22   %r.val.sext = sext i1 %r.val to i32
     23   store i32 %r.val.sext, i32 addrspace(1)* %r
     24   ret void
     25 }
     26 
     27 ; GCN-LABEL: {{^}}fcmp_f16_lt_abs:
     28 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
     29 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
     30 
     31 ; SI:  v_cvt_f32_f16_e64 v[[A_F32:[0-9]+]], |v[[A_F16]]|
     32 ; SI:  v_cvt_f32_f16_e64 v[[B_F32:[0-9]+]], |v[[B_F16]]|
     33 
     34 ; SI:  v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
     35 ; VI:  v_cmp_lt_f16_e64 s{{\[[0-9]+:[0-9]+\]}}, |v[[A_F16]]|, |v[[B_F16]]|
     36 
     37 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
     38 ; GCN: buffer_store_dword v[[R_I32]]
     39 ; GCN: s_endpgm
     40 define amdgpu_kernel void @fcmp_f16_lt_abs(
     41     i32 addrspace(1)* %r,
     42     half addrspace(1)* %a,
     43     half addrspace(1)* %b) {
     44 entry:
     45   %a.val = load volatile half, half addrspace(1)* %a
     46   %b.val = load volatile half, half addrspace(1)* %b
     47   %a.abs = call half @llvm.fabs.f16(half %a.val)
     48   %b.abs = call half @llvm.fabs.f16(half %b.val)
     49   %r.val = fcmp olt half %a.abs, %b.abs
     50   %r.val.sext = sext i1 %r.val to i32
     51   store i32 %r.val.sext, i32 addrspace(1)* %r
     52   ret void
     53 }
     54 
     55 ; GCN-LABEL: {{^}}fcmp_f16_eq
     56 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
     57 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
     58 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
     59 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
     60 ; SI:  v_cmp_eq_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
     61 ; VI:  v_cmp_eq_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
     62 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
     63 ; GCN: buffer_store_dword v[[R_I32]]
     64 ; GCN: s_endpgm
     65 define amdgpu_kernel void @fcmp_f16_eq(
     66     i32 addrspace(1)* %r,
     67     half addrspace(1)* %a,
     68     half addrspace(1)* %b) {
     69 entry:
     70   %a.val = load volatile half, half addrspace(1)* %a
     71   %b.val = load volatile half, half addrspace(1)* %b
     72   %r.val = fcmp oeq half %a.val, %b.val
     73   %r.val.sext = sext i1 %r.val to i32
     74   store i32 %r.val.sext, i32 addrspace(1)* %r
     75   ret void
     76 }
     77 
     78 ; GCN-LABEL: {{^}}fcmp_f16_le
     79 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
     80 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
     81 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
     82 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
     83 ; SI:  v_cmp_le_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
     84 ; VI:  v_cmp_le_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
     85 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
     86 ; GCN: buffer_store_dword v[[R_I32]]
     87 ; GCN: s_endpgm
     88 define amdgpu_kernel void @fcmp_f16_le(
     89     i32 addrspace(1)* %r,
     90     half addrspace(1)* %a,
     91     half addrspace(1)* %b) {
     92 entry:
     93   %a.val = load volatile half, half addrspace(1)* %a
     94   %b.val = load volatile half, half addrspace(1)* %b
     95   %r.val = fcmp ole half %a.val, %b.val
     96   %r.val.sext = sext i1 %r.val to i32
     97   store i32 %r.val.sext, i32 addrspace(1)* %r
     98   ret void
     99 }
    100 
    101 ; GCN-LABEL: {{^}}fcmp_f16_gt
    102 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
    103 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
    104 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
    105 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
    106 ; SI:  v_cmp_gt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
    107 ; VI:  v_cmp_gt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
    108 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
    109 ; GCN: buffer_store_dword v[[R_I32]]
    110 ; GCN: s_endpgm
    111 define amdgpu_kernel void @fcmp_f16_gt(
    112     i32 addrspace(1)* %r,
    113     half addrspace(1)* %a,
    114     half addrspace(1)* %b) {
    115 entry:
    116   %a.val = load volatile half, half addrspace(1)* %a
    117   %b.val = load volatile half, half addrspace(1)* %b
    118   %r.val = fcmp ogt half %a.val, %b.val
    119   %r.val.sext = sext i1 %r.val to i32
    120   store i32 %r.val.sext, i32 addrspace(1)* %r
    121   ret void
    122 }
    123 
    124 ; GCN-LABEL: {{^}}fcmp_f16_lg
    125 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
    126 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
    127 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
    128 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
    129 ; SI:  v_cmp_lg_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
    130 ; VI:  v_cmp_lg_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
    131 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
    132 ; GCN: buffer_store_dword v[[R_I32]]
    133 ; GCN: s_endpgm
    134 define amdgpu_kernel void @fcmp_f16_lg(
    135     i32 addrspace(1)* %r,
    136     half addrspace(1)* %a,
    137     half addrspace(1)* %b) {
    138 entry:
    139   %a.val = load volatile half, half addrspace(1)* %a
    140   %b.val = load volatile half, half addrspace(1)* %b
    141   %r.val = fcmp one half %a.val, %b.val
    142   %r.val.sext = sext i1 %r.val to i32
    143   store i32 %r.val.sext, i32 addrspace(1)* %r
    144   ret void
    145 }
    146 
    147 ; GCN-LABEL: {{^}}fcmp_f16_ge
    148 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
    149 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
    150 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
    151 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
    152 ; SI:  v_cmp_ge_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
    153 ; VI:  v_cmp_ge_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
    154 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
    155 ; GCN: buffer_store_dword v[[R_I32]]
    156 ; GCN: s_endpgm
    157 define amdgpu_kernel void @fcmp_f16_ge(
    158     i32 addrspace(1)* %r,
    159     half addrspace(1)* %a,
    160     half addrspace(1)* %b) {
    161 entry:
    162   %a.val = load volatile half, half addrspace(1)* %a
    163   %b.val = load volatile half, half addrspace(1)* %b
    164   %r.val = fcmp oge half %a.val, %b.val
    165   %r.val.sext = sext i1 %r.val to i32
    166   store i32 %r.val.sext, i32 addrspace(1)* %r
    167   ret void
    168 }
    169 
    170 ; GCN-LABEL: {{^}}fcmp_f16_o
    171 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
    172 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
    173 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
    174 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
    175 ; SI:  v_cmp_o_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
    176 ; VI:  v_cmp_o_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
    177 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
    178 ; GCN: buffer_store_dword v[[R_I32]]
    179 ; GCN: s_endpgm
    180 define amdgpu_kernel void @fcmp_f16_o(
    181     i32 addrspace(1)* %r,
    182     half addrspace(1)* %a,
    183     half addrspace(1)* %b) {
    184 entry:
    185   %a.val = load volatile half, half addrspace(1)* %a
    186   %b.val = load volatile half, half addrspace(1)* %b
    187   %r.val = fcmp ord half %a.val, %b.val
    188   %r.val.sext = sext i1 %r.val to i32
    189   store i32 %r.val.sext, i32 addrspace(1)* %r
    190   ret void
    191 }
    192 
    193 ; GCN-LABEL: {{^}}fcmp_f16_u
    194 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
    195 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
    196 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
    197 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
    198 ; SI:  v_cmp_u_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
    199 ; VI:  v_cmp_u_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
    200 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
    201 ; GCN: buffer_store_dword v[[R_I32]]
    202 ; GCN: s_endpgm
    203 define amdgpu_kernel void @fcmp_f16_u(
    204     i32 addrspace(1)* %r,
    205     half addrspace(1)* %a,
    206     half addrspace(1)* %b) {
    207 entry:
    208   %a.val = load volatile half, half addrspace(1)* %a
    209   %b.val = load volatile half, half addrspace(1)* %b
    210   %r.val = fcmp uno half %a.val, %b.val
    211   %r.val.sext = sext i1 %r.val to i32
    212   store i32 %r.val.sext, i32 addrspace(1)* %r
    213   ret void
    214 }
    215 
    216 ; GCN-LABEL: {{^}}fcmp_f16_nge
    217 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
    218 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
    219 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
    220 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
    221 ; SI:  v_cmp_nge_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
    222 ; VI:  v_cmp_nge_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
    223 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
    224 ; GCN: buffer_store_dword v[[R_I32]]
    225 ; GCN: s_endpgm
    226 define amdgpu_kernel void @fcmp_f16_nge(
    227     i32 addrspace(1)* %r,
    228     half addrspace(1)* %a,
    229     half addrspace(1)* %b) {
    230 entry:
    231   %a.val = load volatile half, half addrspace(1)* %a
    232   %b.val = load volatile half, half addrspace(1)* %b
    233   %r.val = fcmp ult half %a.val, %b.val
    234   %r.val.sext = sext i1 %r.val to i32
    235   store i32 %r.val.sext, i32 addrspace(1)* %r
    236   ret void
    237 }
    238 
    239 ; GCN-LABEL: {{^}}fcmp_f16_nlg
    240 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
    241 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
    242 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
    243 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
    244 ; SI:  v_cmp_nlg_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
    245 ; VI:  v_cmp_nlg_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
    246 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
    247 ; GCN: buffer_store_dword v[[R_I32]]
    248 ; GCN: s_endpgm
    249 define amdgpu_kernel void @fcmp_f16_nlg(
    250     i32 addrspace(1)* %r,
    251     half addrspace(1)* %a,
    252     half addrspace(1)* %b) {
    253 entry:
    254   %a.val = load volatile half, half addrspace(1)* %a
    255   %b.val = load volatile half, half addrspace(1)* %b
    256   %r.val = fcmp ueq half %a.val, %b.val
    257   %r.val.sext = sext i1 %r.val to i32
    258   store i32 %r.val.sext, i32 addrspace(1)* %r
    259   ret void
    260 }
    261 
    262 ; GCN-LABEL: {{^}}fcmp_f16_ngt
    263 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
    264 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
    265 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
    266 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
    267 ; SI:  v_cmp_ngt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
    268 ; VI:  v_cmp_ngt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
    269 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
    270 ; GCN: buffer_store_dword v[[R_I32]]
    271 ; GCN: s_endpgm
    272 define amdgpu_kernel void @fcmp_f16_ngt(
    273     i32 addrspace(1)* %r,
    274     half addrspace(1)* %a,
    275     half addrspace(1)* %b) {
    276 entry:
    277   %a.val = load volatile half, half addrspace(1)* %a
    278   %b.val = load volatile half, half addrspace(1)* %b
    279   %r.val = fcmp ule half %a.val, %b.val
    280   %r.val.sext = sext i1 %r.val to i32
    281   store i32 %r.val.sext, i32 addrspace(1)* %r
    282   ret void
    283 }
    284 
    285 ; GCN-LABEL: {{^}}fcmp_f16_nle
    286 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
    287 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
    288 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
    289 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
    290 ; SI:  v_cmp_nle_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
    291 ; VI:  v_cmp_nle_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
    292 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
    293 ; GCN: buffer_store_dword v[[R_I32]]
    294 ; GCN: s_endpgm
    295 define amdgpu_kernel void @fcmp_f16_nle(
    296     i32 addrspace(1)* %r,
    297     half addrspace(1)* %a,
    298     half addrspace(1)* %b) {
    299 entry:
    300   %a.val = load volatile half, half addrspace(1)* %a
    301   %b.val = load volatile half, half addrspace(1)* %b
    302   %r.val = fcmp ugt half %a.val, %b.val
    303   %r.val.sext = sext i1 %r.val to i32
    304   store i32 %r.val.sext, i32 addrspace(1)* %r
    305   ret void
    306 }
    307 
    308 ; GCN-LABEL: {{^}}fcmp_f16_neq
    309 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
    310 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
    311 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
    312 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
    313 ; SI:  v_cmp_neq_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
    314 ; VI:  v_cmp_neq_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
    315 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
    316 ; GCN: buffer_store_dword v[[R_I32]]
    317 ; GCN: s_endpgm
    318 define amdgpu_kernel void @fcmp_f16_neq(
    319     i32 addrspace(1)* %r,
    320     half addrspace(1)* %a,
    321     half addrspace(1)* %b) {
    322 entry:
    323   %a.val = load volatile half, half addrspace(1)* %a
    324   %b.val = load volatile half, half addrspace(1)* %b
    325   %r.val = fcmp une half %a.val, %b.val
    326   %r.val.sext = sext i1 %r.val to i32
    327   store i32 %r.val.sext, i32 addrspace(1)* %r
    328   ret void
    329 }
    330 
    331 ; GCN-LABEL: {{^}}fcmp_f16_nlt
    332 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
    333 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
    334 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
    335 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
    336 ; SI:  v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
    337 ; VI:  v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
    338 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
    339 ; GCN: buffer_store_dword v[[R_I32]]
    340 ; GCN: s_endpgm
    341 define amdgpu_kernel void @fcmp_f16_nlt(
    342     i32 addrspace(1)* %r,
    343     half addrspace(1)* %a,
    344     half addrspace(1)* %b) {
    345 entry:
    346   %a.val = load volatile half, half addrspace(1)* %a
    347   %b.val = load volatile half, half addrspace(1)* %b
    348   %r.val = fcmp uge half %a.val, %b.val
    349   %r.val.sext = sext i1 %r.val to i32
    350   store i32 %r.val.sext, i32 addrspace(1)* %r
    351   ret void
    352 }
    353 
    354 ; GCN-LABEL: {{^}}fcmp_v2f16_lt:
    355 ; SI: v_cmp_lt_f32_e32 vcc,
    356 ; SI: v_cmp_lt_f32_e32 vcc,
    357 
    358 ; VI: v_cmp_lt_f16_e32 vcc,
    359 ; VI: v_cmp_lt_f16_e32 vcc,
    360 define amdgpu_kernel void @fcmp_v2f16_lt(
    361     <2 x i32> addrspace(1)* %r,
    362     <2 x half> addrspace(1)* %a,
    363     <2 x half> addrspace(1)* %b) {
    364 entry:
    365   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
    366   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
    367   %r.val = fcmp olt <2 x half> %a.val, %b.val
    368   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
    369   store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
    370   ret void
    371 }
    372 
    373 ; GCN-LABEL: {{^}}fcmp_v2f16_eq
    374 ; SI:  v_cmp_eq_f32_e32 vcc,
    375 ; SI:  v_cmp_eq_f32_e32 vcc,
    376 
    377 ; VI:  v_cmp_eq_f16_e32 vcc,
    378 ; VI:  v_cmp_eq_f16_e32 vcc,
    379 define amdgpu_kernel void @fcmp_v2f16_eq(
    380     <2 x i32> addrspace(1)* %r,
    381     <2 x half> addrspace(1)* %a,
    382     <2 x half> addrspace(1)* %b) {
    383 entry:
    384   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
    385   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
    386   %r.val = fcmp oeq <2 x half> %a.val, %b.val
    387   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
    388   store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
    389   ret void
    390 }
    391 
    392 ; GCN-LABEL: {{^}}fcmp_v2f16_le:
    393 ; SI:  v_cmp_le_f32_e32 vcc
    394 ; SI:  v_cmp_le_f32_e32 vcc
    395 ; VI:  v_cmp_le_f16_e32 vcc
    396 ; VI:  v_cmp_le_f16_e32 vcc
    397 define amdgpu_kernel void @fcmp_v2f16_le(
    398     <2 x i32> addrspace(1)* %r,
    399     <2 x half> addrspace(1)* %a,
    400     <2 x half> addrspace(1)* %b) {
    401 entry:
    402   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
    403   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
    404   %r.val = fcmp ole <2 x half> %a.val, %b.val
    405   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
    406   store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
    407   ret void
    408 }
    409 
    410 ; GCN-LABEL: {{^}}fcmp_v2f16_gt:
    411 ; SI: v_cmp_gt_f32_e32 vcc,
    412 ; SI: v_cmp_gt_f32_e32 vcc,
    413 
    414 ; VI: v_cmp_gt_f16_e32 vcc,
    415 ; VI: v_cmp_gt_f16_e32 vcc,
    416 define amdgpu_kernel void @fcmp_v2f16_gt(
    417     <2 x i32> addrspace(1)* %r,
    418     <2 x half> addrspace(1)* %a,
    419     <2 x half> addrspace(1)* %b) {
    420 entry:
    421   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
    422   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
    423   %r.val = fcmp ogt <2 x half> %a.val, %b.val
    424   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
    425   store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
    426   ret void
    427 }
    428 
    429 ; GCN-LABEL: {{^}}fcmp_v2f16_lg:
    430 ; SI: v_cmp_lg_f32_e32 vcc,
    431 ; SI: v_cmp_lg_f32_e32 vcc,
    432 
    433 ; VI: v_cmp_lg_f16_e32 vcc,
    434 ; VI: v_cmp_lg_f16_e32 vcc,
    435 define amdgpu_kernel void @fcmp_v2f16_lg(
    436     <2 x i32> addrspace(1)* %r,
    437     <2 x half> addrspace(1)* %a,
    438     <2 x half> addrspace(1)* %b) {
    439 entry:
    440   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
    441   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
    442   %r.val = fcmp one <2 x half> %a.val, %b.val
    443   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
    444   store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
    445   ret void
    446 }
    447 
    448 ; GCN-LABEL: {{^}}fcmp_v2f16_ge:
    449 ; SI:  v_cmp_ge_f32_e32 vcc,
    450 ; SI:  v_cmp_ge_f32_e32 vcc,
    451 
    452 ; VI:  v_cmp_ge_f16_e32 vcc,
    453 ; VI:  v_cmp_ge_f16_e32 vcc,
    454 define amdgpu_kernel void @fcmp_v2f16_ge(
    455     <2 x i32> addrspace(1)* %r,
    456     <2 x half> addrspace(1)* %a,
    457     <2 x half> addrspace(1)* %b) {
    458 entry:
    459   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
    460   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
    461   %r.val = fcmp oge <2 x half> %a.val, %b.val
    462   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
    463   store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
    464   ret void
    465 }
    466 
    467 ; GCN-LABEL: {{^}}fcmp_v2f16_o:
    468 ; SI:  v_cmp_o_f32_e32 vcc,
    469 ; SI:  v_cmp_o_f32_e32 vcc,
    470 
    471 ; VI:  v_cmp_o_f16_e32 vcc,
    472 ; VI:  v_cmp_o_f16_e32 vcc,
    473 define amdgpu_kernel void @fcmp_v2f16_o(
    474     <2 x i32> addrspace(1)* %r,
    475     <2 x half> addrspace(1)* %a,
    476     <2 x half> addrspace(1)* %b) {
    477 entry:
    478   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
    479   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
    480   %r.val = fcmp ord <2 x half> %a.val, %b.val
    481   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
    482   store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
    483   ret void
    484 }
    485 
    486 ; GCN-LABEL: {{^}}fcmp_v2f16_u:
    487 ; SI:  v_cmp_u_f32_e32 vcc,
    488 ; SI:  v_cmp_u_f32_e32 vcc,
    489 
    490 ; VI:  v_cmp_u_f16_e32 vcc,
    491 ; VI:  v_cmp_u_f16_e32 vcc,
    492 define amdgpu_kernel void @fcmp_v2f16_u(
    493     <2 x i32> addrspace(1)* %r,
    494     <2 x half> addrspace(1)* %a,
    495     <2 x half> addrspace(1)* %b) {
    496 entry:
    497   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
    498   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
    499   %r.val = fcmp uno <2 x half> %a.val, %b.val
    500   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
    501   store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
    502   ret void
    503 }
    504 
    505 ; GCN-LABEL: {{^}}fcmp_v2f16_nge
    506 ; SI:  v_cmp_nge_f32_e32 vcc,
    507 ; SI:  v_cmp_nge_f32_e32 vcc,
    508 
    509 ; VI:  v_cmp_nge_f16_e32 vcc,
    510 ; VI:  v_cmp_nge_f16_e32 vcc,
    511 define amdgpu_kernel void @fcmp_v2f16_nge(
    512     <2 x i32> addrspace(1)* %r,
    513     <2 x half> addrspace(1)* %a,
    514     <2 x half> addrspace(1)* %b) {
    515 entry:
    516   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
    517   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
    518   %r.val = fcmp ult <2 x half> %a.val, %b.val
    519   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
    520   store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
    521   ret void
    522 }
    523 
    524 ; GCN-LABEL: {{^}}fcmp_v2f16_nlg
    525 ; SI:  v_cmp_nlg_f32_e32 vcc
    526 ; SI:  v_cmp_nlg_f32_e32 vcc
    527 
    528 ; VI:  v_cmp_nlg_f16_e32 vcc
    529 ; VI:  v_cmp_nlg_f16_e32 vcc
    530 define amdgpu_kernel void @fcmp_v2f16_nlg(
    531     <2 x i32> addrspace(1)* %r,
    532     <2 x half> addrspace(1)* %a,
    533     <2 x half> addrspace(1)* %b) {
    534 entry:
    535   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
    536   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
    537   %r.val = fcmp ueq <2 x half> %a.val, %b.val
    538   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
    539   store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
    540   ret void
    541 }
    542 
    543 ; GCN-LABEL: {{^}}fcmp_v2f16_ngt
    544 ; SI:  v_cmp_ngt_f32_e32 vcc,
    545 ; SI:  v_cmp_ngt_f32_e32 vcc,
    546 
    547 ; VI:  v_cmp_ngt_f16_e32 vcc,
    548 ; VI:  v_cmp_ngt_f16_e32 vcc,
    549 define amdgpu_kernel void @fcmp_v2f16_ngt(
    550     <2 x i32> addrspace(1)* %r,
    551     <2 x half> addrspace(1)* %a,
    552     <2 x half> addrspace(1)* %b) {
    553 entry:
    554   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
    555   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
    556   %r.val = fcmp ule <2 x half> %a.val, %b.val
    557   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
    558   store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
    559   ret void
    560 }
    561 
    562 ; GCN-LABEL: {{^}}fcmp_v2f16_nle
    563 ; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
    564 ; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
    565 
    566 ; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
    567 ; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
    568 define amdgpu_kernel void @fcmp_v2f16_nle(
    569     <2 x i32> addrspace(1)* %r,
    570     <2 x half> addrspace(1)* %a,
    571     <2 x half> addrspace(1)* %b) {
    572 entry:
    573   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
    574   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
    575   %r.val = fcmp ugt <2 x half> %a.val, %b.val
    576   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
    577   store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
    578   ret void
    579 }
    580 
    581 ; GCN-LABEL: {{^}}fcmp_v2f16_neq
    582 ; SI:  v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
    583 ; SI:  v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
    584 
    585 ; VI:  v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
    586 ; VI:  v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
    587 define amdgpu_kernel void @fcmp_v2f16_neq(
    588     <2 x i32> addrspace(1)* %r,
    589     <2 x half> addrspace(1)* %a,
    590     <2 x half> addrspace(1)* %b) {
    591 entry:
    592   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
    593   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
    594   %r.val = fcmp une <2 x half> %a.val, %b.val
    595   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
    596   store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
    597   ret void
    598 }
    599 
    600 ; GCN-LABEL: {{^}}fcmp_v2f16_nlt
    601 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
    602 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
    603 ; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
    604 ; GCN-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
    605 ; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
    606 ; SI-DAG:  v_cmp_nlt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
    607 
    608 ; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
    609 ; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
    610 ; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
    611 ; SI-DAG:  v_cmp_nlt_f32_e32 vcc, v[[A_F32_1]], v[[B_F32_1]]
    612 ; VI-DAG:  v_cmp_nlt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
    613 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
    614 
    615 ; VI:  v_cmp_nlt_f16_e32 vcc, v[[A_F16_1]], v[[B_F16_1]]
    616 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
    617 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
    618 ; GCN: s_endpgm
    619 define amdgpu_kernel void @fcmp_v2f16_nlt(
    620     <2 x i32> addrspace(1)* %r,
    621     <2 x half> addrspace(1)* %a,
    622     <2 x half> addrspace(1)* %b) {
    623 entry:
    624   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
    625   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
    626   %r.val = fcmp uge <2 x half> %a.val, %b.val
    627   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
    628   store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
    629   ret void
    630 }
    631 
    632 declare half @llvm.fabs.f16(half) #1
    633 
    634 attributes #0 = { nounwind }
    635 attributes #1 = { nounwind readnone }
    636