Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
      2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
      3 
      4 ; GCN-LABEL: {{^}}select_f16:
      5 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
      6 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
      7 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
      8 ; GCN: buffer_load_ushort v[[D_F16:[0-9]+]]
      9 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
     10 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
     11 ; SI-DAG:  v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
     12 ; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
     13 ; SI-DAG:  v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
     14 ; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]]
     15 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
     16 ; VI:  v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
     17 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
     18 ; GCN: buffer_store_short v[[R_F16]]
     19 ; GCN: s_endpgm
     20 define amdgpu_kernel void @select_f16(
     21     half addrspace(1)* %r,
     22     half addrspace(1)* %a,
     23     half addrspace(1)* %b,
     24     half addrspace(1)* %c,
     25     half addrspace(1)* %d) {
     26 entry:
     27   %a.val = load volatile half, half addrspace(1)* %a
     28   %b.val = load volatile half, half addrspace(1)* %b
     29   %c.val = load volatile half, half addrspace(1)* %c
     30   %d.val = load volatile half, half addrspace(1)* %d
     31   %fcmp = fcmp olt half %a.val, %b.val
     32   %r.val = select i1 %fcmp, half %c.val, half %d.val
     33   store half %r.val, half addrspace(1)* %r
     34   ret void
     35 }
     36 
     37 ; GCN-LABEL: {{^}}select_f16_imm_a:
     38 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
     39 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
     40 ; GCN: buffer_load_ushort v[[D_F16:[0-9]+]]
     41 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
     42 ; SI-DAG:  v_cmp_lt_f32_e32 vcc, 0.5, v[[B_F32]]
     43 ; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
     44 ; SI-DAG:  v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
     45 ; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]]
     46 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
     47 ; VI:  v_cmp_lt_f16_e32 vcc, 0.5, v[[B_F16]]
     48 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
     49 ; GCN: buffer_store_short v[[R_F16]]
     50 ; GCN: s_endpgm
     51 define amdgpu_kernel void @select_f16_imm_a(
     52     half addrspace(1)* %r,
     53     half addrspace(1)* %b,
     54     half addrspace(1)* %c,
     55     half addrspace(1)* %d) {
     56 entry:
     57   %b.val = load volatile half, half addrspace(1)* %b
     58   %c.val = load volatile half, half addrspace(1)* %c
     59   %d.val = load volatile half, half addrspace(1)* %d
     60   %fcmp = fcmp olt half 0xH3800, %b.val
     61   %r.val = select i1 %fcmp, half %c.val, half %d.val
     62   store half %r.val, half addrspace(1)* %r
     63   ret void
     64 }
     65 
     66 ; GCN-LABEL: {{^}}select_f16_imm_b:
     67 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
     68 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
     69 ; GCN: buffer_load_ushort v[[D_F16:[0-9]+]]
     70 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
     71 ; SI-DAG:  v_cmp_gt_f32_e32 vcc, 0.5, v[[A_F32]]
     72 ; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
     73 ; SI-DAG:  v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
     74 ; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]]
     75 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
     76 
     77 ; VI:  v_cmp_gt_f16_e32 vcc, 0.5, v[[A_F16]]
     78 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
     79 ; GCN: buffer_store_short v[[R_F16]]
     80 ; GCN: s_endpgm
     81 define amdgpu_kernel void @select_f16_imm_b(
     82     half addrspace(1)* %r,
     83     half addrspace(1)* %a,
     84     half addrspace(1)* %c,
     85     half addrspace(1)* %d) {
     86 entry:
     87   %a.val = load volatile half, half addrspace(1)* %a
     88   %c.val = load volatile half, half addrspace(1)* %c
     89   %d.val = load volatile half, half addrspace(1)* %d
     90   %fcmp = fcmp olt half %a.val, 0xH3800
     91   %r.val = select i1 %fcmp, half %c.val, half %d.val
     92   store half %r.val, half addrspace(1)* %r
     93   ret void
     94 }
     95 
     96 ; GCN-LABEL: {{^}}select_f16_imm_c:
     97 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
     98 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
     99 ; GCN: buffer_load_ushort v[[D_F16:[0-9]+]]
    100 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
    101 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
    102 ; SI:  v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
    103 ; SI:  v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
    104 ; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[D_F32]], vcc
    105 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
    106 
    107 ; VI:  v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x3800{{$}}
    108 ; VI:  v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
    109 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[C_F16]], v[[D_F16]], vcc
    110 ; GCN: buffer_store_short v[[R_F16]]
    111 ; GCN: s_endpgm
    112 define amdgpu_kernel void @select_f16_imm_c(
    113     half addrspace(1)* %r,
    114     half addrspace(1)* %a,
    115     half addrspace(1)* %b,
    116     half addrspace(1)* %d) {
    117 entry:
    118   %a.val = load volatile half, half addrspace(1)* %a
    119   %b.val = load volatile half, half addrspace(1)* %b
    120   %d.val = load volatile half, half addrspace(1)* %d
    121   %fcmp = fcmp olt half %a.val, %b.val
    122   %r.val = select i1 %fcmp, half 0xH3800, half %d.val
    123   store half %r.val, half addrspace(1)* %r
    124   ret void
    125 }
    126 
    127 ; GCN-LABEL: {{^}}select_f16_imm_d:
    128 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
    129 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
    130 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
    131 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
    132 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
    133 ; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
    134 ; SI:  v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
    135 ; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[C_F32]]
    136 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
    137 ; VI:  v_mov_b32_e32 v[[D_F16:[0-9]+]], 0x3800{{$}}
    138 ; VI:  v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
    139 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
    140 ; GCN: buffer_store_short v[[R_F16]]
    141 ; GCN: s_endpgm
    142 define amdgpu_kernel void @select_f16_imm_d(
    143     half addrspace(1)* %r,
    144     half addrspace(1)* %a,
    145     half addrspace(1)* %b,
    146     half addrspace(1)* %c) {
    147 entry:
    148   %a.val = load volatile half, half addrspace(1)* %a
    149   %b.val = load volatile half, half addrspace(1)* %b
    150   %c.val = load volatile half, half addrspace(1)* %c
    151   %fcmp = fcmp olt half %a.val, %b.val
    152   %r.val = select i1 %fcmp, half %c.val, half 0xH3800
    153   store half %r.val, half addrspace(1)* %r
    154   ret void
    155 }
    156 
    157 ; GCN-LABEL: {{^}}select_v2f16:
    158 ; SI: v_cvt_f32_f16_e32
    159 ; SI: v_cvt_f32_f16_e32
    160 ; SI: v_cvt_f32_f16_e32
    161 ; SI: v_cvt_f32_f16_e32
    162 ; SI: v_cmp_lt_f32_e32
    163 ; SI: v_cndmask_b32_e32
    164 ; SI: v_cmp_lt_f32_e32
    165 ; SI: v_cndmask_b32_e32
    166 ; SI: v_cvt_f16_f32_e32
    167 ; SI: v_cvt_f16_f32_e32
    168 
    169 ; VI: v_cmp_lt_f16_e32
    170 ; VI: v_cndmask_b32_e32
    171 ; VI: v_cmp_lt_f16_e32
    172 ; VI: v_cndmask_b32_e32
    173 
    174 ; GCN: s_endpgm
    175 define amdgpu_kernel void @select_v2f16(
    176     <2 x half> addrspace(1)* %r,
    177     <2 x half> addrspace(1)* %a,
    178     <2 x half> addrspace(1)* %b,
    179     <2 x half> addrspace(1)* %c,
    180     <2 x half> addrspace(1)* %d) {
    181 entry:
    182   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
    183   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
    184   %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
    185   %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
    186   %fcmp = fcmp olt <2 x half> %a.val, %b.val
    187   %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
    188   store <2 x half> %r.val, <2 x half> addrspace(1)* %r
    189   ret void
    190 }
    191 
    192 ; GCN-LABEL: {{^}}select_v2f16_imm_a:
    193 ; SI:  v_cvt_f32_f16_e32
    194 ; SI:  v_cvt_f32_f16_e32
    195 ; SI:  v_cvt_f32_f16_e32
    196 ; SI:  v_cvt_f32_f16_e32
    197 ; SI:  v_cvt_f32_f16_e32
    198 ; SI:  v_cvt_f32_f16_e32
    199 
    200 ; SI: v_cmp_gt_f32_e32
    201 ; SI: v_cndmask_b32_e32
    202   ; SI: v_cmp_lt_f32_e32 vcc, 0.5
    203 ; SI: v_cndmask_b32_e32
    204 
    205 ; VI: v_cmp_lt_f16_e32
    206 ; VI: v_cndmask_b32_e32
    207 ; VI: v_cmp_gt_f16_e32
    208 ; VI: v_cndmask_b32_e32
    209 
    210 ; SI:  v_cvt_f16_f32_e32
    211 ; SI:  v_cvt_f16_f32_e32
    212 ; GCN: s_endpgm
    213 define amdgpu_kernel void @select_v2f16_imm_a(
    214     <2 x half> addrspace(1)* %r,
    215     <2 x half> addrspace(1)* %b,
    216     <2 x half> addrspace(1)* %c,
    217     <2 x half> addrspace(1)* %d) {
    218 entry:
    219   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
    220   %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
    221   %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
    222   %fcmp = fcmp olt <2 x half> <half 0xH3800, half 0xH3900>, %b.val
    223   %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
    224   store <2 x half> %r.val, <2 x half> addrspace(1)* %r
    225   ret void
    226 }
    227 
    228 ; GCN-LABEL: {{^}}select_v2f16_imm_b:
    229 ; SI:  v_cvt_f32_f16_e32
    230 ; SI:  v_cvt_f32_f16_e32
    231 ; SI:  v_cvt_f32_f16_e32
    232 ; SI:  v_cvt_f32_f16_e32
    233 ; SI:  v_cvt_f32_f16_e32
    234 ; SI:  v_cvt_f32_f16_e32
    235 
    236 ; SI: v_cmp_lt_f32_e32
    237 ; SI: v_cndmask_b32_e32
    238 ; SI: v_cmp_gt_f32_e32 vcc, 0.5
    239 ; SI: v_cndmask_b32_e32
    240 
    241 ; VI: v_cmp_gt_f16_e32
    242 ; VI: v_cndmask_b32_e32
    243 ; VI: v_cmp_lt_f16_e32
    244 ; VI: v_cndmask_b32_e32
    245 
    246 ; SI:  v_cvt_f16_f32_e32
    247 ; SI:  v_cvt_f16_f32_e32
    248 ; GCN: s_endpgm
    249 define amdgpu_kernel void @select_v2f16_imm_b(
    250     <2 x half> addrspace(1)* %r,
    251     <2 x half> addrspace(1)* %a,
    252     <2 x half> addrspace(1)* %c,
    253     <2 x half> addrspace(1)* %d) {
    254 entry:
    255   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
    256   %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
    257   %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
    258   %fcmp = fcmp olt <2 x half> %a.val, <half 0xH3800, half 0xH3900>
    259   %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
    260   store <2 x half> %r.val, <2 x half> addrspace(1)* %r
    261   ret void
    262 }
    263 
    264 ; GCN-LABEL: {{^}}select_v2f16_imm_c:
    265 ; SI:  v_cvt_f32_f16_e32
    266 ; SI:  v_cvt_f32_f16_e32
    267 ; SI:  v_cvt_f32_f16_e32
    268 ; SI:  v_cvt_f32_f16_e32
    269 ; SI:  v_cvt_f32_f16_e32
    270 ; SI:  v_cvt_f32_f16_e32
    271 
    272 ; SI: v_cmp_nlt_f32_e32
    273 ; SI: v_cndmask_b32_e32
    274 ; SI: v_cmp_nlt_f32_e32
    275 ; SI-DAG: v_cndmask_b32_e32
    276 
    277 ; VI: v_cmp_nlt_f16_e32
    278 ; VI: v_cndmask_b32_e32
    279 
    280 ; VI: v_cmp_nlt_f16_e32
    281 ; VI: v_cndmask_b32_e32
    282 
    283 ; SI-DAG: v_cvt_f16_f32_e32
    284 ; SI: v_cvt_f16_f32_e32
    285 ; GCN: s_endpgm
    286 define amdgpu_kernel void @select_v2f16_imm_c(
    287     <2 x half> addrspace(1)* %r,
    288     <2 x half> addrspace(1)* %a,
    289     <2 x half> addrspace(1)* %b,
    290     <2 x half> addrspace(1)* %d) {
    291 entry:
    292   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
    293   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
    294   %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
    295   %fcmp = fcmp olt <2 x half> %a.val, %b.val
    296   %r.val = select <2 x i1> %fcmp, <2 x half> <half 0xH3800, half 0xH3900>, <2 x half> %d.val
    297   store <2 x half> %r.val, <2 x half> addrspace(1)* %r
    298   ret void
    299 }
    300 
    301 ; GCN-LABEL: {{^}}select_v2f16_imm_d:
    302 ; SI:  v_cvt_f32_f16_e32
    303 ; SI:  v_cvt_f32_f16_e32
    304 ; SI:  v_cvt_f32_f16_e32
    305 ; SI:  v_cvt_f32_f16_e32
    306 ; SI:  v_cvt_f32_f16_e32
    307 ; SI:  v_cvt_f32_f16_e32
    308 
    309 ; SI:  v_cmp_lt_f32_e32
    310 ; SI: v_cndmask_b32
    311 ; SI:  v_cmp_lt_f32_e32
    312 ; SI: v_cndmask_b32
    313 
    314 ; VI:  v_cmp_lt_f16_e32
    315 ; VI: v_cndmask_b32
    316 ; VI:  v_cmp_lt_f16_e32
    317 ; VI: v_cndmask_b32
    318 
    319 ; SI:  v_cvt_f16_f32_e32
    320 ; SI:  v_cvt_f16_f32_e32
    321 ; GCN: s_endpgm
    322 define amdgpu_kernel void @select_v2f16_imm_d(
    323     <2 x half> addrspace(1)* %r,
    324     <2 x half> addrspace(1)* %a,
    325     <2 x half> addrspace(1)* %b,
    326     <2 x half> addrspace(1)* %c) {
    327 entry:
    328   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
    329   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
    330   %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
    331   %fcmp = fcmp olt <2 x half> %a.val, %b.val
    332   %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> <half 0xH3800, half 0xH3900>
    333   store <2 x half> %r.val, <2 x half> addrspace(1)* %r
    334   ret void
    335 }
    336