Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
      2 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s
      3 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
      4 
      5 ; Test expansion of scalar selects on vectors.
      6 ; Evergreen not enabled since it seems to be having problems with doubles.
      7 
      8 ; GCN-LABEL: {{^}}v_select_v2i8:
      9 ; SI: v_cndmask_b32
     10 ; SI-NOT: cndmask
     11 
     12 ; GFX9: v_cndmask_b32
     13 ; GFX9-NOT: cndmask
     14 
     15 ; This is worse when i16 is legal and packed is not because
     16 ; SelectionDAGBuilder for some reason changes the select type.
     17 ; VI: v_cndmask_b32
     18 ; VI: v_cndmask_b32
     19 define amdgpu_kernel void @v_select_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %a.ptr, <2 x i8> addrspace(1)* %b.ptr, i32 %c) #0 {
     20   %a = load <2 x i8>, <2 x i8> addrspace(1)* %a.ptr, align 2
     21   %b = load <2 x i8>, <2 x i8> addrspace(1)* %b.ptr, align 2
     22   %cmp = icmp eq i32 %c, 0
     23   %select = select i1 %cmp, <2 x i8> %a, <2 x i8> %b
     24   store <2 x i8> %select, <2 x i8> addrspace(1)* %out, align 2
     25   ret void
     26 }
     27 
     28 ; GCN-LABEL: {{^}}v_select_v4i8:
     29 ; GCN: v_cndmask_b32_e32
     30 ; GCN-NOT: cndmask
     31 define amdgpu_kernel void @v_select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %a.ptr, <4 x i8> addrspace(1)* %b.ptr, i32 %c) #0 {
     32   %a = load <4 x i8>, <4 x i8> addrspace(1)* %a.ptr
     33   %b = load <4 x i8>, <4 x i8> addrspace(1)* %b.ptr
     34   %cmp = icmp eq i32 %c, 0
     35   %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
     36   store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4
     37   ret void
     38 }
     39 
     40 ; GCN-LABEL: {{^}}v_select_v8i8:
     41 ; GCN: v_cndmask_b32_e32
     42 ; GCN: v_cndmask_b32_e32
     43 ; GCN-NOT: cndmask
     44 define amdgpu_kernel void @v_select_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %a.ptr, <8 x i8> addrspace(1)* %b.ptr, i32 %c) #0 {
     45   %a = load <8 x i8>, <8 x i8> addrspace(1)* %a.ptr
     46   %b = load <8 x i8>, <8 x i8> addrspace(1)* %b.ptr
     47   %cmp = icmp eq i32 %c, 0
     48   %select = select i1 %cmp, <8 x i8> %a, <8 x i8> %b
     49   store <8 x i8> %select, <8 x i8> addrspace(1)* %out, align 4
     50   ret void
     51 }
     52 
     53 ; GCN-LABEL: {{^}}v_select_v16i8:
     54 ; GCN: v_cndmask_b32_e32
     55 ; GCN: v_cndmask_b32_e32
     56 ; GCN: v_cndmask_b32_e32
     57 ; GCN: v_cndmask_b32_e32
     58 ; GCN-NOT: cndmask
     59 define amdgpu_kernel void @v_select_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %a.ptr, <16 x i8> addrspace(1)* %b.ptr, i32 %c) #0 {
     60   %a = load <16 x i8>, <16 x i8> addrspace(1)* %a.ptr
     61   %b = load <16 x i8>, <16 x i8> addrspace(1)* %b.ptr
     62   %cmp = icmp eq i32 %c, 0
     63   %select = select i1 %cmp, <16 x i8> %a, <16 x i8> %b
     64   store <16 x i8> %select, <16 x i8> addrspace(1)* %out, align 4
     65   ret void
     66 }
     67 
     68 ; GCN-LABEL: {{^}}select_v4i8:
     69 ; GCN: v_cndmask_b32
     70 ; GCN-NOT: cndmask
     71 define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) #0 {
     72   %cmp = icmp eq i8 %c, 0
     73   %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
     74   store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4
     75   ret void
     76 }
     77 
     78 ; GCN-LABEL: {{^}}select_v2i16:
     79 ; GFX89: s_load_dword
     80 ; GFX89: s_load_dword
     81 ; GFX89: s_load_dword
     82 ; GFX89: v_cndmask_b32
     83 ; GFX89-NOT: v_cndmask_b32
     84 
     85 ; SI: v_cndmask_b32_e32
     86 ; SI-NOT: v_cndmask_b32e
     87 define amdgpu_kernel void @select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b, i32 %c) #0 {
     88   %cmp = icmp eq i32 %c, 0
     89   %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b
     90   store <2 x i16> %select, <2 x i16> addrspace(1)* %out, align 4
     91   ret void
     92 }
     93 
     94 ; GCN-LABEL: {{^}}v_select_v2i16:
     95 ; GCN: buffer_load_dword v
     96 ; GCN: buffer_load_dword v
     97 ; GCN: v_cndmask_b32
     98 ; GCN-NOT: cndmask
     99 define amdgpu_kernel void @v_select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
    100   %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.ptr
    101   %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.ptr
    102   %cmp = icmp eq i32 %c, 0
    103   %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b
    104   store <2 x i16> %select, <2 x i16> addrspace(1)* %out, align 4
    105   ret void
    106 }
    107 
    108 ; GCN-LABEL: {{^}}v_select_v3i16:
    109 ; SI: v_cndmask_b32_e32
    110 ; SI: cndmask
    111 ; SI-NOT: cndmask
    112 
    113 ; GFX89: v_cndmask_b32_e32
    114 ; GFX89: cndmask
    115 ; GFX89-NOT: cndmask
    116 define amdgpu_kernel void @v_select_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
    117   %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.ptr
    118   %b = load <3 x i16>, <3 x i16> addrspace(1)* %b.ptr
    119   %cmp = icmp eq i32 %c, 0
    120   %select = select i1 %cmp, <3 x i16> %a, <3 x i16> %b
    121   store <3 x i16> %select, <3 x i16> addrspace(1)* %out, align 4
    122   ret void
    123 }
    124 
    125 ; GCN-LABEL: {{^}}v_select_v4i16:
    126 ; GCN: v_cndmask_b32_e32
    127 ; GCN: v_cndmask_b32_e32
    128 ; GCN-NOT: cndmask
    129 define amdgpu_kernel void @v_select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %a.ptr, <4 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
    130   %a = load <4 x i16>, <4 x i16> addrspace(1)* %a.ptr
    131   %b = load <4 x i16>, <4 x i16> addrspace(1)* %b.ptr
    132   %cmp = icmp eq i32 %c, 0
    133   %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b
    134   store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4
    135   ret void
    136 }
    137 
    138 ; GCN-LABEL: {{^}}v_select_v8i16:
    139 ; GCN: v_cndmask_b32_e32
    140 ; GCN: v_cndmask_b32_e32
    141 ; GCN: v_cndmask_b32_e32
    142 ; GCN: v_cndmask_b32_e32
    143 ; GCN-NOT: cndmask
    144 define amdgpu_kernel void @v_select_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %a.ptr, <8 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
    145   %a = load <8 x i16>, <8 x i16> addrspace(1)* %a.ptr
    146   %b = load <8 x i16>, <8 x i16> addrspace(1)* %b.ptr
    147   %cmp = icmp eq i32 %c, 0
    148   %select = select i1 %cmp, <8 x i16> %a, <8 x i16> %b
    149   store <8 x i16> %select, <8 x i16> addrspace(1)* %out, align 4
    150   ret void
    151 }
    152 
    153 ; FIXME: Expansion with bitwise operations may be better if doing a
    154 ; vector select with SGPR inputs.
    155 
    156 ; GCN-LABEL: {{^}}s_select_v2i32:
    157 ; GCN: v_cndmask_b32_e32
    158 ; GCN: v_cndmask_b32_e32
    159 ; GCN: buffer_store_dwordx2
    160 define amdgpu_kernel void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
    161   %cmp = icmp eq i32 %c, 0
    162   %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b
    163   store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8
    164   ret void
    165 }
    166 
    167 ; GCN-LABEL: {{^}}s_select_v4i32:
    168 ; GCN: v_cndmask_b32_e32
    169 ; GCN: v_cndmask_b32_e32
    170 ; GCN: v_cndmask_b32_e32
    171 ; GCN: v_cndmask_b32_e32
    172 ; GCN: buffer_store_dwordx4
    173 define amdgpu_kernel void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
    174   %cmp = icmp eq i32 %c, 0
    175   %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b
    176   store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16
    177   ret void
    178 }
    179 
    180 ; GCN-LABEL: {{^}}v_select_v4i32:
    181 ; GCN: buffer_load_dwordx4
    182 ; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32
    183 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
    184 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
    185 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
    186 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
    187 ; GCN: buffer_store_dwordx4
    188 define amdgpu_kernel void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 {
    189 bb:
    190   %tmp2 = icmp ult i32 %cond, 32
    191   %val = load <4 x i32>, <4 x i32> addrspace(1)* %in
    192   %tmp3 = select i1 %tmp2, <4 x i32> %val, <4 x i32> zeroinitializer
    193   store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 16
    194   ret void
    195 }
    196 
    197 ; GCN-LABEL: {{^}}select_v8i32:
    198 ; GCN: v_cndmask_b32_e32
    199 ; GCN: v_cndmask_b32_e32
    200 ; GCN: v_cndmask_b32_e32
    201 ; GCN: v_cndmask_b32_e32
    202 ; GCN: v_cndmask_b32_e32
    203 ; GCN: v_cndmask_b32_e32
    204 ; GCN: v_cndmask_b32_e32
    205 ; GCN: v_cndmask_b32_e32
    206 define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) #0 {
    207   %cmp = icmp eq i32 %c, 0
    208   %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b
    209   store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16
    210   ret void
    211 }
    212 
    213 ; GCN-LABEL: {{^}}s_select_v2f32:
    214 ; GCN-DAG: s_load_dwordx2 s{{\[}}[[ALO:[0-9]+]]:[[AHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
    215 ; GCN-DAG: s_load_dwordx2 s{{\[}}[[BLO:[0-9]+]]:[[BHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}}
    216 
    217 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]]
    218 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]]
    219 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]]
    220 ; GCN-DAG: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
    221 
    222 ; GCN-DAG: v_cndmask_b32_e32
    223 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]]
    224 ; GCN-DAG: v_cndmask_b32_e32
    225 ; GCN: buffer_store_dwordx2
    226 define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) #0 {
    227   %cmp = icmp eq i32 %c, 0
    228   %select = select i1 %cmp, <2 x float> %a, <2 x float> %b
    229   store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16
    230   ret void
    231 }
    232 
    233 ; GCN-LABEL: {{^}}s_select_v4f32:
    234 ; GCN: s_load_dwordx4
    235 ; GCN: s_load_dwordx4
    236 ; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
    237 
    238 ; GCN: v_cndmask_b32_e32
    239 ; GCN: v_cndmask_b32_e32
    240 ; GCN: v_cndmask_b32_e32
    241 ; GCN: v_cndmask_b32_e32
    242 
    243 ; GCN: buffer_store_dwordx4
    244 define amdgpu_kernel void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) #0 {
    245   %cmp = icmp eq i32 %c, 0
    246   %select = select i1 %cmp, <4 x float> %a, <4 x float> %b
    247   store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16
    248   ret void
    249 }
    250 
    251 ; GCN-LABEL: {{^}}v_select_v4f32:
    252 ; GCN: buffer_load_dwordx4
    253 ; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32
    254 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
    255 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
    256 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
    257 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
    258 ; GCN: buffer_store_dwordx4
    259 define amdgpu_kernel void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 {
    260 bb:
    261   %tmp2 = icmp ult i32 %cond, 32
    262   %val = load <4 x float>, <4 x float> addrspace(1)* %in
    263   %tmp3 = select i1 %tmp2, <4 x float> %val, <4 x float> zeroinitializer
    264   store <4 x float> %tmp3, <4 x float> addrspace(1)* %out, align 16
    265   ret void
    266 }
    267 
    268 ; GCN-LABEL: {{^}}select_v8f32:
    269 ; GCN: v_cndmask_b32_e32
    270 ; GCN: v_cndmask_b32_e32
    271 ; GCN: v_cndmask_b32_e32
    272 ; GCN: v_cndmask_b32_e32
    273 ; GCN: v_cndmask_b32_e32
    274 ; GCN: v_cndmask_b32_e32
    275 ; GCN: v_cndmask_b32_e32
    276 ; GCN: v_cndmask_b32_e32
    277 define amdgpu_kernel void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) #0 {
    278   %cmp = icmp eq i32 %c, 0
    279   %select = select i1 %cmp, <8 x float> %a, <8 x float> %b
    280   store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16
    281   ret void
    282 }
    283 
    284 ; GCN-LABEL: {{^}}select_v2f64:
    285 ; GCN: v_cndmask_b32_e32
    286 ; GCN: v_cndmask_b32_e32
    287 ; GCN: v_cndmask_b32_e32
    288 ; GCN: v_cndmask_b32_e32
    289 define amdgpu_kernel void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) #0 {
    290   %cmp = icmp eq i32 %c, 0
    291   %select = select i1 %cmp, <2 x double> %a, <2 x double> %b
    292   store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16
    293   ret void
    294 }
    295 
    296 ; GCN-LABEL: {{^}}select_v4f64:
    297 ; GCN: v_cndmask_b32_e32
    298 ; GCN: v_cndmask_b32_e32
    299 ; GCN: v_cndmask_b32_e32
    300 ; GCN: v_cndmask_b32_e32
    301 ; GCN: v_cndmask_b32_e32
    302 ; GCN: v_cndmask_b32_e32
    303 ; GCN: v_cndmask_b32_e32
    304 ; GCN: v_cndmask_b32_e32
    305 define amdgpu_kernel void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) #0 {
    306   %cmp = icmp eq i32 %c, 0
    307   %select = select i1 %cmp, <4 x double> %a, <4 x double> %b
    308   store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16
    309   ret void
    310 }
    311 
    312 ; GCN-LABEL: {{^}}select_v8f64:
    313 ; GCN: v_cndmask_b32_e32
    314 ; GCN: v_cndmask_b32_e32
    315 ; GCN: v_cndmask_b32_e32
    316 ; GCN: v_cndmask_b32_e32
    317 ; GCN: v_cndmask_b32_e32
    318 ; GCN: v_cndmask_b32_e32
    319 ; GCN: v_cndmask_b32_e32
    320 ; GCN: v_cndmask_b32_e32
    321 ; GCN: v_cndmask_b32_e32
    322 ; GCN: v_cndmask_b32_e32
    323 ; GCN: v_cndmask_b32_e32
    324 ; GCN: v_cndmask_b32_e32
    325 ; GCN: v_cndmask_b32_e32
    326 ; GCN: v_cndmask_b32_e32
    327 ; GCN: v_cndmask_b32_e32
    328 ; GCN: v_cndmask_b32_e32
    329 define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) #0 {
    330   %cmp = icmp eq i32 %c, 0
    331   %select = select i1 %cmp, <8 x double> %a, <8 x double> %b
    332   store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16
    333   ret void
    334 }
    335 
    336 ; GCN-LABEL: {{^}}v_select_v2f16:
    337 ; GCN: v_cndmask_b32
    338 ; GCN-NOT: cndmask
    339 define amdgpu_kernel void @v_select_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %a.ptr, <2 x half> addrspace(1)* %b.ptr, i32 %c) #0 {
    340   %a = load <2 x half>, <2 x half> addrspace(1)* %a.ptr
    341   %b = load <2 x half>, <2 x half> addrspace(1)* %b.ptr
    342   %cmp = icmp eq i32 %c, 0
    343   %select = select i1 %cmp, <2 x half> %a, <2 x half> %b
    344   store <2 x half> %select, <2 x half> addrspace(1)* %out, align 4
    345   ret void
    346 }
    347 
    348 ; GCN-LABEL: {{^}}v_select_v3f16:
    349 ; GCN: v_cndmask_b32_e32
    350 ; GCN: v_cndmask_b32_e32
    351 ; GCN-NOT: cndmask
    352 define amdgpu_kernel void @v_select_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %a.ptr, <3 x half> addrspace(1)* %b.ptr, i32 %c) #0 {
    353   %a = load <3 x half>, <3 x half> addrspace(1)* %a.ptr
    354   %b = load <3 x half>, <3 x half> addrspace(1)* %b.ptr
    355   %cmp = icmp eq i32 %c, 0
    356   %select = select i1 %cmp, <3 x half> %a, <3 x half> %b
    357   store <3 x half> %select, <3 x half> addrspace(1)* %out, align 4
    358   ret void
    359 }
    360 
    361 ; GCN-LABEL: {{^}}v_select_v4f16:
    362 ; GCN: v_cndmask_b32_e32
    363 ; GCN: v_cndmask_b32_e32
    364 ; GCN-NOT: cndmask
    365 define amdgpu_kernel void @v_select_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %a.ptr, <4 x half> addrspace(1)* %b.ptr, i32 %c) #0 {
    366   %a = load <4 x half>, <4 x half> addrspace(1)* %a.ptr
    367   %b = load <4 x half>, <4 x half> addrspace(1)* %b.ptr
    368   %cmp = icmp eq i32 %c, 0
    369   %select = select i1 %cmp, <4 x half> %a, <4 x half> %b
    370   store <4 x half> %select, <4 x half> addrspace(1)* %out, align 4
    371   ret void
    372 }
    373 
    374 ; Function Attrs: nounwind readnone
    375 declare i32 @llvm.amdgcn.workitem.id.x() #1
    376 
    377 attributes #0 = { nounwind }
    378 attributes #1 = { nounwind readnone }
    379