Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
      2 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
      3 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
      4 
      5 
      6 ; FUNC-LABEL: {{^}}or_v2i32:
      7 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
      8 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
      9 
     10 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
     11 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
     12 define amdgpu_kernel void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
     13   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
     14   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
     15   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
     16   %result = or <2 x i32> %a, %b
     17   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
     18   ret void
     19 }
     20 
     21 ; FUNC-LABEL: {{^}}or_v4i32:
     22 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
     23 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
     24 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
     25 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
     26 
     27 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
     28 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
     29 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
     30 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
     31 define amdgpu_kernel void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
     32   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
     33   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
     34   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
     35   %result = or <4 x i32> %a, %b
     36   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
     37   ret void
     38 }
     39 
     40 ; FUNC-LABEL: {{^}}scalar_or_i32:
     41 ; SI: s_or_b32
     42 define amdgpu_kernel void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
     43   %or = or i32 %a, %b
     44   store i32 %or, i32 addrspace(1)* %out
     45   ret void
     46 }
     47 
     48 ; FUNC-LABEL: {{^}}vector_or_i32:
     49 ; SI: v_or_b32_e32 v{{[0-9]}}
     50 define amdgpu_kernel void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) {
     51   %loada = load i32, i32 addrspace(1)* %a
     52   %or = or i32 %loada, %b
     53   store i32 %or, i32 addrspace(1)* %out
     54   ret void
     55 }
     56 
     57 ; FUNC-LABEL: {{^}}scalar_or_literal_i32:
     58 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1869f
     59 define amdgpu_kernel void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) {
     60   %or = or i32 %a, 99999
     61   store i32 %or, i32 addrspace(1)* %out, align 4
     62   ret void
     63 }
     64 
     65 ; FUNC-LABEL: {{^}}scalar_or_literal_i64:
     66 ; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
     67 ; SI-DAG: s_or_b32 s[[RES_HI:[0-9]+]], s[[HI]], 0xf237b
     68 ; SI-DAG: s_or_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039
     69 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]]
     70 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]]
     71 define amdgpu_kernel void @scalar_or_literal_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
     72   %or = or i64 %a, 4261135838621753
     73   store i64 %or, i64 addrspace(1)* %out
     74   ret void
     75 }
     76 
     77 ; FUNC-LABEL: {{^}}scalar_or_literal_multi_use_i64:
     78 ; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
     79 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xf237b
     80 ; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039
     81 ; SI: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
     82 
     83 ; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]]
     84 ; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]]
     85 define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
     86   %or = or i64 %a, 4261135838621753
     87   store i64 %or, i64 addrspace(1)* %out
     88 
     89   %foo = add i64 %b, 4261135838621753
     90   store volatile i64 %foo, i64 addrspace(1)* undef
     91   ret void
     92 }
     93 
     94 ; FUNC-LABEL: {{^}}scalar_or_inline_imm_i64:
     95 ; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
     96 ; SI-NOT: or_b32
     97 ; SI: s_or_b32 s[[VAL_LO]], s[[VAL_LO]], 63
     98 ; SI-NOT: or_b32
     99 ; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[VAL_LO]]
    100 ; SI-NOT: or_b32
    101 ; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]]
    102 ; SI-NOT: or_b32
    103 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
    104 define amdgpu_kernel void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
    105   %or = or i64 %a, 63
    106   store i64 %or, i64 addrspace(1)* %out
    107   ret void
    108 }
    109 
    110 ; FUNC-LABEL: {{^}}scalar_or_inline_imm_multi_use_i64:
    111 ; SI-NOT: or_b32
    112 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 63
    113 ; SI-NOT: or_b32
    114 define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
    115   %or = or i64 %a, 63
    116   store i64 %or, i64 addrspace(1)* %out
    117   %foo = add i64 %b, 63
    118   store volatile i64 %foo, i64 addrspace(1)* undef
    119   ret void
    120 }
    121 
    122 ; FUNC-LABEL: {{^}}scalar_or_neg_inline_imm_i64:
    123 ; SI-DAG: s_load_dword [[VAL:s[0-9]+]]
    124 ; SI-DAG: s_or_b32 [[VAL]], [[VAL]], -8
    125 ; SI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], -1{{$}}
    126 ; SI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[VAL]]
    127 ; SI: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
    128 define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
    129   %or = or i64 %a, -8
    130   store i64 %or, i64 addrspace(1)* %out
    131   ret void
    132 }
    133 
    134 ; FUNC-LABEL: {{^}}vector_or_literal_i32:
    135 ; SI: v_or_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
    136 define amdgpu_kernel void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
    137   %loada = load i32, i32 addrspace(1)* %a, align 4
    138   %or = or i32 %loada, 65535
    139   store i32 %or, i32 addrspace(1)* %out, align 4
    140   ret void
    141 }
    142 
    143 ; FUNC-LABEL: {{^}}vector_or_inline_immediate_i32:
    144 ; SI: v_or_b32_e32 v{{[0-9]+}}, 4, v{{[0-9]+}}
    145 define amdgpu_kernel void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
    146   %loada = load i32, i32 addrspace(1)* %a, align 4
    147   %or = or i32 %loada, 4
    148   store i32 %or, i32 addrspace(1)* %out, align 4
    149   ret void
    150 }
    151 
    152 ; FUNC-LABEL: {{^}}scalar_or_i64:
    153 ; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
    154 ; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
    155 
    156 ; SI: s_or_b64
    157 define amdgpu_kernel void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
    158   %or = or i64 %a, %b
    159   store i64 %or, i64 addrspace(1)* %out
    160   ret void
    161 }
    162 
    163 ; FUNC-LABEL: {{^}}vector_or_i64:
    164 ; SI: v_or_b32_e32 v{{[0-9]}}
    165 ; SI: v_or_b32_e32 v{{[0-9]}}
    166 define amdgpu_kernel void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
    167   %loada = load i64, i64 addrspace(1)* %a, align 8
    168   %loadb = load i64, i64 addrspace(1)* %b, align 8
    169   %or = or i64 %loada, %loadb
    170   store i64 %or, i64 addrspace(1)* %out
    171   ret void
    172 }
    173 
    174 ; FUNC-LABEL: {{^}}scalar_vector_or_i64:
    175 ; SI: v_or_b32_e32 v{{[0-9]}}
    176 ; SI: v_or_b32_e32 v{{[0-9]}}
    177 define amdgpu_kernel void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) {
    178   %loada = load i64, i64 addrspace(1)* %a
    179   %or = or i64 %loada, %b
    180   store i64 %or, i64 addrspace(1)* %out
    181   ret void
    182 }
    183 
    184 ; FUNC-LABEL: {{^}}vector_or_i64_loadimm:
    185 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
    186 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xdf77987f, v[[LO_VREG]]
    187 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x146f, v[[HI_VREG]]
    188 ; SI: s_endpgm
    189 define amdgpu_kernel void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
    190   %loada = load i64, i64 addrspace(1)* %a, align 8
    191   %or = or i64 %loada, 22470723082367
    192   store i64 %or, i64 addrspace(1)* %out
    193   ret void
    194 }
    195 
    196 ; FIXME: The or 0 should really be removed.
    197 ; FUNC-LABEL: {{^}}vector_or_i64_imm:
    198 ; SI: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
    199 ; SI: v_or_b32_e32 v[[LO_RESULT:[0-9]+]], 8, v[[LO_VREG]]
    200 ; SI-NOT: v_or_b32_e32 {{v[0-9]+}}, 0
    201 ; SI: buffer_store_dwordx2 v{{\[}}[[LO_RESULT]]:[[HI_VREG]]{{\]}}
    202 ; SI: s_endpgm
    203 define amdgpu_kernel void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
    204   %loada = load i64, i64 addrspace(1)* %a, align 8
    205   %or = or i64 %loada, 8
    206   store i64 %or, i64 addrspace(1)* %out
    207   ret void
    208 }
    209 
    210 ; FUNC-LABEL: {{^}}vector_or_i64_neg_inline_imm:
    211 ; SI-DAG: buffer_load_dword v[[LO_VREG:[0-9]+]]
    212 ; SI-DAG: v_or_b32_e32 v[[RES_LO:[0-9]+]], -8, v[[LO_VREG]]
    213 ; SI-DAG: v_mov_b32_e32 v[[RES_HI:[0-9]+]], -1{{$}}
    214 ; SI: buffer_store_dwordx2 v{{\[}}[[RES_LO]]:[[RES_HI]]{{\]}}
    215 ; SI: s_endpgm
    216 define amdgpu_kernel void @vector_or_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
    217   %loada = load i64, i64 addrspace(1)* %a, align 8
    218   %or = or i64 %loada, -8
    219   store i64 %or, i64 addrspace(1)* %out
    220   ret void
    221 }
    222 
    223 ; FUNC-LABEL: {{^}}vector_or_i64_neg_literal:
    224 ; SI-DAG: buffer_load_dword v[[LO_VREG:[0-9]+]]
    225 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, -1{{$}}
    226 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffffff38, v[[LO_VREG]]
    227 ; SI: buffer_store_dwordx2
    228 ; SI: s_endpgm
    229 define amdgpu_kernel void @vector_or_i64_neg_literal(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
    230   %loada = load i64, i64 addrspace(1)* %a, align 8
    231   %or = or i64 %loada, -200
    232   store i64 %or, i64 addrspace(1)* %out
    233   ret void
    234 }
    235 
    236 ; FUNC-LABEL: {{^}}trunc_i64_or_to_i32:
    237 ; SI: s_load_dword s[[SREG0:[0-9]+]]
    238 ; SI: s_load_dword s[[SREG1:[0-9]+]]
    239 ; SI: s_or_b32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]]
    240 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]]
    241 ; SI: buffer_store_dword [[VRESULT]],
    242 define amdgpu_kernel void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
    243   %add = or i64 %b, %a
    244   %trunc = trunc i64 %add to i32
    245   store i32 %trunc, i32 addrspace(1)* %out, align 8
    246   ret void
    247 }
    248 
    249 ; FUNC-LABEL: {{^}}or_i1:
    250 ; EG: OR_INT * {{\** *}}T{{[0-9]+\.[XYZW], PS, PV\.[XYZW]}}
    251 
    252 ; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], vcc
    253 define amdgpu_kernel void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
    254   %a = load float, float addrspace(1)* %in0
    255   %b = load float, float addrspace(1)* %in1
    256   %acmp = fcmp oge float %a, 0.000000e+00
    257   %bcmp = fcmp oge float %b, 0.000000e+00
    258   %or = or i1 %acmp, %bcmp
    259   %result = zext i1 %or to i32
    260   store i32 %result, i32 addrspace(1)* %out
    261   ret void
    262 }
    263 
    264 ; FUNC-LABEL: {{^}}s_or_i1:
    265 ; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
    266 define amdgpu_kernel void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
    267   %cmp0 = icmp eq i32 %a, %b
    268   %cmp1 = icmp eq i32 %c, %d
    269   %or = or i1 %cmp0, %cmp1
    270   store i1 %or, i1 addrspace(1)* %out
    271   ret void
    272 }
    273