Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SI,FUNC %s
      2 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,VI,FUNC %s
      3 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=EG,FUNC %s
      4 
      5 ; mul24 and mad24 are affected
      6 
      7 ; FUNC-LABEL: {{^}}test_mul_v2i32:
      8 ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
      9 ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
     10 
     11 ; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
     12 ; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
     13 
     14 define amdgpu_kernel void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
     15   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
     16   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
     17   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
     18   %result = mul <2 x i32> %a, %b
     19   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
     20   ret void
     21 }
     22 
     23 ; FUNC-LABEL: {{^}}v_mul_v4i32:
     24 ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
     25 ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
     26 ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
     27 ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
     28 
     29 ; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
     30 ; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
     31 ; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
     32 ; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
     33 
     34 define amdgpu_kernel void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
     35   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
     36   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
     37   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
     38   %result = mul <4 x i32> %a, %b
     39   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
     40   ret void
     41 }
     42 
     43 ; FUNC-LABEL: {{^}}s_trunc_i64_mul_to_i32:
     44 ; GCN: s_load_dword
     45 ; GCN: s_load_dword
     46 ; GCN: s_mul_i32
     47 ; GCN: buffer_store_dword
     48 define amdgpu_kernel void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
     49   %mul = mul i64 %b, %a
     50   %trunc = trunc i64 %mul to i32
     51   store i32 %trunc, i32 addrspace(1)* %out, align 8
     52   ret void
     53 }
     54 
     55 ; FUNC-LABEL: {{^}}v_trunc_i64_mul_to_i32:
     56 ; GCN: s_load_dword
     57 ; GCN: s_load_dword
     58 ; GCN: v_mul_lo_i32
     59 ; GCN: buffer_store_dword
     60 define amdgpu_kernel void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
     61   %a = load i64, i64 addrspace(1)* %aptr, align 8
     62   %b = load i64, i64 addrspace(1)* %bptr, align 8
     63   %mul = mul i64 %b, %a
     64   %trunc = trunc i64 %mul to i32
     65   store i32 %trunc, i32 addrspace(1)* %out, align 8
     66   ret void
     67 }
     68 
     69 ; This 64-bit multiply should just use MUL_HI and MUL_LO, since the top
     70 ; 32-bits of both arguments are sign bits.
     71 ; FUNC-LABEL: {{^}}mul64_sext_c:
     72 ; EG-DAG: MULLO_INT
     73 ; EG-DAG: MULHI_INT
     74 ; GCN-DAG: s_mul_i32
     75 ; GCN-DAG: v_mul_hi_i32
     76 define amdgpu_kernel void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) {
     77 entry:
     78   %0 = sext i32 %in to i64
     79   %1 = mul i64 %0, 80
     80   store i64 %1, i64 addrspace(1)* %out
     81   ret void
     82 }
     83 
     84 ; FUNC-LABEL: {{^}}v_mul64_sext_c:
     85 ; EG-DAG: MULLO_INT
     86 ; EG-DAG: MULHI_INT
     87 ; GCN-DAG: v_mul_lo_i32
     88 ; GCN-DAG: v_mul_hi_i32
     89 ; GCN: s_endpgm
     90 define amdgpu_kernel void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
     91   %val = load i32, i32 addrspace(1)* %in, align 4
     92   %ext = sext i32 %val to i64
     93   %mul = mul i64 %ext, 80
     94   store i64 %mul, i64 addrspace(1)* %out, align 8
     95   ret void
     96 }
     97 
     98 ; FUNC-LABEL: {{^}}v_mul64_sext_inline_imm:
     99 ; GCN-DAG: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
    100 ; GCN-DAG: v_mul_hi_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
    101 ; GCN: s_endpgm
    102 define amdgpu_kernel void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
    103   %val = load i32, i32 addrspace(1)* %in, align 4
    104   %ext = sext i32 %val to i64
    105   %mul = mul i64 %ext, 9
    106   store i64 %mul, i64 addrspace(1)* %out, align 8
    107   ret void
    108 }
    109 
    110 ; FUNC-LABEL: {{^}}s_mul_i32:
    111 ; GCN: s_load_dword [[SRC0:s[0-9]+]],
    112 ; GCN: s_load_dword [[SRC1:s[0-9]+]],
    113 ; GCN: s_mul_i32 [[SRESULT:s[0-9]+]], [[SRC0]], [[SRC1]]
    114 ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
    115 ; GCN: buffer_store_dword [[VRESULT]],
    116 ; GCN: s_endpgm
    117 define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind {
    118   %mul = mul i32 %a, %b
    119   store i32 %mul, i32 addrspace(1)* %out, align 4
    120   ret void
    121 }
    122 
    123 ; FUNC-LABEL: {{^}}v_mul_i32:
    124 ; GCN: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
    125 define amdgpu_kernel void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
    126   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
    127   %a = load i32, i32 addrspace(1)* %in
    128   %b = load i32, i32 addrspace(1)* %b_ptr
    129   %result = mul i32 %a, %b
    130   store i32 %result, i32 addrspace(1)* %out
    131   ret void
    132 }
    133 
    134 ; A standard 64-bit multiply.  The expansion should be around 6 instructions.
    135 ; It would be difficult to match the expansion correctly without writing
    136 ; a really complicated list of FileCheck expressions.  I don't want
    137 ; to confuse people who may 'break' this test with a correct optimization,
    138 ; so this test just uses FUNC-LABEL to make sure the compiler does not
    139 ; crash with a 'failed to select' error.
    140 
    141 ; FUNC-LABEL: {{^}}s_mul_i64:
    142 define amdgpu_kernel void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
    143   %mul = mul i64 %a, %b
    144   store i64 %mul, i64 addrspace(1)* %out, align 8
    145   ret void
    146 }
    147 
    148 ; FUNC-LABEL: {{^}}v_mul_i64:
    149 ; GCN: v_mul_lo_i32
    150 define amdgpu_kernel void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
    151   %a = load i64, i64 addrspace(1)* %aptr, align 8
    152   %b = load i64, i64 addrspace(1)* %bptr, align 8
    153   %mul = mul i64 %a, %b
    154   store i64 %mul, i64 addrspace(1)* %out, align 8
    155   ret void
    156 }
    157 
    158 ; FUNC-LABEL: {{^}}mul32_in_branch:
    159 ; GCN: s_mul_i32
    160 define amdgpu_kernel void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) {
    161 entry:
    162   %0 = icmp eq i32 %a, 0
    163   br i1 %0, label %if, label %else
    164 
    165 if:
    166   %1 = load i32, i32 addrspace(1)* %in
    167   br label %endif
    168 
    169 else:
    170   %2 = mul i32 %a, %b
    171   br label %endif
    172 
    173 endif:
    174   %3 = phi i32 [%1, %if], [%2, %else]
    175   store i32 %3, i32 addrspace(1)* %out
    176   ret void
    177 }
    178 
    179 ; FUNC-LABEL: {{^}}mul64_in_branch:
    180 ; GCN-DAG: s_mul_i32
    181 ; GCN-DAG: v_mul_hi_u32
    182 ; GCN: s_endpgm
    183 define amdgpu_kernel void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
    184 entry:
    185   %0 = icmp eq i64 %a, 0
    186   br i1 %0, label %if, label %else
    187 
    188 if:
    189   %1 = load i64, i64 addrspace(1)* %in
    190   br label %endif
    191 
    192 else:
    193   %2 = mul i64 %a, %b
    194   br label %endif
    195 
    196 endif:
    197   %3 = phi i64 [%1, %if], [%2, %else]
    198   store i64 %3, i64 addrspace(1)* %out
    199   ret void
    200 }
    201 
    202 ; FIXME: Load dwordx4
    203 ; FUNC-LABEL: {{^}}s_mul_i128:
    204 ; GCN: s_load_dwordx4
    205 ; GCN: s_load_dwordx4
    206 
    207 ; SI: v_mul_hi_u32
    208 ; SI: v_mul_hi_u32
    209 ; SI: s_mul_i32
    210 ; SI: v_mul_hi_u32
    211 ; SI: s_mul_i32
    212 ; SI: s_mul_i32
    213 
    214 ; SI-DAG: s_mul_i32
    215 ; SI-DAG: v_mul_hi_u32
    216 ; SI-DAG: v_mul_hi_u32
    217 ; SI-DAG: s_mul_i32
    218 ; SI-DAG: s_mul_i32
    219 ; SI-DAG: v_mul_hi_u32
    220 
    221 ; VI: v_mul_hi_u32
    222 ; VI: s_mul_i32
    223 ; VI: s_mul_i32
    224 ; VI: v_mul_hi_u32
    225 ; VI: v_mul_hi_u32
    226 ; VI: s_mul_i32
    227 ; VI: v_mad_u64_u32
    228 ; VI: s_mul_i32
    229 ; VI: v_mad_u64_u32
    230 ; VI: s_mul_i32
    231 ; VI: s_mul_i32
    232 ; VI: v_mad_u64_u32
    233 ; VI: s_mul_i32
    234 
    235 
    236 ; GCN: buffer_store_dwordx4
    237 define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 {
    238   %mul = mul i128 %a, %b
    239   store i128 %mul, i128 addrspace(1)* %out
    240   ret void
    241 }
    242 
    243 ; FUNC-LABEL: {{^}}v_mul_i128:
    244 ; GCN: {{buffer|flat}}_load_dwordx4
    245 ; GCN: {{buffer|flat}}_load_dwordx4
    246 
    247 ; SI-DAG: v_mul_lo_i32
    248 ; SI-DAG: v_mul_hi_u32
    249 ; SI-DAG: v_mul_hi_u32
    250 ; SI-DAG: v_mul_lo_i32
    251 ; SI-DAG: v_mul_hi_u32
    252 ; SI-DAG: v_mul_hi_u32
    253 ; SI-DAG: v_mul_lo_i32
    254 ; SI-DAG: v_mul_lo_i32
    255 ; SI-DAG: v_add_i32_e32
    256 
    257 ; SI-DAG: v_mul_hi_u32
    258 ; SI-DAG: v_mul_lo_i32
    259 ; SI-DAG: v_mul_hi_u32
    260 ; SI-DAG: v_mul_lo_i32
    261 ; SI-DAG: v_mul_lo_i32
    262 ; SI-DAG: v_mul_lo_i32
    263 ; SI-DAG: v_mul_lo_i32
    264 ; SI-DAG: v_mul_lo_i32
    265 
    266 ; VI-DAG: v_mul_lo_i32
    267 ; VI-DAG: v_mul_hi_u32
    268 ; VI: v_mad_u64_u32
    269 ; VI: v_mad_u64_u32
    270 ; VI: v_mad_u64_u32
    271 
    272 ; GCN: {{buffer|flat}}_store_dwordx4
    273 define amdgpu_kernel void @v_mul_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %aptr, i128 addrspace(1)* %bptr) #0 {
    274   %tid = call i32 @llvm.r600.read.tidig.x()
    275   %gep.a = getelementptr inbounds i128, i128 addrspace(1)* %aptr, i32 %tid
    276   %gep.b = getelementptr inbounds i128, i128 addrspace(1)* %bptr, i32 %tid
    277   %gep.out = getelementptr inbounds i128, i128 addrspace(1)* %bptr, i32 %tid
    278   %a = load i128, i128 addrspace(1)* %gep.a
    279   %b = load i128, i128 addrspace(1)* %gep.b
    280   %mul = mul i128 %a, %b
    281   store i128 %mul, i128 addrspace(1)* %gep.out
    282   ret void
    283 }
    284 
    285 declare i32 @llvm.r600.read.tidig.x() #1
    286 
    287 attributes #0 = { nounwind }
    288 attributes #1 = { nounwind readnone}
    289