Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
      2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
      3 
      4 @lds = addrspace(3) global [512 x float] undef, align 4
      5 @lds.f64 = addrspace(3) global [512 x double] undef, align 8
      6 
      7 
      8 ; GCN-LABEL: @simple_read2st64_f32_0_1
      9 ; CI: s_mov_b32 m0
     10 ; GFX9-NOT: m0
     11 
     12 ; GCN: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
     13 ; GCN: s_waitcnt lgkmcnt(0)
     14 ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
     15 ; CI: buffer_store_dword [[RESULT]]
     16 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
     17 define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
     18   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
     19   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
     20   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
     21   %add.x = add nsw i32 %x.i, 64
     22   %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
     23   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
     24   %sum = fadd float %val0, %val1
     25   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
     26   store float %sum, float addrspace(1)* %out.gep, align 4
     27   ret void
     28 }
     29 
     30 ; GCN-LABEL: @simple_read2st64_f32_1_2
     31 ; CI: s_mov_b32 m0
     32 ; GFX9-NOT: m0
     33 
     34 ; GCN: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
     35 ; GCN: s_waitcnt lgkmcnt(0)
     36 ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
     37 ; CI: buffer_store_dword [[RESULT]]
     38 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
     39 define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
     40   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
     41   %add.x.0 = add nsw i32 %x.i, 64
     42   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
     43   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
     44   %add.x.1 = add nsw i32 %x.i, 128
     45   %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1
     46   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
     47   %sum = fadd float %val0, %val1
     48   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
     49   store float %sum, float addrspace(1)* %out.gep, align 4
     50   ret void
     51 }
     52 
     53 ; GCN-LABEL: @simple_read2st64_f32_max_offset
     54 ; CI: s_mov_b32 m0
     55 ; GFX9-NOT: m0
     56 
     57 ; GCN: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255
     58 ; GCN: s_waitcnt lgkmcnt(0)
     59 ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
     60 ; CI: buffer_store_dword [[RESULT]]
     61 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
     62 define amdgpu_kernel void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
     63   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
     64   %add.x.0 = add nsw i32 %x.i, 64
     65   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
     66   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
     67   %add.x.1 = add nsw i32 %x.i, 16320
     68   %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1
     69   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
     70   %sum = fadd float %val0, %val1
     71   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
     72   store float %sum, float addrspace(1)* %out.gep, align 4
     73   ret void
     74 }
     75 
     76 ; GCN-LABEL: @simple_read2st64_f32_over_max_offset
     77 ; CI: s_mov_b32 m0
     78 ; GFX9-NOT: m0
     79 
     80 ; GCN-NOT: ds_read2st64_b32
     81 ; GCN-DAG: v_add_{{i|u}}32_e32 [[BIGADD:v[0-9]+]], {{(vcc, )?}}0x10000, {{v[0-9]+}}
     82 ; GCN-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
     83 ; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]{{$}}
     84 ; GCN: s_endpgm
     85 define amdgpu_kernel void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
     86   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
     87   %add.x.0 = add nsw i32 %x.i, 64
     88   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
     89   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
     90   %add.x.1 = add nsw i32 %x.i, 16384
     91   %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1
     92   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
     93   %sum = fadd float %val0, %val1
     94   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
     95   store float %sum, float addrspace(1)* %out.gep, align 4
     96   ret void
     97 }
     98 
     99 ; GCN-LABEL: @odd_invalid_read2st64_f32_0
    100 ; CI: s_mov_b32 m0
    101 ; GFX9-NOT: m0
    102 
    103 ; GCN-NOT: ds_read2st64_b32
    104 ; GCN: s_endpgm
    105 define amdgpu_kernel void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
    106   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
    107   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
    108   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
    109   %add.x = add nsw i32 %x.i, 63
    110   %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
    111   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
    112   %sum = fadd float %val0, %val1
    113   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
    114   store float %sum, float addrspace(1)* %out.gep, align 4
    115   ret void
    116 }
    117 
    118 ; GCN-LABEL: @odd_invalid_read2st64_f32_1
    119 ; CI: s_mov_b32 m0
    120 ; GFX9-NOT: m0
    121 
    122 ; GCN-NOT: ds_read2st64_b32
    123 ; GCN: s_endpgm
    124 define amdgpu_kernel void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
    125   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
    126   %add.x.0 = add nsw i32 %x.i, 64
    127   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
    128   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
    129   %add.x.1 = add nsw i32 %x.i, 127
    130   %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1
    131   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
    132   %sum = fadd float %val0, %val1
    133   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
    134   store float %sum, float addrspace(1)* %out.gep, align 4
    135   ret void
    136 }
    137 
    138 ; GCN-LABEL: @simple_read2st64_f64_0_1
    139 ; CI: s_mov_b32 m0
    140 ; GFX9-NOT: m0
    141 
    142 ; GCN: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
    143 ; GCN: s_waitcnt lgkmcnt(0)
    144 ; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
    145 ; CI: buffer_store_dwordx2 [[RESULT]]
    146 ; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
    147 define amdgpu_kernel void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
    148   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
    149   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
    150   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
    151   %add.x = add nsw i32 %x.i, 64
    152   %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
    153   %val1 = load double, double addrspace(3)* %arrayidx1, align 8
    154   %sum = fadd double %val0, %val1
    155   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
    156   store double %sum, double addrspace(1)* %out.gep, align 8
    157   ret void
    158 }
    159 
    160 ; GCN-LABEL: @simple_read2st64_f64_1_2
    161 ; CI: s_mov_b32 m0
    162 ; GFX9-NOT: m0
    163 
    164 ; GCN: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
    165 ; GCN: s_waitcnt lgkmcnt(0)
    166 ; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
    167 
    168 ; CI: buffer_store_dwordx2 [[RESULT]]
    169 ; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
    170 define amdgpu_kernel void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
    171   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
    172   %add.x.0 = add nsw i32 %x.i, 64
    173   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
    174   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
    175   %add.x.1 = add nsw i32 %x.i, 128
    176   %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
    177   %val1 = load double, double addrspace(3)* %arrayidx1, align 8
    178   %sum = fadd double %val0, %val1
    179   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
    180   store double %sum, double addrspace(1)* %out.gep, align 8
    181   ret void
    182 }
    183 
    184 ; Alignment only
    185 
    186 ; GCN-LABEL: @misaligned_read2st64_f64
    187 ; CI: s_mov_b32 m0
    188 ; GFX9-NOT: m0
    189 
    190 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
    191 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
    192 ; GCN: s_endpgm
    193 define amdgpu_kernel void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
    194   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
    195   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
    196   %val0 = load double, double addrspace(3)* %arrayidx0, align 4
    197   %add.x = add nsw i32 %x.i, 64
    198   %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
    199   %val1 = load double, double addrspace(3)* %arrayidx1, align 4
    200   %sum = fadd double %val0, %val1
    201   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
    202   store double %sum, double addrspace(1)* %out.gep, align 4
    203   ret void
    204 }
    205 
    206 ; The maximum is not the usual 0xff because 0xff * 8 * 64 > 0xffff
    207 ; GCN-LABEL: @simple_read2st64_f64_max_offset
    208 ; CI: s_mov_b32 m0
    209 ; GFX9-NOT: m0
    210 
    211 ; GCN: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4 offset1:127
    212 ; GCN: s_waitcnt lgkmcnt(0)
    213 ; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
    214 
    215 ; CI: buffer_store_dwordx2 [[RESULT]]
    216 ; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
    217 define amdgpu_kernel void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
    218   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
    219   %add.x.0 = add nsw i32 %x.i, 256
    220   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
    221   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
    222   %add.x.1 = add nsw i32 %x.i, 8128
    223   %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
    224   %val1 = load double, double addrspace(3)* %arrayidx1, align 8
    225   %sum = fadd double %val0, %val1
    226   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
    227   store double %sum, double addrspace(1)* %out.gep, align 8
    228   ret void
    229 }
    230 
    231 ; GCN-LABEL: @simple_read2st64_f64_over_max_offset
    232 ; CI: s_mov_b32 m0
    233 ; GFX9-NOT: m0
    234 
    235 ; GCN-NOT: ds_read2st64_b64
    236 ; GCN-DAG: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
    237 ; GCN-DAG: v_add_{{i|u}}32_e32 [[BIGADD:v[0-9]+]], {{(vcc, )?}}0x10000, {{v[0-9]+}}
    238 ; GCN: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
    239 ; GCN: s_endpgm
    240 define amdgpu_kernel void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
    241   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
    242   %add.x.0 = add nsw i32 %x.i, 64
    243   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
    244   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
    245   %add.x.1 = add nsw i32 %x.i, 8192
    246   %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
    247   %val1 = load double, double addrspace(3)* %arrayidx1, align 8
    248   %sum = fadd double %val0, %val1
    249   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
    250   store double %sum, double addrspace(1)* %out.gep, align 8
    251   ret void
    252 }
    253 
    254 ; GCN-LABEL: @invalid_read2st64_f64_odd_offset
    255 ; CI: s_mov_b32 m0
    256 ; GFX9-NOT: m0
    257 
    258 ; GCN-NOT: ds_read2st64_b64
    259 ; GCN: s_endpgm
    260 define amdgpu_kernel void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
    261   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
    262   %add.x.0 = add nsw i32 %x.i, 64
    263   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
    264   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
    265   %add.x.1 = add nsw i32 %x.i, 8129
    266   %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
    267   %val1 = load double, double addrspace(3)* %arrayidx1, align 8
    268   %sum = fadd double %val0, %val1
    269   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
    270   store double %sum, double addrspace(1)* %out.gep, align 8
    271   ret void
    272 }
    273 
    274 ; The stride of 8 elements is 8 * 8 bytes. We need to make sure the
    275 ; stride in elements, not bytes, is a multiple of 64.
    276 
    277 ; GCN-LABEL: @byte_size_only_divisible_64_read2_f64
    278 ; CI: s_mov_b32 m0
    279 ; GFX9-NOT: m0
    280 
    281 ; GCN-NOT: ds_read2st_b64
    282 ; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8
    283 ; GCN: s_endpgm
    284 define amdgpu_kernel void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
    285   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
    286   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
    287   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
    288   %add.x = add nsw i32 %x.i, 8
    289   %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
    290   %val1 = load double, double addrspace(3)* %arrayidx1, align 8
    291   %sum = fadd double %val0, %val1
    292   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
    293   store double %sum, double addrspace(1)* %out.gep, align 4
    294   ret void
    295 }
    296 
    297 declare i32 @llvm.amdgcn.workitem.id.x() #1
    298 declare i32 @llvm.amdgcn.workitem.id.y() #1
    299 
    300 attributes #0 = { nounwind }
    301 attributes #1 = { nounwind readnone }
    302