Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=SI %s
      2 
      3 @lds = addrspace(3) global [512 x float] undef, align 4
      4 @lds.f64 = addrspace(3) global [512 x double] undef, align 8
      5 
      6 
      7 ; SI-LABEL: @simple_read2st64_f32_0_1
      8 ; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
      9 ; SI: s_waitcnt lgkmcnt(0)
     10 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
     11 ; SI: buffer_store_dword [[RESULT]]
     12 ; SI: s_endpgm
     13 define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
     14   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
     15   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
     16   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
     17   %add.x = add nsw i32 %x.i, 64
     18   %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
     19   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
     20   %sum = fadd float %val0, %val1
     21   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
     22   store float %sum, float addrspace(1)* %out.gep, align 4
     23   ret void
     24 }
     25 
     26 ; SI-LABEL: @simple_read2st64_f32_1_2
     27 ; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
     28 ; SI: s_waitcnt lgkmcnt(0)
     29 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
     30 ; SI: buffer_store_dword [[RESULT]]
     31 ; SI: s_endpgm
     32 define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
     33   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
     34   %add.x.0 = add nsw i32 %x.i, 64
     35   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
     36   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
     37   %add.x.1 = add nsw i32 %x.i, 128
     38   %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1
     39   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
     40   %sum = fadd float %val0, %val1
     41   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
     42   store float %sum, float addrspace(1)* %out.gep, align 4
     43   ret void
     44 }
     45 
     46 ; SI-LABEL: @simple_read2st64_f32_max_offset
     47 ; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255
     48 ; SI: s_waitcnt lgkmcnt(0)
     49 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
     50 ; SI: buffer_store_dword [[RESULT]]
     51 ; SI: s_endpgm
     52 define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
     53   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
     54   %add.x.0 = add nsw i32 %x.i, 64
     55   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
     56   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
     57   %add.x.1 = add nsw i32 %x.i, 16320
     58   %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1
     59   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
     60   %sum = fadd float %val0, %val1
     61   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
     62   store float %sum, float addrspace(1)* %out.gep, align 4
     63   ret void
     64 }
     65 
     66 ; SI-LABEL: @simple_read2st64_f32_over_max_offset
     67 ; SI-NOT: ds_read2st64_b32
     68 ; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
     69 ; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
     70 ; SI: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]
     71 ; SI: s_endpgm
     72 define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
     73   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
     74   %add.x.0 = add nsw i32 %x.i, 64
     75   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
     76   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
     77   %add.x.1 = add nsw i32 %x.i, 16384
     78   %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1
     79   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
     80   %sum = fadd float %val0, %val1
     81   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
     82   store float %sum, float addrspace(1)* %out.gep, align 4
     83   ret void
     84 }
     85 
     86 ; SI-LABEL: @odd_invalid_read2st64_f32_0
     87 ; SI-NOT: ds_read2st64_b32
     88 ; SI: s_endpgm
     89 define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
     90   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
     91   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
     92   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
     93   %add.x = add nsw i32 %x.i, 63
     94   %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
     95   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
     96   %sum = fadd float %val0, %val1
     97   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
     98   store float %sum, float addrspace(1)* %out.gep, align 4
     99   ret void
    100 }
    101 
    102 ; SI-LABEL: @odd_invalid_read2st64_f32_1
    103 ; SI-NOT: ds_read2st64_b32
    104 ; SI: s_endpgm
    105 define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
    106   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
    107   %add.x.0 = add nsw i32 %x.i, 64
    108   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
    109   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
    110   %add.x.1 = add nsw i32 %x.i, 127
    111   %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1
    112   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
    113   %sum = fadd float %val0, %val1
    114   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
    115   store float %sum, float addrspace(1)* %out.gep, align 4
    116   ret void
    117 }
    118 
    119 ; SI-LABEL: @simple_read2st64_f64_0_1
    120 ; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
    121 ; SI: s_waitcnt lgkmcnt(0)
    122 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
    123 ; SI: buffer_store_dwordx2 [[RESULT]]
    124 ; SI: s_endpgm
    125 define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
    126   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
    127   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
    128   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
    129   %add.x = add nsw i32 %x.i, 64
    130   %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
    131   %val1 = load double, double addrspace(3)* %arrayidx1, align 8
    132   %sum = fadd double %val0, %val1
    133   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
    134   store double %sum, double addrspace(1)* %out.gep, align 8
    135   ret void
    136 }
    137 
    138 ; SI-LABEL: @simple_read2st64_f64_1_2
    139 ; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
    140 ; SI: s_waitcnt lgkmcnt(0)
    141 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
    142 ; SI: buffer_store_dwordx2 [[RESULT]]
    143 ; SI: s_endpgm
    144 define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
    145   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
    146   %add.x.0 = add nsw i32 %x.i, 64
    147   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
    148   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
    149   %add.x.1 = add nsw i32 %x.i, 128
    150   %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
    151   %val1 = load double, double addrspace(3)* %arrayidx1, align 8
    152   %sum = fadd double %val0, %val1
    153   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
    154   store double %sum, double addrspace(1)* %out.gep, align 8
    155   ret void
    156 }
    157 
    158 ; Alignment only
    159 
    160 ; SI-LABEL: @misaligned_read2st64_f64
    161 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
    162 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
    163 ; SI: s_endpgm
    164 define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
    165   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
    166   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
    167   %val0 = load double, double addrspace(3)* %arrayidx0, align 4
    168   %add.x = add nsw i32 %x.i, 64
    169   %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
    170   %val1 = load double, double addrspace(3)* %arrayidx1, align 4
    171   %sum = fadd double %val0, %val1
    172   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
    173   store double %sum, double addrspace(1)* %out.gep, align 4
    174   ret void
    175 }
    176 
    177 ; The maximum is not the usual 0xff because 0xff * 8 * 64 > 0xffff
    178 ; SI-LABEL: @simple_read2st64_f64_max_offset
    179 ; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4 offset1:127
    180 ; SI: s_waitcnt lgkmcnt(0)
    181 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
    182 ; SI: buffer_store_dwordx2 [[RESULT]]
    183 ; SI: s_endpgm
    184 define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
    185   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
    186   %add.x.0 = add nsw i32 %x.i, 256
    187   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
    188   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
    189   %add.x.1 = add nsw i32 %x.i, 8128
    190   %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
    191   %val1 = load double, double addrspace(3)* %arrayidx1, align 8
    192   %sum = fadd double %val0, %val1
    193   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
    194   store double %sum, double addrspace(1)* %out.gep, align 8
    195   ret void
    196 }
    197 
    198 ; SI-LABEL: @simple_read2st64_f64_over_max_offset
    199 ; SI-NOT: ds_read2st64_b64
    200 ; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
    201 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
    202 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
    203 ; SI: s_endpgm
    204 define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
    205   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
    206   %add.x.0 = add nsw i32 %x.i, 64
    207   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
    208   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
    209   %add.x.1 = add nsw i32 %x.i, 8192
    210   %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
    211   %val1 = load double, double addrspace(3)* %arrayidx1, align 8
    212   %sum = fadd double %val0, %val1
    213   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
    214   store double %sum, double addrspace(1)* %out.gep, align 8
    215   ret void
    216 }
    217 
    218 ; SI-LABEL: @invalid_read2st64_f64_odd_offset
    219 ; SI-NOT: ds_read2st64_b64
    220 ; SI: s_endpgm
    221 define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
    222   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
    223   %add.x.0 = add nsw i32 %x.i, 64
    224   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
    225   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
    226   %add.x.1 = add nsw i32 %x.i, 8129
    227   %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
    228   %val1 = load double, double addrspace(3)* %arrayidx1, align 8
    229   %sum = fadd double %val0, %val1
    230   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
    231   store double %sum, double addrspace(1)* %out.gep, align 8
    232   ret void
    233 }
    234 
    235 ; The stride of 8 elements is 8 * 8 bytes. We need to make sure the
    236 ; stride in elements, not bytes, is a multiple of 64.
    237 
    238 ; SI-LABEL: @byte_size_only_divisible_64_read2_f64
    239 ; SI-NOT: ds_read2st_b64
    240 ; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8
    241 ; SI: s_endpgm
    242 define void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
    243   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
    244   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
    245   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
    246   %add.x = add nsw i32 %x.i, 8
    247   %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
    248   %val1 = load double, double addrspace(3)* %arrayidx1, align 8
    249   %sum = fadd double %val0, %val1
    250   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
    251   store double %sum, double addrspace(1)* %out.gep, align 4
    252   ret void
    253 }
    254 
    255 ; Function Attrs: nounwind readnone
    256 declare i32 @llvm.r600.read.tgid.x() #1
    257 
    258 ; Function Attrs: nounwind readnone
    259 declare i32 @llvm.r600.read.tgid.y() #1
    260 
    261 ; Function Attrs: nounwind readnone
    262 declare i32 @llvm.r600.read.tidig.x() #1
    263 
    264 ; Function Attrs: nounwind readnone
    265 declare i32 @llvm.r600.read.tidig.y() #1
    266 
    267 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
    268 attributes #1 = { nounwind readnone }
    269