Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
      2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
      3 
      4 ; GCN-LABEL: ds_read32_combine_stride_400:
      5 ; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
      6 ; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
      7 
      8 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
      9 ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
     10 ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
     11 
     12 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
     13 ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
     14 ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]]
     15 
     16 ; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:100
     17 ; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:100
     18 ; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:100
     19 ; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:100
     20 define amdgpu_kernel void @ds_read32_combine_stride_400(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
     21 bb:
     22   %tmp = load float, float addrspace(3)* %arg, align 4
     23   %tmp2 = fadd float %tmp, 0.000000e+00
     24   %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 100
     25   %tmp4 = load float, float addrspace(3)* %tmp3, align 4
     26   %tmp5 = fadd float %tmp2, %tmp4
     27   %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200
     28   %tmp7 = load float, float addrspace(3)* %tmp6, align 4
     29   %tmp8 = fadd float %tmp5, %tmp7
     30   %tmp9 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300
     31   %tmp10 = load float, float addrspace(3)* %tmp9, align 4
     32   %tmp11 = fadd float %tmp8, %tmp10
     33   %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
     34   %tmp13 = load float, float addrspace(3)* %tmp12, align 4
     35   %tmp14 = fadd float %tmp11, %tmp13
     36   %tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500
     37   %tmp16 = load float, float addrspace(3)* %tmp15, align 4
     38   %tmp17 = fadd float %tmp14, %tmp16
     39   %tmp18 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600
     40   %tmp19 = load float, float addrspace(3)* %tmp18, align 4
     41   %tmp20 = fadd float %tmp17, %tmp19
     42   %tmp21 = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
     43   %tmp22 = load float, float addrspace(3)* %tmp21, align 4
     44   %tmp23 = fadd float %tmp20, %tmp22
     45   store float %tmp23, float *%arg1, align 4
     46   ret void
     47 }
     48 
     49 ; GCN-LABEL: ds_read32_combine_stride_400_back:
     50 ; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
     51 ; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
     52 
     53 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
     54 ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
     55 ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
     56 
     57 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
     58 ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
     59 ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]]
     60 
     61 ; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:100
     62 ; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:100
     63 ; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:100
     64 ; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:100
     65 define amdgpu_kernel void @ds_read32_combine_stride_400_back(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
     66 bb:
     67   %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
     68   %tmp2 = load float, float addrspace(3)* %tmp, align 4
     69   %tmp3 = fadd float %tmp2, 0.000000e+00
     70   %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600
     71   %tmp5 = load float, float addrspace(3)* %tmp4, align 4
     72   %tmp6 = fadd float %tmp3, %tmp5
     73   %tmp7 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500
     74   %tmp8 = load float, float addrspace(3)* %tmp7, align 4
     75   %tmp9 = fadd float %tmp6, %tmp8
     76   %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
     77   %tmp11 = load float, float addrspace(3)* %tmp10, align 4
     78   %tmp12 = fadd float %tmp9, %tmp11
     79   %tmp13 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300
     80   %tmp14 = load float, float addrspace(3)* %tmp13, align 4
     81   %tmp15 = fadd float %tmp12, %tmp14
     82   %tmp16 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200
     83   %tmp17 = load float, float addrspace(3)* %tmp16, align 4
     84   %tmp18 = fadd float %tmp15, %tmp17
     85   %tmp19 = getelementptr inbounds float, float addrspace(3)* %arg, i32 100
     86   %tmp20 = load float, float addrspace(3)* %tmp19, align 4
     87   %tmp21 = fadd float %tmp18, %tmp20
     88   %tmp22 = load float, float addrspace(3)* %arg, align 4
     89   %tmp23 = fadd float %tmp21, %tmp22
     90   store float %tmp23, float *%arg1, align 4
     91   ret void
     92 }
     93 
     94 ; GCN-LABEL: ds_read32_combine_stride_8192:
     95 ; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
     96 ; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
     97 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:32
     98 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:64 offset1:96
     99 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:128 offset1:160
    100 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:192 offset1:224
    101 define amdgpu_kernel void @ds_read32_combine_stride_8192(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
    102 bb:
    103   %tmp = load float, float addrspace(3)* %arg, align 4
    104   %tmp2 = fadd float %tmp, 0.000000e+00
    105   %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2048
    106   %tmp4 = load float, float addrspace(3)* %tmp3, align 4
    107   %tmp5 = fadd float %tmp2, %tmp4
    108   %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4096
    109   %tmp7 = load float, float addrspace(3)* %tmp6, align 4
    110   %tmp8 = fadd float %tmp5, %tmp7
    111   %tmp9 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6144
    112   %tmp10 = load float, float addrspace(3)* %tmp9, align 4
    113   %tmp11 = fadd float %tmp8, %tmp10
    114   %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8192
    115   %tmp13 = load float, float addrspace(3)* %tmp12, align 4
    116   %tmp14 = fadd float %tmp11, %tmp13
    117   %tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10240
    118   %tmp16 = load float, float addrspace(3)* %tmp15, align 4
    119   %tmp17 = fadd float %tmp14, %tmp16
    120   %tmp18 = getelementptr inbounds float, float addrspace(3)* %arg, i32 12288
    121   %tmp19 = load float, float addrspace(3)* %tmp18, align 4
    122   %tmp20 = fadd float %tmp17, %tmp19
    123   %tmp21 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14336
    124   %tmp22 = load float, float addrspace(3)* %tmp21, align 4
    125   %tmp23 = fadd float %tmp20, %tmp22
    126   store float %tmp23, float *%arg1, align 4
    127   ret void
    128 }
    129 
    130 ; GCN-LABEL: ds_read32_combine_stride_8192_shifted:
    131 ; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
    132 ; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
    133 
    134 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
    135 ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
    136 ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
    137 
    138 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]]
    139 ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]]
    140 ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]]
    141 
    142 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:32
    143 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:32
    144 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:32
    145 define amdgpu_kernel void @ds_read32_combine_stride_8192_shifted(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
    146 bb:
    147   %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 2
    148   %tmp2 = load float, float addrspace(3)* %tmp, align 4
    149   %tmp3 = fadd float %tmp2, 0.000000e+00
    150   %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2050
    151   %tmp5 = load float, float addrspace(3)* %tmp4, align 4
    152   %tmp6 = fadd float %tmp3, %tmp5
    153   %tmp7 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4098
    154   %tmp8 = load float, float addrspace(3)* %tmp7, align 4
    155   %tmp9 = fadd float %tmp6, %tmp8
    156   %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6146
    157   %tmp11 = load float, float addrspace(3)* %tmp10, align 4
    158   %tmp12 = fadd float %tmp9, %tmp11
    159   %tmp13 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8194
    160   %tmp14 = load float, float addrspace(3)* %tmp13, align 4
    161   %tmp15 = fadd float %tmp12, %tmp14
    162   %tmp16 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10242
    163   %tmp17 = load float, float addrspace(3)* %tmp16, align 4
    164   %tmp18 = fadd float %tmp15, %tmp17
    165   store float %tmp18, float *%arg1, align 4
    166   ret void
    167 }
    168 
    169 ; GCN-LABEL: ds_read64_combine_stride_400:
    170 ; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
    171 ; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
    172 
    173 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
    174 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x960, [[BASE]]
    175 
    176 ; GCN-DAG: ds_read2_b64  v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:50
    177 ; GCN-DAG: ds_read2_b64  v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:100 offset1:150
    178 ; GCN-DAG: ds_read2_b64  v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:200 offset1:250
    179 ; GCN-DAG: ds_read2_b64  v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:50
    180 define amdgpu_kernel void @ds_read64_combine_stride_400(double addrspace(3)* nocapture readonly %arg, double *nocapture %arg1) {
    181 bb:
    182   %tmp = load double, double addrspace(3)* %arg, align 8
    183   %tmp2 = fadd double %tmp, 0.000000e+00
    184   %tmp3 = getelementptr inbounds double, double addrspace(3)* %arg, i32 50
    185   %tmp4 = load double, double addrspace(3)* %tmp3, align 8
    186   %tmp5 = fadd double %tmp2, %tmp4
    187   %tmp6 = getelementptr inbounds double, double addrspace(3)* %arg, i32 100
    188   %tmp7 = load double, double addrspace(3)* %tmp6, align 8
    189   %tmp8 = fadd double %tmp5, %tmp7
    190   %tmp9 = getelementptr inbounds double, double addrspace(3)* %arg, i32 150
    191   %tmp10 = load double, double addrspace(3)* %tmp9, align 8
    192   %tmp11 = fadd double %tmp8, %tmp10
    193   %tmp12 = getelementptr inbounds double, double addrspace(3)* %arg, i32 200
    194   %tmp13 = load double, double addrspace(3)* %tmp12, align 8
    195   %tmp14 = fadd double %tmp11, %tmp13
    196   %tmp15 = getelementptr inbounds double, double addrspace(3)* %arg, i32 250
    197   %tmp16 = load double, double addrspace(3)* %tmp15, align 8
    198   %tmp17 = fadd double %tmp14, %tmp16
    199   %tmp18 = getelementptr inbounds double, double addrspace(3)* %arg, i32 300
    200   %tmp19 = load double, double addrspace(3)* %tmp18, align 8
    201   %tmp20 = fadd double %tmp17, %tmp19
    202   %tmp21 = getelementptr inbounds double, double addrspace(3)* %arg, i32 350
    203   %tmp22 = load double, double addrspace(3)* %tmp21, align 8
    204   %tmp23 = fadd double %tmp20, %tmp22
    205   store double %tmp23, double *%arg1, align 8
    206   ret void
    207 }
    208 
    209 ; GCN-LABEL: ds_read64_combine_stride_8192_shifted:
    210 ; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
    211 ; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
    212 
    213 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
    214 ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
    215 ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
    216 
    217 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]]
    218 ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]]
    219 ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]]
    220 
    221 ; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:16
    222 ; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:16
    223 ; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:16
    224 define amdgpu_kernel void @ds_read64_combine_stride_8192_shifted(double addrspace(3)* nocapture readonly %arg, double *nocapture %arg1) {
    225 bb:
    226   %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 1
    227   %tmp2 = load double, double addrspace(3)* %tmp, align 8
    228   %tmp3 = fadd double %tmp2, 0.000000e+00
    229   %tmp4 = getelementptr inbounds double, double addrspace(3)* %arg, i32 1025
    230   %tmp5 = load double, double addrspace(3)* %tmp4, align 8
    231   %tmp6 = fadd double %tmp3, %tmp5
    232   %tmp7 = getelementptr inbounds double, double addrspace(3)* %arg, i32 2049
    233   %tmp8 = load double, double addrspace(3)* %tmp7, align 8
    234   %tmp9 = fadd double %tmp6, %tmp8
    235   %tmp10 = getelementptr inbounds double, double addrspace(3)* %arg, i32 3073
    236   %tmp11 = load double, double addrspace(3)* %tmp10, align 8
    237   %tmp12 = fadd double %tmp9, %tmp11
    238   %tmp13 = getelementptr inbounds double, double addrspace(3)* %arg, i32 4097
    239   %tmp14 = load double, double addrspace(3)* %tmp13, align 8
    240   %tmp15 = fadd double %tmp12, %tmp14
    241   %tmp16 = getelementptr inbounds double, double addrspace(3)* %arg, i32 5121
    242   %tmp17 = load double, double addrspace(3)* %tmp16, align 8
    243   %tmp18 = fadd double %tmp15, %tmp17
    244   store double %tmp18, double *%arg1, align 8
    245   ret void
    246 }
    247 
    248 ; GCN-LABEL: ds_write32_combine_stride_400:
    249 ; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
    250 ; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
    251 
    252 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
    253 ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
    254 ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
    255 
    256 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
    257 ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
    258 ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]]
    259 
    260 ; GCN-DAG: ds_write2_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
    261 ; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
    262 ; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
    263 ; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
    264 define amdgpu_kernel void @ds_write32_combine_stride_400(float addrspace(3)* nocapture %arg) {
    265 bb:
    266   store float 1.000000e+00, float addrspace(3)* %arg, align 4
    267   %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 100
    268   store float 1.000000e+00, float addrspace(3)* %tmp, align 4
    269   %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200
    270   store float 1.000000e+00, float addrspace(3)* %tmp1, align 4
    271   %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300
    272   store float 1.000000e+00, float addrspace(3)* %tmp2, align 4
    273   %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
    274   store float 1.000000e+00, float addrspace(3)* %tmp3, align 4
    275   %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500
    276   store float 1.000000e+00, float addrspace(3)* %tmp4, align 4
    277   %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600
    278   store float 1.000000e+00, float addrspace(3)* %tmp5, align 4
    279   %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
    280   store float 1.000000e+00, float addrspace(3)* %tmp6, align 4
    281   ret void
    282 }
    283 
    284 ; GCN-LABEL: ds_write32_combine_stride_400_back:
    285 ; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
    286 ; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
    287 
    288 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
    289 ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
    290 ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
    291 
    292 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
    293 ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
    294 ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]]
    295 
    296 ; GCN-DAG: ds_write2_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
    297 ; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
    298 ; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
    299 ; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
    300 define amdgpu_kernel void @ds_write32_combine_stride_400_back(float addrspace(3)* nocapture %arg) {
    301 bb:
    302   %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
    303   store float 1.000000e+00, float addrspace(3)* %tmp, align 4
    304   %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600
    305   store float 1.000000e+00, float addrspace(3)* %tmp1, align 4
    306   %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500
    307   store float 1.000000e+00, float addrspace(3)* %tmp2, align 4
    308   %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
    309   store float 1.000000e+00, float addrspace(3)* %tmp3, align 4
    310   %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300
    311   store float 1.000000e+00, float addrspace(3)* %tmp4, align 4
    312   %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200
    313   store float 1.000000e+00, float addrspace(3)* %tmp5, align 4
    314   %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 100
    315   store float 1.000000e+00, float addrspace(3)* %tmp6, align 4
    316   store float 1.000000e+00, float addrspace(3)* %arg, align 4
    317   ret void
    318 }
    319 
    320 ; GCN-LABEL: ds_write32_combine_stride_8192:
    321 ; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
    322 ; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
    323 ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32
    324 ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:64 offset1:96
    325 ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:128 offset1:160
    326 ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:192 offset1:224
    327 define amdgpu_kernel void @ds_write32_combine_stride_8192(float addrspace(3)* nocapture %arg) {
    328 bb:
    329   store float 1.000000e+00, float addrspace(3)* %arg, align 4
    330   %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 2048
    331   store float 1.000000e+00, float addrspace(3)* %tmp, align 4
    332   %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4096
    333   store float 1.000000e+00, float addrspace(3)* %tmp1, align 4
    334   %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6144
    335   store float 1.000000e+00, float addrspace(3)* %tmp2, align 4
    336   %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8192
    337   store float 1.000000e+00, float addrspace(3)* %tmp3, align 4
    338   %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10240
    339   store float 1.000000e+00, float addrspace(3)* %tmp4, align 4
    340   %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 12288
    341   store float 1.000000e+00, float addrspace(3)* %tmp5, align 4
    342   %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14336
    343   store float 1.000000e+00, float addrspace(3)* %tmp6, align 4
    344   ret void
    345 }
    346 
    347 ; GCN-LABEL: ds_write32_combine_stride_8192_shifted:
    348 ; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
    349 ; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
    350 
    351 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 4, [[BASE]]
    352 ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
    353 ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
    354 
    355 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 4, [[BASE]]
    356 ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4004, [[BASE]]
    357 ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8004, [[BASE]]
    358 
    359 ; GCN-DAG: ds_write2st64_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32
    360 ; GCN-DAG: ds_write2st64_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32
    361 ; GCN-DAG: ds_write2st64_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32
    362 define amdgpu_kernel void @ds_write32_combine_stride_8192_shifted(float addrspace(3)* nocapture %arg) {
    363 bb:
    364   %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 1
    365   store float 1.000000e+00, float addrspace(3)* %tmp, align 4
    366   %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2049
    367   store float 1.000000e+00, float addrspace(3)* %tmp1, align 4
    368   %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4097
    369   store float 1.000000e+00, float addrspace(3)* %tmp2, align 4
    370   %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6145
    371   store float 1.000000e+00, float addrspace(3)* %tmp3, align 4
    372   %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8193
    373   store float 1.000000e+00, float addrspace(3)* %tmp4, align 4
    374   %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10241
    375   store float 1.000000e+00, float addrspace(3)* %tmp5, align 4
    376   ret void
    377 }
    378 
    379 ; GCN-LABEL: ds_write64_combine_stride_400:
    380 ; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
    381 ; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
    382 
    383 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
    384 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x960, [[BASE]]
    385 
    386 ; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50
    387 ; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:100 offset1:150
    388 ; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:200 offset1:250
    389 ; GCN-DAG: ds_write2_b64 [[B1]],   v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50
    390 define amdgpu_kernel void @ds_write64_combine_stride_400(double addrspace(3)* nocapture %arg) {
    391 bb:
    392   store double 1.000000e+00, double addrspace(3)* %arg, align 8
    393   %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 50
    394   store double 1.000000e+00, double addrspace(3)* %tmp, align 8
    395   %tmp1 = getelementptr inbounds double, double addrspace(3)* %arg, i32 100
    396   store double 1.000000e+00, double addrspace(3)* %tmp1, align 8
    397   %tmp2 = getelementptr inbounds double, double addrspace(3)* %arg, i32 150
    398   store double 1.000000e+00, double addrspace(3)* %tmp2, align 8
    399   %tmp3 = getelementptr inbounds double, double addrspace(3)* %arg, i32 200
    400   store double 1.000000e+00, double addrspace(3)* %tmp3, align 8
    401   %tmp4 = getelementptr inbounds double, double addrspace(3)* %arg, i32 250
    402   store double 1.000000e+00, double addrspace(3)* %tmp4, align 8
    403   %tmp5 = getelementptr inbounds double, double addrspace(3)* %arg, i32 300
    404   store double 1.000000e+00, double addrspace(3)* %tmp5, align 8
    405   %tmp6 = getelementptr inbounds double, double addrspace(3)* %arg, i32 350
    406   store double 1.000000e+00, double addrspace(3)* %tmp6, align 8
    407   ret void
    408 }
    409 
    410 ; GCN-LABEL: ds_write64_combine_stride_8192_shifted:
    411 ; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
    412 ; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
    413 
    414 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
    415 ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
    416 ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
    417 
    418 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]]
    419 ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]]
    420 ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]]
    421 
    422 ; GCN-DAG: ds_write2st64_b64 [[B1]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16
    423 ; GCN-DAG: ds_write2st64_b64 [[B2]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16
    424 ; GCN-DAG: ds_write2st64_b64 [[B3]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16
    425 define amdgpu_kernel void @ds_write64_combine_stride_8192_shifted(double addrspace(3)* nocapture %arg) {
    426 bb:
    427   %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 1
    428   store double 1.000000e+00, double addrspace(3)* %tmp, align 8
    429   %tmp1 = getelementptr inbounds double, double addrspace(3)* %arg, i32 1025
    430   store double 1.000000e+00, double addrspace(3)* %tmp1, align 8
    431   %tmp2 = getelementptr inbounds double, double addrspace(3)* %arg, i32 2049
    432   store double 1.000000e+00, double addrspace(3)* %tmp2, align 8
    433   %tmp3 = getelementptr inbounds double, double addrspace(3)* %arg, i32 3073
    434   store double 1.000000e+00, double addrspace(3)* %tmp3, align 8
    435   %tmp4 = getelementptr inbounds double, double addrspace(3)* %arg, i32 4097
    436   store double 1.000000e+00, double addrspace(3)* %tmp4, align 8
    437   %tmp5 = getelementptr inbounds double, double addrspace(3)* %arg, i32 5121
    438   store double 1.000000e+00, double addrspace(3)* %tmp5, align 8
    439   ret void
    440 }
    441