Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
      2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
      3 
      4 declare void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(3)* nocapture, i32, i1) nounwind
      5 declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i1) nounwind
      6 declare void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(4)* nocapture, i64, i1) nounwind
      7 
      8 
      9 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1:
     10 ; SI-DAG: ds_read_u8
     11 ; SI-DAG: ds_read_u8
     12 ; SI-DAG: ds_read_u8
     13 ; SI-DAG: ds_read_u8
     14 ; SI-DAG: ds_read_u8
     15 ; SI-DAG: ds_read_u8
     16 ; SI-DAG: ds_read_u8
     17 ; SI-DAG: ds_read_u8
     18 
     19 ; SI-DAG: ds_read_u8
     20 ; SI-DAG: ds_read_u8
     21 ; SI-DAG: ds_read_u8
     22 ; SI-DAG: ds_read_u8
     23 ; SI-DAG: ds_read_u8
     24 ; SI-DAG: ds_read_u8
     25 ; SI-DAG: ds_read_u8
     26 ; SI-DAG: ds_read_u8
     27 
     28 ; SI-DAG: ds_read_u8
     29 ; SI-DAG: ds_read_u8
     30 ; SI-DAG: ds_read_u8
     31 ; SI-DAG: ds_read_u8
     32 ; SI-DAG: ds_read_u8
     33 ; SI-DAG: ds_read_u8
     34 ; SI-DAG: ds_read_u8
     35 ; SI-DAG: ds_read_u8
     36 
     37 ; SI-DAG: ds_read_u8
     38 ; SI-DAG: ds_read_u8
     39 ; SI-DAG: ds_read_u8
     40 ; SI-DAG: ds_read_u8
     41 ; SI-DAG: ds_read_u8
     42 ; SI-DAG: ds_read_u8
     43 ; SI-DAG: ds_read_u8
     44 ; SI-DAG: ds_read_u8
     45 
     46 ; SI-DAG: ds_write_b8
     47 ; SI-DAG: ds_write_b8
     48 ; SI-DAG: ds_write_b8
     49 ; SI-DAG: ds_write_b8
     50 ; SI-DAG: ds_write_b8
     51 ; SI-DAG: ds_write_b8
     52 ; SI-DAG: ds_write_b8
     53 ; SI-DAG: ds_write_b8
     54 
     55 ; SI-DAG: ds_write_b8
     56 ; SI-DAG: ds_write_b8
     57 ; SI-DAG: ds_write_b8
     58 ; SI-DAG: ds_write_b8
     59 ; SI-DAG: ds_write_b8
     60 ; SI-DAG: ds_write_b8
     61 ; SI-DAG: ds_write_b8
     62 ; SI-DAG: ds_write_b8
     63 
     64 ; SI-DAG: ds_write_b8
     65 ; SI-DAG: ds_write_b8
     66 ; SI-DAG: ds_write_b8
     67 ; SI-DAG: ds_write_b8
     68 ; SI-DAG: ds_write_b8
     69 ; SI-DAG: ds_write_b8
     70 ; SI-DAG: ds_write_b8
     71 ; SI-DAG: ds_write_b8
     72 
     73 ; SI-DAG: ds_write_b8
     74 ; SI-DAG: ds_write_b8
     75 ; SI-DAG: ds_write_b8
     76 ; SI-DAG: ds_write_b8
     77 ; SI-DAG: ds_write_b8
     78 ; SI-DAG: ds_write_b8
     79 ; SI-DAG: ds_write_b8
     80 ; SI-DAG: ds_write_b8
     81 
     82 ; SI: s_endpgm
     83 define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
     84   %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
     85   %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
     86   call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i1 false) nounwind
     87   ret void
     88 }
     89 
     90 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align2:
     91 ; SI-DAG: ds_read_u16
     92 ; SI-DAG: ds_read_u16
     93 ; SI-DAG: ds_read_u16
     94 ; SI-DAG: ds_read_u16
     95 ; SI-DAG: ds_read_u16
     96 ; SI-DAG: ds_read_u16
     97 ; SI-DAG: ds_read_u16
     98 ; SI-DAG: ds_read_u16
     99 
    100 ; SI-DAG: ds_read_u16
    101 ; SI-DAG: ds_read_u16
    102 ; SI-DAG: ds_read_u16
    103 ; SI-DAG: ds_read_u16
    104 ; SI-DAG: ds_read_u16
    105 ; SI-DAG: ds_read_u16
    106 ; SI-DAG: ds_read_u16
    107 ; SI-DAG: ds_read_u16
    108 
    109 ; SI-DAG: ds_write_b16
    110 ; SI-DAG: ds_write_b16
    111 ; SI-DAG: ds_write_b16
    112 ; SI-DAG: ds_write_b16
    113 ; SI-DAG: ds_write_b16
    114 ; SI-DAG: ds_write_b16
    115 ; SI-DAG: ds_write_b16
    116 ; SI-DAG: ds_write_b16
    117 
    118 ; SI-DAG: ds_write_b16
    119 ; SI-DAG: ds_write_b16
    120 ; SI-DAG: ds_write_b16
    121 ; SI-DAG: ds_write_b16
    122 ; SI-DAG: ds_write_b16
    123 ; SI-DAG: ds_write_b16
    124 ; SI-DAG: ds_write_b16
    125 ; SI-DAG: ds_write_b16
    126 
    127 ; SI: s_endpgm
    128 define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
    129   %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
    130   %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
    131   call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* align 2 %bcout, i8 addrspace(3)* align 2 %bcin, i32 32, i1 false) nounwind
    132   ret void
    133 }
    134 
    135 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align4:
    136 ; SI: ds_read2_b32
    137 ; SI: ds_read2_b32
    138 ; SI: ds_read2_b32
    139 ; SI: ds_read2_b32
    140 
    141 ; SI: ds_write2_b32
    142 ; SI: ds_write2_b32
    143 ; SI: ds_write2_b32
    144 ; SI: ds_write2_b32
    145 
    146 ; SI: s_endpgm
    147 define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
    148   %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
    149   %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
    150   call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* align 4 %bcout, i8 addrspace(3)* align 4 %bcin, i32 32, i1 false) nounwind
    151   ret void
    152 }
    153 
    154 ; FIXME: Use 64-bit ops
    155 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align8:
    156 
    157 ; SI: ds_read2_b64
    158 ; SI: ds_read2_b64
    159 
    160 ; SI: ds_write2_b64
    161 ; SI: ds_write2_b64
    162 
    163 ; SI-DAG: s_endpgm
    164 define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
    165   %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
    166   %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
    167   call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* align 8 %bcout, i8 addrspace(3)* align 8 %bcin, i32 32, i1 false) nounwind
    168   ret void
    169 }
    170 
    171 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align1:
    172 ; SI-DAG: buffer_load_ubyte
    173 ; SI-DAG: buffer_store_byte
    174 ; SI-DAG: buffer_load_ubyte
    175 ; SI-DAG: buffer_store_byte
    176 ; SI-DAG: buffer_load_ubyte
    177 ; SI-DAG: buffer_store_byte
    178 ; SI-DAG: buffer_load_ubyte
    179 ; SI-DAG: buffer_store_byte
    180 ; SI-DAG: buffer_load_ubyte
    181 ; SI-DAG: buffer_store_byte
    182 ; SI-DAG: buffer_load_ubyte
    183 ; SI-DAG: buffer_store_byte
    184 ; SI-DAG: buffer_load_ubyte
    185 ; SI-DAG: buffer_store_byte
    186 ; SI-DAG: buffer_load_ubyte
    187 ; SI-DAG: buffer_store_byte
    188 
    189 ; SI-DAG: buffer_load_ubyte
    190 ; SI-DAG: buffer_store_byte
    191 ; SI-DAG: buffer_load_ubyte
    192 ; SI-DAG: buffer_store_byte
    193 ; SI-DAG: buffer_load_ubyte
    194 ; SI-DAG: buffer_store_byte
    195 ; SI-DAG: buffer_load_ubyte
    196 ; SI-DAG: buffer_store_byte
    197 ; SI-DAG: buffer_load_ubyte
    198 ; SI-DAG: buffer_store_byte
    199 ; SI-DAG: buffer_load_ubyte
    200 ; SI-DAG: buffer_store_byte
    201 ; SI-DAG: buffer_load_ubyte
    202 ; SI-DAG: buffer_store_byte
    203 ; SI-DAG: buffer_load_ubyte
    204 ; SI-DAG: buffer_store_byte
    205 
    206 ; SI-DAG: buffer_load_ubyte
    207 ; SI-DAG: buffer_store_byte
    208 ; SI-DAG: buffer_load_ubyte
    209 ; SI-DAG: buffer_store_byte
    210 ; SI-DAG: buffer_load_ubyte
    211 ; SI-DAG: buffer_store_byte
    212 ; SI-DAG: buffer_load_ubyte
    213 ; SI-DAG: buffer_store_byte
    214 ; SI-DAG: buffer_load_ubyte
    215 ; SI-DAG: buffer_store_byte
    216 ; SI-DAG: buffer_load_ubyte
    217 ; SI-DAG: buffer_store_byte
    218 ; SI-DAG: buffer_load_ubyte
    219 ; SI-DAG: buffer_store_byte
    220 ; SI-DAG: buffer_load_ubyte
    221 ; SI-DAG: buffer_store_byte
    222 
    223 ; SI-DAG: buffer_load_ubyte
    224 ; SI-DAG: buffer_store_byte
    225 ; SI-DAG: buffer_load_ubyte
    226 ; SI-DAG: buffer_store_byte
    227 ; SI-DAG: buffer_load_ubyte
    228 ; SI-DAG: buffer_store_byte
    229 ; SI-DAG: buffer_load_ubyte
    230 ; SI-DAG: buffer_store_byte
    231 ; SI-DAG: buffer_load_ubyte
    232 ; SI-DAG: buffer_store_byte
    233 ; SI-DAG: buffer_load_ubyte
    234 ; SI-DAG: buffer_store_byte
    235 ; SI-DAG: buffer_load_ubyte
    236 ; SI-DAG: buffer_store_byte
    237 ; SI-DAG: buffer_load_ubyte
    238 ; SI-DAG: buffer_store_byte
    239 
    240 ; SI: s_endpgm
    241 define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
    242   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
    243   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
    244   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i1 false) nounwind
    245   ret void
    246 }
    247 
    248 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align2:
    249 ; SI-DAG: buffer_load_ushort
    250 ; SI-DAG: buffer_load_ushort
    251 ; SI-DAG: buffer_load_ushort
    252 ; SI-DAG: buffer_load_ushort
    253 ; SI-DAG: buffer_load_ushort
    254 ; SI-DAG: buffer_load_ushort
    255 ; SI-DAG: buffer_load_ushort
    256 ; SI-DAG: buffer_load_ushort
    257 ; SI-DAG: buffer_load_ushort
    258 ; SI-DAG: buffer_load_ushort
    259 ; SI-DAG: buffer_load_ushort
    260 ; SI-DAG: buffer_load_ushort
    261 ; SI-DAG: buffer_load_ushort
    262 ; SI-DAG: buffer_load_ushort
    263 ; SI-DAG: buffer_load_ushort
    264 ; SI-DAG: buffer_load_ushort
    265 
    266 ; SI-DAG: buffer_store_short
    267 ; SI-DAG: buffer_store_short
    268 ; SI-DAG: buffer_store_short
    269 ; SI-DAG: buffer_store_short
    270 ; SI-DAG: buffer_store_short
    271 ; SI-DAG: buffer_store_short
    272 ; SI-DAG: buffer_store_short
    273 ; SI-DAG: buffer_store_short
    274 ; SI-DAG: buffer_store_short
    275 ; SI-DAG: buffer_store_short
    276 ; SI-DAG: buffer_store_short
    277 ; SI-DAG: buffer_store_short
    278 ; SI-DAG: buffer_store_short
    279 ; SI-DAG: buffer_store_short
    280 ; SI-DAG: buffer_store_short
    281 ; SI-DAG: buffer_store_short
    282 
    283 ; SI: s_endpgm
    284 define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
    285   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
    286   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
    287   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 2 %bcout, i8 addrspace(1)* align 2 %bcin, i64 32, i1 false) nounwind
    288   ret void
    289 }
    290 
    291 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align4:
    292 ; SI: buffer_load_dwordx4
    293 ; SI: buffer_load_dwordx4
    294 ; SI: buffer_store_dwordx4
    295 ; SI: buffer_store_dwordx4
    296 ; SI: s_endpgm
    297 define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
    298   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
    299   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
    300   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %bcout, i8 addrspace(1)* align 4 %bcin, i64 32, i1 false) nounwind
    301   ret void
    302 }
    303 
    304 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align8:
    305 ; SI: buffer_load_dwordx4
    306 ; SI: buffer_load_dwordx4
    307 ; SI: buffer_store_dwordx4
    308 ; SI: buffer_store_dwordx4
    309 ; SI: s_endpgm
    310 define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
    311   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
    312   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
    313   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 8 %bcout, i8 addrspace(1)* align 8 %bcin, i64 32, i1 false) nounwind
    314   ret void
    315 }
    316 
    317 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align16:
    318 ; SI: buffer_load_dwordx4
    319 ; SI: buffer_load_dwordx4
    320 ; SI: buffer_store_dwordx4
    321 ; SI: buffer_store_dwordx4
    322 ; SI: s_endpgm
    323 define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
    324   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
    325   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
    326   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 16 %bcout, i8 addrspace(1)* align 16 %bcin, i64 32, i1 false) nounwind
    327   ret void
    328 }
    329 
    330 ; Test shouldConvertConstantLoadToIntImm
    331 @hello.align4 = private unnamed_addr addrspace(4) constant [16 x i8] c"constant string\00", align 4
    332 @hello.align1 = private unnamed_addr addrspace(4) constant [16 x i8] c"constant string\00", align 1
    333 
    334 ; FUNC-LABEL: {{^}}test_memcpy_const_string_align4:
    335 ; SI: s_getpc_b64
    336 ; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, hello.align4+20
    337 ; SI: s_addc_u32
    338 ; SI-DAG: s_load_dwordx4
    339 ; SI-DAG: s_load_dwordx4
    340 ; SI-DAG: s_load_dwordx2
    341 ; SI-DAG: buffer_store_dwordx4
    342 ; SI-DAG: buffer_store_dwordx4
    343 define amdgpu_kernel void @test_memcpy_const_string_align4(i8 addrspace(1)* noalias %out) nounwind {
    344   %str = bitcast [16 x i8] addrspace(4)* @hello.align4 to i8 addrspace(4)*
    345   call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* align 4 %out, i8 addrspace(4)* align 4 %str, i64 32, i1 false)
    346   ret void
    347 }
    348 
    349 ; FUNC-LABEL: {{^}}test_memcpy_const_string_align1:
    350 ; SI-NOT: buffer_load
    351 ; SI: v_mov_b32_e32 v{{[0-9]+}}, 0x69
    352 ; SI: buffer_store_byte
    353 ; SI: buffer_store_byte
    354 ; SI: buffer_store_byte
    355 ; SI: buffer_store_byte
    356 ; SI: buffer_store_byte
    357 ; SI: buffer_store_byte
    358 ; SI: buffer_store_byte
    359 ; SI: buffer_store_byte
    360 ; SI: buffer_store_byte
    361 ; SI: buffer_store_byte
    362 ; SI: buffer_store_byte
    363 ; SI: buffer_store_byte
    364 ; SI: buffer_store_byte
    365 ; SI: buffer_store_byte
    366 ; SI: buffer_store_byte
    367 ; SI: buffer_store_byte
    368 define amdgpu_kernel void @test_memcpy_const_string_align1(i8 addrspace(1)* noalias %out) nounwind {
    369   %str = bitcast [16 x i8] addrspace(4)* @hello.align1 to i8 addrspace(4)*
    370   call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* %out, i8 addrspace(4)* %str, i64 32, i1 false)
    371   ret void
    372 }
    373