Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
      2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
      3 
      4 declare void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(3)* nocapture, i32, i32, i1) nounwind
      5 declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind
      6 
      7 
      8 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1:
      9 ; SI-DAG: ds_read_u8
     10 ; SI-DAG: ds_read_u8
     11 ; SI-DAG: ds_read_u8
     12 ; SI-DAG: ds_read_u8
     13 ; SI-DAG: ds_read_u8
     14 ; SI-DAG: ds_read_u8
     15 ; SI-DAG: ds_read_u8
     16 ; SI-DAG: ds_read_u8
     17 
     18 ; SI-DAG: ds_read_u8
     19 ; SI-DAG: ds_read_u8
     20 ; SI-DAG: ds_read_u8
     21 ; SI-DAG: ds_read_u8
     22 ; SI-DAG: ds_read_u8
     23 ; SI-DAG: ds_read_u8
     24 ; SI-DAG: ds_read_u8
     25 ; SI-DAG: ds_read_u8
     26 
     27 ; SI-DAG: ds_read_u8
     28 ; SI-DAG: ds_read_u8
     29 ; SI-DAG: ds_read_u8
     30 ; SI-DAG: ds_read_u8
     31 ; SI-DAG: ds_read_u8
     32 ; SI-DAG: ds_read_u8
     33 ; SI-DAG: ds_read_u8
     34 ; SI-DAG: ds_read_u8
     35 
     36 ; SI-DAG: ds_read_u8
     37 ; SI-DAG: ds_read_u8
     38 ; SI-DAG: ds_read_u8
     39 ; SI-DAG: ds_read_u8
     40 ; SI-DAG: ds_read_u8
     41 ; SI-DAG: ds_read_u8
     42 ; SI-DAG: ds_read_u8
     43 ; SI-DAG: ds_read_u8
     44 
     45 ; SI-DAG: ds_write_b8
     46 ; SI-DAG: ds_write_b8
     47 ; SI-DAG: ds_write_b8
     48 ; SI-DAG: ds_write_b8
     49 ; SI-DAG: ds_write_b8
     50 ; SI-DAG: ds_write_b8
     51 ; SI-DAG: ds_write_b8
     52 ; SI-DAG: ds_write_b8
     53 
     54 ; SI-DAG: ds_write_b8
     55 ; SI-DAG: ds_write_b8
     56 ; SI-DAG: ds_write_b8
     57 ; SI-DAG: ds_write_b8
     58 ; SI-DAG: ds_write_b8
     59 ; SI-DAG: ds_write_b8
     60 ; SI-DAG: ds_write_b8
     61 ; SI-DAG: ds_write_b8
     62 
     63 ; SI-DAG: ds_write_b8
     64 ; SI-DAG: ds_write_b8
     65 ; SI-DAG: ds_write_b8
     66 ; SI-DAG: ds_write_b8
     67 ; SI-DAG: ds_write_b8
     68 ; SI-DAG: ds_write_b8
     69 ; SI-DAG: ds_write_b8
     70 ; SI-DAG: ds_write_b8
     71 
     72 ; SI-DAG: ds_write_b8
     73 ; SI-DAG: ds_write_b8
     74 ; SI-DAG: ds_write_b8
     75 ; SI-DAG: ds_write_b8
     76 ; SI-DAG: ds_write_b8
     77 ; SI-DAG: ds_write_b8
     78 ; SI-DAG: ds_write_b8
     79 ; SI-DAG: ds_write_b8
     80 
     81 ; SI: s_endpgm
     82 define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
     83   %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
     84   %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
     85   call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 1, i1 false) nounwind
     86   ret void
     87 }
     88 
     89 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align2:
     90 ; SI-DAG: ds_read_u16
     91 ; SI-DAG: ds_read_u16
     92 ; SI-DAG: ds_read_u16
     93 ; SI-DAG: ds_read_u16
     94 ; SI-DAG: ds_read_u16
     95 ; SI-DAG: ds_read_u16
     96 ; SI-DAG: ds_read_u16
     97 ; SI-DAG: ds_read_u16
     98 
     99 ; SI-DAG: ds_read_u16
    100 ; SI-DAG: ds_read_u16
    101 ; SI-DAG: ds_read_u16
    102 ; SI-DAG: ds_read_u16
    103 ; SI-DAG: ds_read_u16
    104 ; SI-DAG: ds_read_u16
    105 ; SI-DAG: ds_read_u16
    106 ; SI-DAG: ds_read_u16
    107 
    108 ; SI-DAG: ds_write_b16
    109 ; SI-DAG: ds_write_b16
    110 ; SI-DAG: ds_write_b16
    111 ; SI-DAG: ds_write_b16
    112 ; SI-DAG: ds_write_b16
    113 ; SI-DAG: ds_write_b16
    114 ; SI-DAG: ds_write_b16
    115 ; SI-DAG: ds_write_b16
    116 
    117 ; SI-DAG: ds_write_b16
    118 ; SI-DAG: ds_write_b16
    119 ; SI-DAG: ds_write_b16
    120 ; SI-DAG: ds_write_b16
    121 ; SI-DAG: ds_write_b16
    122 ; SI-DAG: ds_write_b16
    123 ; SI-DAG: ds_write_b16
    124 ; SI-DAG: ds_write_b16
    125 
    126 ; SI: s_endpgm
    127 define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
    128   %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
    129   %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
    130   call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 2, i1 false) nounwind
    131   ret void
    132 }
    133 
    134 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align4:
    135 ; SI: ds_read2_b32
    136 ; SI: ds_read2_b32
    137 ; SI: ds_read2_b32
    138 ; SI: ds_read2_b32
    139 
    140 ; SI: ds_write2_b32
    141 ; SI: ds_write2_b32
    142 ; SI: ds_write2_b32
    143 ; SI: ds_write2_b32
    144 
    145 ; SI: s_endpgm
    146 define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
    147   %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
    148   %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
    149   call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 4, i1 false) nounwind
    150   ret void
    151 }
    152 
    153 ; FIXME: Use 64-bit ops
    154 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align8:
    155 
    156 ; SI: ds_read2_b64
    157 ; SI: ds_read2_b64
    158 
    159 ; SI: ds_write2_b64
    160 ; SI: ds_write2_b64
    161 
    162 ; SI-DAG: s_endpgm
    163 define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
    164   %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
    165   %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
    166   call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 8, i1 false) nounwind
    167   ret void
    168 }
    169 
    170 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align1:
    171 ; SI-DAG: buffer_load_ubyte
    172 ; SI-DAG: buffer_store_byte
    173 ; SI-DAG: buffer_load_ubyte
    174 ; SI-DAG: buffer_store_byte
    175 ; SI-DAG: buffer_load_ubyte
    176 ; SI-DAG: buffer_store_byte
    177 ; SI-DAG: buffer_load_ubyte
    178 ; SI-DAG: buffer_store_byte
    179 ; SI-DAG: buffer_load_ubyte
    180 ; SI-DAG: buffer_store_byte
    181 ; SI-DAG: buffer_load_ubyte
    182 ; SI-DAG: buffer_store_byte
    183 ; SI-DAG: buffer_load_ubyte
    184 ; SI-DAG: buffer_store_byte
    185 ; SI-DAG: buffer_load_ubyte
    186 ; SI-DAG: buffer_store_byte
    187 
    188 ; SI-DAG: buffer_load_ubyte
    189 ; SI-DAG: buffer_store_byte
    190 ; SI-DAG: buffer_load_ubyte
    191 ; SI-DAG: buffer_store_byte
    192 ; SI-DAG: buffer_load_ubyte
    193 ; SI-DAG: buffer_store_byte
    194 ; SI-DAG: buffer_load_ubyte
    195 ; SI-DAG: buffer_store_byte
    196 ; SI-DAG: buffer_load_ubyte
    197 ; SI-DAG: buffer_store_byte
    198 ; SI-DAG: buffer_load_ubyte
    199 ; SI-DAG: buffer_store_byte
    200 ; SI-DAG: buffer_load_ubyte
    201 ; SI-DAG: buffer_store_byte
    202 ; SI-DAG: buffer_load_ubyte
    203 ; SI-DAG: buffer_store_byte
    204 
    205 ; SI-DAG: buffer_load_ubyte
    206 ; SI-DAG: buffer_store_byte
    207 ; SI-DAG: buffer_load_ubyte
    208 ; SI-DAG: buffer_store_byte
    209 ; SI-DAG: buffer_load_ubyte
    210 ; SI-DAG: buffer_store_byte
    211 ; SI-DAG: buffer_load_ubyte
    212 ; SI-DAG: buffer_store_byte
    213 ; SI-DAG: buffer_load_ubyte
    214 ; SI-DAG: buffer_store_byte
    215 ; SI-DAG: buffer_load_ubyte
    216 ; SI-DAG: buffer_store_byte
    217 ; SI-DAG: buffer_load_ubyte
    218 ; SI-DAG: buffer_store_byte
    219 ; SI-DAG: buffer_load_ubyte
    220 ; SI-DAG: buffer_store_byte
    221 
    222 ; SI-DAG: buffer_load_ubyte
    223 ; SI-DAG: buffer_store_byte
    224 ; SI-DAG: buffer_load_ubyte
    225 ; SI-DAG: buffer_store_byte
    226 ; SI-DAG: buffer_load_ubyte
    227 ; SI-DAG: buffer_store_byte
    228 ; SI-DAG: buffer_load_ubyte
    229 ; SI-DAG: buffer_store_byte
    230 ; SI-DAG: buffer_load_ubyte
    231 ; SI-DAG: buffer_store_byte
    232 ; SI-DAG: buffer_load_ubyte
    233 ; SI-DAG: buffer_store_byte
    234 ; SI-DAG: buffer_load_ubyte
    235 ; SI-DAG: buffer_store_byte
    236 ; SI-DAG: buffer_load_ubyte
    237 ; SI-DAG: buffer_store_byte
    238 
    239 ; SI: s_endpgm
    240 define void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
    241   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
    242   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
    243   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 1, i1 false) nounwind
    244   ret void
    245 }
    246 
    247 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align2:
    248 ; SI-DAG: buffer_load_ushort
    249 ; SI-DAG: buffer_load_ushort
    250 ; SI-DAG: buffer_load_ushort
    251 ; SI-DAG: buffer_load_ushort
    252 ; SI-DAG: buffer_load_ushort
    253 ; SI-DAG: buffer_load_ushort
    254 ; SI-DAG: buffer_load_ushort
    255 ; SI-DAG: buffer_load_ushort
    256 ; SI-DAG: buffer_load_ushort
    257 ; SI-DAG: buffer_load_ushort
    258 ; SI-DAG: buffer_load_ushort
    259 ; SI-DAG: buffer_load_ushort
    260 ; SI-DAG: buffer_load_ushort
    261 ; SI-DAG: buffer_load_ushort
    262 ; SI-DAG: buffer_load_ushort
    263 ; SI-DAG: buffer_load_ushort
    264 
    265 ; SI-DAG: buffer_store_short
    266 ; SI-DAG: buffer_store_short
    267 ; SI-DAG: buffer_store_short
    268 ; SI-DAG: buffer_store_short
    269 ; SI-DAG: buffer_store_short
    270 ; SI-DAG: buffer_store_short
    271 ; SI-DAG: buffer_store_short
    272 ; SI-DAG: buffer_store_short
    273 ; SI-DAG: buffer_store_short
    274 ; SI-DAG: buffer_store_short
    275 ; SI-DAG: buffer_store_short
    276 ; SI-DAG: buffer_store_short
    277 ; SI-DAG: buffer_store_short
    278 ; SI-DAG: buffer_store_short
    279 ; SI-DAG: buffer_store_short
    280 ; SI-DAG: buffer_store_short
    281 
    282 ; SI: s_endpgm
    283 define void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
    284   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
    285   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
    286   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 2, i1 false) nounwind
    287   ret void
    288 }
    289 
    290 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align4:
    291 ; SI: buffer_load_dwordx4
    292 ; SI: buffer_load_dwordx4
    293 ; SI: buffer_store_dwordx4
    294 ; SI: buffer_store_dwordx4
    295 ; SI: s_endpgm
    296 define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
    297   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
    298   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
    299   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 4, i1 false) nounwind
    300   ret void
    301 }
    302 
    303 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align8:
    304 ; SI: buffer_load_dwordx4
    305 ; SI: buffer_load_dwordx4
    306 ; SI: buffer_store_dwordx4
    307 ; SI: buffer_store_dwordx4
    308 ; SI: s_endpgm
    309 define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
    310   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
    311   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
    312   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 8, i1 false) nounwind
    313   ret void
    314 }
    315 
    316 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align16:
    317 ; SI: buffer_load_dwordx4
    318 ; SI: buffer_load_dwordx4
    319 ; SI: buffer_store_dwordx4
    320 ; SI: buffer_store_dwordx4
    321 ; SI: s_endpgm
    322 define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
    323   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
    324   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
    325   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 16, i1 false) nounwind
    326   ret void
    327 }
    328