1 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s 2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s 3 4 declare void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(3)* nocapture, i32, i32, i1) nounwind 5 declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind 6 7 8 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1: 9 ; SI-DAG: ds_read_u8 10 ; SI-DAG: ds_read_u8 11 ; SI-DAG: ds_read_u8 12 ; SI-DAG: ds_read_u8 13 ; SI-DAG: ds_read_u8 14 ; SI-DAG: ds_read_u8 15 ; SI-DAG: ds_read_u8 16 ; SI-DAG: ds_read_u8 17 18 ; SI-DAG: ds_read_u8 19 ; SI-DAG: ds_read_u8 20 ; SI-DAG: ds_read_u8 21 ; SI-DAG: ds_read_u8 22 ; SI-DAG: ds_read_u8 23 ; SI-DAG: ds_read_u8 24 ; SI-DAG: ds_read_u8 25 ; SI-DAG: ds_read_u8 26 27 ; SI-DAG: ds_read_u8 28 ; SI-DAG: ds_read_u8 29 ; SI-DAG: ds_read_u8 30 ; SI-DAG: ds_read_u8 31 ; SI-DAG: ds_read_u8 32 ; SI-DAG: ds_read_u8 33 ; SI-DAG: ds_read_u8 34 ; SI-DAG: ds_read_u8 35 36 ; SI-DAG: ds_read_u8 37 ; SI-DAG: ds_read_u8 38 ; SI-DAG: ds_read_u8 39 ; SI-DAG: ds_read_u8 40 ; SI-DAG: ds_read_u8 41 ; SI-DAG: ds_read_u8 42 ; SI-DAG: ds_read_u8 43 ; SI-DAG: ds_read_u8 44 45 ; SI-DAG: ds_write_b8 46 ; SI-DAG: ds_write_b8 47 ; SI-DAG: ds_write_b8 48 ; SI-DAG: ds_write_b8 49 ; SI-DAG: ds_write_b8 50 ; SI-DAG: ds_write_b8 51 ; SI-DAG: ds_write_b8 52 ; SI-DAG: ds_write_b8 53 54 ; SI-DAG: ds_write_b8 55 ; SI-DAG: ds_write_b8 56 ; SI-DAG: ds_write_b8 57 ; SI-DAG: ds_write_b8 58 ; SI-DAG: ds_write_b8 59 ; SI-DAG: ds_write_b8 60 ; SI-DAG: ds_write_b8 61 ; SI-DAG: ds_write_b8 62 63 ; SI-DAG: ds_write_b8 64 ; SI-DAG: ds_write_b8 65 ; SI-DAG: ds_write_b8 66 ; SI-DAG: ds_write_b8 67 ; SI-DAG: ds_write_b8 68 ; SI-DAG: ds_write_b8 69 ; SI-DAG: ds_write_b8 70 ; SI-DAG: ds_write_b8 71 72 ; SI-DAG: ds_write_b8 73 ; SI-DAG: ds_write_b8 74 ; SI-DAG: ds_write_b8 75 ; SI-DAG: ds_write_b8 76 ; SI-DAG: ds_write_b8 77 ; SI-DAG: ds_write_b8 78 ; SI-DAG: ds_write_b8 79 ; SI-DAG: ds_write_b8 80 81 ; SI: s_endpgm 82 define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { 83 %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* 84 %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* 85 call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 1, i1 false) nounwind 86 ret void 87 } 88 89 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align2: 90 ; SI-DAG: ds_read_u16 91 ; SI-DAG: ds_read_u16 92 ; SI-DAG: ds_read_u16 93 ; SI-DAG: ds_read_u16 94 ; SI-DAG: ds_read_u16 95 ; SI-DAG: ds_read_u16 96 ; SI-DAG: ds_read_u16 97 ; SI-DAG: ds_read_u16 98 99 ; SI-DAG: ds_read_u16 100 ; SI-DAG: ds_read_u16 101 ; SI-DAG: ds_read_u16 102 ; SI-DAG: ds_read_u16 103 ; SI-DAG: ds_read_u16 104 ; SI-DAG: ds_read_u16 105 ; SI-DAG: ds_read_u16 106 ; SI-DAG: ds_read_u16 107 108 ; SI-DAG: ds_write_b16 109 ; SI-DAG: ds_write_b16 110 ; SI-DAG: ds_write_b16 111 ; SI-DAG: ds_write_b16 112 ; SI-DAG: ds_write_b16 113 ; SI-DAG: ds_write_b16 114 ; SI-DAG: ds_write_b16 115 ; SI-DAG: ds_write_b16 116 117 ; SI-DAG: ds_write_b16 118 ; SI-DAG: ds_write_b16 119 ; SI-DAG: ds_write_b16 120 ; SI-DAG: ds_write_b16 121 ; SI-DAG: ds_write_b16 122 ; SI-DAG: ds_write_b16 123 ; SI-DAG: ds_write_b16 124 ; SI-DAG: ds_write_b16 125 126 ; SI: s_endpgm 127 define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { 128 %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* 129 %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* 130 call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 2, i1 false) nounwind 131 ret void 132 } 133 134 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align4: 135 ; SI: ds_read2_b32 136 ; SI: ds_read2_b32 137 ; SI: ds_read2_b32 138 ; SI: ds_read2_b32 139 140 ; SI: ds_write2_b32 141 ; SI: ds_write2_b32 142 ; SI: ds_write2_b32 143 ; SI: ds_write2_b32 144 145 ; SI: s_endpgm 146 define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { 147 %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* 148 %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* 149 call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 4, i1 false) nounwind 150 ret void 151 } 152 153 ; FIXME: Use 64-bit ops 154 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align8: 155 156 ; SI: ds_read2_b64 157 ; SI: ds_read2_b64 158 159 ; SI: ds_write2_b64 160 ; SI: ds_write2_b64 161 162 ; SI-DAG: s_endpgm 163 define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { 164 %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* 165 %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* 166 call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 8, i1 false) nounwind 167 ret void 168 } 169 170 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align1: 171 ; SI-DAG: buffer_load_ubyte 172 ; SI-DAG: buffer_store_byte 173 ; SI-DAG: buffer_load_ubyte 174 ; SI-DAG: buffer_store_byte 175 ; SI-DAG: buffer_load_ubyte 176 ; SI-DAG: buffer_store_byte 177 ; SI-DAG: buffer_load_ubyte 178 ; SI-DAG: buffer_store_byte 179 ; SI-DAG: buffer_load_ubyte 180 ; SI-DAG: buffer_store_byte 181 ; SI-DAG: buffer_load_ubyte 182 ; SI-DAG: buffer_store_byte 183 ; SI-DAG: buffer_load_ubyte 184 ; SI-DAG: buffer_store_byte 185 ; SI-DAG: buffer_load_ubyte 186 ; SI-DAG: buffer_store_byte 187 188 ; SI-DAG: buffer_load_ubyte 189 ; SI-DAG: buffer_store_byte 190 ; SI-DAG: buffer_load_ubyte 191 ; SI-DAG: buffer_store_byte 192 ; SI-DAG: buffer_load_ubyte 193 ; SI-DAG: buffer_store_byte 194 ; SI-DAG: buffer_load_ubyte 195 ; SI-DAG: buffer_store_byte 196 ; SI-DAG: buffer_load_ubyte 197 ; SI-DAG: buffer_store_byte 198 ; SI-DAG: buffer_load_ubyte 199 ; SI-DAG: buffer_store_byte 200 ; SI-DAG: buffer_load_ubyte 201 ; SI-DAG: buffer_store_byte 202 ; SI-DAG: buffer_load_ubyte 203 ; SI-DAG: buffer_store_byte 204 205 ; SI-DAG: buffer_load_ubyte 206 ; SI-DAG: buffer_store_byte 207 ; SI-DAG: buffer_load_ubyte 208 ; SI-DAG: buffer_store_byte 209 ; SI-DAG: buffer_load_ubyte 210 ; SI-DAG: buffer_store_byte 211 ; SI-DAG: buffer_load_ubyte 212 ; SI-DAG: buffer_store_byte 213 ; SI-DAG: buffer_load_ubyte 214 ; SI-DAG: buffer_store_byte 215 ; SI-DAG: buffer_load_ubyte 216 ; SI-DAG: buffer_store_byte 217 ; SI-DAG: buffer_load_ubyte 218 ; SI-DAG: buffer_store_byte 219 ; SI-DAG: buffer_load_ubyte 220 ; SI-DAG: buffer_store_byte 221 222 ; SI-DAG: buffer_load_ubyte 223 ; SI-DAG: buffer_store_byte 224 ; SI-DAG: buffer_load_ubyte 225 ; SI-DAG: buffer_store_byte 226 ; SI-DAG: buffer_load_ubyte 227 ; SI-DAG: buffer_store_byte 228 ; SI-DAG: buffer_load_ubyte 229 ; SI-DAG: buffer_store_byte 230 ; SI-DAG: buffer_load_ubyte 231 ; SI-DAG: buffer_store_byte 232 ; SI-DAG: buffer_load_ubyte 233 ; SI-DAG: buffer_store_byte 234 ; SI-DAG: buffer_load_ubyte 235 ; SI-DAG: buffer_store_byte 236 ; SI-DAG: buffer_load_ubyte 237 ; SI-DAG: buffer_store_byte 238 239 ; SI: s_endpgm 240 define void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 241 %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* 242 %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* 243 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 1, i1 false) nounwind 244 ret void 245 } 246 247 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align2: 248 ; SI-DAG: buffer_load_ushort 249 ; SI-DAG: buffer_load_ushort 250 ; SI-DAG: buffer_load_ushort 251 ; SI-DAG: buffer_load_ushort 252 ; SI-DAG: buffer_load_ushort 253 ; SI-DAG: buffer_load_ushort 254 ; SI-DAG: buffer_load_ushort 255 ; SI-DAG: buffer_load_ushort 256 ; SI-DAG: buffer_load_ushort 257 ; SI-DAG: buffer_load_ushort 258 ; SI-DAG: buffer_load_ushort 259 ; SI-DAG: buffer_load_ushort 260 ; SI-DAG: buffer_load_ushort 261 ; SI-DAG: buffer_load_ushort 262 ; SI-DAG: buffer_load_ushort 263 ; SI-DAG: buffer_load_ushort 264 265 ; SI-DAG: buffer_store_short 266 ; SI-DAG: buffer_store_short 267 ; SI-DAG: buffer_store_short 268 ; SI-DAG: buffer_store_short 269 ; SI-DAG: buffer_store_short 270 ; SI-DAG: buffer_store_short 271 ; SI-DAG: buffer_store_short 272 ; SI-DAG: buffer_store_short 273 ; SI-DAG: buffer_store_short 274 ; SI-DAG: buffer_store_short 275 ; SI-DAG: buffer_store_short 276 ; SI-DAG: buffer_store_short 277 ; SI-DAG: buffer_store_short 278 ; SI-DAG: buffer_store_short 279 ; SI-DAG: buffer_store_short 280 ; SI-DAG: buffer_store_short 281 282 ; SI: s_endpgm 283 define void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 284 %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* 285 %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* 286 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 2, i1 false) nounwind 287 ret void 288 } 289 290 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align4: 291 ; SI: buffer_load_dwordx4 292 ; SI: buffer_load_dwordx4 293 ; SI: buffer_store_dwordx4 294 ; SI: buffer_store_dwordx4 295 ; SI: s_endpgm 296 define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 297 %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* 298 %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* 299 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 4, i1 false) nounwind 300 ret void 301 } 302 303 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align8: 304 ; SI: buffer_load_dwordx4 305 ; SI: buffer_load_dwordx4 306 ; SI: buffer_store_dwordx4 307 ; SI: buffer_store_dwordx4 308 ; SI: s_endpgm 309 define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 310 %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* 311 %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* 312 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 8, i1 false) nounwind 313 ret void 314 } 315 316 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align16: 317 ; SI: buffer_load_dwordx4 318 ; SI: buffer_load_dwordx4 319 ; SI: buffer_store_dwordx4 320 ; SI: buffer_store_dwordx4 321 ; SI: s_endpgm 322 define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 323 %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* 324 %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* 325 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 16, i1 false) nounwind 326 ret void 327 } 328