1 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s 2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s 3 4 ; SI-LABEL: {{^}}unaligned_load_store_i16_local: 5 ; SI: ds_read_u8 6 ; SI: ds_read_u8 7 ; SI: ds_write_b8 8 ; SI: ds_write_b8 9 ; SI: s_endpgm 10 define void @unaligned_load_store_i16_local(i16 addrspace(3)* %p, i16 addrspace(3)* %r) nounwind { 11 %v = load i16, i16 addrspace(3)* %p, align 1 12 store i16 %v, i16 addrspace(3)* %r, align 1 13 ret void 14 } 15 16 ; SI-LABEL: {{^}}unaligned_load_store_i16_global: 17 ; SI: buffer_load_ubyte 18 ; SI: buffer_load_ubyte 19 ; SI: buffer_store_byte 20 ; SI: buffer_store_byte 21 ; SI: s_endpgm 22 define void @unaligned_load_store_i16_global(i16 addrspace(1)* %p, i16 addrspace(1)* %r) nounwind { 23 %v = load i16, i16 addrspace(1)* %p, align 1 24 store i16 %v, i16 addrspace(1)* %r, align 1 25 ret void 26 } 27 28 ; SI-LABEL: {{^}}unaligned_load_store_i32_local: 29 ; SI: ds_read_u8 30 ; SI: ds_read_u8 31 ; SI: ds_read_u8 32 ; SI: ds_read_u8 33 ; SI: ds_write_b8 34 ; SI: ds_write_b8 35 ; SI: ds_write_b8 36 ; SI: ds_write_b8 37 ; SI: s_endpgm 38 define void @unaligned_load_store_i32_local(i32 addrspace(3)* %p, i32 addrspace(3)* %r) nounwind { 39 %v = load i32, i32 addrspace(3)* %p, align 1 40 store i32 %v, i32 addrspace(3)* %r, align 1 41 ret void 42 } 43 44 ; SI-LABEL: {{^}}unaligned_load_store_i32_global: 45 ; SI: buffer_load_ubyte 46 ; SI: buffer_load_ubyte 47 ; SI: buffer_load_ubyte 48 ; SI: buffer_load_ubyte 49 ; SI: buffer_store_byte 50 ; SI: buffer_store_byte 51 ; SI: buffer_store_byte 52 ; SI: buffer_store_byte 53 define void @unaligned_load_store_i32_global(i32 addrspace(1)* %p, i32 addrspace(1)* %r) nounwind { 54 %v = load i32, i32 addrspace(1)* %p, align 1 55 store i32 %v, i32 addrspace(1)* %r, align 1 56 ret void 57 } 58 59 ; SI-LABEL: {{^}}unaligned_load_store_i64_local: 60 ; SI: ds_read_u8 61 ; SI: ds_read_u8 62 ; SI: ds_read_u8 63 ; SI: ds_read_u8 64 ; SI: ds_read_u8 65 ; SI: ds_read_u8 66 ; SI: ds_read_u8 67 ; SI: ds_read_u8 68 ; SI: ds_write_b8 69 ; SI: ds_write_b8 70 ; SI: ds_write_b8 71 ; SI: ds_write_b8 72 ; SI: ds_write_b8 73 ; SI: ds_write_b8 74 ; SI: ds_write_b8 75 ; SI: ds_write_b8 76 ; SI: s_endpgm 77 define void @unaligned_load_store_i64_local(i64 addrspace(3)* %p, i64 addrspace(3)* %r) { 78 %v = load i64, i64 addrspace(3)* %p, align 1 79 store i64 %v, i64 addrspace(3)* %r, align 1 80 ret void 81 } 82 83 ; SI-LABEL: {{^}}unaligned_load_store_i64_global: 84 ; SI: buffer_load_ubyte 85 ; SI: buffer_load_ubyte 86 ; SI: buffer_load_ubyte 87 ; SI: buffer_load_ubyte 88 ; SI: buffer_load_ubyte 89 ; SI: buffer_load_ubyte 90 ; SI: buffer_load_ubyte 91 ; SI: buffer_load_ubyte 92 ; SI: buffer_store_byte 93 ; SI: buffer_store_byte 94 ; SI: buffer_store_byte 95 ; SI: buffer_store_byte 96 ; SI: buffer_store_byte 97 ; SI: buffer_store_byte 98 ; SI: buffer_store_byte 99 ; SI: buffer_store_byte 100 define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) { 101 %v = load i64, i64 addrspace(1)* %p, align 1 102 store i64 %v, i64 addrspace(1)* %r, align 1 103 ret void 104 } 105 106 ; SI-LABEL: {{^}}unaligned_load_store_v4i32_local: 107 ; SI: ds_read_u8 108 ; SI: ds_read_u8 109 ; SI: ds_read_u8 110 ; SI: ds_read_u8 111 112 ; SI: ds_read_u8 113 ; SI: ds_read_u8 114 ; SI: ds_read_u8 115 ; SI: ds_read_u8 116 117 ; SI: ds_read_u8 118 ; SI: ds_read_u8 119 ; SI: ds_read_u8 120 ; SI: ds_read_u8 121 122 ; SI: ds_read_u8 123 ; SI: ds_read_u8 124 ; SI: ds_read_u8 125 ; SI: ds_read_u8 126 127 ; SI: ds_write_b8 128 ; SI: ds_write_b8 129 ; SI: ds_write_b8 130 ; SI: ds_write_b8 131 132 ; SI: ds_write_b8 133 ; SI: ds_write_b8 134 ; SI: ds_write_b8 135 ; SI: ds_write_b8 136 137 ; SI: ds_write_b8 138 ; SI: ds_write_b8 139 ; SI: ds_write_b8 140 ; SI: ds_write_b8 141 142 ; SI: ds_write_b8 143 ; SI: ds_write_b8 144 ; SI: ds_write_b8 145 ; SI: ds_write_b8 146 ; SI: s_endpgm 147 define void @unaligned_load_store_v4i32_local(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) nounwind { 148 %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1 149 store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1 150 ret void 151 } 152 153 ; FIXME: We mark v4i32 as custom, so misaligned loads are never expanded. 154 ; FIXME-SI-LABEL: {{^}}unaligned_load_store_v4i32_global 155 ; FIXME-SI: buffer_load_ubyte 156 ; FIXME-SI: buffer_load_ubyte 157 ; FIXME-SI: buffer_load_ubyte 158 ; FIXME-SI: buffer_load_ubyte 159 ; FIXME-SI: buffer_load_ubyte 160 ; FIXME-SI: buffer_load_ubyte 161 ; FIXME-SI: buffer_load_ubyte 162 ; FIXME-SI: buffer_load_ubyte 163 ; FIXME-SI: buffer_load_ubyte 164 ; FIXME-SI: buffer_load_ubyte 165 ; FIXME-SI: buffer_load_ubyte 166 ; FIXME-SI: buffer_load_ubyte 167 ; FIXME-SI: buffer_load_ubyte 168 ; FIXME-SI: buffer_load_ubyte 169 ; FIXME-SI: buffer_load_ubyte 170 ; FIXME-SI: buffer_load_ubyte 171 define void @unaligned_load_store_v4i32_global(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) nounwind { 172 %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1 173 store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1 174 ret void 175 } 176 177 ; SI-LABEL: {{^}}load_lds_i64_align_4: 178 ; SI: ds_read2_b32 179 ; SI: s_endpgm 180 define void @load_lds_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { 181 %val = load i64, i64 addrspace(3)* %in, align 4 182 store i64 %val, i64 addrspace(1)* %out, align 8 183 ret void 184 } 185 186 ; SI-LABEL: {{^}}load_lds_i64_align_4_with_offset 187 ; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9 188 ; SI: s_endpgm 189 define void @load_lds_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { 190 %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4 191 %val = load i64, i64 addrspace(3)* %ptr, align 4 192 store i64 %val, i64 addrspace(1)* %out, align 8 193 ret void 194 } 195 196 ; SI-LABEL: {{^}}load_lds_i64_align_4_with_split_offset: 197 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits 198 ; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1 199 ; SI: s_endpgm 200 define void @load_lds_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { 201 %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)* 202 %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255 203 %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)* 204 %val = load i64, i64 addrspace(3)* %ptri64, align 4 205 store i64 %val, i64 addrspace(1)* %out, align 8 206 ret void 207 } 208 209 ; SI-LABEL: {{^}}load_lds_i64_align_1: 210 ; SI: ds_read_u8 211 ; SI: ds_read_u8 212 ; SI: ds_read_u8 213 ; SI: ds_read_u8 214 ; SI: ds_read_u8 215 ; SI: ds_read_u8 216 ; SI: ds_read_u8 217 ; SI: ds_read_u8 218 ; SI: buffer_store_dwordx2 219 ; SI: s_endpgm 220 221 define void @load_lds_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { 222 %val = load i64, i64 addrspace(3)* %in, align 1 223 store i64 %val, i64 addrspace(1)* %out, align 8 224 ret void 225 } 226 227 ; SI-LABEL: {{^}}store_lds_i64_align_4: 228 ; SI: ds_write2_b32 229 ; SI: s_endpgm 230 define void @store_lds_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 { 231 store i64 %val, i64 addrspace(3)* %out, align 4 232 ret void 233 } 234 235 ; SI-LABEL: {{^}}store_lds_i64_align_4_with_offset 236 ; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9 237 ; SI: s_endpgm 238 define void @store_lds_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 { 239 %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4 240 store i64 0, i64 addrspace(3)* %ptr, align 4 241 ret void 242 } 243 244 ; SI-LABEL: {{^}}store_lds_i64_align_4_with_split_offset: 245 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits 246 ; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1 247 ; SI: s_endpgm 248 define void @store_lds_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 { 249 %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)* 250 %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255 251 %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)* 252 store i64 0, i64 addrspace(3)* %out, align 4 253 ret void 254 } 255