Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
      2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
      3 
      4 ; SI-LABEL: {{^}}unaligned_load_store_i16_local:
      5 ; SI: ds_read_u8
      6 ; SI: ds_read_u8
      7 ; SI: ds_write_b8
      8 ; SI: ds_write_b8
      9 ; SI: s_endpgm
     10 define void @unaligned_load_store_i16_local(i16 addrspace(3)* %p, i16 addrspace(3)* %r) nounwind {
     11   %v = load i16, i16 addrspace(3)* %p, align 1
     12   store i16 %v, i16 addrspace(3)* %r, align 1
     13   ret void
     14 }
     15 
     16 ; SI-LABEL: {{^}}unaligned_load_store_i16_global:
     17 ; SI: buffer_load_ubyte
     18 ; SI: buffer_load_ubyte
     19 ; SI: buffer_store_byte
     20 ; SI: buffer_store_byte
     21 ; SI: s_endpgm
     22 define void @unaligned_load_store_i16_global(i16 addrspace(1)* %p, i16 addrspace(1)* %r) nounwind {
     23   %v = load i16, i16 addrspace(1)* %p, align 1
     24   store i16 %v, i16 addrspace(1)* %r, align 1
     25   ret void
     26 }
     27 
     28 ; SI-LABEL: {{^}}unaligned_load_store_i32_local:
     29 ; SI: ds_read_u8
     30 ; SI: ds_read_u8
     31 ; SI: ds_read_u8
     32 ; SI: ds_read_u8
     33 ; SI: ds_write_b8
     34 ; SI: ds_write_b8
     35 ; SI: ds_write_b8
     36 ; SI: ds_write_b8
     37 ; SI: s_endpgm
     38 define void @unaligned_load_store_i32_local(i32 addrspace(3)* %p, i32 addrspace(3)* %r) nounwind {
     39   %v = load i32, i32 addrspace(3)* %p, align 1
     40   store i32 %v, i32 addrspace(3)* %r, align 1
     41   ret void
     42 }
     43 
     44 ; SI-LABEL: {{^}}unaligned_load_store_i32_global:
     45 ; SI: buffer_load_ubyte
     46 ; SI: buffer_load_ubyte
     47 ; SI: buffer_load_ubyte
     48 ; SI: buffer_load_ubyte
     49 ; SI: buffer_store_byte
     50 ; SI: buffer_store_byte
     51 ; SI: buffer_store_byte
     52 ; SI: buffer_store_byte
     53 define void @unaligned_load_store_i32_global(i32 addrspace(1)* %p, i32 addrspace(1)* %r) nounwind {
     54   %v = load i32, i32 addrspace(1)* %p, align 1
     55   store i32 %v, i32 addrspace(1)* %r, align 1
     56   ret void
     57 }
     58 
     59 ; SI-LABEL: {{^}}unaligned_load_store_i64_local:
     60 ; SI: ds_read_u8
     61 ; SI: ds_read_u8
     62 ; SI: ds_read_u8
     63 ; SI: ds_read_u8
     64 ; SI: ds_read_u8
     65 ; SI: ds_read_u8
     66 ; SI: ds_read_u8
     67 ; SI: ds_read_u8
     68 ; SI: ds_write_b8
     69 ; SI: ds_write_b8
     70 ; SI: ds_write_b8
     71 ; SI: ds_write_b8
     72 ; SI: ds_write_b8
     73 ; SI: ds_write_b8
     74 ; SI: ds_write_b8
     75 ; SI: ds_write_b8
     76 ; SI: s_endpgm
     77 define void @unaligned_load_store_i64_local(i64 addrspace(3)* %p, i64 addrspace(3)* %r) {
     78   %v = load i64, i64 addrspace(3)* %p, align 1
     79   store i64 %v, i64 addrspace(3)* %r, align 1
     80   ret void
     81 }
     82 
     83 ; SI-LABEL: {{^}}unaligned_load_store_i64_global:
     84 ; SI: buffer_load_ubyte
     85 ; SI: buffer_load_ubyte
     86 ; SI: buffer_load_ubyte
     87 ; SI: buffer_load_ubyte
     88 ; SI: buffer_load_ubyte
     89 ; SI: buffer_load_ubyte
     90 ; SI: buffer_load_ubyte
     91 ; SI: buffer_load_ubyte
     92 ; SI: buffer_store_byte
     93 ; SI: buffer_store_byte
     94 ; SI: buffer_store_byte
     95 ; SI: buffer_store_byte
     96 ; SI: buffer_store_byte
     97 ; SI: buffer_store_byte
     98 ; SI: buffer_store_byte
     99 ; SI: buffer_store_byte
    100 define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) {
    101   %v = load i64, i64 addrspace(1)* %p, align 1
    102   store i64 %v, i64 addrspace(1)* %r, align 1
    103   ret void
    104 }
    105 
    106 ; SI-LABEL: {{^}}unaligned_load_store_v4i32_local:
    107 ; SI: ds_read_u8
    108 ; SI: ds_read_u8
    109 ; SI: ds_read_u8
    110 ; SI: ds_read_u8
    111 
    112 ; SI: ds_read_u8
    113 ; SI: ds_read_u8
    114 ; SI: ds_read_u8
    115 ; SI: ds_read_u8
    116 
    117 ; SI: ds_read_u8
    118 ; SI: ds_read_u8
    119 ; SI: ds_read_u8
    120 ; SI: ds_read_u8
    121 
    122 ; SI: ds_read_u8
    123 ; SI: ds_read_u8
    124 ; SI: ds_read_u8
    125 ; SI: ds_read_u8
    126 
    127 ; SI: ds_write_b8
    128 ; SI: ds_write_b8
    129 ; SI: ds_write_b8
    130 ; SI: ds_write_b8
    131 
    132 ; SI: ds_write_b8
    133 ; SI: ds_write_b8
    134 ; SI: ds_write_b8
    135 ; SI: ds_write_b8
    136 
    137 ; SI: ds_write_b8
    138 ; SI: ds_write_b8
    139 ; SI: ds_write_b8
    140 ; SI: ds_write_b8
    141 
    142 ; SI: ds_write_b8
    143 ; SI: ds_write_b8
    144 ; SI: ds_write_b8
    145 ; SI: ds_write_b8
    146 ; SI: s_endpgm
    147 define void @unaligned_load_store_v4i32_local(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) nounwind {
    148   %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1
    149   store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1
    150   ret void
    151 }
    152 
    153 ; FIXME: We mark v4i32 as custom, so misaligned loads are never expanded.
    154 ; FIXME-SI-LABEL: {{^}}unaligned_load_store_v4i32_global
    155 ; FIXME-SI: buffer_load_ubyte
    156 ; FIXME-SI: buffer_load_ubyte
    157 ; FIXME-SI: buffer_load_ubyte
    158 ; FIXME-SI: buffer_load_ubyte
    159 ; FIXME-SI: buffer_load_ubyte
    160 ; FIXME-SI: buffer_load_ubyte
    161 ; FIXME-SI: buffer_load_ubyte
    162 ; FIXME-SI: buffer_load_ubyte
    163 ; FIXME-SI: buffer_load_ubyte
    164 ; FIXME-SI: buffer_load_ubyte
    165 ; FIXME-SI: buffer_load_ubyte
    166 ; FIXME-SI: buffer_load_ubyte
    167 ; FIXME-SI: buffer_load_ubyte
    168 ; FIXME-SI: buffer_load_ubyte
    169 ; FIXME-SI: buffer_load_ubyte
    170 ; FIXME-SI: buffer_load_ubyte
    171 define void @unaligned_load_store_v4i32_global(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) nounwind {
    172   %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1
    173   store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1
    174   ret void
    175 }
    176 
    177 ; SI-LABEL: {{^}}load_lds_i64_align_4:
    178 ; SI: ds_read2_b32
    179 ; SI: s_endpgm
    180 define void @load_lds_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
    181   %val = load i64, i64 addrspace(3)* %in, align 4
    182   store i64 %val, i64 addrspace(1)* %out, align 8
    183   ret void
    184 }
    185 
    186 ; SI-LABEL: {{^}}load_lds_i64_align_4_with_offset
    187 ; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9
    188 ; SI: s_endpgm
    189 define void @load_lds_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
    190   %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4
    191   %val = load i64, i64 addrspace(3)* %ptr, align 4
    192   store i64 %val, i64 addrspace(1)* %out, align 8
    193   ret void
    194 }
    195 
    196 ; SI-LABEL: {{^}}load_lds_i64_align_4_with_split_offset:
    197 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
    198 ; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1
    199 ; SI: s_endpgm
    200 define void @load_lds_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
    201   %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)*
    202   %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
    203   %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
    204   %val = load i64, i64 addrspace(3)* %ptri64, align 4
    205   store i64 %val, i64 addrspace(1)* %out, align 8
    206   ret void
    207 }
    208 
    209 ; SI-LABEL: {{^}}load_lds_i64_align_1:
    210 ; SI: ds_read_u8
    211 ; SI: ds_read_u8
    212 ; SI: ds_read_u8
    213 ; SI: ds_read_u8
    214 ; SI: ds_read_u8
    215 ; SI: ds_read_u8
    216 ; SI: ds_read_u8
    217 ; SI: ds_read_u8
    218 ; SI: buffer_store_dwordx2
    219 ; SI: s_endpgm
    220 
    221 define void @load_lds_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
    222   %val = load i64, i64 addrspace(3)* %in, align 1
    223   store i64 %val, i64 addrspace(1)* %out, align 8
    224   ret void
    225 }
    226 
    227 ; SI-LABEL: {{^}}store_lds_i64_align_4:
    228 ; SI: ds_write2_b32
    229 ; SI: s_endpgm
    230 define void @store_lds_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
    231   store i64 %val, i64 addrspace(3)* %out, align 4
    232   ret void
    233 }
    234 
    235 ; SI-LABEL: {{^}}store_lds_i64_align_4_with_offset
    236 ; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9
    237 ; SI: s_endpgm
    238 define void @store_lds_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
    239   %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4
    240   store i64 0, i64 addrspace(3)* %ptr, align 4
    241   ret void
    242 }
    243 
    244 ; SI-LABEL: {{^}}store_lds_i64_align_4_with_split_offset:
    245 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
    246 ; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1
    247 ; SI: s_endpgm
    248 define void @store_lds_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
    249   %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)*
    250   %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
    251   %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
    252   store i64 0, i64 addrspace(3)* %out, align 4
    253   ret void
    254 }
    255