Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire < %s | FileCheck -check-prefixes=CHECK,CIVI %s
      2 ; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,CIVI %s
      3 ; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,HSA %s
      4 ; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,HSA,GFX9 %s
      5 
      6 ; Disable optimizations in case there are optimizations added that
      7 ; specialize away generic pointer accesses.
      8 
      9 
     10 ; These testcases might become useless when there are optimizations to
     11 ; remove generic pointers.
     12 
     13 ; CHECK-LABEL: {{^}}store_flat_i32:
     14 ; CHECK-DAG: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:[[HI_SREG:[0-9]+]]],
     15 ; CHECK-DAG: s_load_dword s[[SDATA:[0-9]+]],
     16 ; CHECK: s_waitcnt lgkmcnt(0)
     17 ; CHECK-DAG: v_mov_b32_e32 v[[DATA:[0-9]+]], s[[SDATA]]
     18 ; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
     19 ; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
     20 ; CHECK: flat_store_dword v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}, v[[DATA]]
     21 define amdgpu_kernel void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
     22   %fptr = addrspacecast i32 addrspace(1)* %gptr to i32*
     23   store volatile i32 %x, i32* %fptr, align 4
     24   ret void
     25 }
     26 
     27 ; CHECK-LABEL: {{^}}store_flat_i64:
     28 ; CHECK: flat_store_dwordx2
     29 define amdgpu_kernel void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 {
     30   %fptr = addrspacecast i64 addrspace(1)* %gptr to i64*
     31   store volatile i64 %x, i64* %fptr, align 8
     32   ret void
     33 }
     34 
     35 ; CHECK-LABEL: {{^}}store_flat_v4i32:
     36 ; CHECK: flat_store_dwordx4
     37 define amdgpu_kernel void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 {
     38   %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32>*
     39   store volatile <4 x i32> %x, <4 x i32>* %fptr, align 16
     40   ret void
     41 }
     42 
     43 ; CHECK-LABEL: {{^}}store_flat_trunc_i16:
     44 ; CHECK: flat_store_short
     45 define amdgpu_kernel void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 {
     46   %fptr = addrspacecast i16 addrspace(1)* %gptr to i16*
     47   %y = trunc i32 %x to i16
     48   store volatile i16 %y, i16* %fptr, align 2
     49   ret void
     50 }
     51 
     52 ; CHECK-LABEL: {{^}}store_flat_trunc_i8:
     53 ; CHECK: flat_store_byte
     54 define amdgpu_kernel void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
     55   %fptr = addrspacecast i8 addrspace(1)* %gptr to i8*
     56   %y = trunc i32 %x to i8
     57   store volatile i8 %y, i8* %fptr, align 2
     58   ret void
     59 }
     60 
     61 
     62 
     63 ; CHECK-LABEL: load_flat_i32:
     64 ; CHECK: flat_load_dword
     65 define amdgpu_kernel void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
     66   %fptr = addrspacecast i32 addrspace(1)* %gptr to i32*
     67   %fload = load volatile i32, i32* %fptr, align 4
     68   store i32 %fload, i32 addrspace(1)* %out, align 4
     69   ret void
     70 }
     71 
     72 ; CHECK-LABEL: load_flat_i64:
     73 ; CHECK: flat_load_dwordx2
     74 define amdgpu_kernel void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
     75   %fptr = addrspacecast i64 addrspace(1)* %gptr to i64*
     76   %fload = load volatile i64, i64* %fptr, align 8
     77   store i64 %fload, i64 addrspace(1)* %out, align 8
     78   ret void
     79 }
     80 
     81 ; CHECK-LABEL: load_flat_v4i32:
     82 ; CHECK: flat_load_dwordx4
     83 define amdgpu_kernel void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
     84   %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32>*
     85   %fload = load volatile <4 x i32>, <4 x i32>* %fptr, align 32
     86   store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8
     87   ret void
     88 }
     89 
     90 ; CHECK-LABEL: sextload_flat_i8:
     91 ; CHECK: flat_load_sbyte
     92 define amdgpu_kernel void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
     93   %fptr = addrspacecast i8 addrspace(1)* %gptr to i8*
     94   %fload = load volatile i8, i8* %fptr, align 4
     95   %ext = sext i8 %fload to i32
     96   store i32 %ext, i32 addrspace(1)* %out, align 4
     97   ret void
     98 }
     99 
    100 ; CHECK-LABEL: zextload_flat_i8:
    101 ; CHECK: flat_load_ubyte
    102 define amdgpu_kernel void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
    103   %fptr = addrspacecast i8 addrspace(1)* %gptr to i8*
    104   %fload = load volatile i8, i8* %fptr, align 4
    105   %ext = zext i8 %fload to i32
    106   store i32 %ext, i32 addrspace(1)* %out, align 4
    107   ret void
    108 }
    109 
    110 ; CHECK-LABEL: sextload_flat_i16:
    111 ; CHECK: flat_load_sshort
    112 define amdgpu_kernel void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
    113   %fptr = addrspacecast i16 addrspace(1)* %gptr to i16*
    114   %fload = load volatile i16, i16* %fptr, align 4
    115   %ext = sext i16 %fload to i32
    116   store i32 %ext, i32 addrspace(1)* %out, align 4
    117   ret void
    118 }
    119 
    120 ; CHECK-LABEL: zextload_flat_i16:
    121 ; CHECK: flat_load_ushort
    122 define amdgpu_kernel void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
    123   %fptr = addrspacecast i16 addrspace(1)* %gptr to i16*
    124   %fload = load volatile i16, i16* %fptr, align 4
    125   %ext = zext i16 %fload to i32
    126   store i32 %ext, i32 addrspace(1)* %out, align 4
    127   ret void
    128 }
    129 
    130 ; CHECK-LABEL: flat_scratch_unaligned_load:
    131 ; CHECK: flat_load_ubyte
    132 ; CHECK: flat_load_ubyte
    133 ; CHECK: flat_load_ubyte
    134 ; CHECK: flat_load_ubyte
    135 define amdgpu_kernel void @flat_scratch_unaligned_load() {
    136   %scratch = alloca i32, addrspace(5)
    137   %fptr = addrspacecast i32 addrspace(5)* %scratch to i32*
    138   %ld = load volatile i32, i32* %fptr, align 1
    139   ret void
    140 }
    141 
    142 ; CHECK-LABEL: flat_scratch_unaligned_store:
    143 ; CHECK: flat_store_byte
    144 ; CHECK: flat_store_byte
    145 ; CHECK: flat_store_byte
    146 ; CHECK: flat_store_byte
    147 define amdgpu_kernel void @flat_scratch_unaligned_store() {
    148   %scratch = alloca i32, addrspace(5)
    149   %fptr = addrspacecast i32 addrspace(5)* %scratch to i32*
    150   store volatile i32 0, i32* %fptr, align 1
    151   ret void
    152 }
    153 
    154 ; CHECK-LABEL: flat_scratch_multidword_load:
    155 ; HSA: flat_load_dword
    156 ; HSA: flat_load_dword
    157 ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr
    158 define amdgpu_kernel void @flat_scratch_multidword_load() {
    159   %scratch = alloca <2 x i32>, addrspace(5)
    160   %fptr = addrspacecast <2 x i32> addrspace(5)* %scratch to <2 x i32>*
    161   %ld = load volatile <2 x i32>, <2 x i32>* %fptr
    162   ret void
    163 }
    164 
    165 ; CHECK-LABEL: flat_scratch_multidword_store:
    166 ; HSA: flat_store_dword
    167 ; HSA: flat_store_dword
    168 ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr
    169 define amdgpu_kernel void @flat_scratch_multidword_store() {
    170   %scratch = alloca <2 x i32>, addrspace(5)
    171   %fptr = addrspacecast <2 x i32> addrspace(5)* %scratch to <2 x i32>*
    172   store volatile <2 x i32> zeroinitializer, <2 x i32>* %fptr
    173   ret void
    174 }
    175 
    176 ; CHECK-LABEL: {{^}}store_flat_i8_max_offset:
    177 ; CIVI: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}}
    178 ; GFX9: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:4095{{$}}
    179 define amdgpu_kernel void @store_flat_i8_max_offset(i8* %fptr, i8 %x) #0 {
    180   %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4095
    181   store volatile i8 %x, i8* %fptr.offset
    182   ret void
    183 }
    184 
    185 ; CHECK-LABEL: {{^}}store_flat_i8_max_offset_p1:
    186 ; CHECK: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}}
    187 define amdgpu_kernel void @store_flat_i8_max_offset_p1(i8* %fptr, i8 %x) #0 {
    188   %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4096
    189   store volatile i8 %x, i8* %fptr.offset
    190   ret void
    191 }
    192 
    193 ; CHECK-LABEL: {{^}}store_flat_i8_neg_offset:
    194 ; CHECK: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}}
    195 define amdgpu_kernel void @store_flat_i8_neg_offset(i8* %fptr, i8 %x) #0 {
    196   %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 -2
    197   store volatile i8 %x, i8* %fptr.offset
    198   ret void
    199 }
    200 
    201 ; CHECK-LABEL: {{^}}load_flat_i8_max_offset:
    202 ; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
    203 ; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095{{$}}
    204 define amdgpu_kernel void @load_flat_i8_max_offset(i8* %fptr) #0 {
    205   %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4095
    206   %val = load volatile i8, i8* %fptr.offset
    207   ret void
    208 }
    209 
    210 ; CHECK-LABEL: {{^}}load_flat_i8_max_offset_p1:
    211 ; CHECK: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
    212 define amdgpu_kernel void @load_flat_i8_max_offset_p1(i8* %fptr) #0 {
    213   %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4096
    214   %val = load volatile i8, i8* %fptr.offset
    215   ret void
    216 }
    217 
    218 ; CHECK-LABEL: {{^}}load_flat_i8_neg_offset:
    219 ; CHECK: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
    220 define amdgpu_kernel void @load_flat_i8_neg_offset(i8* %fptr) #0 {
    221   %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 -2
    222   %val = load volatile i8, i8* %fptr.offset
    223   ret void
    224 }
    225 
    226 attributes #0 = { nounwind }
    227 attributes #1 = { nounwind convergent }
    228