Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefix=SI --check-prefix=ALL %s
      2 ; RUN: opt -S -mcpu=tonga -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefix=CI --check-prefix=ALL %s
      3 
      4 ; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
      5 ; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
      6 
      7 define amdgpu_kernel void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
      8 entry:
      9   %stack = alloca [5 x i32], align 4
     10   %0 = load i32, i32 addrspace(1)* %in, align 4
     11   %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
     12   store i32 4, i32* %arrayidx1, align 4
     13   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
     14   %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
     15   %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
     16   store i32 5, i32* %arrayidx3, align 4
     17   %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
     18   %2 = load i32, i32* %arrayidx10, align 4
     19   store i32 %2, i32 addrspace(1)* %out, align 4
     20   %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
     21   %3 = load i32, i32* %arrayidx12
     22   %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
     23   store i32 %3, i32 addrspace(1)* %arrayidx13
     24   ret void
     25 }
     26 
     27 ; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
     28 
     29 define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
     30 entry:
     31   %stack = alloca [5 x i32], align 4
     32   %0 = load i32, i32 addrspace(1)* %in, align 4
     33   %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
     34   store i32 4, i32* %arrayidx1, align 4
     35   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
     36   %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
     37   %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
     38   store i32 5, i32* %arrayidx3, align 4
     39   %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
     40   %2 = load i32, i32* %arrayidx10, align 4
     41   store i32 %2, i32 addrspace(1)* %out, align 4
     42   %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
     43   %3 = load i32, i32* %arrayidx12
     44   %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
     45   store i32 %3, i32 addrspace(1)* %arrayidx13
     46   ret void
     47 }
     48 
     49 ; ALL: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
     50 
     51 define amdgpu_kernel void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
     52 entry:
     53   %stack = alloca [5 x i32], align 4
     54   %0 = load i32, i32 addrspace(1)* %in, align 4
     55   %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
     56   store i32 4, i32* %arrayidx1, align 4
     57   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
     58   %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
     59   %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
     60   store i32 5, i32* %arrayidx3, align 4
     61   %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
     62   %2 = load i32, i32* %arrayidx10, align 4
     63   store i32 %2, i32 addrspace(1)* %out, align 4
     64   %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
     65   %3 = load i32, i32* %arrayidx12
     66   %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
     67   store i32 %3, i32 addrspace(1)* %arrayidx13
     68   ret void
     69 }
     70 
     71 ; ALL-LABEL: @occupancy_0(
     72 ; CI-NOT: alloca [5 x i32]
     73 ; SI: alloca [5 x i32]
     74 define amdgpu_kernel void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
     75 entry:
     76   %stack = alloca [5 x i32], align 4
     77   %0 = load i32, i32 addrspace(1)* %in, align 4
     78   %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
     79   store i32 4, i32* %arrayidx1, align 4
     80   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
     81   %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
     82   %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
     83   store i32 5, i32* %arrayidx3, align 4
     84   %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
     85   %2 = load i32, i32* %arrayidx10, align 4
     86   store i32 %2, i32 addrspace(1)* %out, align 4
     87   %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
     88   %3 = load i32, i32* %arrayidx12
     89   %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
     90   store i32 %3, i32 addrspace(1)* %arrayidx13
     91   ret void
     92 }
     93 
     94 ; ALL-LABEL: @occupancy_max(
     95 ; CI-NOT: alloca [5 x i32]
     96 ; SI: alloca [5 x i32]
     97 define amdgpu_kernel void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
     98 entry:
     99   %stack = alloca [5 x i32], align 4
    100   %0 = load i32, i32 addrspace(1)* %in, align 4
    101   %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
    102   store i32 4, i32* %arrayidx1, align 4
    103   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
    104   %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
    105   %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
    106   store i32 5, i32* %arrayidx3, align 4
    107   %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
    108   %2 = load i32, i32* %arrayidx10, align 4
    109   store i32 %2, i32 addrspace(1)* %out, align 4
    110   %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
    111   %3 = load i32, i32* %arrayidx12
    112   %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
    113   store i32 %3, i32 addrspace(1)* %arrayidx13
    114   ret void
    115 }
    116 
    117 ; SI-LABEL: @occupancy_6(
    118 ; CI-LABEL: @occupancy_6(
    119 ; SI: alloca
    120 ; CI-NOT: alloca
    121 define amdgpu_kernel void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
    122 entry:
    123   %stack = alloca [42 x i8], align 4
    124   %tmp = load i8, i8 addrspace(1)* %in, align 1
    125   %tmp4 = sext i8 %tmp to i64
    126   %arrayidx1 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 %tmp4
    127   store i8 4, i8* %arrayidx1, align 1
    128   %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
    129   %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
    130   %tmp5 = sext i8 %tmp1 to i64
    131   %arrayidx3 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 %tmp5
    132   store i8 5, i8* %arrayidx3, align 1
    133   %arrayidx10 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 0
    134   %tmp2 = load i8, i8* %arrayidx10, align 1
    135   store i8 %tmp2, i8 addrspace(1)* %out, align 1
    136   %arrayidx12 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 1
    137   %tmp3 = load i8, i8* %arrayidx12, align 1
    138   %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
    139   store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
    140   ret void
    141 }
    142 
    143 ; ALL-LABEL: @occupancy_6_over(
    144 ; ALL: alloca [43 x i8]
    145 define amdgpu_kernel void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
    146 entry:
    147   %stack = alloca [43 x i8], align 4
    148   %tmp = load i8, i8 addrspace(1)* %in, align 1
    149   %tmp4 = sext i8 %tmp to i64
    150   %arrayidx1 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 %tmp4
    151   store i8 4, i8* %arrayidx1, align 1
    152   %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
    153   %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
    154   %tmp5 = sext i8 %tmp1 to i64
    155   %arrayidx3 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 %tmp5
    156   store i8 5, i8* %arrayidx3, align 1
    157   %arrayidx10 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 0
    158   %tmp2 = load i8, i8* %arrayidx10, align 1
    159   store i8 %tmp2, i8 addrspace(1)* %out, align 1
    160   %arrayidx12 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 1
    161   %tmp3 = load i8, i8* %arrayidx12, align 1
    162   %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
    163   store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
    164   ret void
    165 }
    166 
    167 ; SI-LABEL: @occupancy_8(
    168 ; CI-LABEL: @occupancy_8(
    169 ; SI: alloca
    170 ; CI-NOT: alloca
    171 define amdgpu_kernel void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
    172 entry:
    173   %stack = alloca [32 x i8], align 4
    174   %tmp = load i8, i8 addrspace(1)* %in, align 1
    175   %tmp4 = sext i8 %tmp to i64
    176   %arrayidx1 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 %tmp4
    177   store i8 4, i8* %arrayidx1, align 1
    178   %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
    179   %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
    180   %tmp5 = sext i8 %tmp1 to i64
    181   %arrayidx3 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 %tmp5
    182   store i8 5, i8* %arrayidx3, align 1
    183   %arrayidx10 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 0
    184   %tmp2 = load i8, i8* %arrayidx10, align 1
    185   store i8 %tmp2, i8 addrspace(1)* %out, align 1
    186   %arrayidx12 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 1
    187   %tmp3 = load i8, i8* %arrayidx12, align 1
    188   %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
    189   store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
    190   ret void
    191 }
    192 
    193 ; ALL-LABEL: @occupancy_8_over(
    194 ; ALL: alloca [33 x i8]
    195 define amdgpu_kernel void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
    196 entry:
    197   %stack = alloca [33 x i8], align 4
    198   %tmp = load i8, i8 addrspace(1)* %in, align 1
    199   %tmp4 = sext i8 %tmp to i64
    200   %arrayidx1 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 %tmp4
    201   store i8 4, i8* %arrayidx1, align 1
    202   %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
    203   %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
    204   %tmp5 = sext i8 %tmp1 to i64
    205   %arrayidx3 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 %tmp5
    206   store i8 5, i8* %arrayidx3, align 1
    207   %arrayidx10 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 0
    208   %tmp2 = load i8, i8* %arrayidx10, align 1
    209   store i8 %tmp2, i8 addrspace(1)* %out, align 1
    210   %arrayidx12 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 1
    211   %tmp3 = load i8, i8* %arrayidx12, align 1
    212   %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
    213   store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
    214   ret void
    215 }
    216 
    217 ; SI-LABEL: @occupancy_9(
    218 ; CI-LABEL: @occupancy_9(
    219 ; SI: alloca
    220 ; CI-NOT: alloca
    221 define amdgpu_kernel void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
    222 entry:
    223   %stack = alloca [28 x i8], align 4
    224   %tmp = load i8, i8 addrspace(1)* %in, align 1
    225   %tmp4 = sext i8 %tmp to i64
    226   %arrayidx1 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 %tmp4
    227   store i8 4, i8* %arrayidx1, align 1
    228   %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
    229   %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
    230   %tmp5 = sext i8 %tmp1 to i64
    231   %arrayidx3 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 %tmp5
    232   store i8 5, i8* %arrayidx3, align 1
    233   %arrayidx10 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 0
    234   %tmp2 = load i8, i8* %arrayidx10, align 1
    235   store i8 %tmp2, i8 addrspace(1)* %out, align 1
    236   %arrayidx12 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 1
    237   %tmp3 = load i8, i8* %arrayidx12, align 1
    238   %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
    239   store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
    240   ret void
    241 }
    242 
    243 ; ALL-LABEL: @occupancy_9_over(
    244 ; ALL: alloca [29 x i8]
    245 define amdgpu_kernel void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
    246 entry:
    247   %stack = alloca [29 x i8], align 4
    248   %tmp = load i8, i8 addrspace(1)* %in, align 1
    249   %tmp4 = sext i8 %tmp to i64
    250   %arrayidx1 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 %tmp4
    251   store i8 4, i8* %arrayidx1, align 1
    252   %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
    253   %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
    254   %tmp5 = sext i8 %tmp1 to i64
    255   %arrayidx3 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 %tmp5
    256   store i8 5, i8* %arrayidx3, align 1
    257   %arrayidx10 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 0
    258   %tmp2 = load i8, i8* %arrayidx10, align 1
    259   store i8 %tmp2, i8 addrspace(1)* %out, align 1
    260   %arrayidx12 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 1
    261   %tmp3 = load i8, i8* %arrayidx12, align 1
    262   %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
    263   store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
    264   ret void
    265 }
    266 
    267 attributes #0 = { nounwind "amdgpu-max-work-group-size"="63" }
    268 attributes #1 = { nounwind "amdgpu-waves-per-eu"="1,3" "amdgpu-flat-work-group-size"="256,256" }
    269 attributes #2 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1600,1600" }
    270 attributes #3 = { nounwind "amdgpu-waves-per-eu"="1,10" }
    271 attributes #4 = { nounwind "amdgpu-waves-per-eu"="1,10" }
    272 attributes #5 = { nounwind "amdgpu-waves-per-eu"="1,6" "amdgpu-flat-work-group-size"="64,64" }
    273 attributes #6 = { nounwind "amdgpu-waves-per-eu"="1,8" "amdgpu-flat-work-group-size"="64,64" }
    274 attributes #7 = { nounwind "amdgpu-waves-per-eu"="1,9" "amdgpu-flat-work-group-size"="64,64" }
    275