Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s
      2 
      3 ; This shows that the amount of LDS estimate is sensitive to the order
      4 ; of the LDS globals.
      5 
      6 ; Both of these functions use the same amount of LDS, but the total
      7 ; changes depending on the visit order of first use.
      8 
      9 ; The one with the suboptimal order resulting in extra padding exceeds
     10 ; the desired limit
     11 
     12 ; The padding estimate heuristic used by the promote alloca pass
     13 ; is mostly determined by the order of the globals,
     14 
     15 ; Raw usage = 1060 bytes
     16 ; Rounded usage:
     17 ; 292 + (4 pad) + 256 + (8 pad) + 512 = 1072
     18 ; 512 + (0 pad) + 256 + (0 pad) + 292 = 1060
     19 
     20 ; At default occupancy guess of 7, 2340 bytes available total.
     21 
     22 ; 1280 need to be left to promote alloca
     23 ; optimally packed, this requires
     24 
     25 
     26 @lds0 = internal unnamed_addr addrspace(3) global [32 x <4 x i32>] undef, align 16
     27 @lds2 = internal unnamed_addr addrspace(3) global [32 x i64] undef, align 8
     28 @lds1 = internal unnamed_addr addrspace(3) global [73 x i32] undef, align 4
     29 
     30 
     31 ; GCN-LABEL: {{^}}promote_alloca_size_order_0:
     32 ; GCN: workgroup_group_segment_byte_size = 2340
     33 define void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
     34 entry:
     35   %stack = alloca [5 x i32], align 4
     36   %tmp0 = load i32, i32 addrspace(1)* %in, align 4
     37   %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp0
     38   store i32 4, i32* %arrayidx1, align 4
     39   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
     40   %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
     41   %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp1
     42   store i32 5, i32* %arrayidx3, align 4
     43   %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
     44   %tmp2 = load i32, i32* %arrayidx10, align 4
     45   store i32 %tmp2, i32 addrspace(1)* %out, align 4
     46   %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
     47   %tmp3 = load i32, i32* %arrayidx12
     48   %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
     49   store i32 %tmp3, i32 addrspace(1)* %arrayidx13
     50 
     51   %gep.lds1 = getelementptr inbounds [73 x i32], [73 x i32] addrspace(3)* @lds1, i32 0, i32 %idx
     52   store volatile i32 0, i32 addrspace(3)* %gep.lds1, align 4
     53 
     54   %gep.lds2 = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @lds2, i32 0, i32 %idx
     55   store volatile i64 0, i64 addrspace(3)* %gep.lds2, align 8
     56 
     57   %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], [32 x <4 x i32>] addrspace(3)* @lds0, i32 0, i32 %idx
     58   store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds0, align 16
     59 
     60   ret void
     61 }
     62 
     63 ; GCN-LABEL: {{^}}promote_alloca_size_order_1:
     64 ; GCN: workgroup_group_segment_byte_size = 2352
     65 define void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
     66 entry:
     67   %stack = alloca [5 x i32], align 4
     68   %tmp0 = load i32, i32 addrspace(1)* %in, align 4
     69   %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp0
     70   store i32 4, i32* %arrayidx1, align 4
     71   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
     72   %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
     73   %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp1
     74   store i32 5, i32* %arrayidx3, align 4
     75   %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
     76   %tmp2 = load i32, i32* %arrayidx10, align 4
     77   store i32 %tmp2, i32 addrspace(1)* %out, align 4
     78   %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
     79   %tmp3 = load i32, i32* %arrayidx12
     80   %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
     81   store i32 %tmp3, i32 addrspace(1)* %arrayidx13
     82 
     83   %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], [32 x <4 x i32>] addrspace(3)* @lds0, i32 0, i32 %idx
     84   store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds0, align 16
     85 
     86   %gep.lds2 = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @lds2, i32 0, i32 %idx
     87   store volatile i64 0, i64 addrspace(3)* %gep.lds2, align 8
     88 
     89   %gep.lds1 = getelementptr inbounds [73 x i32], [73 x i32] addrspace(3)* @lds1, i32 0, i32 %idx
     90   store volatile i32 0, i32 addrspace(3)* %gep.lds1, align 4
     91 
     92   ret void
     93 }
     94 
     95 @lds3 = internal unnamed_addr addrspace(3) global [13 x i32] undef, align 4
     96 @lds4 = internal unnamed_addr addrspace(3) global [63 x <4 x i32>] undef, align 16
     97 
     98 ; The guess from the alignment padding pushes this over the determined
     99 ; size limit, so it isn't promoted
    100 
    101 ; GCN-LABEL: {{^}}promote_alloca_align_pad_guess_over_limit:
    102 ; GCN: workgroup_group_segment_byte_size = 1060
    103 define void @promote_alloca_align_pad_guess_over_limit(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
    104 entry:
    105   %stack = alloca [5 x i32], align 4
    106   %tmp0 = load i32, i32 addrspace(1)* %in, align 4
    107   %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp0
    108   store i32 4, i32* %arrayidx1, align 4
    109   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
    110   %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
    111   %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp1
    112   store i32 5, i32* %arrayidx3, align 4
    113   %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
    114   %tmp2 = load i32, i32* %arrayidx10, align 4
    115   store i32 %tmp2, i32 addrspace(1)* %out, align 4
    116   %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
    117   %tmp3 = load i32, i32* %arrayidx12
    118   %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
    119   store i32 %tmp3, i32 addrspace(1)* %arrayidx13
    120 
    121   %gep.lds3 = getelementptr inbounds [13 x i32], [13 x i32] addrspace(3)* @lds3, i32 0, i32 %idx
    122   store volatile i32 0, i32 addrspace(3)* %gep.lds3, align 4
    123 
    124   %gep.lds4 = getelementptr inbounds [63 x <4 x i32>], [63 x <4 x i32>] addrspace(3)* @lds4, i32 0, i32 %idx
    125   store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds4, align 16
    126 
    127   ret void
    128 }
    129 
    130 attributes #0 = { nounwind "amdgpu-max-work-group-size"="64" }
    131