Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=HSA -check-prefix=FUNC %s
      2 
      3 @lds.align16.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16
      4 @lds.align16.1 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16
      5 
      6 @lds.align8.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 8
      7 @lds.align32.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 32
      8 
      9 @lds.missing.align.0 = internal unnamed_addr addrspace(3) global [39 x i32] undef
     10 @lds.missing.align.1 = internal unnamed_addr addrspace(3) global [7 x i64] undef
     11 
     12 declare void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(1)* nocapture readonly, i32, i32, i1) #0
     13 declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #0
     14 
     15 
     16 ; HSA-LABEL: {{^}}test_no_round_size_1:
     17 ; HSA: workgroup_group_segment_byte_size = 38
     18 define void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
     19   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
     20   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
     21   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false)
     22   ret void
     23 }
     24 
     25 ; There are two objects, so one requires padding to to be correctly
     26 ; aligned after the other.
     27 
     28 ; (38 -> 48) + 38 = 92
     29 
     30 ; I don't think it is necessary to add padding after since if there
     31 ; were to be a dynamically sized LDS kernel arg, the runtime should
     32 ; add the alignment padding if necessary alignment padding if needed.
     33 
     34 ; HSA-LABEL: {{^}}test_round_size_2:
     35 ; HSA: workgroup_group_segment_byte_size = 86
     36 ; HSA: group_segment_alignment = 4
     37 define void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
     38   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
     39   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
     40   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false)
     41 
     42   %lds.align16.1.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.1 to i8 addrspace(3)*
     43   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.1.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
     44   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.1.bc, i32 38, i32 4, i1 false)
     45 
     46   ret void
     47 }
     48 
     49 ; 38 + (10 pad) + 38
     50 ; HSA-LABEL: {{^}}test_round_size_2_align_8:
     51 ; HSA: workgroup_group_segment_byte_size = 86
     52 ; HSA: group_segment_alignment = 4
     53 define void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
     54   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
     55   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
     56   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
     57 
     58   %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
     59   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
     60   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
     61 
     62   ret void
     63 }
     64 
     65 ; HSA-LABEL: {{^}}test_round_local_lds_and_arg:
     66 ; HSA: workgroup_group_segment_byte_size = 38
     67 ; HSA: group_segment_alignment = 4
     68 define void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
     69   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
     70   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
     71 
     72   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false)
     73   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
     74   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 4, i1 false)
     75   ret void
     76 }
     77 
     78 ; HSA-LABEL: {{^}}test_round_lds_arg:
     79 ; HSA: workgroup_group_segment_byte_size = 0
     80 ; HSA: group_segment_alignment = 4
     81 define void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
     82   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
     83   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 4, i1 false)
     84   ret void
     85 }
     86 
     87 ; FIXME: Parameter alignment not considered
     88 ; HSA-LABEL: {{^}}test_high_align_lds_arg:
     89 ; HSA: workgroup_group_segment_byte_size = 0
     90 ; HSA: group_segment_alignment = 4
     91 define void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* align 64 %lds.arg) #1 {
     92   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 64, i1 false)
     93   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 64, i1 false)
     94   ret void
     95 }
     96 
     97 ; (7 * 8) + (39 * 4) = 212
     98 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0:
     99 ; HSA: workgroup_group_segment_byte_size = 212
    100 ; HSA: group_segment_alignment = 4
    101 define void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
    102   %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)*
    103   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.0.bc, i8 addrspace(1)* %in, i32 160, i32 4, i1 false)
    104   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.0.bc, i32 160, i32 4, i1 false)
    105 
    106   %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)*
    107   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.1.bc, i8 addrspace(1)* %in, i32 56, i32 8, i1 false)
    108   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.1.bc, i32 56, i32 8, i1 false)
    109 
    110   ret void
    111 }
    112 
    113 ; (39 * 4) + (4 pad) + (7 * 8) = 216
    114 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order1:
    115 ; HSA: workgroup_group_segment_byte_size = 216
    116 ; HSA: group_segment_alignment = 4
    117 define void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
    118   %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)*
    119   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.1.bc, i8 addrspace(1)* %in, i32 56, i32 8, i1 false)
    120   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.1.bc, i32 56, i32 8, i1 false)
    121 
    122   %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)*
    123   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.0.bc, i8 addrspace(1)* %in, i32 160, i32 4, i1 false)
    124   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.0.bc, i32 160, i32 4, i1 false)
    125 
    126   ret void
    127 }
    128 ; Test how the size needed for padding changes based on when the
    129 ; global is encountered during lowering. There should be a consistent
    130 ; order to minimize padding waste.
    131 ;
    132 ; The way global addresses are lowered now, this is in inverse of
    133 ; first use order which isn't great.
    134 ;
    135 ; This should be the optimal order for these globals. If sorted to
    136 ; minimize padding, the minimum possible size is: align 32, align 8,
    137 ; align 16
    138 
    139 
    140 ; align 32, 16, 8
    141 ; 38 + (10 pad) + 38 + (10 pad) + 38 = 134
    142 ; HSA-LABEL: {{^}}test_round_size_3_order0:
    143 ; HSA: workgroup_group_segment_byte_size = 134
    144 ; HSA: group_segment_alignment = 4
    145 define void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
    146   %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
    147   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
    148   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
    149 
    150   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
    151   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
    152   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
    153 
    154   %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
    155   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
    156   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
    157 
    158   ret void
    159 }
    160 
    161 ; align 32, 8, 16
    162 ; 38 (+ 2 pad) + 38 + (18 pad) + 38 = 134
    163 ; HSA-LABEL: {{^}}test_round_size_3_order1:
    164 ; HSA: workgroup_group_segment_byte_size = 134
    165 ; HSA: group_segment_alignment = 4
    166 define void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
    167   %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
    168   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
    169   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
    170 
    171   %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
    172   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
    173   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
    174 
    175   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
    176   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
    177   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
    178 
    179   ret void
    180 }
    181 
    182 ; align 16, 32, 8
    183 ; 38 + (26 pad) + 38 + (10 pad) + 38 = 150
    184 ; HSA-LABEL: {{^}}test_round_size_3_order2:
    185 ; HSA: workgroup_group_segment_byte_size = 150
    186 ; HSA: group_segment_alignment = 4
    187 define void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
    188   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
    189   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
    190   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
    191 
    192   %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
    193   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
    194   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
    195 
    196   %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
    197   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
    198   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
    199 
    200   ret void
    201 }
    202 
    203 ; align 16, 8, 32
    204 ; 38 + (2 pad) + 38 + (2 pad) + 38
    205 ; HSA-LABEL: {{^}}test_round_size_3_order3:
    206 ; HSA: workgroup_group_segment_byte_size = 118
    207 ; HSA: group_segment_alignment = 4
    208 define void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
    209   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
    210   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
    211   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
    212 
    213   %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
    214   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
    215   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
    216 
    217   %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
    218   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
    219   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
    220 
    221   ret void
    222 }
    223 
    224 ; align 8, 32, 16
    225 ; 38 + (26 pad) + 38 + (2 pad) + 38 = 142
    226 ; HSA-LABEL: {{^}}test_round_size_3_order4:
    227 ; HSA: workgroup_group_segment_byte_size = 142
    228 ; HSA: group_segment_alignment = 4
    229 define void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
    230   %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
    231   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
    232   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
    233 
    234   %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
    235   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
    236   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
    237 
    238   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
    239   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
    240   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
    241 
    242   ret void
    243 }
    244 
    245 ; align 8, 16, 32
    246 ; 38 + (10 pad) + 38 + (2 pad) + 38 = 126
    247 ; HSA-LABEL: {{^}}test_round_size_3_order5:
    248 ; HSA: workgroup_group_segment_byte_size = 126
    249 ; HSA: group_segment_alignment = 4
    250 define void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
    251   %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
    252   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
    253   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
    254 
    255   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
    256   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
    257   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
    258 
    259   %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
    260   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
    261   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
    262 
    263   ret void
    264 }
    265 
    266 attributes #0 = { argmemonly nounwind }
    267 attributes #1 = { nounwind }
    268 attributes #2 = { convergent nounwind }
    269