1 ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=HSA -check-prefix=FUNC %s 2 3 @lds.align16.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16 4 @lds.align16.1 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16 5 6 @lds.align8.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 8 7 @lds.align32.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 32 8 9 @lds.missing.align.0 = internal unnamed_addr addrspace(3) global [39 x i32] undef 10 @lds.missing.align.1 = internal unnamed_addr addrspace(3) global [7 x i64] undef 11 12 declare void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(1)* nocapture readonly, i32, i32, i1) #0 13 declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #0 14 15 16 ; HSA-LABEL: {{^}}test_no_round_size_1: 17 ; HSA: workgroup_group_segment_byte_size = 38 18 define void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 19 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 20 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) 21 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false) 22 ret void 23 } 24 25 ; There are two objects, so one requires padding to to be correctly 26 ; aligned after the other. 27 28 ; (38 -> 48) + 38 = 92 29 30 ; I don't think it is necessary to add padding after since if there 31 ; were to be a dynamically sized LDS kernel arg, the runtime should 32 ; add the alignment padding if necessary alignment padding if needed. 33 34 ; HSA-LABEL: {{^}}test_round_size_2: 35 ; HSA: workgroup_group_segment_byte_size = 86 36 ; HSA: group_segment_alignment = 4 37 define void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 38 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 39 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) 40 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false) 41 42 %lds.align16.1.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.1 to i8 addrspace(3)* 43 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.1.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) 44 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.1.bc, i32 38, i32 4, i1 false) 45 46 ret void 47 } 48 49 ; 38 + (10 pad) + 38 50 ; HSA-LABEL: {{^}}test_round_size_2_align_8: 51 ; HSA: workgroup_group_segment_byte_size = 86 52 ; HSA: group_segment_alignment = 4 53 define void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 54 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 55 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 56 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) 57 58 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* 59 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 60 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false) 61 62 ret void 63 } 64 65 ; HSA-LABEL: {{^}}test_round_local_lds_and_arg: 66 ; HSA: workgroup_group_segment_byte_size = 38 67 ; HSA: group_segment_alignment = 4 68 define void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 { 69 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 70 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) 71 72 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false) 73 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) 74 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 4, i1 false) 75 ret void 76 } 77 78 ; HSA-LABEL: {{^}}test_round_lds_arg: 79 ; HSA: workgroup_group_segment_byte_size = 0 80 ; HSA: group_segment_alignment = 4 81 define void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 { 82 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) 83 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 4, i1 false) 84 ret void 85 } 86 87 ; FIXME: Parameter alignment not considered 88 ; HSA-LABEL: {{^}}test_high_align_lds_arg: 89 ; HSA: workgroup_group_segment_byte_size = 0 90 ; HSA: group_segment_alignment = 4 91 define void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* align 64 %lds.arg) #1 { 92 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 64, i1 false) 93 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 64, i1 false) 94 ret void 95 } 96 97 ; (7 * 8) + (39 * 4) = 212 98 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0: 99 ; HSA: workgroup_group_segment_byte_size = 212 100 ; HSA: group_segment_alignment = 4 101 define void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 102 %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)* 103 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.0.bc, i8 addrspace(1)* %in, i32 160, i32 4, i1 false) 104 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.0.bc, i32 160, i32 4, i1 false) 105 106 %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)* 107 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.1.bc, i8 addrspace(1)* %in, i32 56, i32 8, i1 false) 108 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.1.bc, i32 56, i32 8, i1 false) 109 110 ret void 111 } 112 113 ; (39 * 4) + (4 pad) + (7 * 8) = 216 114 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order1: 115 ; HSA: workgroup_group_segment_byte_size = 216 116 ; HSA: group_segment_alignment = 4 117 define void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 118 %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)* 119 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.1.bc, i8 addrspace(1)* %in, i32 56, i32 8, i1 false) 120 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.1.bc, i32 56, i32 8, i1 false) 121 122 %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)* 123 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.0.bc, i8 addrspace(1)* %in, i32 160, i32 4, i1 false) 124 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.0.bc, i32 160, i32 4, i1 false) 125 126 ret void 127 } 128 ; Test how the size needed for padding changes based on when the 129 ; global is encountered during lowering. There should be a consistent 130 ; order to minimize padding waste. 131 ; 132 ; The way global addresses are lowered now, this is in inverse of 133 ; first use order which isn't great. 134 ; 135 ; This should be the optimal order for these globals. If sorted to 136 ; minimize padding, the minimum possible size is: align 32, align 8, 137 ; align 16 138 139 140 ; align 32, 16, 8 141 ; 38 + (10 pad) + 38 + (10 pad) + 38 = 134 142 ; HSA-LABEL: {{^}}test_round_size_3_order0: 143 ; HSA: workgroup_group_segment_byte_size = 134 144 ; HSA: group_segment_alignment = 4 145 define void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 146 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* 147 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 148 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false) 149 150 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 151 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 152 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) 153 154 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* 155 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 156 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false) 157 158 ret void 159 } 160 161 ; align 32, 8, 16 162 ; 38 (+ 2 pad) + 38 + (18 pad) + 38 = 134 163 ; HSA-LABEL: {{^}}test_round_size_3_order1: 164 ; HSA: workgroup_group_segment_byte_size = 134 165 ; HSA: group_segment_alignment = 4 166 define void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 167 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* 168 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 169 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false) 170 171 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* 172 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 173 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false) 174 175 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 176 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 177 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) 178 179 ret void 180 } 181 182 ; align 16, 32, 8 183 ; 38 + (26 pad) + 38 + (10 pad) + 38 = 150 184 ; HSA-LABEL: {{^}}test_round_size_3_order2: 185 ; HSA: workgroup_group_segment_byte_size = 150 186 ; HSA: group_segment_alignment = 4 187 define void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 188 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 189 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 190 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) 191 192 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* 193 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 194 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false) 195 196 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* 197 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 198 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false) 199 200 ret void 201 } 202 203 ; align 16, 8, 32 204 ; 38 + (2 pad) + 38 + (2 pad) + 38 205 ; HSA-LABEL: {{^}}test_round_size_3_order3: 206 ; HSA: workgroup_group_segment_byte_size = 118 207 ; HSA: group_segment_alignment = 4 208 define void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 209 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 210 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 211 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) 212 213 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* 214 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 215 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false) 216 217 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* 218 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 219 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false) 220 221 ret void 222 } 223 224 ; align 8, 32, 16 225 ; 38 + (26 pad) + 38 + (2 pad) + 38 = 142 226 ; HSA-LABEL: {{^}}test_round_size_3_order4: 227 ; HSA: workgroup_group_segment_byte_size = 142 228 ; HSA: group_segment_alignment = 4 229 define void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 230 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* 231 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 232 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false) 233 234 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* 235 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 236 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false) 237 238 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 239 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 240 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) 241 242 ret void 243 } 244 245 ; align 8, 16, 32 246 ; 38 + (10 pad) + 38 + (2 pad) + 38 = 126 247 ; HSA-LABEL: {{^}}test_round_size_3_order5: 248 ; HSA: workgroup_group_segment_byte_size = 126 249 ; HSA: group_segment_alignment = 4 250 define void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 251 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* 252 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 253 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false) 254 255 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 256 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 257 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) 258 259 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* 260 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 261 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false) 262 263 ret void 264 } 265 266 attributes #0 = { argmemonly nounwind } 267 attributes #1 = { nounwind } 268 attributes #2 = { convergent nounwind } 269