Home | History | Annotate | Download | only in AMDGPU
      1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
      2 ; FIXME: Manually added checks for metadata nodes at bottom
      3 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -o - -amdgpu-lower-kernel-arguments %s | FileCheck -check-prefix=HSA %s
      4 ; RUN: opt -mtriple=amdgcn-- -S -o - -amdgpu-lower-kernel-arguments %s | FileCheck -check-prefix=MESA %s
      5 
      6 define amdgpu_kernel void @kern_noargs() {
      7 ; HSA-LABEL: @kern_noargs(
      8 ; HSA-NEXT:    ret void
      9 ;
     10 ; MESA-LABEL: @kern_noargs(
     11 ; MESA-NEXT:    ret void
     12 ;
     13   ret void
     14 }
     15 
     16 define amdgpu_kernel void @kern_i8(i8 %arg) #0 {
     17 ; HSA-LABEL: @kern_i8(
     18 ; HSA-NEXT:    [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
     19 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]], i64 0
     20 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
     21 ; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
     22 ; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
     23 ; HSA-NEXT:    store i8 [[TMP2]], i8 addrspace(1)* undef, align 1
     24 ; HSA-NEXT:    ret void
     25 ;
     26 ; MESA-LABEL: @kern_i8(
     27 ; MESA-NEXT:    [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
     28 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]], i64 36
     29 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
     30 ; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
     31 ; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
     32 ; MESA-NEXT:    store i8 [[TMP2]], i8 addrspace(1)* undef, align 1
     33 ; MESA-NEXT:    ret void
     34 ;
     35   store i8 %arg, i8 addrspace(1)* undef, align 1
     36   ret void
     37 }
     38 
     39 define amdgpu_kernel void @kern_i16(i16 %arg) #0 {
     40 ; HSA-LABEL: @kern_i16(
     41 ; HSA-NEXT:    [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
     42 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]], i64 0
     43 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
     44 ; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
     45 ; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
     46 ; HSA-NEXT:    store i16 [[TMP2]], i16 addrspace(1)* undef, align 1
     47 ; HSA-NEXT:    ret void
     48 ;
     49 ; MESA-LABEL: @kern_i16(
     50 ; MESA-NEXT:    [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
     51 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]], i64 36
     52 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
     53 ; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
     54 ; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
     55 ; MESA-NEXT:    store i16 [[TMP2]], i16 addrspace(1)* undef, align 1
     56 ; MESA-NEXT:    ret void
     57 ;
     58   store i16 %arg, i16 addrspace(1)* undef, align 1
     59   ret void
     60 }
     61 
     62 define amdgpu_kernel void @kern_f16(half %arg) #0 {
     63 ; HSA-LABEL: @kern_f16(
     64 ; HSA-NEXT:    [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
     65 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]], i64 0
     66 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
     67 ; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
     68 ; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
     69 ; HSA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
     70 ; HSA-NEXT:    store half [[ARG_LOAD]], half addrspace(1)* undef, align 1
     71 ; HSA-NEXT:    ret void
     72 ;
     73 ; MESA-LABEL: @kern_f16(
     74 ; MESA-NEXT:    [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
     75 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]], i64 36
     76 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
     77 ; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
     78 ; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
     79 ; MESA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
     80 ; MESA-NEXT:    store half [[ARG_LOAD]], half addrspace(1)* undef, align 1
     81 ; MESA-NEXT:    ret void
     82 ;
     83   store half %arg, half addrspace(1)* undef, align 1
     84   ret void
     85 }
     86 
     87 define amdgpu_kernel void @kern_zeroext_i8(i8 zeroext %arg) #0 {
     88 ; HSA-LABEL: @kern_zeroext_i8(
     89 ; HSA-NEXT:    [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
     90 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 0
     91 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
     92 ; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
     93 ; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
     94 ; HSA-NEXT:    store i8 [[TMP2]], i8 addrspace(1)* undef, align 1
     95 ; HSA-NEXT:    ret void
     96 ;
     97 ; MESA-LABEL: @kern_zeroext_i8(
     98 ; MESA-NEXT:    [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
     99 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 36
    100 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    101 ; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    102 ; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
    103 ; MESA-NEXT:    store i8 [[TMP2]], i8 addrspace(1)* undef, align 1
    104 ; MESA-NEXT:    ret void
    105 ;
    106   store i8 %arg, i8 addrspace(1)* undef, align 1
    107   ret void
    108 }
    109 
    110 define amdgpu_kernel void @kern_zeroext_i16(i16 zeroext %arg) #0 {
    111 ; HSA-LABEL: @kern_zeroext_i16(
    112 ; HSA-NEXT:    [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    113 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 0
    114 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    115 ; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    116 ; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
    117 ; HSA-NEXT:    store i16 [[TMP2]], i16 addrspace(1)* undef, align 1
    118 ; HSA-NEXT:    ret void
    119 ;
    120 ; MESA-LABEL: @kern_zeroext_i16(
    121 ; MESA-NEXT:    [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    122 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 36
    123 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    124 ; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    125 ; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
    126 ; MESA-NEXT:    store i16 [[TMP2]], i16 addrspace(1)* undef, align 1
    127 ; MESA-NEXT:    ret void
    128 ;
    129   store i16 %arg, i16 addrspace(1)* undef, align 1
    130   ret void
    131 }
    132 
    133 define amdgpu_kernel void @kern_signext_i8(i8 signext %arg) #0 {
    134 ; HSA-LABEL: @kern_signext_i8(
    135 ; HSA-NEXT:    [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    136 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 0
    137 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    138 ; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    139 ; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
    140 ; HSA-NEXT:    store i8 [[TMP2]], i8 addrspace(1)* undef, align 1
    141 ; HSA-NEXT:    ret void
    142 ;
    143 ; MESA-LABEL: @kern_signext_i8(
    144 ; MESA-NEXT:    [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    145 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 36
    146 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    147 ; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    148 ; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
    149 ; MESA-NEXT:    store i8 [[TMP2]], i8 addrspace(1)* undef, align 1
    150 ; MESA-NEXT:    ret void
    151 ;
    152   store i8 %arg, i8 addrspace(1)* undef, align 1
    153   ret void
    154 }
    155 
    156 define amdgpu_kernel void @kern_signext_i16(i16 signext %arg) #0 {
    157 ; HSA-LABEL: @kern_signext_i16(
    158 ; HSA-NEXT:    [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    159 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 0
    160 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    161 ; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    162 ; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
    163 ; HSA-NEXT:    store i16 [[TMP2]], i16 addrspace(1)* undef, align 1
    164 ; HSA-NEXT:    ret void
    165 ;
    166 ; MESA-LABEL: @kern_signext_i16(
    167 ; MESA-NEXT:    [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    168 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 36
    169 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    170 ; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    171 ; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
    172 ; MESA-NEXT:    store i16 [[TMP2]], i16 addrspace(1)* undef, align 1
    173 ; MESA-NEXT:    ret void
    174 ;
    175   store i16 %arg, i16 addrspace(1)* undef, align 1
    176   ret void
    177 }
    178 
    179 define amdgpu_kernel void @kern_i8_i8(i8 %arg0, i8 %arg1) {
    180 ; HSA-LABEL: @kern_i8_i8(
    181 ; HSA-NEXT:    [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    182 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 0
    183 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    184 ; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    185 ; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
    186 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 0
    187 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    188 ; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    189 ; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
    190 ; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
    191 ; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
    192 ; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
    193 ; HSA-NEXT:    ret void
    194 ;
    195 ; MESA-LABEL: @kern_i8_i8(
    196 ; MESA-NEXT:    [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    197 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 36
    198 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    199 ; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    200 ; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
    201 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 36
    202 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    203 ; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    204 ; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
    205 ; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
    206 ; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
    207 ; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
    208 ; MESA-NEXT:    ret void
    209 ;
    210   store volatile i8 %arg0, i8 addrspace(1)* undef, align 1
    211   store volatile i8 %arg1, i8 addrspace(1)* undef, align 1
    212   ret void
    213 }
    214 
    215 define amdgpu_kernel void @kern_v3i8(<3 x i8> %arg) {
    216 ; HSA-LABEL: @kern_v3i8(
    217 ; HSA-NEXT:    [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    218 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]], i64 0
    219 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    220 ; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    221 ; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
    222 ; HSA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i24 [[TMP2]] to <3 x i8>
    223 ; HSA-NEXT:    store <3 x i8> [[ARG_LOAD]], <3 x i8> addrspace(1)* undef, align 4
    224 ; HSA-NEXT:    ret void
    225 ;
    226 ; MESA-LABEL: @kern_v3i8(
    227 ; MESA-NEXT:    [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    228 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]], i64 36
    229 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    230 ; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    231 ; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
    232 ; MESA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i24 [[TMP2]] to <3 x i8>
    233 ; MESA-NEXT:    store <3 x i8> [[ARG_LOAD]], <3 x i8> addrspace(1)* undef, align 4
    234 ; MESA-NEXT:    ret void
    235 ;
    236   store <3 x i8> %arg, <3 x i8> addrspace(1)* undef, align 4
    237   ret void
    238 }
    239 
    240 define amdgpu_kernel void @kern_i24(i24 %arg0) {
    241 ; HSA-LABEL: @kern_i24(
    242 ; HSA-NEXT:    [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    243 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]], i64 0
    244 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    245 ; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    246 ; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
    247 ; HSA-NEXT:    store i24 [[TMP2]], i24 addrspace(1)* undef
    248 ; HSA-NEXT:    ret void
    249 ;
    250 ; MESA-LABEL: @kern_i24(
    251 ; MESA-NEXT:    [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    252 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]], i64 36
    253 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    254 ; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    255 ; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
    256 ; MESA-NEXT:    store i24 [[TMP2]], i24 addrspace(1)* undef
    257 ; MESA-NEXT:    ret void
    258 ;
    259   store i24 %arg0, i24 addrspace(1)* undef
    260   ret void
    261 }
    262 
    263 define amdgpu_kernel void @kern_i32(i32 %arg0) {
    264 ; HSA-LABEL: @kern_i32(
    265 ; HSA-NEXT:    [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    266 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_KERNARG_SEGMENT]], i64 0
    267 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
    268 ; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
    269 ; HSA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
    270 ; HSA-NEXT:    ret void
    271 ;
    272 ; MESA-LABEL: @kern_i32(
    273 ; MESA-NEXT:    [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    274 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_KERNARG_SEGMENT]], i64 36
    275 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
    276 ; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
    277 ; MESA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
    278 ; MESA-NEXT:    ret void
    279 ;
    280   store i32 %arg0, i32 addrspace(1)* undef
    281   ret void
    282 }
    283 
    284 define amdgpu_kernel void @kern_f32(float %arg0) {
    285 ; HSA-LABEL: @kern_f32(
    286 ; HSA-NEXT:    [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    287 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F32_KERNARG_SEGMENT]], i64 0
    288 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to float addrspace(4)*
    289 ; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load float, float addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
    290 ; HSA-NEXT:    store float [[ARG0_LOAD]], float addrspace(1)* undef
    291 ; HSA-NEXT:    ret void
    292 ;
    293 ; MESA-LABEL: @kern_f32(
    294 ; MESA-NEXT:    [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    295 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F32_KERNARG_SEGMENT]], i64 36
    296 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to float addrspace(4)*
    297 ; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load float, float addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
    298 ; MESA-NEXT:    store float [[ARG0_LOAD]], float addrspace(1)* undef
    299 ; MESA-NEXT:    ret void
    300 ;
    301   store float %arg0, float addrspace(1)* undef
    302   ret void
    303 }
    304 
    305 define amdgpu_kernel void @kern_v3i32(<3 x i32> %arg0) {
    306 ; HSA-LABEL: @kern_v3i32(
    307 ; HSA-NEXT:    [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    308 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I32_KERNARG_SEGMENT]], i64 0
    309 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to <3 x i32> addrspace(4)*
    310 ; HSA-NEXT:    [[TMP1:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]] to <4 x i32> addrspace(4)*
    311 ; HSA-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP1]], align 16, !invariant.load !0
    312 ; HSA-NEXT:    [[ARG0_LOAD:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    313 ; HSA-NEXT:    store <3 x i32> [[ARG0_LOAD]], <3 x i32> addrspace(1)* undef, align 4
    314 ; HSA-NEXT:    ret void
    315 ;
    316 ; MESA-LABEL: @kern_v3i32(
    317 ; MESA-NEXT:    [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    318 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I32_KERNARG_SEGMENT]], i64 36
    319 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to <3 x i32> addrspace(4)*
    320 ; MESA-NEXT:    [[TMP1:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]] to <4 x i32> addrspace(4)*
    321 ; MESA-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP1]], align 4, !invariant.load !0
    322 ; MESA-NEXT:    [[ARG0_LOAD:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    323 ; MESA-NEXT:    store <3 x i32> [[ARG0_LOAD]], <3 x i32> addrspace(1)* undef, align 4
    324 ; MESA-NEXT:    ret void
    325 ;
    326   store <3 x i32> %arg0, <3 x i32> addrspace(1)* undef, align 4
    327   ret void
    328 }
    329 
    330 define amdgpu_kernel void @kern_v8i32(<8 x i32> %arg) #0 {
    331 ; HSA-LABEL: @kern_v8i32(
    332 ; HSA-NEXT:    [[KERN_V8I32_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(32) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    333 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I32_KERNARG_SEGMENT]], i64 0
    334 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i32> addrspace(4)*
    335 ; HSA-NEXT:    [[ARG_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
    336 ; HSA-NEXT:    store <8 x i32> [[ARG_LOAD]], <8 x i32> addrspace(1)* undef
    337 ; HSA-NEXT:    ret void
    338 ;
    339 ; MESA-LABEL: @kern_v8i32(
    340 ; MESA-NEXT:    [[KERN_V8I32_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    341 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I32_KERNARG_SEGMENT]], i64 36
    342 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i32> addrspace(4)*
    343 ; MESA-NEXT:    [[ARG_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
    344 ; MESA-NEXT:    store <8 x i32> [[ARG_LOAD]], <8 x i32> addrspace(1)* undef
    345 ; MESA-NEXT:    ret void
    346 ;
    347   store <8 x i32> %arg, <8 x i32> addrspace(1)* undef
    348   ret void
    349 }
    350 
    351 define amdgpu_kernel void @kern_v8i64(<8 x i64> %arg) #0 {
    352 ; HSA-LABEL: @kern_v8i64(
    353 ; HSA-NEXT:    [[KERN_V8I64_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    354 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I64_KERNARG_SEGMENT]], i64 0
    355 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i64> addrspace(4)*
    356 ; HSA-NEXT:    [[ARG_LOAD:%.*]] = load <8 x i64>, <8 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
    357 ; HSA-NEXT:    store <8 x i64> [[ARG_LOAD]], <8 x i64> addrspace(1)* undef
    358 ; HSA-NEXT:    ret void
    359 ;
    360 ; MESA-LABEL: @kern_v8i64(
    361 ; MESA-NEXT:    [[KERN_V8I64_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(100) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    362 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I64_KERNARG_SEGMENT]], i64 36
    363 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i64> addrspace(4)*
    364 ; MESA-NEXT:    [[ARG_LOAD:%.*]] = load <8 x i64>, <8 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
    365 ; MESA-NEXT:    store <8 x i64> [[ARG_LOAD]], <8 x i64> addrspace(1)* undef
    366 ; MESA-NEXT:    ret void
    367 ;
    368   store <8 x i64> %arg, <8 x i64> addrspace(1)* undef
    369   ret void
    370 }
    371 
    372 define amdgpu_kernel void @kern_v16i64(<16 x i64> %arg) #0 {
    373 ; HSA-LABEL: @kern_v16i64(
    374 ; HSA-NEXT:    [[KERN_V16I64_KERNARG_SEGMENT:%.*]] = call nonnull align 128 dereferenceable(128) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    375 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V16I64_KERNARG_SEGMENT]], i64 0
    376 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <16 x i64> addrspace(4)*
    377 ; HSA-NEXT:    [[ARG_LOAD:%.*]] = load <16 x i64>, <16 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
    378 ; HSA-NEXT:    store <16 x i64> [[ARG_LOAD]], <16 x i64> addrspace(1)* undef
    379 ; HSA-NEXT:    ret void
    380 ;
    381 ; MESA-LABEL: @kern_v16i64(
    382 ; MESA-NEXT:    [[KERN_V16I64_KERNARG_SEGMENT:%.*]] = call nonnull align 128 dereferenceable(164) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    383 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V16I64_KERNARG_SEGMENT]], i64 36
    384 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <16 x i64> addrspace(4)*
    385 ; MESA-NEXT:    [[ARG_LOAD:%.*]] = load <16 x i64>, <16 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
    386 ; MESA-NEXT:    store <16 x i64> [[ARG_LOAD]], <16 x i64> addrspace(1)* undef
    387 ; MESA-NEXT:    ret void
    388 ;
    389   store <16 x i64> %arg, <16 x i64> addrspace(1)* undef
    390   ret void
    391 }
    392 
    393 define amdgpu_kernel void @kern_i32_v3i32(i32 %arg0, <3 x i32> %arg1) {
    394 ; HSA-LABEL: @kern_i32_v3i32(
    395 ; HSA-NEXT:    [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    396 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 0
    397 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
    398 ; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
    399 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 16
    400 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to <3 x i32> addrspace(4)*
    401 ; HSA-NEXT:    [[TMP1:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]] to <4 x i32> addrspace(4)*
    402 ; HSA-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP1]], align 16, !invariant.load !0
    403 ; HSA-NEXT:    [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    404 ; HSA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
    405 ; HSA-NEXT:    store <3 x i32> [[ARG1_LOAD]], <3 x i32> addrspace(1)* undef, align 4
    406 ; HSA-NEXT:    ret void
    407 ;
    408 ; MESA-LABEL: @kern_i32_v3i32(
    409 ; MESA-NEXT:    [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    410 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 36
    411 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
    412 ; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
    413 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 52
    414 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to <3 x i32> addrspace(4)*
    415 ; MESA-NEXT:    [[TMP1:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]] to <4 x i32> addrspace(4)*
    416 ; MESA-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP1]], align 4, !invariant.load !0
    417 ; MESA-NEXT:    [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    418 ; MESA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
    419 ; MESA-NEXT:    store <3 x i32> [[ARG1_LOAD]], <3 x i32> addrspace(1)* undef, align 4
    420 ; MESA-NEXT:    ret void
    421 ;
    422   store i32 %arg0, i32 addrspace(1)* undef
    423   store <3 x i32> %arg1, <3 x i32> addrspace(1)* undef, align 4
    424   ret void
    425 }
    426 
    427 %struct.a = type { i32, i8, [4 x i8] }
    428 %struct.b.packed = type { i8, i32, [3 x i16], <2 x double> }
    429 
    430 define amdgpu_kernel void @kern_struct_a(%struct.a %arg0) {
    431 ; HSA-LABEL: @kern_struct_a(
    432 ; HSA-NEXT:    [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    433 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_A_KERNARG_SEGMENT]], i64 0
    434 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_A:%.*]] addrspace(4)*
    435 ; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_A]], [[STRUCT_A]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
    436 ; HSA-NEXT:    store [[STRUCT_A]] %arg0.load, [[STRUCT_A]] addrspace(1)* undef
    437 ; HSA-NEXT:    ret void
    438 ;
    439 ; MESA-LABEL: @kern_struct_a(
    440 ; MESA-NEXT:    [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    441 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_A_KERNARG_SEGMENT]], i64 36
    442 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_A:%.*]] addrspace(4)*
    443 ; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_A]], [[STRUCT_A]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
    444 ; MESA-NEXT:    store [[STRUCT_A]] %arg0.load, [[STRUCT_A]] addrspace(1)* undef
    445 ; MESA-NEXT:    ret void
    446 ;
    447   store %struct.a %arg0, %struct.a addrspace(1)* undef
    448   ret void
    449 }
    450 
    451 define amdgpu_kernel void @kern_struct_b_packed(%struct.b.packed %arg0) #0 {
    452 ; HSA-LABEL: @kern_struct_b_packed(
    453 ; HSA-NEXT:    [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    454 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]], i64 0
    455 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_B_PACKED:%.*]] addrspace(4)*
    456 ; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED]], [[STRUCT_B_PACKED]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
    457 ; HSA-NEXT:    store [[STRUCT_B_PACKED]] %arg0.load, [[STRUCT_B_PACKED]] addrspace(1)* undef
    458 ; HSA-NEXT:    ret void
    459 ;
    460 ; MESA-LABEL: @kern_struct_b_packed(
    461 ; MESA-NEXT:    [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    462 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]], i64 36
    463 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_B_PACKED:%.*]] addrspace(4)*
    464 ; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED]], [[STRUCT_B_PACKED]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
    465 ; MESA-NEXT:    store [[STRUCT_B_PACKED]] %arg0.load, [[STRUCT_B_PACKED]] addrspace(1)* undef
    466 ; MESA-NEXT:    ret void
    467 ;
    468   store %struct.b.packed %arg0, %struct.b.packed addrspace(1)* undef
    469   ret void
    470 }
    471 
    472 define amdgpu_kernel void @kern_implicit_arg_num_bytes(i32 %arg0) #1 {
    473 ; HSA-LABEL: @kern_implicit_arg_num_bytes(
    474 ; HSA-NEXT:    [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    475 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]], i64 0
    476 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
    477 ; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
    478 ; HSA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
    479 ; HSA-NEXT:    ret void
    480 ;
    481 ; MESA-LABEL: @kern_implicit_arg_num_bytes(
    482 ; MESA-NEXT:    [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    483 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]], i64 36
    484 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
    485 ; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
    486 ; MESA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
    487 ; MESA-NEXT:    ret void
    488 ;
    489   store i32 %arg0, i32 addrspace(1)* undef
    490   ret void
    491 }
    492 
    493 define amdgpu_kernel void @kernel_implicitarg_no_struct_align(<16 x i32>, i32 %arg1) #1 {
    494 ; HSA-LABEL: @kernel_implicitarg_no_struct_align(
    495 ; HSA-NEXT:    [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(112) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    496 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT]], i64 64
    497 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)*
    498 ; HSA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
    499 ; HSA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef
    500 ; HSA-NEXT:    ret void
    501 ;
    502 ; MESA-LABEL: @kernel_implicitarg_no_struct_align(
    503 ; MESA-NEXT:    [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(108) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    504 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT]], i64 100
    505 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)*
    506 ; MESA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
    507 ; MESA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef
    508 ; MESA-NEXT:    ret void
    509 ;
    510   store i32 %arg1, i32 addrspace(1)* undef
    511   ret void
    512 }
    513 
    514 define amdgpu_kernel void @kern_lds_ptr(i32 addrspace(3)* %lds) #0 {
    515 ; HSA-LABEL: @kern_lds_ptr(
    516 ; HSA-NEXT:    [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    517 ; HSA-NEXT:    [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_LDS_PTR_KERNARG_SEGMENT]], i64 0
    518 ; HSA-NEXT:    [[LDS_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[LDS_KERNARG_OFFSET]] to i32 addrspace(3)* addrspace(4)*
    519 ; HSA-NEXT:    [[LDS_LOAD:%.*]] = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* [[LDS_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
    520 ; HSA-NEXT:    store i32 0, i32 addrspace(3)* [[LDS_LOAD]], align 4
    521 ; HSA-NEXT:    ret void
    522 ;
    523 ; MESA-LABEL: @kern_lds_ptr(
    524 ; MESA-NEXT:    [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    525 ; MESA-NEXT:    [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_LDS_PTR_KERNARG_SEGMENT]], i64 36
    526 ; MESA-NEXT:    [[LDS_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[LDS_KERNARG_OFFSET]] to i32 addrspace(3)* addrspace(4)*
    527 ; MESA-NEXT:    [[LDS_LOAD:%.*]] = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* [[LDS_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
    528 ; MESA-NEXT:    store i32 0, i32 addrspace(3)* [[LDS_LOAD]], align 4
    529 ; MESA-NEXT:    ret void
    530 ;
    531   store i32 0, i32 addrspace(3)* %lds, align 4
    532   ret void
    533 }
    534 
    535 define amdgpu_kernel void @kern_lds_ptr_si(i32 addrspace(3)* %lds) #2 {
    536 ; HSA-LABEL: @kern_lds_ptr_si(
    537 ; HSA-NEXT:    [[KERN_LDS_PTR_SI_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    538 ; HSA-NEXT:    store i32 0, i32 addrspace(3)* [[LDS:%.*]], align 4
    539 ; HSA-NEXT:    ret void
    540 ;
    541 ; MESA-LABEL: @kern_lds_ptr_si(
    542 ; MESA-NEXT:    [[KERN_LDS_PTR_SI_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    543 ; MESA-NEXT:    store i32 0, i32 addrspace(3)* [[LDS:%.*]], align 4
    544 ; MESA-NEXT:    ret void
    545 ;
    546   store i32 0, i32 addrspace(3)* %lds, align 4
    547   ret void
    548 }
    549 
    550 define amdgpu_kernel void @kern_realign_i8_i8(i8 %arg0, i8 %arg1) #0 {
    551 ; HSA-LABEL: @kern_realign_i8_i8(
    552 ; HSA-NEXT:    [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    553 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 0
    554 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    555 ; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    556 ; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
    557 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 0
    558 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    559 ; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    560 ; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
    561 ; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
    562 ; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef
    563 ; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
    564 ; HSA-NEXT:    ret void
    565 ;
    566 ; MESA-LABEL: @kern_realign_i8_i8(
    567 ; MESA-NEXT:    [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    568 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 36
    569 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    570 ; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    571 ; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
    572 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 36
    573 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    574 ; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    575 ; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
    576 ; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
    577 ; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef
    578 ; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
    579 ; MESA-NEXT:    ret void
    580 ;
    581   store volatile i8 %arg0, i8 addrspace(1)* undef
    582   store volatile i8 %arg1, i8 addrspace(1)* undef
    583   ret void
    584 }
    585 
    586 define amdgpu_kernel void @kern_realign_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2) #0 {
    587 ; HSA-LABEL: @kern_realign_i8_i8_i8(
    588 ; HSA-NEXT:    [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    589 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
    590 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    591 ; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    592 ; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
    593 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
    594 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    595 ; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    596 ; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
    597 ; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
    598 ; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
    599 ; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    600 ; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    601 ; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
    602 ; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
    603 ; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef
    604 ; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
    605 ; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef
    606 ; HSA-NEXT:    ret void
    607 ;
    608 ; MESA-LABEL: @kern_realign_i8_i8_i8(
    609 ; MESA-NEXT:    [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    610 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36
    611 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    612 ; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    613 ; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
    614 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36
    615 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    616 ; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    617 ; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
    618 ; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
    619 ; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36
    620 ; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    621 ; MESA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    622 ; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
    623 ; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
    624 ; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef
    625 ; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
    626 ; MESA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef
    627 ; MESA-NEXT:    ret void
    628 ;
    629   store volatile i8 %arg0, i8 addrspace(1)* undef
    630   store volatile i8 %arg1, i8 addrspace(1)* undef
    631   store volatile i8 %arg2, i8 addrspace(1)* undef
    632   ret void
    633 }
    634 
    635 define amdgpu_kernel void @kern_realign_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) #0 {
    636 ; HSA-LABEL: @kern_realign_i8_i8_i8_i8(
    637 ; HSA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    638 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
    639 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    640 ; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    641 ; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
    642 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
    643 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    644 ; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    645 ; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
    646 ; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
    647 ; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
    648 ; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    649 ; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    650 ; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
    651 ; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
    652 ; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
    653 ; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    654 ; HSA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    655 ; HSA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
    656 ; HSA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
    657 ; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef
    658 ; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
    659 ; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef
    660 ; HSA-NEXT:    store volatile i8 [[TMP11]], i8 addrspace(1)* undef
    661 ; HSA-NEXT:    ret void
    662 ;
    663 ; MESA-LABEL: @kern_realign_i8_i8_i8_i8(
    664 ; MESA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    665 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
    666 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    667 ; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    668 ; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
    669 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
    670 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    671 ; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    672 ; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
    673 ; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
    674 ; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
    675 ; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    676 ; MESA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    677 ; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
    678 ; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
    679 ; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
    680 ; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    681 ; MESA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    682 ; MESA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
    683 ; MESA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
    684 ; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef
    685 ; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
    686 ; MESA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef
    687 ; MESA-NEXT:    store volatile i8 [[TMP11]], i8 addrspace(1)* undef
    688 ; MESA-NEXT:    ret void
    689 ;
    690   store volatile i8 %arg0, i8 addrspace(1)* undef
    691   store volatile i8 %arg1, i8 addrspace(1)* undef
    692   store volatile i8 %arg2, i8 addrspace(1)* undef
    693   store volatile i8 %arg3, i8 addrspace(1)* undef
    694   ret void
    695 }
    696 
    697 define amdgpu_kernel void @kern_realign_i8_v3i8(i8 %arg0, <3 x i8> %arg1) #0 {
    698 ; HSA-LABEL: @kern_realign_i8_v3i8(
    699 ; HSA-NEXT:    [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    700 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 0
    701 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    702 ; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    703 ; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
    704 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 4
    705 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    706 ; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    707 ; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24
    708 ; HSA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP4]] to <3 x i8>
    709 ; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef
    710 ; HSA-NEXT:    store volatile <3 x i8> [[ARG1_LOAD]], <3 x i8> addrspace(1)* undef
    711 ; HSA-NEXT:    ret void
    712 ;
    713 ; MESA-LABEL: @kern_realign_i8_v3i8(
    714 ; MESA-NEXT:    [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    715 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 36
    716 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    717 ; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    718 ; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
    719 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 40
    720 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    721 ; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0
    722 ; MESA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24
    723 ; MESA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP4]] to <3 x i8>
    724 ; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef
    725 ; MESA-NEXT:    store volatile <3 x i8> [[ARG1_LOAD]], <3 x i8> addrspace(1)* undef
    726 ; MESA-NEXT:    ret void
    727 ;
    728   store volatile i8 %arg0, i8 addrspace(1)* undef
    729   store volatile <3 x i8> %arg1, <3 x i8> addrspace(1)* undef
    730   ret void
    731 }
    732 
    733 define amdgpu_kernel void @kern_realign_i8_i16(i8 %arg0, i16 %arg1) #0 {
    734 ; HSA-LABEL: @kern_realign_i8_i16(
    735 ; HSA-NEXT:    [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    736 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 0
    737 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    738 ; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    739 ; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
    740 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 0
    741 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    742 ; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    743 ; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
    744 ; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
    745 ; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef
    746 ; HSA-NEXT:    store volatile i16 [[TMP5]], i16 addrspace(1)* undef
    747 ; HSA-NEXT:    ret void
    748 ;
    749 ; MESA-LABEL: @kern_realign_i8_i16(
    750 ; MESA-NEXT:    [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    751 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 36
    752 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    753 ; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    754 ; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
    755 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 36
    756 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    757 ; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    758 ; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
    759 ; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
    760 ; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef
    761 ; MESA-NEXT:    store volatile i16 [[TMP5]], i16 addrspace(1)* undef
    762 ; MESA-NEXT:    ret void
    763 ;
    764   store volatile i8 %arg0, i8 addrspace(1)* undef
    765   store volatile i16 %arg1, i16 addrspace(1)* undef
    766   ret void
    767 }
    768 
    769 define amdgpu_kernel void @kern_realign_i1_i1(i1 %arg0, i1 %arg1) #0 {
    770 ; HSA-LABEL: @kern_realign_i1_i1(
    771 ; HSA-NEXT:    [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    772 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 0
    773 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    774 ; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    775 ; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
    776 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 0
    777 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    778 ; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    779 ; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
    780 ; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
    781 ; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef
    782 ; HSA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef
    783 ; HSA-NEXT:    ret void
    784 ;
    785 ; MESA-LABEL: @kern_realign_i1_i1(
    786 ; MESA-NEXT:    [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    787 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 36
    788 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    789 ; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    790 ; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
    791 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 36
    792 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    793 ; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    794 ; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
    795 ; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
    796 ; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef
    797 ; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef
    798 ; MESA-NEXT:    ret void
    799 ;
    800   store volatile i1 %arg0, i1 addrspace(1)* undef
    801   store volatile i1 %arg1, i1 addrspace(1)* undef
    802   ret void
    803 }
    804 
    805 define amdgpu_kernel void @kern_realign_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2) #0 {
    806 ; HSA-LABEL: @kern_realign_i1_i1_i1(
    807 ; HSA-NEXT:    [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    808 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
    809 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    810 ; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    811 ; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
    812 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
    813 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    814 ; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    815 ; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
    816 ; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
    817 ; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
    818 ; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    819 ; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    820 ; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
    821 ; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
    822 ; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef
    823 ; HSA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef
    824 ; HSA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef
    825 ; HSA-NEXT:    ret void
    826 ;
    827 ; MESA-LABEL: @kern_realign_i1_i1_i1(
    828 ; MESA-NEXT:    [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    829 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36
    830 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    831 ; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    832 ; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
    833 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36
    834 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    835 ; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    836 ; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
    837 ; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
    838 ; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36
    839 ; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    840 ; MESA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    841 ; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
    842 ; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
    843 ; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef
    844 ; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef
    845 ; MESA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef
    846 ; MESA-NEXT:    ret void
    847 ;
    848   store volatile i1 %arg0, i1 addrspace(1)* undef
    849   store volatile i1 %arg1, i1 addrspace(1)* undef
    850   store volatile i1 %arg2, i1 addrspace(1)* undef
    851   ret void
    852 }
    853 
    854 define amdgpu_kernel void @kern_realign_i1_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3) #0 {
    855 ; HSA-LABEL: @kern_realign_i1_i1_i1_i1(
    856 ; HSA-NEXT:    [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    857 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
    858 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    859 ; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    860 ; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
    861 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
    862 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    863 ; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    864 ; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
    865 ; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
    866 ; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
    867 ; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    868 ; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    869 ; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
    870 ; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
    871 ; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
    872 ; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    873 ; HSA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    874 ; HSA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
    875 ; HSA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i1
    876 ; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef
    877 ; HSA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef
    878 ; HSA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef
    879 ; HSA-NEXT:    store volatile i1 [[TMP11]], i1 addrspace(1)* undef
    880 ; HSA-NEXT:    ret void
    881 ;
    882 ; MESA-LABEL: @kern_realign_i1_i1_i1_i1(
    883 ; MESA-NEXT:    [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    884 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
    885 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    886 ; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    887 ; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
    888 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
    889 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    890 ; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    891 ; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
    892 ; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
    893 ; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
    894 ; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    895 ; MESA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    896 ; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
    897 ; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
    898 ; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
    899 ; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    900 ; MESA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    901 ; MESA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
    902 ; MESA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i1
    903 ; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef
    904 ; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef
    905 ; MESA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef
    906 ; MESA-NEXT:    store volatile i1 [[TMP11]], i1 addrspace(1)* undef
    907 ; MESA-NEXT:    ret void
    908 ;
    909   store volatile i1 %arg0, i1 addrspace(1)* undef
    910   store volatile i1 %arg1, i1 addrspace(1)* undef
    911   store volatile i1 %arg2, i1 addrspace(1)* undef
    912   store volatile i1 %arg3, i1 addrspace(1)* undef
    913   ret void
    914 }
    915 
    916 define amdgpu_kernel void @kern_realign_i1_v3i1(i1 %arg0, <3 x i1> %arg1) #0 {
    917 ; HSA-LABEL: @kern_realign_i1_v3i1(
    918 ; HSA-NEXT:    [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    919 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 0
    920 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    921 ; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    922 ; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
    923 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 4
    924 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    925 ; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    926 ; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
    927 ; HSA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP4]] to <3 x i1>
    928 ; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef
    929 ; HSA-NEXT:    store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef
    930 ; HSA-NEXT:    ret void
    931 ;
    932 ; MESA-LABEL: @kern_realign_i1_v3i1(
    933 ; MESA-NEXT:    [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    934 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 36
    935 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    936 ; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    937 ; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
    938 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 40
    939 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    940 ; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0
    941 ; MESA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
    942 ; MESA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP4]] to <3 x i1>
    943 ; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef
    944 ; MESA-NEXT:    store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef
    945 ; MESA-NEXT:    ret void
    946 ;
    947   store volatile i1 %arg0, i1 addrspace(1)* undef
    948   store volatile <3 x i1> %arg1, <3 x i1> addrspace(1)* undef
    949   ret void
    950 }
    951 
    952 define amdgpu_kernel void @kern_realign_i1_i16(i1 %arg0, i16 %arg1) #0 {
    953 ; HSA-LABEL: @kern_realign_i1_i16(
    954 ; HSA-NEXT:    [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    955 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 0
    956 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    957 ; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    958 ; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
    959 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 0
    960 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    961 ; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    962 ; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
    963 ; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
    964 ; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef
    965 ; HSA-NEXT:    store volatile i16 [[TMP5]], i16 addrspace(1)* undef
    966 ; HSA-NEXT:    ret void
    967 ;
    968 ; MESA-LABEL: @kern_realign_i1_i16(
    969 ; MESA-NEXT:    [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    970 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 36
    971 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    972 ; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    973 ; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
    974 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 36
    975 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    976 ; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
    977 ; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
    978 ; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
    979 ; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef
    980 ; MESA-NEXT:    store volatile i16 [[TMP5]], i16 addrspace(1)* undef
    981 ; MESA-NEXT:    ret void
    982 ;
    983   store volatile i1 %arg0, i1 addrspace(1)* undef
    984   store volatile i16 %arg1, i16 addrspace(1)* undef
    985   ret void
    986 }
    987 
    988 define amdgpu_kernel void @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3, i8 %arg4, i8 %arg5, i8 %arg6, i8 %arg7) #0 {
    989 ; HSA-LABEL: @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(
    990 ; HSA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
    991 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
    992 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    993 ; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    994 ; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
    995 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
    996 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
    997 ; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
    998 ; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
    999 ; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
   1000 ; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
   1001 ; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
   1002 ; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
   1003 ; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
   1004 ; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
   1005 ; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
   1006 ; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
   1007 ; HSA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
   1008 ; HSA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
   1009 ; HSA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
   1010 ; HSA-NEXT:    [[ARG5_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
   1011 ; HSA-NEXT:    [[ARG5_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
   1012 ; HSA-NEXT:    [[TMP12:%.*]] = load i32, i32 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
   1013 ; HSA-NEXT:    [[TMP13:%.*]] = lshr i32 [[TMP12]], 8
   1014 ; HSA-NEXT:    [[TMP14:%.*]] = trunc i32 [[TMP13]] to i8
   1015 ; HSA-NEXT:    [[ARG6_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
   1016 ; HSA-NEXT:    [[ARG6_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
   1017 ; HSA-NEXT:    [[TMP15:%.*]] = load i32, i32 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
   1018 ; HSA-NEXT:    [[TMP16:%.*]] = lshr i32 [[TMP15]], 16
   1019 ; HSA-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
   1020 ; HSA-NEXT:    [[ARG7_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
   1021 ; HSA-NEXT:    [[ARG7_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
   1022 ; HSA-NEXT:    [[TMP18:%.*]] = load i32, i32 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
   1023 ; HSA-NEXT:    [[TMP19:%.*]] = lshr i32 [[TMP18]], 24
   1024 ; HSA-NEXT:    [[TMP20:%.*]] = trunc i32 [[TMP19]] to i8
   1025 ; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef
   1026 ; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
   1027 ; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef
   1028 ; HSA-NEXT:    store volatile i8 [[TMP11]], i8 addrspace(1)* undef
   1029 ; HSA-NEXT:    store volatile i8 [[TMP14]], i8 addrspace(1)* undef
   1030 ; HSA-NEXT:    store volatile i8 [[TMP17]], i8 addrspace(1)* undef
   1031 ; HSA-NEXT:    store volatile i8 [[TMP20]], i8 addrspace(1)* undef
   1032 ; HSA-NEXT:    ret void
   1033 ;
   1034 ; MESA-LABEL: @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(
   1035 ; MESA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1036 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
   1037 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
   1038 ; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
   1039 ; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
   1040 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
   1041 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
   1042 ; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
   1043 ; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
   1044 ; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
   1045 ; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
   1046 ; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
   1047 ; MESA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
   1048 ; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
   1049 ; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
   1050 ; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
   1051 ; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
   1052 ; MESA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
   1053 ; MESA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
   1054 ; MESA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
   1055 ; MESA-NEXT:    [[ARG5_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 40
   1056 ; MESA-NEXT:    [[ARG5_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
   1057 ; MESA-NEXT:    [[TMP12:%.*]] = load i32, i32 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0
   1058 ; MESA-NEXT:    [[TMP13:%.*]] = lshr i32 [[TMP12]], 8
   1059 ; MESA-NEXT:    [[TMP14:%.*]] = trunc i32 [[TMP13]] to i8
   1060 ; MESA-NEXT:    [[ARG6_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 40
   1061 ; MESA-NEXT:    [[ARG6_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
   1062 ; MESA-NEXT:    [[TMP15:%.*]] = load i32, i32 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0
   1063 ; MESA-NEXT:    [[TMP16:%.*]] = lshr i32 [[TMP15]], 16
   1064 ; MESA-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
   1065 ; MESA-NEXT:    [[ARG7_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 40
   1066 ; MESA-NEXT:    [[ARG7_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
   1067 ; MESA-NEXT:    [[TMP18:%.*]] = load i32, i32 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0
   1068 ; MESA-NEXT:    [[TMP19:%.*]] = lshr i32 [[TMP18]], 24
   1069 ; MESA-NEXT:    [[TMP20:%.*]] = trunc i32 [[TMP19]] to i8
   1070 ; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef
   1071 ; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
   1072 ; MESA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef
   1073 ; MESA-NEXT:    store volatile i8 [[TMP11]], i8 addrspace(1)* undef
   1074 ; MESA-NEXT:    store volatile i8 [[TMP14]], i8 addrspace(1)* undef
   1075 ; MESA-NEXT:    store volatile i8 [[TMP17]], i8 addrspace(1)* undef
   1076 ; MESA-NEXT:    store volatile i8 [[TMP20]], i8 addrspace(1)* undef
   1077 ; MESA-NEXT:    ret void
   1078 ;
   1079   store volatile i8 %arg0, i8 addrspace(1)* undef
   1080   store volatile i8 %arg1, i8 addrspace(1)* undef
   1081   store volatile i8 %arg2, i8 addrspace(1)* undef
   1082   store volatile i8 %arg3, i8 addrspace(1)* undef
   1083   store volatile i8 %arg5, i8 addrspace(1)* undef
   1084   store volatile i8 %arg6, i8 addrspace(1)* undef
   1085   store volatile i8 %arg7, i8 addrspace(1)* undef
   1086   ret void
   1087 }
   1088 
   1089 define amdgpu_kernel void @kern_realign_f16_f16(half %arg0, half %arg1) #0 {
   1090 ; HSA-LABEL: @kern_realign_f16_f16(
   1091 ; HSA-NEXT:    [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1092 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 0
   1093 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
   1094 ; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
   1095 ; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
   1096 ; HSA-NEXT:    [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
   1097 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 0
   1098 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
   1099 ; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
   1100 ; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
   1101 ; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
   1102 ; HSA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP5]] to half
   1103 ; HSA-NEXT:    store volatile half [[ARG0_LOAD]], half addrspace(1)* undef
   1104 ; HSA-NEXT:    store volatile half [[ARG1_LOAD]], half addrspace(1)* undef
   1105 ; HSA-NEXT:    ret void
   1106 ;
   1107 ; MESA-LABEL: @kern_realign_f16_f16(
   1108 ; MESA-NEXT:    [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1109 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 36
   1110 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
   1111 ; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
   1112 ; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
   1113 ; MESA-NEXT:    [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
   1114 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 36
   1115 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
   1116 ; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
   1117 ; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
   1118 ; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
   1119 ; MESA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP5]] to half
   1120 ; MESA-NEXT:    store volatile half [[ARG0_LOAD]], half addrspace(1)* undef
   1121 ; MESA-NEXT:    store volatile half [[ARG1_LOAD]], half addrspace(1)* undef
   1122 ; MESA-NEXT:    ret void
   1123 ;
   1124   store volatile half %arg0, half addrspace(1)* undef
   1125   store volatile half %arg1, half addrspace(1)* undef
   1126   ret void
   1127 }
   1128 
   1129 define amdgpu_kernel void @kern_global_ptr(i8 addrspace(1)* %ptr) #0 {
   1130 ; HSA-LABEL: @kern_global_ptr(
   1131 ; HSA-NEXT:    [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1132 ; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
   1133 ; HSA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
   1134 ; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
   1135 ; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
   1136 ; HSA-NEXT:    ret void
   1137 ;
   1138 ; MESA-LABEL: @kern_global_ptr(
   1139 ; MESA-NEXT:    [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1140 ; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
   1141 ; MESA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
   1142 ; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
   1143 ; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
   1144 ; MESA-NEXT:    ret void
   1145 ;
   1146   store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
   1147   ret void
   1148 }
   1149 
   1150 define amdgpu_kernel void @kern_global_ptr_dereferencable(i8 addrspace(1)* dereferenceable(42) %ptr) #0 {
   1151 ; HSA-LABEL: @kern_global_ptr_dereferencable(
   1152 ; HSA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1153 ; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]], i64 0
   1154 ; HSA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
   1155 ; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !dereferenceable !1
   1156 ; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
   1157 ; HSA-NEXT:    ret void
   1158 ;
   1159 ; MESA-LABEL: @kern_global_ptr_dereferencable(
   1160 ; MESA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1161 ; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]], i64 36
   1162 ; MESA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
   1163 ; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !dereferenceable !1
   1164 ; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
   1165 ; MESA-NEXT:    ret void
   1166 ;
   1167   store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
   1168   ret void
   1169 }
   1170 
   1171 define amdgpu_kernel void @kern_global_ptr_dereferencable_or_null(i8 addrspace(1)* dereferenceable_or_null(128) %ptr) #0 {
   1172 ; HSA-LABEL: @kern_global_ptr_dereferencable_or_null(
   1173 ; HSA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1174 ; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]], i64 0
   1175 ; HSA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
   1176 ; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !dereferenceable_or_null !2
   1177 ; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
   1178 ; HSA-NEXT:    ret void
   1179 ;
   1180 ; MESA-LABEL: @kern_global_ptr_dereferencable_or_null(
   1181 ; MESA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1182 ; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]], i64 36
   1183 ; MESA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
   1184 ; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !dereferenceable_or_null !2
   1185 ; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
   1186 ; MESA-NEXT:    ret void
   1187 ;
   1188   store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
   1189   ret void
   1190 }
   1191 
   1192 define amdgpu_kernel void @kern_nonnull_global_ptr(i8 addrspace(1)* nonnull %ptr) #0 {
   1193 ; HSA-LABEL: @kern_nonnull_global_ptr(
   1194 ; HSA-NEXT:    [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1195 ; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
   1196 ; HSA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
   1197 ; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !nonnull !0
   1198 ; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
   1199 ; HSA-NEXT:    ret void
   1200 ;
   1201 ; MESA-LABEL: @kern_nonnull_global_ptr(
   1202 ; MESA-NEXT:    [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1203 ; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
   1204 ; MESA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
   1205 ; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !nonnull !0
   1206 ; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
   1207 ; MESA-NEXT:    ret void
   1208 ;
   1209   store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
   1210   ret void
   1211 }
   1212 
   1213 define amdgpu_kernel void @kern_align32_global_ptr(i8 addrspace(1)* align 1024 %ptr) #0 {
   1214 ; HSA-LABEL: @kern_align32_global_ptr(
   1215 ; HSA-NEXT:    [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1216 ; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
   1217 ; HSA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
   1218 ; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !align !3
   1219 ; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
   1220 ; HSA-NEXT:    ret void
   1221 ;
   1222 ; MESA-LABEL: @kern_align32_global_ptr(
   1223 ; MESA-NEXT:    [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1224 ; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
   1225 ; MESA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
   1226 ; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !align !3
   1227 ; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
   1228 ; MESA-NEXT:    ret void
   1229 ;
   1230   store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
   1231   ret void
   1232 }
   1233 
   1234 define amdgpu_kernel void @kern_noalias_global_ptr(i8 addrspace(1)* noalias %ptr) #0 {
   1235 ; HSA-LABEL: @kern_noalias_global_ptr(
   1236 ; HSA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1237 ; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR:%.*]], i8 addrspace(1)* addrspace(1)* undef
   1238 ; HSA-NEXT:    ret void
   1239 ;
   1240 ; MESA-LABEL: @kern_noalias_global_ptr(
   1241 ; MESA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1242 ; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR:%.*]], i8 addrspace(1)* addrspace(1)* undef
   1243 ; MESA-NEXT:    ret void
   1244 ;
   1245   store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
   1246   ret void
   1247 }
   1248 
   1249 define amdgpu_kernel void @kern_noalias_global_ptr_x2(i8 addrspace(1)* noalias %ptr0, i8 addrspace(1)* noalias %ptr1) #0 {
   1250 ; HSA-LABEL: @kern_noalias_global_ptr_x2(
   1251 ; HSA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1252 ; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR0:%.*]], i8 addrspace(1)* addrspace(1)* undef
   1253 ; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR1:%.*]], i8 addrspace(1)* addrspace(1)* undef
   1254 ; HSA-NEXT:    ret void
   1255 ;
   1256 ; MESA-LABEL: @kern_noalias_global_ptr_x2(
   1257 ; MESA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1258 ; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR0:%.*]], i8 addrspace(1)* addrspace(1)* undef
   1259 ; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR1:%.*]], i8 addrspace(1)* addrspace(1)* undef
   1260 ; MESA-NEXT:    ret void
   1261 ;
   1262   store volatile i8 addrspace(1)* %ptr0, i8 addrspace(1)* addrspace(1)* undef
   1263   store volatile i8 addrspace(1)* %ptr1, i8 addrspace(1)* addrspace(1)* undef
   1264   ret void
   1265 }
   1266 
   1267 define amdgpu_kernel void @struct_i8_i8_arg({i8, i8} %in) #0 {
   1268 ; HSA-LABEL: @struct_i8_i8_arg(
   1269 ; HSA-NEXT:  entry:
   1270 ; HSA-NEXT:    [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1271 ; HSA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT]], i64 0
   1272 ; HSA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to { i8, i8 } addrspace(4)*
   1273 ; HSA-NEXT:    [[IN_LOAD:%.*]] = load { i8, i8 }, { i8, i8 } addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
   1274 ; HSA-NEXT:    [[ELT0:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 0
   1275 ; HSA-NEXT:    [[ELT1:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 1
   1276 ; HSA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
   1277 ; HSA-NEXT:    store volatile i8 [[ELT1]], i8 addrspace(1)* null, align 4
   1278 ; HSA-NEXT:    ret void
   1279 ;
   1280 ; MESA-LABEL: @struct_i8_i8_arg(
   1281 ; MESA-NEXT:  entry:
   1282 ; MESA-NEXT:    [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1283 ; MESA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT]], i64 36
   1284 ; MESA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to { i8, i8 } addrspace(4)*
   1285 ; MESA-NEXT:    [[IN_LOAD:%.*]] = load { i8, i8 }, { i8, i8 } addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
   1286 ; MESA-NEXT:    [[ELT0:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 0
   1287 ; MESA-NEXT:    [[ELT1:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 1
   1288 ; MESA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
   1289 ; MESA-NEXT:    store volatile i8 [[ELT1]], i8 addrspace(1)* null, align 4
   1290 ; MESA-NEXT:    ret void
   1291 ;
   1292 entry:
   1293   %elt0 = extractvalue {i8, i8} %in, 0
   1294   %elt1 = extractvalue {i8, i8} %in, 1
   1295   store volatile i8 %elt0, i8 addrspace(1)* null, align 4
   1296   store volatile i8 %elt1, i8 addrspace(1)* null, align 4
   1297   ret void
   1298 }
   1299 
   1300 define amdgpu_kernel void @struct_i8_i16_arg({i8, i16} %in) #0 {
   1301 ; HSA-LABEL: @struct_i8_i16_arg(
   1302 ; HSA-NEXT:  entry:
   1303 ; HSA-NEXT:    [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1304 ; HSA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT]], i64 0
   1305 ; HSA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to { i8, i16 } addrspace(4)*
   1306 ; HSA-NEXT:    [[IN_LOAD:%.*]] = load { i8, i16 }, { i8, i16 } addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
   1307 ; HSA-NEXT:    [[ELT0:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 0
   1308 ; HSA-NEXT:    [[ELT1:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 1
   1309 ; HSA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
   1310 ; HSA-NEXT:    store volatile i16 [[ELT1]], i16 addrspace(1)* null, align 4
   1311 ; HSA-NEXT:    ret void
   1312 ;
   1313 ; MESA-LABEL: @struct_i8_i16_arg(
   1314 ; MESA-NEXT:  entry:
   1315 ; MESA-NEXT:    [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1316 ; MESA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT]], i64 36
   1317 ; MESA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to { i8, i16 } addrspace(4)*
   1318 ; MESA-NEXT:    [[IN_LOAD:%.*]] = load { i8, i16 }, { i8, i16 } addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
   1319 ; MESA-NEXT:    [[ELT0:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 0
   1320 ; MESA-NEXT:    [[ELT1:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 1
   1321 ; MESA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
   1322 ; MESA-NEXT:    store volatile i16 [[ELT1]], i16 addrspace(1)* null, align 4
   1323 ; MESA-NEXT:    ret void
   1324 ;
   1325 entry:
   1326   %elt0 = extractvalue {i8, i16} %in, 0
   1327   %elt1 = extractvalue {i8, i16} %in, 1
   1328   store volatile i8 %elt0, i8 addrspace(1)* null, align 4
   1329   store volatile i16 %elt1, i16 addrspace(1)* null, align 4
   1330   ret void
   1331 }
   1332 
   1333 define amdgpu_kernel void @array_2xi8_arg([2 x i8] %in) #0 {
   1334 ; HSA-LABEL: @array_2xi8_arg(
   1335 ; HSA-NEXT:  entry:
   1336 ; HSA-NEXT:    [[ARRAY_2XI8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1337 ; HSA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI8_ARG_KERNARG_SEGMENT]], i64 0
   1338 ; HSA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to [2 x i8] addrspace(4)*
   1339 ; HSA-NEXT:    [[IN_LOAD:%.*]] = load [2 x i8], [2 x i8] addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
   1340 ; HSA-NEXT:    [[ELT0:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 0
   1341 ; HSA-NEXT:    [[ELT1:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 1
   1342 ; HSA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
   1343 ; HSA-NEXT:    store volatile i8 [[ELT1]], i8 addrspace(1)* null, align 4
   1344 ; HSA-NEXT:    ret void
   1345 ;
   1346 ; MESA-LABEL: @array_2xi8_arg(
   1347 ; MESA-NEXT:  entry:
   1348 ; MESA-NEXT:    [[ARRAY_2XI8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1349 ; MESA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI8_ARG_KERNARG_SEGMENT]], i64 36
   1350 ; MESA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to [2 x i8] addrspace(4)*
   1351 ; MESA-NEXT:    [[IN_LOAD:%.*]] = load [2 x i8], [2 x i8] addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
   1352 ; MESA-NEXT:    [[ELT0:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 0
   1353 ; MESA-NEXT:    [[ELT1:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 1
   1354 ; MESA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
   1355 ; MESA-NEXT:    store volatile i8 [[ELT1]], i8 addrspace(1)* null, align 4
   1356 ; MESA-NEXT:    ret void
   1357 ;
   1358 entry:
   1359   %elt0 = extractvalue [2 x i8] %in, 0
   1360   %elt1 = extractvalue [2 x i8] %in, 1
   1361   store volatile i8 %elt0, i8 addrspace(1)* null, align 4
   1362   store volatile i8 %elt1, i8 addrspace(1)* null, align 4
   1363   ret void
   1364 }
   1365 
   1366 define amdgpu_kernel void @array_2xi1_arg([2 x i1] %in) #0 {
   1367 ; HSA-LABEL: @array_2xi1_arg(
   1368 ; HSA-NEXT:  entry:
   1369 ; HSA-NEXT:    [[ARRAY_2XI1_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1370 ; HSA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI1_ARG_KERNARG_SEGMENT]], i64 0
   1371 ; HSA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to [2 x i1] addrspace(4)*
   1372 ; HSA-NEXT:    [[IN_LOAD:%.*]] = load [2 x i1], [2 x i1] addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
   1373 ; HSA-NEXT:    [[ELT0:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 0
   1374 ; HSA-NEXT:    [[ELT1:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 1
   1375 ; HSA-NEXT:    store volatile i1 [[ELT0]], i1 addrspace(1)* null, align 4
   1376 ; HSA-NEXT:    store volatile i1 [[ELT1]], i1 addrspace(1)* null, align 4
   1377 ; HSA-NEXT:    ret void
   1378 ;
   1379 ; MESA-LABEL: @array_2xi1_arg(
   1380 ; MESA-NEXT:  entry:
   1381 ; MESA-NEXT:    [[ARRAY_2XI1_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1382 ; MESA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI1_ARG_KERNARG_SEGMENT]], i64 36
   1383 ; MESA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to [2 x i1] addrspace(4)*
   1384 ; MESA-NEXT:    [[IN_LOAD:%.*]] = load [2 x i1], [2 x i1] addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
   1385 ; MESA-NEXT:    [[ELT0:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 0
   1386 ; MESA-NEXT:    [[ELT1:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 1
   1387 ; MESA-NEXT:    store volatile i1 [[ELT0]], i1 addrspace(1)* null, align 4
   1388 ; MESA-NEXT:    store volatile i1 [[ELT1]], i1 addrspace(1)* null, align 4
   1389 ; MESA-NEXT:    ret void
   1390 ;
   1391 entry:
   1392   %elt0 = extractvalue [2 x i1] %in, 0
   1393   %elt1 = extractvalue [2 x i1] %in, 1
   1394   store volatile i1 %elt0, i1 addrspace(1)* null, align 4
   1395   store volatile i1 %elt1, i1 addrspace(1)* null, align 4
   1396   ret void
   1397 }
   1398 
   1399 define amdgpu_kernel void @only_empty_struct({} %empty) #0 {
   1400 ; HSA-LABEL: @only_empty_struct(
   1401 ; HSA-NEXT:    ret void
   1402 ;
   1403 ; MESA-LABEL: @only_empty_struct(
   1404 ; MESA-NEXT:    [[ONLY_EMPTY_STRUCT_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(36) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1405 ; MESA-NEXT:    ret void
   1406 ;
   1407   ret void
   1408 }
   1409 
   1410 define amdgpu_kernel void @empty_struct_with_other({} %empty, i32 %arg1) #0 {
   1411 ; HSA-LABEL: @empty_struct_with_other(
   1412 ; HSA-NEXT:    [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1413 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT]], i64 0
   1414 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)*
   1415 ; HSA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
   1416 ; HSA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef
   1417 ; HSA-NEXT:    ret void
   1418 ;
   1419 ; MESA-LABEL: @empty_struct_with_other(
   1420 ; MESA-NEXT:    [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
   1421 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT]], i64 36
   1422 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)*
   1423 ; MESA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
   1424 ; MESA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef
   1425 ; MESA-NEXT:    ret void
   1426 ;
   1427   store i32 %arg1, i32 addrspace(1)* undef
   1428   ret void
   1429 }
   1430 
   1431 attributes #0 = { nounwind "target-cpu"="kaveri" }
   1432 attributes #1 = { nounwind "target-cpu"="kaveri" "amdgpu-implicitarg-num-bytes"="40" }
   1433 attributes #2 = { nounwind "target-cpu"="tahiti" }
   1434 
   1435 ; GCN: 0 = !{}
   1436 ; GCN: !1 = !{i64 42}
   1437 ; GCN: !2 = !{i64 128}
   1438 ; GCN: !3 = !{i64 1024}
   1439