Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
      2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
      3 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
      4 
      5 
      6 ; GCN-LABEL: {{^}}s_pack_v2f16:
      7 ; GFX9: s_load_dword [[VAL0:s[0-9]+]]
      8 ; GFX9: s_load_dword [[VAL1:s[0-9]+]]
      9 ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], [[VAL1]]
     10 ; GFX9: ; use [[PACKED]]
     11 define amdgpu_kernel void @s_pack_v2f16(i32 addrspace(4)* %in0, i32 addrspace(4)* %in1) #0 {
     12   %val0 = load volatile i32, i32 addrspace(4)* %in0
     13   %val1 = load volatile i32, i32 addrspace(4)* %in1
     14   %lo.i = trunc i32 %val0 to i16
     15   %hi.i = trunc i32 %val1 to i16
     16   %lo = bitcast i16 %lo.i to half
     17   %hi = bitcast i16 %hi.i to half
     18   %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
     19   %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
     20   %vec.i32 = bitcast <2 x half> %vec.1 to i32
     21 
     22   call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
     23   ret void
     24 }
     25 
     26 ; GCN-LABEL: {{^}}s_pack_v2f16_imm_lo:
     27 ; GFX9: s_load_dword [[VAL1:s[0-9]+]]
     28 ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], 0x1234, [[VAL1]]
     29 ; GFX9: ; use [[PACKED]]
     30 define amdgpu_kernel void @s_pack_v2f16_imm_lo(i32 addrspace(4)* %in1) #0 {
     31   %val1 = load i32, i32 addrspace(4)* %in1
     32   %hi.i = trunc i32 %val1 to i16
     33   %hi = bitcast i16 %hi.i to half
     34   %vec.0 = insertelement <2 x half> undef, half 0xH1234, i32 0
     35   %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
     36   %vec.i32 = bitcast <2 x half> %vec.1 to i32
     37 
     38   call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
     39   ret void
     40 }
     41 
     42 ; GCN-LABEL: {{^}}s_pack_v2f16_imm_hi:
     43 ; GFX9: s_load_dword [[VAL0:s[0-9]+]]
     44 ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], 0x1234
     45 ; GFX9: ; use [[PACKED]]
     46 define amdgpu_kernel void @s_pack_v2f16_imm_hi(i32 addrspace(4)* %in0) #0 {
     47   %val0 = load i32, i32 addrspace(4)* %in0
     48   %lo.i = trunc i32 %val0 to i16
     49   %lo = bitcast i16 %lo.i to half
     50   %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
     51   %vec.1 = insertelement <2 x half> %vec.0, half 0xH1234, i32 1
     52   %vec.i32 = bitcast <2 x half> %vec.1 to i32
     53 
     54   call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
     55   ret void
     56 }
     57 
     58 ; GCN-LABEL: {{^}}v_pack_v2f16:
     59 ; GFX9: global_load_dword [[VAL0:v[0-9]+]]
     60 ; GFX9: global_load_dword [[VAL1:v[0-9]+]]
     61 
     62 ; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VAL0]]
     63 ; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[ELT0]]
     64 ; GFX9: ; use [[PACKED]]
     65 define amdgpu_kernel void @v_pack_v2f16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
     66   %tid = call i32 @llvm.amdgcn.workitem.id.x()
     67   %tid.ext = sext i32 %tid to i64
     68   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
     69   %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
     70   %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
     71   %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
     72   %lo.i = trunc i32 %val0 to i16
     73   %hi.i = trunc i32 %val1 to i16
     74   %lo = bitcast i16 %lo.i to half
     75   %hi = bitcast i16 %hi.i to half
     76   %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
     77   %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
     78   %vec.i32 = bitcast <2 x half> %vec.1 to i32
     79   call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
     80   ret void
     81 }
     82 
     83 ; GCN-LABEL: {{^}}v_pack_v2f16_user:
     84 ; GFX9: global_load_dword [[VAL0:v[0-9]+]]
     85 ; GFX9: global_load_dword [[VAL1:v[0-9]+]]
     86 
     87 ; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VAL0]]
     88 ; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[ELT0]]
     89 
     90 ; GFX9: v_add_u32_e32 v{{[0-9]+}}, 9, [[PACKED]]
     91 define amdgpu_kernel void @v_pack_v2f16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
     92   %tid = call i32 @llvm.amdgcn.workitem.id.x()
     93   %tid.ext = sext i32 %tid to i64
     94   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
     95   %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
     96   %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
     97   %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
     98   %lo.i = trunc i32 %val0 to i16
     99   %hi.i = trunc i32 %val1 to i16
    100   %lo = bitcast i16 %lo.i to half
    101   %hi = bitcast i16 %hi.i to half
    102   %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
    103   %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
    104   %vec.i32 = bitcast <2 x half> %vec.1 to i32
    105   %foo = add i32 %vec.i32, 9
    106   store volatile i32 %foo, i32 addrspace(1)* undef
    107   ret void
    108 }
    109 
    110 ; GCN-LABEL: {{^}}v_pack_v2f16_imm_lo:
    111 ; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]]
    112 
    113 ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234{{$}}
    114 ; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]]
    115 ; GFX9: ; use [[PACKED]]
    116 define amdgpu_kernel void @v_pack_v2f16_imm_lo(i32 addrspace(1)* %in1) #0 {
    117   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    118   %tid.ext = sext i32 %tid to i64
    119   %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
    120   %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
    121   %hi.i = trunc i32 %val1 to i16
    122   %hi = bitcast i16 %hi.i to half
    123   %vec.0 = insertelement <2 x half> undef, half 0xH1234, i32 0
    124   %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
    125   %vec.i32 = bitcast <2 x half> %vec.1 to i32
    126   call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
    127   ret void
    128 }
    129 
    130 ; GCN-LABEL: {{^}}v_pack_v2f16_inline_imm_lo:
    131 ; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]]
    132 
    133 ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x4400{{$}}
    134 ; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]]
    135 
    136 ; GFX9: ; use [[PACKED]]
    137 define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(i32 addrspace(1)* %in1) #0 {
    138   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    139   %tid.ext = sext i32 %tid to i64
    140   %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
    141   %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
    142   %hi.i = trunc i32 %val1 to i16
    143   %hi = bitcast i16 %hi.i to half
    144   %vec.0 = insertelement <2 x half> undef, half 4.0, i32 0
    145   %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
    146   %vec.i32 = bitcast <2 x half> %vec.1 to i32
    147   call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
    148   ret void
    149 }
    150 
    151 ; GCN-LABEL: {{^}}v_pack_v2f16_imm_hi:
    152 ; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]]
    153 
    154 ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x1234
    155 ; GFX9: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL0]]
    156 ; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[MASKED]]
    157 
    158 ; GFX9: ; use [[PACKED]]
    159 define amdgpu_kernel void @v_pack_v2f16_imm_hi(i32 addrspace(1)* %in0) #0 {
    160   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    161   %tid.ext = sext i32 %tid to i64
    162   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
    163   %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
    164   %lo.i = trunc i32 %val0 to i16
    165   %lo = bitcast i16 %lo.i to half
    166   %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
    167   %vec.1 = insertelement <2 x half> %vec.0, half 0xH1234, i32 1
    168   %vec.i32 = bitcast <2 x half> %vec.1 to i32
    169   call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
    170   ret void
    171 }
    172 
    173 ; GCN-LABEL: {{^}}v_pack_v2f16_inline_f16imm_hi:
    174 ; GFX9-DAG: global_load_dword [[VAL:v[0-9]+]]
    175 
    176 ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3c00
    177 ; GFX9: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL]]
    178 ; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[MASKED]]
    179 
    180 ; GFX9: ; use [[PACKED]]
    181 define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(i32 addrspace(1)* %in0) #0 {
    182   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    183   %tid.ext = sext i32 %tid to i64
    184   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
    185   %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
    186   %lo.i = trunc i32 %val0 to i16
    187   %lo = bitcast i16 %lo.i to half
    188   %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
    189   %vec.1 = insertelement <2 x half> %vec.0, half 1.0, i32 1
    190   %vec.i32 = bitcast <2 x half> %vec.1 to i32
    191   call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
    192   ret void
    193 }
    194 
    195 ; GCN-LABEL: {{^}}v_pack_v2f16_inline_imm_hi:
    196 ; GFX9: global_load_dword [[VAL:v[0-9]+]]
    197 
    198 ; GFX9: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL]]
    199 ; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], 64, 16, [[MASKED]]
    200 
    201 ; GFX9: ; use [[PACKED]]
    202 define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(i32 addrspace(1)* %in0) #0 {
    203   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    204   %tid.ext = sext i32 %tid to i64
    205   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
    206   %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
    207   %lo.i = trunc i32 %val0 to i16
    208   %lo = bitcast i16 %lo.i to half
    209   %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
    210   %vec.1 = insertelement <2 x half> %vec.0, half 0xH0040, i32 1
    211   %vec.i32 = bitcast <2 x half> %vec.1 to i32
    212   call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
    213   ret void
    214 }
    215 
    216 declare i32 @llvm.amdgcn.workitem.id.x() #1
    217 
    218 attributes #0 = { nounwind }
    219 attributes #1 = { nounwind readnone }
    220