Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
      2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
      3 
      4 ; SI-LABEL: {{^}}load_i8_to_f32:
      5 ; SI: buffer_load_ubyte [[LOADREG:v[0-9]+]],
      6 ; SI-NOT: bfe
      7 ; SI-NOT: lshr
      8 ; SI: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
      9 ; SI: buffer_store_dword [[CONV]],
     10 define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
     11   %load = load i8, i8 addrspace(1)* %in, align 1
     12   %cvt = uitofp i8 %load to float
     13   store float %cvt, float addrspace(1)* %out, align 4
     14   ret void
     15 }
     16 
     17 ; SI-LABEL: {{^}}load_v2i8_to_v2f32:
     18 ; SI: buffer_load_ushort [[LOADREG:v[0-9]+]],
     19 ; SI-NOT: bfe
     20 ; SI-NOT: lshr
     21 ; SI-NOT: and
     22 ; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
     23 ; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
     24 ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
     25 define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
     26   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2
     27   %cvt = uitofp <2 x i8> %load to <2 x float>
     28   store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
     29   ret void
     30 }
     31 
     32 ; SI-LABEL: {{^}}load_v3i8_to_v3f32:
     33 ; SI-NOT: bfe
     34 ; SI-NOT: v_cvt_f32_ubyte3_e32
     35 ; SI-DAG: v_cvt_f32_ubyte2_e32
     36 ; SI-DAG: v_cvt_f32_ubyte1_e32
     37 ; SI-DAG: v_cvt_f32_ubyte0_e32
     38 ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
     39 define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
     40   %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
     41   %cvt = uitofp <3 x i8> %load to <3 x float>
     42   store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
     43   ret void
     44 }
     45 
     46 ; SI-LABEL: {{^}}load_v4i8_to_v4f32:
     47 ; SI: buffer_load_dword [[LOADREG:v[0-9]+]]
     48 ; SI-NOT: bfe
     49 ; SI-NOT: lshr
     50 ; SI-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
     51 ; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[LOADREG]]
     52 ; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]]
     53 ; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
     54 ; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
     55 define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
     56   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
     57   %cvt = uitofp <4 x i8> %load to <4 x float>
     58   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
     59   ret void
     60 }
     61 
     62 ; This should not be adding instructions to shift into the correct
     63 ; position in the word for the component.
     64 
     65 ; SI-LABEL: {{^}}load_v4i8_to_v4f32_unaligned:
     66 ; SI: buffer_load_ubyte [[LOADREG3:v[0-9]+]]
     67 ; SI: buffer_load_ubyte [[LOADREG2:v[0-9]+]]
     68 ; SI: buffer_load_ubyte [[LOADREG1:v[0-9]+]]
     69 ; SI: buffer_load_ubyte [[LOADREG0:v[0-9]+]]
     70 ; SI-NOT: v_lshlrev_b32
     71 ; SI-NOT: v_or_b32
     72 
     73 ; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG0]]
     74 ; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG1]]
     75 ; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG2]]
     76 ; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]], [[LOADREG3]]
     77 
     78 ; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
     79 define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
     80   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
     81   %cvt = uitofp <4 x i8> %load to <4 x float>
     82   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
     83   ret void
     84 }
     85 
     86 ; XXX - This should really still be able to use the v_cvt_f32_ubyte0
     87 ; for each component, but computeKnownBits doesn't handle vectors very
     88 ; well.
     89 
     90 ; SI-LABEL: {{^}}load_v4i8_to_v4f32_2_uses:
     91 ; SI: buffer_load_ubyte
     92 ; SI: buffer_load_ubyte
     93 ; SI: buffer_load_ubyte
     94 ; SI: buffer_load_ubyte
     95 ; SI: v_cvt_f32_ubyte0_e32
     96 ; SI: v_cvt_f32_ubyte0_e32
     97 ; SI: v_cvt_f32_ubyte0_e32
     98 ; SI: v_cvt_f32_ubyte0_e32
     99 
    100 ; XXX - replace with this when v4i8 loads aren't scalarized anymore.
    101 ; XSI: buffer_load_dword
    102 ; XSI: v_cvt_f32_u32_e32
    103 ; XSI: v_cvt_f32_u32_e32
    104 ; XSI: v_cvt_f32_u32_e32
    105 ; XSI: v_cvt_f32_u32_e32
    106 ; SI: s_endpgm
    107 define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
    108   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
    109   %cvt = uitofp <4 x i8> %load to <4 x float>
    110   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
    111   %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load
    112   store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4
    113   ret void
    114 }
    115 
    116 ; Make sure this doesn't crash.
    117 ; SI-LABEL: {{^}}load_v7i8_to_v7f32:
    118 ; SI: s_endpgm
    119 define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
    120   %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1
    121   %cvt = uitofp <7 x i8> %load to <7 x float>
    122   store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
    123   ret void
    124 }
    125 
    126 ; SI-LABEL: {{^}}load_v8i8_to_v8f32:
    127 ; SI: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
    128 ; SI-NOT: bfe
    129 ; SI-NOT: lshr
    130 ; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]]
    131 ; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[LOLOAD]]
    132 ; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[LOLOAD]]
    133 ; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[LOLOAD]]
    134 ; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[HILOAD]]
    135 ; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[HILOAD]]
    136 ; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[HILOAD]]
    137 ; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]]
    138 ; SI-NOT: bfe
    139 ; SI-NOT: lshr
    140 ; SI: buffer_store_dwordx4
    141 ; SI: buffer_store_dwordx4
    142 define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
    143   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8
    144   %cvt = uitofp <8 x i8> %load to <8 x float>
    145   store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
    146   ret void
    147 }
    148 
    149 ; SI-LABEL: {{^}}i8_zext_inreg_i32_to_f32:
    150 ; SI: buffer_load_dword [[LOADREG:v[0-9]+]],
    151 ; SI: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]]
    152 ; SI-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
    153 ; SI: buffer_store_dword [[CONV]],
    154 define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
    155   %load = load i32, i32 addrspace(1)* %in, align 4
    156   %add = add i32 %load, 2
    157   %inreg = and i32 %add, 255
    158   %cvt = uitofp i32 %inreg to float
    159   store float %cvt, float addrspace(1)* %out, align 4
    160   ret void
    161 }
    162 
    163 ; SI-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
    164 define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
    165   %load = load i32, i32 addrspace(1)* %in, align 4
    166   %inreg = and i32 %load, 65280
    167   %shr = lshr i32 %inreg, 8
    168   %cvt = uitofp i32 %shr to float
    169   store float %cvt, float addrspace(1)* %out, align 4
    170   ret void
    171 }
    172 
    173 
    174 ; We don't get these ones because of the zext, but instcombine removes
    175 ; them so it shouldn't really matter.
    176 define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
    177   %load = load i8, i8 addrspace(1)* %in, align 1
    178   %ext = zext i8 %load to i32
    179   %cvt = uitofp i32 %ext to float
    180   store float %cvt, float addrspace(1)* %out, align 4
    181   ret void
    182 }
    183 
    184 define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
    185   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
    186   %ext = zext <4 x i8> %load to <4 x i32>
    187   %cvt = uitofp <4 x i32> %ext to <4 x float>
    188   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
    189   ret void
    190 }
    191