Home | History | Annotate | Download | only in ARM
      1 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
      2 ; Implement ctpop with vcnt
      3 
      4 define <8 x i8> @vcnt8(<8 x i8>* %A) nounwind {
      5 ;CHECK: vcnt8:
      6 ;CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}}
      7 	%tmp1 = load <8 x i8>* %A
      8 	%tmp2 = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %tmp1)
      9 	ret <8 x i8> %tmp2
     10 }
     11 
     12 define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind {
     13 ;CHECK: vcntQ8:
     14 ;CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}}
     15 	%tmp1 = load <16 x i8>* %A
     16 	%tmp2 = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %tmp1)
     17 	ret <16 x i8> %tmp2
     18 }
     19 
     20 define <4 x i16> @vcnt16(<4 x i16>* %A) nounwind {
     21 ; CHECK: vcnt16:
     22 ; CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}}
     23 ; CHECK: vrev16.8 {{d[0-9]+}}, {{d[0-9]+}}
     24 ; CHECK: vadd.i8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
     25 ; CHECK: vuzp.8 {{d[0-9]+}}, {{d[0-9]+}}
     26 ; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
     27 	%tmp1 = load <4 x i16>* %A
     28 	%tmp2 = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %tmp1)
     29 	ret <4 x i16> %tmp2
     30 }
     31 
     32 define <8 x i16> @vcntQ16(<8 x i16>* %A) nounwind {
     33 ; CHECK: vcntQ16:
     34 ; CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}}
     35 ; CHECK: vrev16.8 {{q[0-9]+}}, {{q[0-9]+}}
     36 ; CHECK: vadd.i8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
     37 ; CHECK: vuzp.8 {{q[0-9]+}}, {{q[0-9]+}}
     38 ; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
     39 	%tmp1 = load <8 x i16>* %A
     40 	%tmp2 = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %tmp1)
     41 	ret <8 x i16> %tmp2
     42 }
     43 
     44 define <2 x i32> @vcnt32(<2 x i32>* %A) nounwind {
     45 ; CHECK: vcnt32:
     46 ; CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}}
     47 ; CHECK: vrev16.8 {{d[0-9]+}}, {{d[0-9]+}}
     48 ; CHECK: vadd.i8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
     49 ; CHECK: vuzp.8 {{d[0-9]+}}, {{d[0-9]+}}
     50 ; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
     51 ; CHECK: vrev32.16 {{d[0-9]+}}, {{d[0-9]+}}
     52 ; CHECK: vuzp.16 {{d[0-9]+}}, {{d[0-9]+}}
     53 ; CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}}
     54 	%tmp1 = load <2 x i32>* %A
     55 	%tmp2 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %tmp1)
     56 	ret <2 x i32> %tmp2
     57 }
     58 
     59 define <4 x i32> @vcntQ32(<4 x i32>* %A) nounwind {
     60 ; CHECK: vcntQ32:
     61 ; CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}}
     62 ; CHECK: vrev16.8 {{q[0-9]+}}, {{q[0-9]+}}
     63 ; CHECK: vadd.i8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
     64 ; CHECK: vuzp.8 {{q[0-9]+}}, {{q[0-9]+}}
     65 ; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
     66 ; CHECK: vrev32.16 {{q[0-9]+}}, {{q[0-9]+}}
     67 ; CHECK: vuzp.16 {{q[0-9]+}}, {{q[0-9]+}}
     68 ; CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}}
     69 	%tmp1 = load <4 x i32>* %A
     70 	%tmp2 = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %tmp1)
     71 	ret <4 x i32> %tmp2
     72 }
     73 
     74 declare <8 x i8>  @llvm.ctpop.v8i8(<8 x i8>) nounwind readnone
     75 declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) nounwind readnone
     76 declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone
     77 declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone
     78 declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone
     79 declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone
     80 
     81 define <8 x i8> @vclz8(<8 x i8>* %A) nounwind {
     82 ;CHECK: vclz8:
     83 ;CHECK: vclz.i8 {{d[0-9]+}}, {{d[0-9]+}}
     84 	%tmp1 = load <8 x i8>* %A
     85 	%tmp2 = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %tmp1, i1 0)
     86 	ret <8 x i8> %tmp2
     87 }
     88 
     89 define <4 x i16> @vclz16(<4 x i16>* %A) nounwind {
     90 ;CHECK: vclz16:
     91 ;CHECK: vclz.i16 {{d[0-9]+}}, {{d[0-9]+}}
     92 	%tmp1 = load <4 x i16>* %A
     93 	%tmp2 = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %tmp1, i1 0)
     94 	ret <4 x i16> %tmp2
     95 }
     96 
     97 define <2 x i32> @vclz32(<2 x i32>* %A) nounwind {
     98 ;CHECK: vclz32:
     99 ;CHECK: vclz.i32 {{d[0-9]+}}, {{d[0-9]+}}
    100 	%tmp1 = load <2 x i32>* %A
    101 	%tmp2 = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %tmp1, i1 0)
    102 	ret <2 x i32> %tmp2
    103 }
    104 
    105 define <16 x i8> @vclzQ8(<16 x i8>* %A) nounwind {
    106 ;CHECK: vclzQ8:
    107 ;CHECK: vclz.i8 {{q[0-9]+}}, {{q[0-9]+}}
    108 	%tmp1 = load <16 x i8>* %A
    109 	%tmp2 = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %tmp1, i1 0)
    110 	ret <16 x i8> %tmp2
    111 }
    112 
    113 define <8 x i16> @vclzQ16(<8 x i16>* %A) nounwind {
    114 ;CHECK: vclzQ16:
    115 ;CHECK: vclz.i16 {{q[0-9]+}}, {{q[0-9]+}}
    116 	%tmp1 = load <8 x i16>* %A
    117 	%tmp2 = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %tmp1, i1 0)
    118 	ret <8 x i16> %tmp2
    119 }
    120 
    121 define <4 x i32> @vclzQ32(<4 x i32>* %A) nounwind {
    122 ;CHECK: vclzQ32:
    123 ;CHECK: vclz.i32 {{q[0-9]+}}, {{q[0-9]+}}
    124 	%tmp1 = load <4 x i32>* %A
    125 	%tmp2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %tmp1, i1 0)
    126 	ret <4 x i32> %tmp2
    127 }
    128 
    129 declare <8 x i8>  @llvm.ctlz.v8i8(<8 x i8>, i1) nounwind readnone
    130 declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone
    131 declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
    132 
    133 declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) nounwind readnone
    134 declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) nounwind readnone
    135 declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
    136 
    137 define <8 x i8> @vclss8(<8 x i8>* %A) nounwind {
    138 ;CHECK: vclss8:
    139 ;CHECK: vcls.s8
    140 	%tmp1 = load <8 x i8>* %A
    141 	%tmp2 = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %tmp1)
    142 	ret <8 x i8> %tmp2
    143 }
    144 
    145 define <4 x i16> @vclss16(<4 x i16>* %A) nounwind {
    146 ;CHECK: vclss16:
    147 ;CHECK: vcls.s16
    148 	%tmp1 = load <4 x i16>* %A
    149 	%tmp2 = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %tmp1)
    150 	ret <4 x i16> %tmp2
    151 }
    152 
    153 define <2 x i32> @vclss32(<2 x i32>* %A) nounwind {
    154 ;CHECK: vclss32:
    155 ;CHECK: vcls.s32
    156 	%tmp1 = load <2 x i32>* %A
    157 	%tmp2 = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %tmp1)
    158 	ret <2 x i32> %tmp2
    159 }
    160 
    161 define <16 x i8> @vclsQs8(<16 x i8>* %A) nounwind {
    162 ;CHECK: vclsQs8:
    163 ;CHECK: vcls.s8
    164 	%tmp1 = load <16 x i8>* %A
    165 	%tmp2 = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %tmp1)
    166 	ret <16 x i8> %tmp2
    167 }
    168 
    169 define <8 x i16> @vclsQs16(<8 x i16>* %A) nounwind {
    170 ;CHECK: vclsQs16:
    171 ;CHECK: vcls.s16
    172 	%tmp1 = load <8 x i16>* %A
    173 	%tmp2 = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %tmp1)
    174 	ret <8 x i16> %tmp2
    175 }
    176 
    177 define <4 x i32> @vclsQs32(<4 x i32>* %A) nounwind {
    178 ;CHECK: vclsQs32:
    179 ;CHECK: vcls.s32
    180 	%tmp1 = load <4 x i32>* %A
    181 	%tmp2 = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %tmp1)
    182 	ret <4 x i32> %tmp2
    183 }
    184 
    185 declare <8 x i8>  @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone
    186 declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone
    187 declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone
    188 
    189 declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone
    190 declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone
    191 declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone
    192