Home | History | Annotate | Download | only in ARM
      1 ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
      2 ; Implement ctpop with vcnt
      3 
      4 define <8 x i8> @vcnt8(<8 x i8>* %A) nounwind {
      5 ;CHECK-LABEL: vcnt8:
      6 ;CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}}
      7 	%tmp1 = load <8 x i8>, <8 x i8>* %A
      8 	%tmp2 = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %tmp1)
      9 	ret <8 x i8> %tmp2
     10 }
     11 
     12 define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind {
     13 ;CHECK-LABEL: vcntQ8:
     14 ;CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}}
     15 	%tmp1 = load <16 x i8>, <16 x i8>* %A
     16 	%tmp2 = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %tmp1)
     17 	ret <16 x i8> %tmp2
     18 }
     19 
     20 define <4 x i16> @vcnt16(<4 x i16>* %A) nounwind {
     21 ; CHECK-LABEL: vcnt16:
     22 ; CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}}
     23 ; CHECK: vrev16.8 {{d[0-9]+}}, {{d[0-9]+}}
     24 ; CHECK: vadd.i8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
     25 ; CHECK: vuzp.8 {{d[0-9]+}}, {{d[0-9]+}}
     26 ; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
     27 	%tmp1 = load <4 x i16>, <4 x i16>* %A
     28 	%tmp2 = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %tmp1)
     29 	ret <4 x i16> %tmp2
     30 }
     31 
     32 define <8 x i16> @vcntQ16(<8 x i16>* %A) nounwind {
     33 ; CHECK-LABEL: vcntQ16:
     34 ; CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}}
     35 ; CHECK: vrev16.8 {{q[0-9]+}}, {{q[0-9]+}}
     36 ; CHECK: vadd.i8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
     37 ; CHECK: vuzp.8 {{q[0-9]+}}, {{q[0-9]+}}
     38 ; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
     39 	%tmp1 = load <8 x i16>, <8 x i16>* %A
     40 	%tmp2 = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %tmp1)
     41 	ret <8 x i16> %tmp2
     42 }
     43 
     44 define <2 x i32> @vcnt32(<2 x i32>* %A) nounwind {
     45 ; CHECK-LABEL: vcnt32:
     46 ; CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}}
     47 ; CHECK: vrev16.8 {{d[0-9]+}}, {{d[0-9]+}}
     48 ; CHECK: vadd.i8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
     49 ; CHECK: vuzp.8 {{d[0-9]+}}, {{d[0-9]+}}
     50 ; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
     51 ; CHECK: vrev32.16 {{d[0-9]+}}, {{d[0-9]+}}
     52 ; CHECK: vuzp.16 {{d[0-9]+}}, {{d[0-9]+}}
     53 ; CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}}
     54 	%tmp1 = load <2 x i32>, <2 x i32>* %A
     55 	%tmp2 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %tmp1)
     56 	ret <2 x i32> %tmp2
     57 }
     58 
     59 define <4 x i32> @vcntQ32(<4 x i32>* %A) nounwind {
     60 ; CHECK-LABEL: vcntQ32:
     61 ; CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}}
     62 ; CHECK: vrev16.8 {{q[0-9]+}}, {{q[0-9]+}}
     63 ; CHECK: vadd.i8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
     64 ; CHECK: vuzp.8 {{q[0-9]+}}, {{q[0-9]+}}
     65 ; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
     66 ; CHECK: vrev32.16 {{q[0-9]+}}, {{q[0-9]+}}
     67 ; CHECK: vuzp.16 {{q[0-9]+}}, {{q[0-9]+}}
     68 ; CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}}
     69 	%tmp1 = load <4 x i32>, <4 x i32>* %A
     70 	%tmp2 = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %tmp1)
     71 	ret <4 x i32> %tmp2
     72 }
     73 
     74 define <1 x i64> @vcnt64(<1 x i64>* %A) nounwind {
     75 ; CHECK-LABEL: vcnt64:
     76 	%tmp1 = load <1 x i64>, <1 x i64>* %A
     77 	%tmp2 = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %tmp1)
     78 	ret <1 x i64> %tmp2
     79 }
     80 
     81 define <2 x i64> @vcntQ64(<2 x i64>* %A) nounwind {
     82 ; CHECK-LABEL: vcntQ64:
     83 	%tmp1 = load <2 x i64>, <2 x i64>* %A
     84 	%tmp2 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %tmp1)
     85 	ret <2 x i64> %tmp2
     86 }
     87 
     88 declare <8 x i8>  @llvm.ctpop.v8i8(<8 x i8>) nounwind readnone
     89 declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) nounwind readnone
     90 declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone
     91 declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone
     92 declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone
     93 declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone
     94 declare <1 x i64> @llvm.ctpop.v1i64(<1 x i64>) nounwind readnone
     95 declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
     96 
     97 define <8 x i8> @vclz8(<8 x i8>* %A) nounwind {
     98 ;CHECK-LABEL: vclz8:
     99 ;CHECK: vclz.i8 {{d[0-9]+}}, {{d[0-9]+}}
    100 	%tmp1 = load <8 x i8>, <8 x i8>* %A
    101 	%tmp2 = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %tmp1, i1 0)
    102 	ret <8 x i8> %tmp2
    103 }
    104 
    105 define <4 x i16> @vclz16(<4 x i16>* %A) nounwind {
    106 ;CHECK-LABEL: vclz16:
    107 ;CHECK: vclz.i16 {{d[0-9]+}}, {{d[0-9]+}}
    108 	%tmp1 = load <4 x i16>, <4 x i16>* %A
    109 	%tmp2 = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %tmp1, i1 0)
    110 	ret <4 x i16> %tmp2
    111 }
    112 
    113 define <2 x i32> @vclz32(<2 x i32>* %A) nounwind {
    114 ;CHECK-LABEL: vclz32:
    115 ;CHECK: vclz.i32 {{d[0-9]+}}, {{d[0-9]+}}
    116 	%tmp1 = load <2 x i32>, <2 x i32>* %A
    117 	%tmp2 = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %tmp1, i1 0)
    118 	ret <2 x i32> %tmp2
    119 }
    120 
    121 define <16 x i8> @vclzQ8(<16 x i8>* %A) nounwind {
    122 ;CHECK-LABEL: vclzQ8:
    123 ;CHECK: vclz.i8 {{q[0-9]+}}, {{q[0-9]+}}
    124 	%tmp1 = load <16 x i8>, <16 x i8>* %A
    125 	%tmp2 = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %tmp1, i1 0)
    126 	ret <16 x i8> %tmp2
    127 }
    128 
    129 define <8 x i16> @vclzQ16(<8 x i16>* %A) nounwind {
    130 ;CHECK-LABEL: vclzQ16:
    131 ;CHECK: vclz.i16 {{q[0-9]+}}, {{q[0-9]+}}
    132 	%tmp1 = load <8 x i16>, <8 x i16>* %A
    133 	%tmp2 = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %tmp1, i1 0)
    134 	ret <8 x i16> %tmp2
    135 }
    136 
    137 define <4 x i32> @vclzQ32(<4 x i32>* %A) nounwind {
    138 ;CHECK-LABEL: vclzQ32:
    139 ;CHECK: vclz.i32 {{q[0-9]+}}, {{q[0-9]+}}
    140 	%tmp1 = load <4 x i32>, <4 x i32>* %A
    141 	%tmp2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %tmp1, i1 0)
    142 	ret <4 x i32> %tmp2
    143 }
    144 
    145 declare <8 x i8>  @llvm.ctlz.v8i8(<8 x i8>, i1) nounwind readnone
    146 declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone
    147 declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
    148 
    149 declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) nounwind readnone
    150 declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) nounwind readnone
    151 declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
    152 
    153 define <8 x i8> @vclss8(<8 x i8>* %A) nounwind {
    154 ;CHECK-LABEL: vclss8:
    155 ;CHECK: vcls.s8
    156 	%tmp1 = load <8 x i8>, <8 x i8>* %A
    157 	%tmp2 = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %tmp1)
    158 	ret <8 x i8> %tmp2
    159 }
    160 
    161 define <4 x i16> @vclss16(<4 x i16>* %A) nounwind {
    162 ;CHECK-LABEL: vclss16:
    163 ;CHECK: vcls.s16
    164 	%tmp1 = load <4 x i16>, <4 x i16>* %A
    165 	%tmp2 = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %tmp1)
    166 	ret <4 x i16> %tmp2
    167 }
    168 
    169 define <2 x i32> @vclss32(<2 x i32>* %A) nounwind {
    170 ;CHECK-LABEL: vclss32:
    171 ;CHECK: vcls.s32
    172 	%tmp1 = load <2 x i32>, <2 x i32>* %A
    173 	%tmp2 = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %tmp1)
    174 	ret <2 x i32> %tmp2
    175 }
    176 
    177 define <16 x i8> @vclsQs8(<16 x i8>* %A) nounwind {
    178 ;CHECK-LABEL: vclsQs8:
    179 ;CHECK: vcls.s8
    180 	%tmp1 = load <16 x i8>, <16 x i8>* %A
    181 	%tmp2 = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %tmp1)
    182 	ret <16 x i8> %tmp2
    183 }
    184 
    185 define <8 x i16> @vclsQs16(<8 x i16>* %A) nounwind {
    186 ;CHECK-LABEL: vclsQs16:
    187 ;CHECK: vcls.s16
    188 	%tmp1 = load <8 x i16>, <8 x i16>* %A
    189 	%tmp2 = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %tmp1)
    190 	ret <8 x i16> %tmp2
    191 }
    192 
    193 define <4 x i32> @vclsQs32(<4 x i32>* %A) nounwind {
    194 ;CHECK-LABEL: vclsQs32:
    195 ;CHECK: vcls.s32
    196 	%tmp1 = load <4 x i32>, <4 x i32>* %A
    197 	%tmp2 = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %tmp1)
    198 	ret <4 x i32> %tmp2
    199 }
    200 
    201 declare <8 x i8>  @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone
    202 declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone
    203 declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone
    204 
    205 declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone
    206 declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone
    207 declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone
    208