1 ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s 2 ; Implement ctpop with vcnt 3 4 define <8 x i8> @vcnt8(<8 x i8>* %A) nounwind { 5 ;CHECK-LABEL: vcnt8: 6 ;CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}} 7 %tmp1 = load <8 x i8>, <8 x i8>* %A 8 %tmp2 = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %tmp1) 9 ret <8 x i8> %tmp2 10 } 11 12 define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind { 13 ;CHECK-LABEL: vcntQ8: 14 ;CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}} 15 %tmp1 = load <16 x i8>, <16 x i8>* %A 16 %tmp2 = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %tmp1) 17 ret <16 x i8> %tmp2 18 } 19 20 define <4 x i16> @vcnt16(<4 x i16>* %A) nounwind { 21 ; CHECK-LABEL: vcnt16: 22 ; CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}} 23 ; CHECK: vrev16.8 {{d[0-9]+}}, {{d[0-9]+}} 24 ; CHECK: vadd.i8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} 25 ; CHECK: vuzp.8 {{d[0-9]+}}, {{d[0-9]+}} 26 ; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}} 27 %tmp1 = load <4 x i16>, <4 x i16>* %A 28 %tmp2 = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %tmp1) 29 ret <4 x i16> %tmp2 30 } 31 32 define <8 x i16> @vcntQ16(<8 x i16>* %A) nounwind { 33 ; CHECK-LABEL: vcntQ16: 34 ; CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}} 35 ; CHECK: vrev16.8 {{q[0-9]+}}, {{q[0-9]+}} 36 ; CHECK: vadd.i8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} 37 ; CHECK: vuzp.8 {{q[0-9]+}}, {{q[0-9]+}} 38 ; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}} 39 %tmp1 = load <8 x i16>, <8 x i16>* %A 40 %tmp2 = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %tmp1) 41 ret <8 x i16> %tmp2 42 } 43 44 define <2 x i32> @vcnt32(<2 x i32>* %A) nounwind { 45 ; CHECK-LABEL: vcnt32: 46 ; CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}} 47 ; CHECK: vrev16.8 {{d[0-9]+}}, {{d[0-9]+}} 48 ; CHECK: vadd.i8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} 49 ; CHECK: vuzp.8 {{d[0-9]+}}, {{d[0-9]+}} 50 ; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}} 51 ; CHECK: vrev32.16 {{d[0-9]+}}, {{d[0-9]+}} 52 ; CHECK: vuzp.16 {{d[0-9]+}}, {{d[0-9]+}} 53 ; CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}} 54 %tmp1 = load <2 x i32>, <2 x i32>* %A 55 %tmp2 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %tmp1) 56 ret <2 x i32> %tmp2 57 } 58 59 define <4 x i32> @vcntQ32(<4 x i32>* %A) nounwind { 60 ; CHECK-LABEL: vcntQ32: 61 ; CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}} 62 ; CHECK: vrev16.8 {{q[0-9]+}}, {{q[0-9]+}} 63 ; CHECK: vadd.i8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} 64 ; CHECK: vuzp.8 {{q[0-9]+}}, {{q[0-9]+}} 65 ; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}} 66 ; CHECK: vrev32.16 {{q[0-9]+}}, {{q[0-9]+}} 67 ; CHECK: vuzp.16 {{q[0-9]+}}, {{q[0-9]+}} 68 ; CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}} 69 %tmp1 = load <4 x i32>, <4 x i32>* %A 70 %tmp2 = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %tmp1) 71 ret <4 x i32> %tmp2 72 } 73 74 define <1 x i64> @vcnt64(<1 x i64>* %A) nounwind { 75 ; CHECK-LABEL: vcnt64: 76 %tmp1 = load <1 x i64>, <1 x i64>* %A 77 %tmp2 = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %tmp1) 78 ret <1 x i64> %tmp2 79 } 80 81 define <2 x i64> @vcntQ64(<2 x i64>* %A) nounwind { 82 ; CHECK-LABEL: vcntQ64: 83 %tmp1 = load <2 x i64>, <2 x i64>* %A 84 %tmp2 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %tmp1) 85 ret <2 x i64> %tmp2 86 } 87 88 declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>) nounwind readnone 89 declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) nounwind readnone 90 declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone 91 declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone 92 declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone 93 declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone 94 declare <1 x i64> @llvm.ctpop.v1i64(<1 x i64>) nounwind readnone 95 declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone 96 97 define <8 x i8> @vclz8(<8 x i8>* %A) nounwind { 98 ;CHECK-LABEL: vclz8: 99 ;CHECK: vclz.i8 {{d[0-9]+}}, {{d[0-9]+}} 100 %tmp1 = load <8 x i8>, <8 x i8>* %A 101 %tmp2 = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %tmp1, i1 0) 102 ret <8 x i8> %tmp2 103 } 104 105 define <4 x i16> @vclz16(<4 x i16>* %A) nounwind { 106 ;CHECK-LABEL: vclz16: 107 ;CHECK: vclz.i16 {{d[0-9]+}}, {{d[0-9]+}} 108 %tmp1 = load <4 x i16>, <4 x i16>* %A 109 %tmp2 = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %tmp1, i1 0) 110 ret <4 x i16> %tmp2 111 } 112 113 define <2 x i32> @vclz32(<2 x i32>* %A) nounwind { 114 ;CHECK-LABEL: vclz32: 115 ;CHECK: vclz.i32 {{d[0-9]+}}, {{d[0-9]+}} 116 %tmp1 = load <2 x i32>, <2 x i32>* %A 117 %tmp2 = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %tmp1, i1 0) 118 ret <2 x i32> %tmp2 119 } 120 121 define <16 x i8> @vclzQ8(<16 x i8>* %A) nounwind { 122 ;CHECK-LABEL: vclzQ8: 123 ;CHECK: vclz.i8 {{q[0-9]+}}, {{q[0-9]+}} 124 %tmp1 = load <16 x i8>, <16 x i8>* %A 125 %tmp2 = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %tmp1, i1 0) 126 ret <16 x i8> %tmp2 127 } 128 129 define <8 x i16> @vclzQ16(<8 x i16>* %A) nounwind { 130 ;CHECK-LABEL: vclzQ16: 131 ;CHECK: vclz.i16 {{q[0-9]+}}, {{q[0-9]+}} 132 %tmp1 = load <8 x i16>, <8 x i16>* %A 133 %tmp2 = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %tmp1, i1 0) 134 ret <8 x i16> %tmp2 135 } 136 137 define <4 x i32> @vclzQ32(<4 x i32>* %A) nounwind { 138 ;CHECK-LABEL: vclzQ32: 139 ;CHECK: vclz.i32 {{q[0-9]+}}, {{q[0-9]+}} 140 %tmp1 = load <4 x i32>, <4 x i32>* %A 141 %tmp2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %tmp1, i1 0) 142 ret <4 x i32> %tmp2 143 } 144 145 declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1) nounwind readnone 146 declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone 147 declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone 148 149 declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) nounwind readnone 150 declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) nounwind readnone 151 declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone 152 153 define <8 x i8> @vclss8(<8 x i8>* %A) nounwind { 154 ;CHECK-LABEL: vclss8: 155 ;CHECK: vcls.s8 156 %tmp1 = load <8 x i8>, <8 x i8>* %A 157 %tmp2 = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %tmp1) 158 ret <8 x i8> %tmp2 159 } 160 161 define <4 x i16> @vclss16(<4 x i16>* %A) nounwind { 162 ;CHECK-LABEL: vclss16: 163 ;CHECK: vcls.s16 164 %tmp1 = load <4 x i16>, <4 x i16>* %A 165 %tmp2 = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %tmp1) 166 ret <4 x i16> %tmp2 167 } 168 169 define <2 x i32> @vclss32(<2 x i32>* %A) nounwind { 170 ;CHECK-LABEL: vclss32: 171 ;CHECK: vcls.s32 172 %tmp1 = load <2 x i32>, <2 x i32>* %A 173 %tmp2 = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %tmp1) 174 ret <2 x i32> %tmp2 175 } 176 177 define <16 x i8> @vclsQs8(<16 x i8>* %A) nounwind { 178 ;CHECK-LABEL: vclsQs8: 179 ;CHECK: vcls.s8 180 %tmp1 = load <16 x i8>, <16 x i8>* %A 181 %tmp2 = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %tmp1) 182 ret <16 x i8> %tmp2 183 } 184 185 define <8 x i16> @vclsQs16(<8 x i16>* %A) nounwind { 186 ;CHECK-LABEL: vclsQs16: 187 ;CHECK: vcls.s16 188 %tmp1 = load <8 x i16>, <8 x i16>* %A 189 %tmp2 = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %tmp1) 190 ret <8 x i16> %tmp2 191 } 192 193 define <4 x i32> @vclsQs32(<4 x i32>* %A) nounwind { 194 ;CHECK-LABEL: vclsQs32: 195 ;CHECK: vcls.s32 196 %tmp1 = load <4 x i32>, <4 x i32>* %A 197 %tmp2 = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %tmp1) 198 ret <4 x i32> %tmp2 199 } 200 201 declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone 202 declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone 203 declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone 204 205 declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone 206 declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone 207 declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone 208