1 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s 2 ; Implement ctpop with vcnt 3 4 define <8 x i8> @vcnt8(<8 x i8>* %A) nounwind { 5 ;CHECK: vcnt8: 6 ;CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}} 7 %tmp1 = load <8 x i8>* %A 8 %tmp2 = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %tmp1) 9 ret <8 x i8> %tmp2 10 } 11 12 define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind { 13 ;CHECK: vcntQ8: 14 ;CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}} 15 %tmp1 = load <16 x i8>* %A 16 %tmp2 = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %tmp1) 17 ret <16 x i8> %tmp2 18 } 19 20 define <4 x i16> @vcnt16(<4 x i16>* %A) nounwind { 21 ; CHECK: vcnt16: 22 ; CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}} 23 ; CHECK: vrev16.8 {{d[0-9]+}}, {{d[0-9]+}} 24 ; CHECK: vadd.i8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} 25 ; CHECK: vuzp.8 {{d[0-9]+}}, {{d[0-9]+}} 26 ; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}} 27 %tmp1 = load <4 x i16>* %A 28 %tmp2 = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %tmp1) 29 ret <4 x i16> %tmp2 30 } 31 32 define <8 x i16> @vcntQ16(<8 x i16>* %A) nounwind { 33 ; CHECK: vcntQ16: 34 ; CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}} 35 ; CHECK: vrev16.8 {{q[0-9]+}}, {{q[0-9]+}} 36 ; CHECK: vadd.i8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} 37 ; CHECK: vuzp.8 {{q[0-9]+}}, {{q[0-9]+}} 38 ; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}} 39 %tmp1 = load <8 x i16>* %A 40 %tmp2 = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %tmp1) 41 ret <8 x i16> %tmp2 42 } 43 44 define <2 x i32> @vcnt32(<2 x i32>* %A) nounwind { 45 ; CHECK: vcnt32: 46 ; CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}} 47 ; CHECK: vrev16.8 {{d[0-9]+}}, {{d[0-9]+}} 48 ; CHECK: vadd.i8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} 49 ; CHECK: vuzp.8 {{d[0-9]+}}, {{d[0-9]+}} 50 ; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}} 51 ; CHECK: vrev32.16 {{d[0-9]+}}, {{d[0-9]+}} 52 ; CHECK: vuzp.16 {{d[0-9]+}}, {{d[0-9]+}} 53 ; CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}} 54 %tmp1 = load <2 x i32>* %A 55 %tmp2 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %tmp1) 56 ret <2 x i32> %tmp2 57 } 58 59 define <4 x i32> @vcntQ32(<4 x i32>* %A) nounwind { 60 ; CHECK: vcntQ32: 61 ; CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}} 62 ; CHECK: vrev16.8 {{q[0-9]+}}, {{q[0-9]+}} 63 ; CHECK: vadd.i8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} 64 ; CHECK: vuzp.8 {{q[0-9]+}}, {{q[0-9]+}} 65 ; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}} 66 ; CHECK: vrev32.16 {{q[0-9]+}}, {{q[0-9]+}} 67 ; CHECK: vuzp.16 {{q[0-9]+}}, {{q[0-9]+}} 68 ; CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}} 69 %tmp1 = load <4 x i32>* %A 70 %tmp2 = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %tmp1) 71 ret <4 x i32> %tmp2 72 } 73 74 declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>) nounwind readnone 75 declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) nounwind readnone 76 declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone 77 declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone 78 declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone 79 declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone 80 81 define <8 x i8> @vclz8(<8 x i8>* %A) nounwind { 82 ;CHECK: vclz8: 83 ;CHECK: vclz.i8 {{d[0-9]+}}, {{d[0-9]+}} 84 %tmp1 = load <8 x i8>* %A 85 %tmp2 = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %tmp1, i1 0) 86 ret <8 x i8> %tmp2 87 } 88 89 define <4 x i16> @vclz16(<4 x i16>* %A) nounwind { 90 ;CHECK: vclz16: 91 ;CHECK: vclz.i16 {{d[0-9]+}}, {{d[0-9]+}} 92 %tmp1 = load <4 x i16>* %A 93 %tmp2 = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %tmp1, i1 0) 94 ret <4 x i16> %tmp2 95 } 96 97 define <2 x i32> @vclz32(<2 x i32>* %A) nounwind { 98 ;CHECK: vclz32: 99 ;CHECK: vclz.i32 {{d[0-9]+}}, {{d[0-9]+}} 100 %tmp1 = load <2 x i32>* %A 101 %tmp2 = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %tmp1, i1 0) 102 ret <2 x i32> %tmp2 103 } 104 105 define <16 x i8> @vclzQ8(<16 x i8>* %A) nounwind { 106 ;CHECK: vclzQ8: 107 ;CHECK: vclz.i8 {{q[0-9]+}}, {{q[0-9]+}} 108 %tmp1 = load <16 x i8>* %A 109 %tmp2 = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %tmp1, i1 0) 110 ret <16 x i8> %tmp2 111 } 112 113 define <8 x i16> @vclzQ16(<8 x i16>* %A) nounwind { 114 ;CHECK: vclzQ16: 115 ;CHECK: vclz.i16 {{q[0-9]+}}, {{q[0-9]+}} 116 %tmp1 = load <8 x i16>* %A 117 %tmp2 = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %tmp1, i1 0) 118 ret <8 x i16> %tmp2 119 } 120 121 define <4 x i32> @vclzQ32(<4 x i32>* %A) nounwind { 122 ;CHECK: vclzQ32: 123 ;CHECK: vclz.i32 {{q[0-9]+}}, {{q[0-9]+}} 124 %tmp1 = load <4 x i32>* %A 125 %tmp2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %tmp1, i1 0) 126 ret <4 x i32> %tmp2 127 } 128 129 declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1) nounwind readnone 130 declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone 131 declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone 132 133 declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) nounwind readnone 134 declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) nounwind readnone 135 declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone 136 137 define <8 x i8> @vclss8(<8 x i8>* %A) nounwind { 138 ;CHECK: vclss8: 139 ;CHECK: vcls.s8 140 %tmp1 = load <8 x i8>* %A 141 %tmp2 = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %tmp1) 142 ret <8 x i8> %tmp2 143 } 144 145 define <4 x i16> @vclss16(<4 x i16>* %A) nounwind { 146 ;CHECK: vclss16: 147 ;CHECK: vcls.s16 148 %tmp1 = load <4 x i16>* %A 149 %tmp2 = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %tmp1) 150 ret <4 x i16> %tmp2 151 } 152 153 define <2 x i32> @vclss32(<2 x i32>* %A) nounwind { 154 ;CHECK: vclss32: 155 ;CHECK: vcls.s32 156 %tmp1 = load <2 x i32>* %A 157 %tmp2 = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %tmp1) 158 ret <2 x i32> %tmp2 159 } 160 161 define <16 x i8> @vclsQs8(<16 x i8>* %A) nounwind { 162 ;CHECK: vclsQs8: 163 ;CHECK: vcls.s8 164 %tmp1 = load <16 x i8>* %A 165 %tmp2 = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %tmp1) 166 ret <16 x i8> %tmp2 167 } 168 169 define <8 x i16> @vclsQs16(<8 x i16>* %A) nounwind { 170 ;CHECK: vclsQs16: 171 ;CHECK: vcls.s16 172 %tmp1 = load <8 x i16>* %A 173 %tmp2 = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %tmp1) 174 ret <8 x i16> %tmp2 175 } 176 177 define <4 x i32> @vclsQs32(<4 x i32>* %A) nounwind { 178 ;CHECK: vclsQs32: 179 ;CHECK: vcls.s32 180 %tmp1 = load <4 x i32>* %A 181 %tmp2 = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %tmp1) 182 ret <4 x i32> %tmp2 183 } 184 185 declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone 186 declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone 187 declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone 188 189 declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone 190 declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone 191 declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone 192