1 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s 2 3 define <8 x i8> @vsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind { 4 ;CHECK: vsras8: 5 ;CHECK: vsra.s8 6 %tmp1 = load <8 x i8>* %A 7 %tmp2 = load <8 x i8>* %B 8 %tmp3 = ashr <8 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > 9 %tmp4 = add <8 x i8> %tmp1, %tmp3 10 ret <8 x i8> %tmp4 11 } 12 13 define <4 x i16> @vsras16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 14 ;CHECK: vsras16: 15 ;CHECK: vsra.s16 16 %tmp1 = load <4 x i16>* %A 17 %tmp2 = load <4 x i16>* %B 18 %tmp3 = ashr <4 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16 > 19 %tmp4 = add <4 x i16> %tmp1, %tmp3 20 ret <4 x i16> %tmp4 21 } 22 23 define <2 x i32> @vsras32(<2 x i32>* %A, <2 x i32>* %B) nounwind { 24 ;CHECK: vsras32: 25 ;CHECK: vsra.s32 26 %tmp1 = load <2 x i32>* %A 27 %tmp2 = load <2 x i32>* %B 28 %tmp3 = ashr <2 x i32> %tmp2, < i32 32, i32 32 > 29 %tmp4 = add <2 x i32> %tmp1, %tmp3 30 ret <2 x i32> %tmp4 31 } 32 33 define <1 x i64> @vsras64(<1 x i64>* %A, <1 x i64>* %B) nounwind { 34 ;CHECK: vsras64: 35 ;CHECK: vsra.s64 36 %tmp1 = load <1 x i64>* %A 37 %tmp2 = load <1 x i64>* %B 38 %tmp3 = ashr <1 x i64> %tmp2, < i64 64 > 39 %tmp4 = add <1 x i64> %tmp1, %tmp3 40 ret <1 x i64> %tmp4 41 } 42 43 define <16 x i8> @vsraQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { 44 ;CHECK: vsraQs8: 45 ;CHECK: vsra.s8 46 %tmp1 = load <16 x i8>* %A 47 %tmp2 = load <16 x i8>* %B 48 %tmp3 = ashr <16 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > 49 %tmp4 = add <16 x i8> %tmp1, %tmp3 50 ret <16 x i8> %tmp4 51 } 52 53 define <8 x i16> @vsraQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { 54 ;CHECK: vsraQs16: 55 ;CHECK: vsra.s16 56 %tmp1 = load <8 x i16>* %A 57 %tmp2 = load <8 x i16>* %B 58 %tmp3 = ashr <8 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 > 59 %tmp4 = add <8 x i16> %tmp1, %tmp3 60 ret <8 x i16> %tmp4 61 } 62 63 define <4 x i32> @vsraQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { 64 ;CHECK: vsraQs32: 65 ;CHECK: vsra.s32 66 %tmp1 = load <4 x i32>* %A 67 %tmp2 = load <4 x i32>* %B 68 %tmp3 = ashr <4 x i32> %tmp2, < i32 32, i32 32, i32 32, i32 32 > 69 %tmp4 = add <4 x i32> %tmp1, %tmp3 70 ret <4 x i32> %tmp4 71 } 72 73 define <2 x i64> @vsraQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind { 74 ;CHECK: vsraQs64: 75 ;CHECK: vsra.s64 76 %tmp1 = load <2 x i64>* %A 77 %tmp2 = load <2 x i64>* %B 78 %tmp3 = ashr <2 x i64> %tmp2, < i64 64, i64 64 > 79 %tmp4 = add <2 x i64> %tmp1, %tmp3 80 ret <2 x i64> %tmp4 81 } 82 83 define <8 x i8> @vsrau8(<8 x i8>* %A, <8 x i8>* %B) nounwind { 84 ;CHECK: vsrau8: 85 ;CHECK: vsra.u8 86 %tmp1 = load <8 x i8>* %A 87 %tmp2 = load <8 x i8>* %B 88 %tmp3 = lshr <8 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > 89 %tmp4 = add <8 x i8> %tmp1, %tmp3 90 ret <8 x i8> %tmp4 91 } 92 93 define <4 x i16> @vsrau16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 94 ;CHECK: vsrau16: 95 ;CHECK: vsra.u16 96 %tmp1 = load <4 x i16>* %A 97 %tmp2 = load <4 x i16>* %B 98 %tmp3 = lshr <4 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16 > 99 %tmp4 = add <4 x i16> %tmp1, %tmp3 100 ret <4 x i16> %tmp4 101 } 102 103 define <2 x i32> @vsrau32(<2 x i32>* %A, <2 x i32>* %B) nounwind { 104 ;CHECK: vsrau32: 105 ;CHECK: vsra.u32 106 %tmp1 = load <2 x i32>* %A 107 %tmp2 = load <2 x i32>* %B 108 %tmp3 = lshr <2 x i32> %tmp2, < i32 32, i32 32 > 109 %tmp4 = add <2 x i32> %tmp1, %tmp3 110 ret <2 x i32> %tmp4 111 } 112 113 define <1 x i64> @vsrau64(<1 x i64>* %A, <1 x i64>* %B) nounwind { 114 ;CHECK: vsrau64: 115 ;CHECK: vsra.u64 116 %tmp1 = load <1 x i64>* %A 117 %tmp2 = load <1 x i64>* %B 118 %tmp3 = lshr <1 x i64> %tmp2, < i64 64 > 119 %tmp4 = add <1 x i64> %tmp1, %tmp3 120 ret <1 x i64> %tmp4 121 } 122 123 define <16 x i8> @vsraQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { 124 ;CHECK: vsraQu8: 125 ;CHECK: vsra.u8 126 %tmp1 = load <16 x i8>* %A 127 %tmp2 = load <16 x i8>* %B 128 %tmp3 = lshr <16 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > 129 %tmp4 = add <16 x i8> %tmp1, %tmp3 130 ret <16 x i8> %tmp4 131 } 132 133 define <8 x i16> @vsraQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { 134 ;CHECK: vsraQu16: 135 ;CHECK: vsra.u16 136 %tmp1 = load <8 x i16>* %A 137 %tmp2 = load <8 x i16>* %B 138 %tmp3 = lshr <8 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 > 139 %tmp4 = add <8 x i16> %tmp1, %tmp3 140 ret <8 x i16> %tmp4 141 } 142 143 define <4 x i32> @vsraQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { 144 ;CHECK: vsraQu32: 145 ;CHECK: vsra.u32 146 %tmp1 = load <4 x i32>* %A 147 %tmp2 = load <4 x i32>* %B 148 %tmp3 = lshr <4 x i32> %tmp2, < i32 32, i32 32, i32 32, i32 32 > 149 %tmp4 = add <4 x i32> %tmp1, %tmp3 150 ret <4 x i32> %tmp4 151 } 152 153 define <2 x i64> @vsraQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind { 154 ;CHECK: vsraQu64: 155 ;CHECK: vsra.u64 156 %tmp1 = load <2 x i64>* %A 157 %tmp2 = load <2 x i64>* %B 158 %tmp3 = lshr <2 x i64> %tmp2, < i64 64, i64 64 > 159 %tmp4 = add <2 x i64> %tmp1, %tmp3 160 ret <2 x i64> %tmp4 161 } 162 163 define <8 x i8> @vrsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind { 164 ;CHECK: vrsras8: 165 ;CHECK: vrsra.s8 166 %tmp1 = load <8 x i8>* %A 167 %tmp2 = load <8 x i8>* %B 168 %tmp3 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp2, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) 169 %tmp4 = add <8 x i8> %tmp1, %tmp3 170 ret <8 x i8> %tmp4 171 } 172 173 define <4 x i16> @vrsras16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 174 ;CHECK: vrsras16: 175 ;CHECK: vrsra.s16 176 %tmp1 = load <4 x i16>* %A 177 %tmp2 = load <4 x i16>* %B 178 %tmp3 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp2, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >) 179 %tmp4 = add <4 x i16> %tmp1, %tmp3 180 ret <4 x i16> %tmp4 181 } 182 183 define <2 x i32> @vrsras32(<2 x i32>* %A, <2 x i32>* %B) nounwind { 184 ;CHECK: vrsras32: 185 ;CHECK: vrsra.s32 186 %tmp1 = load <2 x i32>* %A 187 %tmp2 = load <2 x i32>* %B 188 %tmp3 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp2, <2 x i32> < i32 -32, i32 -32 >) 189 %tmp4 = add <2 x i32> %tmp1, %tmp3 190 ret <2 x i32> %tmp4 191 } 192 193 define <1 x i64> @vrsras64(<1 x i64>* %A, <1 x i64>* %B) nounwind { 194 ;CHECK: vrsras64: 195 ;CHECK: vrsra.s64 196 %tmp1 = load <1 x i64>* %A 197 %tmp2 = load <1 x i64>* %B 198 %tmp3 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp2, <1 x i64> < i64 -64 >) 199 %tmp4 = add <1 x i64> %tmp1, %tmp3 200 ret <1 x i64> %tmp4 201 } 202 203 define <8 x i8> @vrsrau8(<8 x i8>* %A, <8 x i8>* %B) nounwind { 204 ;CHECK: vrsrau8: 205 ;CHECK: vrsra.u8 206 %tmp1 = load <8 x i8>* %A 207 %tmp2 = load <8 x i8>* %B 208 %tmp3 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp2, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) 209 %tmp4 = add <8 x i8> %tmp1, %tmp3 210 ret <8 x i8> %tmp4 211 } 212 213 define <4 x i16> @vrsrau16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 214 ;CHECK: vrsrau16: 215 ;CHECK: vrsra.u16 216 %tmp1 = load <4 x i16>* %A 217 %tmp2 = load <4 x i16>* %B 218 %tmp3 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp2, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >) 219 %tmp4 = add <4 x i16> %tmp1, %tmp3 220 ret <4 x i16> %tmp4 221 } 222 223 define <2 x i32> @vrsrau32(<2 x i32>* %A, <2 x i32>* %B) nounwind { 224 ;CHECK: vrsrau32: 225 ;CHECK: vrsra.u32 226 %tmp1 = load <2 x i32>* %A 227 %tmp2 = load <2 x i32>* %B 228 %tmp3 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp2, <2 x i32> < i32 -32, i32 -32 >) 229 %tmp4 = add <2 x i32> %tmp1, %tmp3 230 ret <2 x i32> %tmp4 231 } 232 233 define <1 x i64> @vrsrau64(<1 x i64>* %A, <1 x i64>* %B) nounwind { 234 ;CHECK: vrsrau64: 235 ;CHECK: vrsra.u64 236 %tmp1 = load <1 x i64>* %A 237 %tmp2 = load <1 x i64>* %B 238 %tmp3 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp2, <1 x i64> < i64 -64 >) 239 %tmp4 = add <1 x i64> %tmp1, %tmp3 240 ret <1 x i64> %tmp4 241 } 242 243 define <16 x i8> @vrsraQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { 244 ;CHECK: vrsraQs8: 245 ;CHECK: vrsra.s8 246 %tmp1 = load <16 x i8>* %A 247 %tmp2 = load <16 x i8>* %B 248 %tmp3 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp2, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) 249 %tmp4 = add <16 x i8> %tmp1, %tmp3 250 ret <16 x i8> %tmp4 251 } 252 253 define <8 x i16> @vrsraQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { 254 ;CHECK: vrsraQs16: 255 ;CHECK: vrsra.s16 256 %tmp1 = load <8 x i16>* %A 257 %tmp2 = load <8 x i16>* %B 258 %tmp3 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp2, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >) 259 %tmp4 = add <8 x i16> %tmp1, %tmp3 260 ret <8 x i16> %tmp4 261 } 262 263 define <4 x i32> @vrsraQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { 264 ;CHECK: vrsraQs32: 265 ;CHECK: vrsra.s32 266 %tmp1 = load <4 x i32>* %A 267 %tmp2 = load <4 x i32>* %B 268 %tmp3 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp2, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >) 269 %tmp4 = add <4 x i32> %tmp1, %tmp3 270 ret <4 x i32> %tmp4 271 } 272 273 define <2 x i64> @vrsraQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind { 274 ;CHECK: vrsraQs64: 275 ;CHECK: vrsra.s64 276 %tmp1 = load <2 x i64>* %A 277 %tmp2 = load <2 x i64>* %B 278 %tmp3 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp2, <2 x i64> < i64 -64, i64 -64 >) 279 %tmp4 = add <2 x i64> %tmp1, %tmp3 280 ret <2 x i64> %tmp4 281 } 282 283 define <16 x i8> @vrsraQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { 284 ;CHECK: vrsraQu8: 285 ;CHECK: vrsra.u8 286 %tmp1 = load <16 x i8>* %A 287 %tmp2 = load <16 x i8>* %B 288 %tmp3 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp2, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) 289 %tmp4 = add <16 x i8> %tmp1, %tmp3 290 ret <16 x i8> %tmp4 291 } 292 293 define <8 x i16> @vrsraQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { 294 ;CHECK: vrsraQu16: 295 ;CHECK: vrsra.u16 296 %tmp1 = load <8 x i16>* %A 297 %tmp2 = load <8 x i16>* %B 298 %tmp3 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp2, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >) 299 %tmp4 = add <8 x i16> %tmp1, %tmp3 300 ret <8 x i16> %tmp4 301 } 302 303 define <4 x i32> @vrsraQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { 304 ;CHECK: vrsraQu32: 305 ;CHECK: vrsra.u32 306 %tmp1 = load <4 x i32>* %A 307 %tmp2 = load <4 x i32>* %B 308 %tmp3 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp2, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >) 309 %tmp4 = add <4 x i32> %tmp1, %tmp3 310 ret <4 x i32> %tmp4 311 } 312 313 define <2 x i64> @vrsraQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind { 314 ;CHECK: vrsraQu64: 315 ;CHECK: vrsra.u64 316 %tmp1 = load <2 x i64>* %A 317 %tmp2 = load <2 x i64>* %B 318 %tmp3 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp2, <2 x i64> < i64 -64, i64 -64 >) 319 %tmp4 = add <2 x i64> %tmp1, %tmp3 320 ret <2 x i64> %tmp4 321 } 322 323 declare <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone 324 declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 325 declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 326 declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone 327 328 declare <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone 329 declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 330 declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 331 declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone 332 333 declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 334 declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 335 declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 336 declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone 337 338 declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 339 declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 340 declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 341 declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone 342