Home | History | Annotate | Download | only in ARM
      1 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
      2 
      3 define <8 x i8> @vshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
      4 ;CHECK: vshls8:
      5 ;CHECK: vshl.s8
      6 	%tmp1 = load <8 x i8>* %A
      7 	%tmp2 = load <8 x i8>* %B
      8 	%tmp3 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
      9 	ret <8 x i8> %tmp3
     10 }
     11 
     12 define <4 x i16> @vshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
     13 ;CHECK: vshls16:
     14 ;CHECK: vshl.s16
     15 	%tmp1 = load <4 x i16>* %A
     16 	%tmp2 = load <4 x i16>* %B
     17 	%tmp3 = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
     18 	ret <4 x i16> %tmp3
     19 }
     20 
     21 define <2 x i32> @vshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
     22 ;CHECK: vshls32:
     23 ;CHECK: vshl.s32
     24 	%tmp1 = load <2 x i32>* %A
     25 	%tmp2 = load <2 x i32>* %B
     26 	%tmp3 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
     27 	ret <2 x i32> %tmp3
     28 }
     29 
     30 define <1 x i64> @vshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
     31 ;CHECK: vshls64:
     32 ;CHECK: vshl.s64
     33 	%tmp1 = load <1 x i64>* %A
     34 	%tmp2 = load <1 x i64>* %B
     35 	%tmp3 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
     36 	ret <1 x i64> %tmp3
     37 }
     38 
     39 define <8 x i8> @vshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
     40 ;CHECK: vshlu8:
     41 ;CHECK: vshl.u8
     42 	%tmp1 = load <8 x i8>* %A
     43 	%tmp2 = load <8 x i8>* %B
     44 	%tmp3 = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
     45 	ret <8 x i8> %tmp3
     46 }
     47 
     48 define <4 x i16> @vshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
     49 ;CHECK: vshlu16:
     50 ;CHECK: vshl.u16
     51 	%tmp1 = load <4 x i16>* %A
     52 	%tmp2 = load <4 x i16>* %B
     53 	%tmp3 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
     54 	ret <4 x i16> %tmp3
     55 }
     56 
     57 define <2 x i32> @vshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
     58 ;CHECK: vshlu32:
     59 ;CHECK: vshl.u32
     60 	%tmp1 = load <2 x i32>* %A
     61 	%tmp2 = load <2 x i32>* %B
     62 	%tmp3 = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
     63 	ret <2 x i32> %tmp3
     64 }
     65 
     66 define <1 x i64> @vshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
     67 ;CHECK: vshlu64:
     68 ;CHECK: vshl.u64
     69 	%tmp1 = load <1 x i64>* %A
     70 	%tmp2 = load <1 x i64>* %B
     71 	%tmp3 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
     72 	ret <1 x i64> %tmp3
     73 }
     74 
     75 define <16 x i8> @vshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
     76 ;CHECK: vshlQs8:
     77 ;CHECK: vshl.s8
     78 	%tmp1 = load <16 x i8>* %A
     79 	%tmp2 = load <16 x i8>* %B
     80 	%tmp3 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
     81 	ret <16 x i8> %tmp3
     82 }
     83 
     84 define <8 x i16> @vshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
     85 ;CHECK: vshlQs16:
     86 ;CHECK: vshl.s16
     87 	%tmp1 = load <8 x i16>* %A
     88 	%tmp2 = load <8 x i16>* %B
     89 	%tmp3 = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
     90 	ret <8 x i16> %tmp3
     91 }
     92 
     93 define <4 x i32> @vshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
     94 ;CHECK: vshlQs32:
     95 ;CHECK: vshl.s32
     96 	%tmp1 = load <4 x i32>* %A
     97 	%tmp2 = load <4 x i32>* %B
     98 	%tmp3 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
     99 	ret <4 x i32> %tmp3
    100 }
    101 
    102 define <2 x i64> @vshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
    103 ;CHECK: vshlQs64:
    104 ;CHECK: vshl.s64
    105 	%tmp1 = load <2 x i64>* %A
    106 	%tmp2 = load <2 x i64>* %B
    107 	%tmp3 = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
    108 	ret <2 x i64> %tmp3
    109 }
    110 
    111 define <16 x i8> @vshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
    112 ;CHECK: vshlQu8:
    113 ;CHECK: vshl.u8
    114 	%tmp1 = load <16 x i8>* %A
    115 	%tmp2 = load <16 x i8>* %B
    116 	%tmp3 = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
    117 	ret <16 x i8> %tmp3
    118 }
    119 
    120 define <8 x i16> @vshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    121 ;CHECK: vshlQu16:
    122 ;CHECK: vshl.u16
    123 	%tmp1 = load <8 x i16>* %A
    124 	%tmp2 = load <8 x i16>* %B
    125 	%tmp3 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
    126 	ret <8 x i16> %tmp3
    127 }
    128 
    129 define <4 x i32> @vshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    130 ;CHECK: vshlQu32:
    131 ;CHECK: vshl.u32
    132 	%tmp1 = load <4 x i32>* %A
    133 	%tmp2 = load <4 x i32>* %B
    134 	%tmp3 = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
    135 	ret <4 x i32> %tmp3
    136 }
    137 
    138 define <2 x i64> @vshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
    139 ;CHECK: vshlQu64:
    140 ;CHECK: vshl.u64
    141 	%tmp1 = load <2 x i64>* %A
    142 	%tmp2 = load <2 x i64>* %B
    143 	%tmp3 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
    144 	ret <2 x i64> %tmp3
    145 }
    146 
    147 ; For left shifts by immediates, the signedness is irrelevant.
    148 ; Test a mix of both signed and unsigned intrinsics.
    149 
    150 define <8 x i8> @vshli8(<8 x i8>* %A) nounwind {
    151 ;CHECK: vshli8:
    152 ;CHECK: vshl.i8
    153 	%tmp1 = load <8 x i8>* %A
    154 	%tmp2 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
    155 	ret <8 x i8> %tmp2
    156 }
    157 
    158 define <4 x i16> @vshli16(<4 x i16>* %A) nounwind {
    159 ;CHECK: vshli16:
    160 ;CHECK: vshl.i16
    161 	%tmp1 = load <4 x i16>* %A
    162 	%tmp2 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
    163 	ret <4 x i16> %tmp2
    164 }
    165 
    166 define <2 x i32> @vshli32(<2 x i32>* %A) nounwind {
    167 ;CHECK: vshli32:
    168 ;CHECK: vshl.i32
    169 	%tmp1 = load <2 x i32>* %A
    170 	%tmp2 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
    171 	ret <2 x i32> %tmp2
    172 }
    173 
    174 define <1 x i64> @vshli64(<1 x i64>* %A) nounwind {
    175 ;CHECK: vshli64:
    176 ;CHECK: vshl.i64
    177 	%tmp1 = load <1 x i64>* %A
    178 	%tmp2 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 63 >)
    179 	ret <1 x i64> %tmp2
    180 }
    181 
    182 define <16 x i8> @vshlQi8(<16 x i8>* %A) nounwind {
    183 ;CHECK: vshlQi8:
    184 ;CHECK: vshl.i8
    185 	%tmp1 = load <16 x i8>* %A
    186 	%tmp2 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
    187 	ret <16 x i8> %tmp2
    188 }
    189 
    190 define <8 x i16> @vshlQi16(<8 x i16>* %A) nounwind {
    191 ;CHECK: vshlQi16:
    192 ;CHECK: vshl.i16
    193 	%tmp1 = load <8 x i16>* %A
    194 	%tmp2 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >)
    195 	ret <8 x i16> %tmp2
    196 }
    197 
    198 define <4 x i32> @vshlQi32(<4 x i32>* %A) nounwind {
    199 ;CHECK: vshlQi32:
    200 ;CHECK: vshl.i32
    201 	%tmp1 = load <4 x i32>* %A
    202 	%tmp2 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >)
    203 	ret <4 x i32> %tmp2
    204 }
    205 
    206 define <2 x i64> @vshlQi64(<2 x i64>* %A) nounwind {
    207 ;CHECK: vshlQi64:
    208 ;CHECK: vshl.i64
    209 	%tmp1 = load <2 x i64>* %A
    210 	%tmp2 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 63, i64 63 >)
    211 	ret <2 x i64> %tmp2
    212 }
    213 
    214 ; Right shift by immediate:
    215 
    216 define <8 x i8> @vshrs8(<8 x i8>* %A) nounwind {
    217 ;CHECK: vshrs8:
    218 ;CHECK: vshr.s8
    219 	%tmp1 = load <8 x i8>* %A
    220 	%tmp2 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
    221 	ret <8 x i8> %tmp2
    222 }
    223 
    224 define <4 x i16> @vshrs16(<4 x i16>* %A) nounwind {
    225 ;CHECK: vshrs16:
    226 ;CHECK: vshr.s16
    227 	%tmp1 = load <4 x i16>* %A
    228 	%tmp2 = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
    229 	ret <4 x i16> %tmp2
    230 }
    231 
    232 define <2 x i32> @vshrs32(<2 x i32>* %A) nounwind {
    233 ;CHECK: vshrs32:
    234 ;CHECK: vshr.s32
    235 	%tmp1 = load <2 x i32>* %A
    236 	%tmp2 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
    237 	ret <2 x i32> %tmp2
    238 }
    239 
    240 define <1 x i64> @vshrs64(<1 x i64>* %A) nounwind {
    241 ;CHECK: vshrs64:
    242 ;CHECK: vshr.s64
    243 	%tmp1 = load <1 x i64>* %A
    244 	%tmp2 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
    245 	ret <1 x i64> %tmp2
    246 }
    247 
    248 define <8 x i8> @vshru8(<8 x i8>* %A) nounwind {
    249 ;CHECK: vshru8:
    250 ;CHECK: vshr.u8
    251 	%tmp1 = load <8 x i8>* %A
    252 	%tmp2 = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
    253 	ret <8 x i8> %tmp2
    254 }
    255 
    256 define <4 x i16> @vshru16(<4 x i16>* %A) nounwind {
    257 ;CHECK: vshru16:
    258 ;CHECK: vshr.u16
    259 	%tmp1 = load <4 x i16>* %A
    260 	%tmp2 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
    261 	ret <4 x i16> %tmp2
    262 }
    263 
    264 define <2 x i32> @vshru32(<2 x i32>* %A) nounwind {
    265 ;CHECK: vshru32:
    266 ;CHECK: vshr.u32
    267 	%tmp1 = load <2 x i32>* %A
    268 	%tmp2 = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
    269 	ret <2 x i32> %tmp2
    270 }
    271 
    272 define <1 x i64> @vshru64(<1 x i64>* %A) nounwind {
    273 ;CHECK: vshru64:
    274 ;CHECK: vshr.u64
    275 	%tmp1 = load <1 x i64>* %A
    276 	%tmp2 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
    277 	ret <1 x i64> %tmp2
    278 }
    279 
    280 define <16 x i8> @vshrQs8(<16 x i8>* %A) nounwind {
    281 ;CHECK: vshrQs8:
    282 ;CHECK: vshr.s8
    283 	%tmp1 = load <16 x i8>* %A
    284 	%tmp2 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
    285 	ret <16 x i8> %tmp2
    286 }
    287 
    288 define <8 x i16> @vshrQs16(<8 x i16>* %A) nounwind {
    289 ;CHECK: vshrQs16:
    290 ;CHECK: vshr.s16
    291 	%tmp1 = load <8 x i16>* %A
    292 	%tmp2 = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
    293 	ret <8 x i16> %tmp2
    294 }
    295 
    296 define <4 x i32> @vshrQs32(<4 x i32>* %A) nounwind {
    297 ;CHECK: vshrQs32:
    298 ;CHECK: vshr.s32
    299 	%tmp1 = load <4 x i32>* %A
    300 	%tmp2 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
    301 	ret <4 x i32> %tmp2
    302 }
    303 
    304 define <2 x i64> @vshrQs64(<2 x i64>* %A) nounwind {
    305 ;CHECK: vshrQs64:
    306 ;CHECK: vshr.s64
    307 	%tmp1 = load <2 x i64>* %A
    308 	%tmp2 = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
    309 	ret <2 x i64> %tmp2
    310 }
    311 
    312 define <16 x i8> @vshrQu8(<16 x i8>* %A) nounwind {
    313 ;CHECK: vshrQu8:
    314 ;CHECK: vshr.u8
    315 	%tmp1 = load <16 x i8>* %A
    316 	%tmp2 = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
    317 	ret <16 x i8> %tmp2
    318 }
    319 
    320 define <8 x i16> @vshrQu16(<8 x i16>* %A) nounwind {
    321 ;CHECK: vshrQu16:
    322 ;CHECK: vshr.u16
    323 	%tmp1 = load <8 x i16>* %A
    324 	%tmp2 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
    325 	ret <8 x i16> %tmp2
    326 }
    327 
    328 define <4 x i32> @vshrQu32(<4 x i32>* %A) nounwind {
    329 ;CHECK: vshrQu32:
    330 ;CHECK: vshr.u32
    331 	%tmp1 = load <4 x i32>* %A
    332 	%tmp2 = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
    333 	ret <4 x i32> %tmp2
    334 }
    335 
    336 define <2 x i64> @vshrQu64(<2 x i64>* %A) nounwind {
    337 ;CHECK: vshrQu64:
    338 ;CHECK: vshr.u64
    339 	%tmp1 = load <2 x i64>* %A
    340 	%tmp2 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
    341 	ret <2 x i64> %tmp2
    342 }
    343 
    344 declare <8 x i8>  @llvm.arm.neon.vshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
    345 declare <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
    346 declare <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
    347 declare <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
    348 
    349 declare <8 x i8>  @llvm.arm.neon.vshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
    350 declare <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
    351 declare <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
    352 declare <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
    353 
    354 declare <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
    355 declare <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
    356 declare <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
    357 declare <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
    358 
    359 declare <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
    360 declare <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
    361 declare <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
    362 declare <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
    363 
    364 define <8 x i8> @vrshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
    365 ;CHECK: vrshls8:
    366 ;CHECK: vrshl.s8
    367 	%tmp1 = load <8 x i8>* %A
    368 	%tmp2 = load <8 x i8>* %B
    369 	%tmp3 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
    370 	ret <8 x i8> %tmp3
    371 }
    372 
    373 define <4 x i16> @vrshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
    374 ;CHECK: vrshls16:
    375 ;CHECK: vrshl.s16
    376 	%tmp1 = load <4 x i16>* %A
    377 	%tmp2 = load <4 x i16>* %B
    378 	%tmp3 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
    379 	ret <4 x i16> %tmp3
    380 }
    381 
    382 define <2 x i32> @vrshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
    383 ;CHECK: vrshls32:
    384 ;CHECK: vrshl.s32
    385 	%tmp1 = load <2 x i32>* %A
    386 	%tmp2 = load <2 x i32>* %B
    387 	%tmp3 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
    388 	ret <2 x i32> %tmp3
    389 }
    390 
    391 define <1 x i64> @vrshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
    392 ;CHECK: vrshls64:
    393 ;CHECK: vrshl.s64
    394 	%tmp1 = load <1 x i64>* %A
    395 	%tmp2 = load <1 x i64>* %B
    396 	%tmp3 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
    397 	ret <1 x i64> %tmp3
    398 }
    399 
    400 define <8 x i8> @vrshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
    401 ;CHECK: vrshlu8:
    402 ;CHECK: vrshl.u8
    403 	%tmp1 = load <8 x i8>* %A
    404 	%tmp2 = load <8 x i8>* %B
    405 	%tmp3 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
    406 	ret <8 x i8> %tmp3
    407 }
    408 
    409 define <4 x i16> @vrshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
    410 ;CHECK: vrshlu16:
    411 ;CHECK: vrshl.u16
    412 	%tmp1 = load <4 x i16>* %A
    413 	%tmp2 = load <4 x i16>* %B
    414 	%tmp3 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
    415 	ret <4 x i16> %tmp3
    416 }
    417 
    418 define <2 x i32> @vrshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
    419 ;CHECK: vrshlu32:
    420 ;CHECK: vrshl.u32
    421 	%tmp1 = load <2 x i32>* %A
    422 	%tmp2 = load <2 x i32>* %B
    423 	%tmp3 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
    424 	ret <2 x i32> %tmp3
    425 }
    426 
    427 define <1 x i64> @vrshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
    428 ;CHECK: vrshlu64:
    429 ;CHECK: vrshl.u64
    430 	%tmp1 = load <1 x i64>* %A
    431 	%tmp2 = load <1 x i64>* %B
    432 	%tmp3 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
    433 	ret <1 x i64> %tmp3
    434 }
    435 
    436 define <16 x i8> @vrshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
    437 ;CHECK: vrshlQs8:
    438 ;CHECK: vrshl.s8
    439 	%tmp1 = load <16 x i8>* %A
    440 	%tmp2 = load <16 x i8>* %B
    441 	%tmp3 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
    442 	ret <16 x i8> %tmp3
    443 }
    444 
    445 define <8 x i16> @vrshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    446 ;CHECK: vrshlQs16:
    447 ;CHECK: vrshl.s16
    448 	%tmp1 = load <8 x i16>* %A
    449 	%tmp2 = load <8 x i16>* %B
    450 	%tmp3 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
    451 	ret <8 x i16> %tmp3
    452 }
    453 
    454 define <4 x i32> @vrshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    455 ;CHECK: vrshlQs32:
    456 ;CHECK: vrshl.s32
    457 	%tmp1 = load <4 x i32>* %A
    458 	%tmp2 = load <4 x i32>* %B
    459 	%tmp3 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
    460 	ret <4 x i32> %tmp3
    461 }
    462 
    463 define <2 x i64> @vrshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
    464 ;CHECK: vrshlQs64:
    465 ;CHECK: vrshl.s64
    466 	%tmp1 = load <2 x i64>* %A
    467 	%tmp2 = load <2 x i64>* %B
    468 	%tmp3 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
    469 	ret <2 x i64> %tmp3
    470 }
    471 
    472 define <16 x i8> @vrshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
    473 ;CHECK: vrshlQu8:
    474 ;CHECK: vrshl.u8
    475 	%tmp1 = load <16 x i8>* %A
    476 	%tmp2 = load <16 x i8>* %B
    477 	%tmp3 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
    478 	ret <16 x i8> %tmp3
    479 }
    480 
    481 define <8 x i16> @vrshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    482 ;CHECK: vrshlQu16:
    483 ;CHECK: vrshl.u16
    484 	%tmp1 = load <8 x i16>* %A
    485 	%tmp2 = load <8 x i16>* %B
    486 	%tmp3 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
    487 	ret <8 x i16> %tmp3
    488 }
    489 
    490 define <4 x i32> @vrshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    491 ;CHECK: vrshlQu32:
    492 ;CHECK: vrshl.u32
    493 	%tmp1 = load <4 x i32>* %A
    494 	%tmp2 = load <4 x i32>* %B
    495 	%tmp3 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
    496 	ret <4 x i32> %tmp3
    497 }
    498 
    499 define <2 x i64> @vrshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
    500 ;CHECK: vrshlQu64:
    501 ;CHECK: vrshl.u64
    502 	%tmp1 = load <2 x i64>* %A
    503 	%tmp2 = load <2 x i64>* %B
    504 	%tmp3 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
    505 	ret <2 x i64> %tmp3
    506 }
    507 
    508 define <8 x i8> @vrshrs8(<8 x i8>* %A) nounwind {
    509 ;CHECK: vrshrs8:
    510 ;CHECK: vrshr.s8
    511 	%tmp1 = load <8 x i8>* %A
    512 	%tmp2 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
    513 	ret <8 x i8> %tmp2
    514 }
    515 
    516 define <4 x i16> @vrshrs16(<4 x i16>* %A) nounwind {
    517 ;CHECK: vrshrs16:
    518 ;CHECK: vrshr.s16
    519 	%tmp1 = load <4 x i16>* %A
    520 	%tmp2 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
    521 	ret <4 x i16> %tmp2
    522 }
    523 
    524 define <2 x i32> @vrshrs32(<2 x i32>* %A) nounwind {
    525 ;CHECK: vrshrs32:
    526 ;CHECK: vrshr.s32
    527 	%tmp1 = load <2 x i32>* %A
    528 	%tmp2 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
    529 	ret <2 x i32> %tmp2
    530 }
    531 
    532 define <1 x i64> @vrshrs64(<1 x i64>* %A) nounwind {
    533 ;CHECK: vrshrs64:
    534 ;CHECK: vrshr.s64
    535 	%tmp1 = load <1 x i64>* %A
    536 	%tmp2 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
    537 	ret <1 x i64> %tmp2
    538 }
    539 
    540 define <8 x i8> @vrshru8(<8 x i8>* %A) nounwind {
    541 ;CHECK: vrshru8:
    542 ;CHECK: vrshr.u8
    543 	%tmp1 = load <8 x i8>* %A
    544 	%tmp2 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
    545 	ret <8 x i8> %tmp2
    546 }
    547 
    548 define <4 x i16> @vrshru16(<4 x i16>* %A) nounwind {
    549 ;CHECK: vrshru16:
    550 ;CHECK: vrshr.u16
    551 	%tmp1 = load <4 x i16>* %A
    552 	%tmp2 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
    553 	ret <4 x i16> %tmp2
    554 }
    555 
    556 define <2 x i32> @vrshru32(<2 x i32>* %A) nounwind {
    557 ;CHECK: vrshru32:
    558 ;CHECK: vrshr.u32
    559 	%tmp1 = load <2 x i32>* %A
    560 	%tmp2 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
    561 	ret <2 x i32> %tmp2
    562 }
    563 
    564 define <1 x i64> @vrshru64(<1 x i64>* %A) nounwind {
    565 ;CHECK: vrshru64:
    566 ;CHECK: vrshr.u64
    567 	%tmp1 = load <1 x i64>* %A
    568 	%tmp2 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
    569 	ret <1 x i64> %tmp2
    570 }
    571 
    572 define <16 x i8> @vrshrQs8(<16 x i8>* %A) nounwind {
    573 ;CHECK: vrshrQs8:
    574 ;CHECK: vrshr.s8
    575 	%tmp1 = load <16 x i8>* %A
    576 	%tmp2 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
    577 	ret <16 x i8> %tmp2
    578 }
    579 
    580 define <8 x i16> @vrshrQs16(<8 x i16>* %A) nounwind {
    581 ;CHECK: vrshrQs16:
    582 ;CHECK: vrshr.s16
    583 	%tmp1 = load <8 x i16>* %A
    584 	%tmp2 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
    585 	ret <8 x i16> %tmp2
    586 }
    587 
    588 define <4 x i32> @vrshrQs32(<4 x i32>* %A) nounwind {
    589 ;CHECK: vrshrQs32:
    590 ;CHECK: vrshr.s32
    591 	%tmp1 = load <4 x i32>* %A
    592 	%tmp2 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
    593 	ret <4 x i32> %tmp2
    594 }
    595 
    596 define <2 x i64> @vrshrQs64(<2 x i64>* %A) nounwind {
    597 ;CHECK: vrshrQs64:
    598 ;CHECK: vrshr.s64
    599 	%tmp1 = load <2 x i64>* %A
    600 	%tmp2 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
    601 	ret <2 x i64> %tmp2
    602 }
    603 
    604 define <16 x i8> @vrshrQu8(<16 x i8>* %A) nounwind {
    605 ;CHECK: vrshrQu8:
    606 ;CHECK: vrshr.u8
    607 	%tmp1 = load <16 x i8>* %A
    608 	%tmp2 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
    609 	ret <16 x i8> %tmp2
    610 }
    611 
    612 define <8 x i16> @vrshrQu16(<8 x i16>* %A) nounwind {
    613 ;CHECK: vrshrQu16:
    614 ;CHECK: vrshr.u16
    615 	%tmp1 = load <8 x i16>* %A
    616 	%tmp2 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
    617 	ret <8 x i16> %tmp2
    618 }
    619 
    620 define <4 x i32> @vrshrQu32(<4 x i32>* %A) nounwind {
    621 ;CHECK: vrshrQu32:
    622 ;CHECK: vrshr.u32
    623 	%tmp1 = load <4 x i32>* %A
    624 	%tmp2 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
    625 	ret <4 x i32> %tmp2
    626 }
    627 
    628 define <2 x i64> @vrshrQu64(<2 x i64>* %A) nounwind {
    629 ;CHECK: vrshrQu64:
    630 ;CHECK: vrshr.u64
    631 	%tmp1 = load <2 x i64>* %A
    632 	%tmp2 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
    633 	ret <2 x i64> %tmp2
    634 }
    635 
    636 declare <8 x i8>  @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
    637 declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
    638 declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
    639 declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
    640 
    641 declare <8 x i8>  @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
    642 declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
    643 declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
    644 declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
    645 
    646 declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
    647 declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
    648 declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
    649 declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
    650 
    651 declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
    652 declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
    653 declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
    654 declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
    655