Home | History | Annotate | Download | only in AArch64
      1 ; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
      2 
      3 define i32 @test_rev_w(i32 %a) nounwind {
      4 entry:
      5 ; CHECK-LABEL: test_rev_w:
      6 ; CHECK: rev w0, w0
      7   %0 = tail call i32 @llvm.bswap.i32(i32 %a)
      8   ret i32 %0
      9 }
     10 
     11 define i64 @test_rev_x(i64 %a) nounwind {
     12 entry:
     13 ; CHECK-LABEL: test_rev_x:
     14 ; CHECK: rev x0, x0
     15   %0 = tail call i64 @llvm.bswap.i64(i64 %a)
     16   ret i64 %0
     17 }
     18 
     19 ; Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 16-bits
     20 ; of %a are zero. This optimizes rev + lsr 16 to rev16.
     21 define i32 @test_rev_w_srl16(i16 %a) {
     22 entry:
     23 ; CHECK-LABEL: test_rev_w_srl16:
     24 ; CHECK: and [[REG:w[0-9]+]], w0, #0xffff
     25 ; CHECK: rev16 w0, [[REG]]
     26 ; CHECK-NOT: lsr
     27   %0 = zext i16 %a to i32
     28   %1 = tail call i32 @llvm.bswap.i32(i32 %0)
     29   %2 = lshr i32 %1, 16
     30   ret i32 %2
     31 }
     32 
     33 ; Canonicalize (srl (bswap x), 32) to (rotr (bswap x), 32) if the high 32-bits
     34 ; of %a are zero. This optimizes rev + lsr 32 to rev32.
     35 define i64 @test_rev_x_srl32(i32 %a) {
     36 entry:
     37 ; CHECK-LABEL: test_rev_x_srl32:
     38 ; CHECK: rev32 x0, {{x[0-9]+}}
     39 ; CHECK-NOT: lsr
     40   %0 = zext i32 %a to i64
     41   %1 = tail call i64 @llvm.bswap.i64(i64 %0)
     42   %2 = lshr i64 %1, 32
     43   ret i64 %2
     44 }
     45 
     46 declare i32 @llvm.bswap.i32(i32) nounwind readnone
     47 declare i64 @llvm.bswap.i64(i64) nounwind readnone
     48 
     49 define i32 @test_rev16_w(i32 %X) nounwind {
     50 entry:
     51 ; CHECK-LABEL: test_rev16_w:
     52 ; CHECK: rev16 w0, w0
     53   %tmp1 = lshr i32 %X, 8
     54   %X15 = bitcast i32 %X to i32
     55   %tmp4 = shl i32 %X15, 8
     56   %tmp2 = and i32 %tmp1, 16711680
     57   %tmp5 = and i32 %tmp4, -16777216
     58   %tmp9 = and i32 %tmp1, 255
     59   %tmp13 = and i32 %tmp4, 65280
     60   %tmp6 = or i32 %tmp5, %tmp2
     61   %tmp10 = or i32 %tmp6, %tmp13
     62   %tmp14 = or i32 %tmp10, %tmp9
     63   ret i32 %tmp14
     64 }
     65 
     66 ; 64-bit REV16 is *not* a swap then a 16-bit rotation:
     67 ;   01234567 ->(bswap) 76543210 ->(rotr) 10765432
     68 ;   01234567 ->(rev16) 10325476
     69 define i64 @test_rev16_x(i64 %a) nounwind {
     70 entry:
     71 ; CHECK-LABEL: test_rev16_x:
     72 ; CHECK-NOT: rev16 x0, x0
     73   %0 = tail call i64 @llvm.bswap.i64(i64 %a)
     74   %1 = lshr i64 %0, 16
     75   %2 = shl i64 %0, 48
     76   %3 = or i64 %1, %2
     77   ret i64 %3
     78 }
     79 
     80 define i64 @test_rev32_x(i64 %a) nounwind {
     81 entry:
     82 ; CHECK-LABEL: test_rev32_x:
     83 ; CHECK: rev32 x0, x0
     84   %0 = tail call i64 @llvm.bswap.i64(i64 %a)
     85   %1 = lshr i64 %0, 32
     86   %2 = shl i64 %0, 32
     87   %3 = or i64 %1, %2
     88   ret i64 %3
     89 }
     90 
     91 define <8 x i8> @test_vrev64D8(<8 x i8>* %A) nounwind {
     92 ;CHECK-LABEL: test_vrev64D8:
     93 ;CHECK: rev64.8b
     94 	%tmp1 = load <8 x i8>, <8 x i8>* %A
     95 	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
     96 	ret <8 x i8> %tmp2
     97 }
     98 
     99 define <4 x i16> @test_vrev64D16(<4 x i16>* %A) nounwind {
    100 ;CHECK-LABEL: test_vrev64D16:
    101 ;CHECK: rev64.4h
    102 	%tmp1 = load <4 x i16>, <4 x i16>* %A
    103 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
    104 	ret <4 x i16> %tmp2
    105 }
    106 
    107 define <2 x i32> @test_vrev64D32(<2 x i32>* %A) nounwind {
    108 ;CHECK-LABEL: test_vrev64D32:
    109 ;CHECK: rev64.2s
    110 	%tmp1 = load <2 x i32>, <2 x i32>* %A
    111 	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
    112 	ret <2 x i32> %tmp2
    113 }
    114 
    115 define <2 x float> @test_vrev64Df(<2 x float>* %A) nounwind {
    116 ;CHECK-LABEL: test_vrev64Df:
    117 ;CHECK: rev64.2s
    118 	%tmp1 = load <2 x float>, <2 x float>* %A
    119 	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
    120 	ret <2 x float> %tmp2
    121 }
    122 
    123 define <16 x i8> @test_vrev64Q8(<16 x i8>* %A) nounwind {
    124 ;CHECK-LABEL: test_vrev64Q8:
    125 ;CHECK: rev64.16b
    126 	%tmp1 = load <16 x i8>, <16 x i8>* %A
    127 	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
    128 	ret <16 x i8> %tmp2
    129 }
    130 
    131 define <8 x i16> @test_vrev64Q16(<8 x i16>* %A) nounwind {
    132 ;CHECK-LABEL: test_vrev64Q16:
    133 ;CHECK: rev64.8h
    134 	%tmp1 = load <8 x i16>, <8 x i16>* %A
    135 	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
    136 	ret <8 x i16> %tmp2
    137 }
    138 
    139 define <4 x i32> @test_vrev64Q32(<4 x i32>* %A) nounwind {
    140 ;CHECK-LABEL: test_vrev64Q32:
    141 ;CHECK: rev64.4s
    142 	%tmp1 = load <4 x i32>, <4 x i32>* %A
    143 	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
    144 	ret <4 x i32> %tmp2
    145 }
    146 
    147 define <4 x float> @test_vrev64Qf(<4 x float>* %A) nounwind {
    148 ;CHECK-LABEL: test_vrev64Qf:
    149 ;CHECK: rev64.4s
    150 	%tmp1 = load <4 x float>, <4 x float>* %A
    151 	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
    152 	ret <4 x float> %tmp2
    153 }
    154 
    155 define <8 x i8> @test_vrev32D8(<8 x i8>* %A) nounwind {
    156 ;CHECK-LABEL: test_vrev32D8:
    157 ;CHECK: rev32.8b
    158 	%tmp1 = load <8 x i8>, <8 x i8>* %A
    159 	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
    160 	ret <8 x i8> %tmp2
    161 }
    162 
    163 define <4 x i16> @test_vrev32D16(<4 x i16>* %A) nounwind {
    164 ;CHECK-LABEL: test_vrev32D16:
    165 ;CHECK: rev32.4h
    166 	%tmp1 = load <4 x i16>, <4 x i16>* %A
    167 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
    168 	ret <4 x i16> %tmp2
    169 }
    170 
    171 define <16 x i8> @test_vrev32Q8(<16 x i8>* %A) nounwind {
    172 ;CHECK-LABEL: test_vrev32Q8:
    173 ;CHECK: rev32.16b
    174 	%tmp1 = load <16 x i8>, <16 x i8>* %A
    175 	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
    176 	ret <16 x i8> %tmp2
    177 }
    178 
    179 define <8 x i16> @test_vrev32Q16(<8 x i16>* %A) nounwind {
    180 ;CHECK-LABEL: test_vrev32Q16:
    181 ;CHECK: rev32.8h
    182 	%tmp1 = load <8 x i16>, <8 x i16>* %A
    183 	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
    184 	ret <8 x i16> %tmp2
    185 }
    186 
    187 define <8 x i8> @test_vrev16D8(<8 x i8>* %A) nounwind {
    188 ;CHECK-LABEL: test_vrev16D8:
    189 ;CHECK: rev16.8b
    190 	%tmp1 = load <8 x i8>, <8 x i8>* %A
    191 	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
    192 	ret <8 x i8> %tmp2
    193 }
    194 
    195 define <16 x i8> @test_vrev16Q8(<16 x i8>* %A) nounwind {
    196 ;CHECK-LABEL: test_vrev16Q8:
    197 ;CHECK: rev16.16b
    198 	%tmp1 = load <16 x i8>, <16 x i8>* %A
    199 	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
    200 	ret <16 x i8> %tmp2
    201 }
    202 
    203 ; Undef shuffle indices should not prevent matching to VREV:
    204 
    205 define <8 x i8> @test_vrev64D8_undef(<8 x i8>* %A) nounwind {
    206 ;CHECK-LABEL: test_vrev64D8_undef:
    207 ;CHECK: rev64.8b
    208 	%tmp1 = load <8 x i8>, <8 x i8>* %A
    209 	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 undef, i32 undef, i32 4, i32 3, i32 2, i32 1, i32 0>
    210 	ret <8 x i8> %tmp2
    211 }
    212 
    213 define <8 x i16> @test_vrev32Q16_undef(<8 x i16>* %A) nounwind {
    214 ;CHECK-LABEL: test_vrev32Q16_undef:
    215 ;CHECK: rev32.8h
    216 	%tmp1 = load <8 x i16>, <8 x i16>* %A
    217 	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 5, i32 4, i32 7, i32 undef>
    218 	ret <8 x i16> %tmp2
    219 }
    220 
    221 ; vrev <4 x i16> should use REV32 and not REV64
    222 define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp {
    223 ; CHECK-LABEL: test_vrev64:
    224 ; CHECK: ldr [[DEST:q[0-9]+]],
    225 ; CHECK: st1.h
    226 ; CHECK: st1.h
    227 entry:
    228   %0 = bitcast <4 x i16>* %source to <8 x i16>*
    229   %tmp2 = load <8 x i16>, <8 x i16>* %0, align 4
    230   %tmp3 = extractelement <8 x i16> %tmp2, i32 6
    231   %tmp5 = insertelement <2 x i16> undef, i16 %tmp3, i32 0
    232   %tmp9 = extractelement <8 x i16> %tmp2, i32 5
    233   %tmp11 = insertelement <2 x i16> %tmp5, i16 %tmp9, i32 1
    234   store <2 x i16> %tmp11, <2 x i16>* %dst, align 4
    235   ret void
    236 }
    237 
    238 ; Test vrev of float4
    239 define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest) nounwind noinline ssp {
    240 ; CHECK: float_vrev64
    241 ; CHECK: ldr [[DEST:q[0-9]+]],
    242 ; CHECK: rev64.4s
    243 entry:
    244   %0 = bitcast float* %source to <4 x float>*
    245   %tmp2 = load <4 x float>, <4 x float>* %0, align 4
    246   %tmp5 = shufflevector <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x float> %tmp2, <4 x i32> <i32 0, i32 7, i32 0, i32 0>
    247   %arrayidx8 = getelementptr inbounds <4 x float>, <4 x float>* %dest, i32 11
    248   store <4 x float> %tmp5, <4 x float>* %arrayidx8, align 4
    249   ret void
    250 }
    251 
    252 
    253 define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind {
    254 ; CHECK-LABEL: test_vrev32_bswap:
    255 ; CHECK: rev32.16b
    256 ; CHECK-NOT: rev
    257 ; CHECK: ret
    258   %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source)
    259   ret <4 x i32> %bswap
    260 }
    261 
    262 declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone
    263