1 ; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s 2 3 define i32 @test_rev_w(i32 %a) nounwind { 4 entry: 5 ; CHECK-LABEL: test_rev_w: 6 ; CHECK: rev w0, w0 7 %0 = tail call i32 @llvm.bswap.i32(i32 %a) 8 ret i32 %0 9 } 10 11 define i64 @test_rev_x(i64 %a) nounwind { 12 entry: 13 ; CHECK-LABEL: test_rev_x: 14 ; CHECK: rev x0, x0 15 %0 = tail call i64 @llvm.bswap.i64(i64 %a) 16 ret i64 %0 17 } 18 19 declare i32 @llvm.bswap.i32(i32) nounwind readnone 20 declare i64 @llvm.bswap.i64(i64) nounwind readnone 21 22 define i32 @test_rev16_w(i32 %X) nounwind { 23 entry: 24 ; CHECK-LABEL: test_rev16_w: 25 ; CHECK: rev16 w0, w0 26 %tmp1 = lshr i32 %X, 8 27 %X15 = bitcast i32 %X to i32 28 %tmp4 = shl i32 %X15, 8 29 %tmp2 = and i32 %tmp1, 16711680 30 %tmp5 = and i32 %tmp4, -16777216 31 %tmp9 = and i32 %tmp1, 255 32 %tmp13 = and i32 %tmp4, 65280 33 %tmp6 = or i32 %tmp5, %tmp2 34 %tmp10 = or i32 %tmp6, %tmp13 35 %tmp14 = or i32 %tmp10, %tmp9 36 ret i32 %tmp14 37 } 38 39 ; 64-bit REV16 is *not* a swap then a 16-bit rotation: 40 ; 01234567 ->(bswap) 76543210 ->(rotr) 10765432 41 ; 01234567 ->(rev16) 10325476 42 define i64 @test_rev16_x(i64 %a) nounwind { 43 entry: 44 ; CHECK-LABEL: test_rev16_x: 45 ; CHECK-NOT: rev16 x0, x0 46 %0 = tail call i64 @llvm.bswap.i64(i64 %a) 47 %1 = lshr i64 %0, 16 48 %2 = shl i64 %0, 48 49 %3 = or i64 %1, %2 50 ret i64 %3 51 } 52 53 define i64 @test_rev32_x(i64 %a) nounwind { 54 entry: 55 ; CHECK-LABEL: test_rev32_x: 56 ; CHECK: rev32 x0, x0 57 %0 = tail call i64 @llvm.bswap.i64(i64 %a) 58 %1 = lshr i64 %0, 32 59 %2 = shl i64 %0, 32 60 %3 = or i64 %1, %2 61 ret i64 %3 62 } 63 64 define <8 x i8> @test_vrev64D8(<8 x i8>* %A) nounwind { 65 ;CHECK-LABEL: test_vrev64D8: 66 ;CHECK: rev64.8b 67 %tmp1 = load <8 x i8>* %A 68 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 69 ret <8 x i8> %tmp2 70 } 71 72 define <4 x i16> @test_vrev64D16(<4 x i16>* %A) nounwind { 73 ;CHECK-LABEL: test_vrev64D16: 74 ;CHECK: rev64.4h 75 %tmp1 = load <4 x i16>* %A 76 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 77 ret <4 x i16> %tmp2 78 } 79 80 define <2 x i32> @test_vrev64D32(<2 x i32>* %A) nounwind { 81 ;CHECK-LABEL: test_vrev64D32: 82 ;CHECK: rev64.2s 83 %tmp1 = load <2 x i32>* %A 84 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> <i32 1, i32 0> 85 ret <2 x i32> %tmp2 86 } 87 88 define <2 x float> @test_vrev64Df(<2 x float>* %A) nounwind { 89 ;CHECK-LABEL: test_vrev64Df: 90 ;CHECK: rev64.2s 91 %tmp1 = load <2 x float>* %A 92 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> <i32 1, i32 0> 93 ret <2 x float> %tmp2 94 } 95 96 define <16 x i8> @test_vrev64Q8(<16 x i8>* %A) nounwind { 97 ;CHECK-LABEL: test_vrev64Q8: 98 ;CHECK: rev64.16b 99 %tmp1 = load <16 x i8>* %A 100 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8> 101 ret <16 x i8> %tmp2 102 } 103 104 define <8 x i16> @test_vrev64Q16(<8 x i16>* %A) nounwind { 105 ;CHECK-LABEL: test_vrev64Q16: 106 ;CHECK: rev64.8h 107 %tmp1 = load <8 x i16>* %A 108 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 109 ret <8 x i16> %tmp2 110 } 111 112 define <4 x i32> @test_vrev64Q32(<4 x i32>* %A) nounwind { 113 ;CHECK-LABEL: test_vrev64Q32: 114 ;CHECK: rev64.4s 115 %tmp1 = load <4 x i32>* %A 116 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 117 ret <4 x i32> %tmp2 118 } 119 120 define <4 x float> @test_vrev64Qf(<4 x float>* %A) nounwind { 121 ;CHECK-LABEL: test_vrev64Qf: 122 ;CHECK: rev64.4s 123 %tmp1 = load <4 x float>* %A 124 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 125 ret <4 x float> %tmp2 126 } 127 128 define <8 x i8> @test_vrev32D8(<8 x i8>* %A) nounwind { 129 ;CHECK-LABEL: test_vrev32D8: 130 ;CHECK: rev32.8b 131 %tmp1 = load <8 x i8>* %A 132 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 133 ret <8 x i8> %tmp2 134 } 135 136 define <4 x i16> @test_vrev32D16(<4 x i16>* %A) nounwind { 137 ;CHECK-LABEL: test_vrev32D16: 138 ;CHECK: rev32.4h 139 %tmp1 = load <4 x i16>* %A 140 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 141 ret <4 x i16> %tmp2 142 } 143 144 define <16 x i8> @test_vrev32Q8(<16 x i8>* %A) nounwind { 145 ;CHECK-LABEL: test_vrev32Q8: 146 ;CHECK: rev32.16b 147 %tmp1 = load <16 x i8>* %A 148 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12> 149 ret <16 x i8> %tmp2 150 } 151 152 define <8 x i16> @test_vrev32Q16(<8 x i16>* %A) nounwind { 153 ;CHECK-LABEL: test_vrev32Q16: 154 ;CHECK: rev32.8h 155 %tmp1 = load <8 x i16>* %A 156 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 157 ret <8 x i16> %tmp2 158 } 159 160 define <8 x i8> @test_vrev16D8(<8 x i8>* %A) nounwind { 161 ;CHECK-LABEL: test_vrev16D8: 162 ;CHECK: rev16.8b 163 %tmp1 = load <8 x i8>* %A 164 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 165 ret <8 x i8> %tmp2 166 } 167 168 define <16 x i8> @test_vrev16Q8(<16 x i8>* %A) nounwind { 169 ;CHECK-LABEL: test_vrev16Q8: 170 ;CHECK: rev16.16b 171 %tmp1 = load <16 x i8>* %A 172 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14> 173 ret <16 x i8> %tmp2 174 } 175 176 ; Undef shuffle indices should not prevent matching to VREV: 177 178 define <8 x i8> @test_vrev64D8_undef(<8 x i8>* %A) nounwind { 179 ;CHECK-LABEL: test_vrev64D8_undef: 180 ;CHECK: rev64.8b 181 %tmp1 = load <8 x i8>* %A 182 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 undef, i32 undef, i32 4, i32 3, i32 2, i32 1, i32 0> 183 ret <8 x i8> %tmp2 184 } 185 186 define <8 x i16> @test_vrev32Q16_undef(<8 x i16>* %A) nounwind { 187 ;CHECK-LABEL: test_vrev32Q16_undef: 188 ;CHECK: rev32.8h 189 %tmp1 = load <8 x i16>* %A 190 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 5, i32 4, i32 7, i32 undef> 191 ret <8 x i16> %tmp2 192 } 193 194 ; vrev <4 x i16> should use REV32 and not REV64 195 define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp { 196 ; CHECK-LABEL: test_vrev64: 197 ; CHECK: ldr [[DEST:q[0-9]+]], 198 ; CHECK: st1.h 199 ; CHECK: st1.h 200 entry: 201 %0 = bitcast <4 x i16>* %source to <8 x i16>* 202 %tmp2 = load <8 x i16>* %0, align 4 203 %tmp3 = extractelement <8 x i16> %tmp2, i32 6 204 %tmp5 = insertelement <2 x i16> undef, i16 %tmp3, i32 0 205 %tmp9 = extractelement <8 x i16> %tmp2, i32 5 206 %tmp11 = insertelement <2 x i16> %tmp5, i16 %tmp9, i32 1 207 store <2 x i16> %tmp11, <2 x i16>* %dst, align 4 208 ret void 209 } 210 211 ; Test vrev of float4 212 define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest) nounwind noinline ssp { 213 ; CHECK: float_vrev64 214 ; CHECK: ldr [[DEST:q[0-9]+]], 215 ; CHECK: rev64.4s 216 entry: 217 %0 = bitcast float* %source to <4 x float>* 218 %tmp2 = load <4 x float>* %0, align 4 219 %tmp5 = shufflevector <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x float> %tmp2, <4 x i32> <i32 0, i32 7, i32 0, i32 0> 220 %arrayidx8 = getelementptr inbounds <4 x float>* %dest, i32 11 221 store <4 x float> %tmp5, <4 x float>* %arrayidx8, align 4 222 ret void 223 } 224 225 226 define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind { 227 ; CHECK-LABEL: test_vrev32_bswap: 228 ; CHECK: rev32.16b 229 ; CHECK-NOT: rev 230 ; CHECK: ret 231 %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source) 232 ret <4 x i32> %bswap 233 } 234 235 declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone 236