1 ; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s 2 3 define i32 @test_rev_w(i32 %a) nounwind { 4 entry: 5 ; CHECK-LABEL: test_rev_w: 6 ; CHECK: rev w0, w0 7 %0 = tail call i32 @llvm.bswap.i32(i32 %a) 8 ret i32 %0 9 } 10 11 define i64 @test_rev_x(i64 %a) nounwind { 12 entry: 13 ; CHECK-LABEL: test_rev_x: 14 ; CHECK: rev x0, x0 15 %0 = tail call i64 @llvm.bswap.i64(i64 %a) 16 ret i64 %0 17 } 18 19 ; Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 16-bits 20 ; of %a are zero. This optimizes rev + lsr 16 to rev16. 21 define i32 @test_rev_w_srl16(i16 %a) { 22 entry: 23 ; CHECK-LABEL: test_rev_w_srl16: 24 ; CHECK: and [[REG:w[0-9]+]], w0, #0xffff 25 ; CHECK: rev16 w0, [[REG]] 26 ; CHECK-NOT: lsr 27 %0 = zext i16 %a to i32 28 %1 = tail call i32 @llvm.bswap.i32(i32 %0) 29 %2 = lshr i32 %1, 16 30 ret i32 %2 31 } 32 33 ; Canonicalize (srl (bswap x), 32) to (rotr (bswap x), 32) if the high 32-bits 34 ; of %a are zero. This optimizes rev + lsr 32 to rev32. 35 define i64 @test_rev_x_srl32(i32 %a) { 36 entry: 37 ; CHECK-LABEL: test_rev_x_srl32: 38 ; CHECK: rev32 x0, {{x[0-9]+}} 39 ; CHECK-NOT: lsr 40 %0 = zext i32 %a to i64 41 %1 = tail call i64 @llvm.bswap.i64(i64 %0) 42 %2 = lshr i64 %1, 32 43 ret i64 %2 44 } 45 46 declare i32 @llvm.bswap.i32(i32) nounwind readnone 47 declare i64 @llvm.bswap.i64(i64) nounwind readnone 48 49 define i32 @test_rev16_w(i32 %X) nounwind { 50 entry: 51 ; CHECK-LABEL: test_rev16_w: 52 ; CHECK: rev16 w0, w0 53 %tmp1 = lshr i32 %X, 8 54 %X15 = bitcast i32 %X to i32 55 %tmp4 = shl i32 %X15, 8 56 %tmp2 = and i32 %tmp1, 16711680 57 %tmp5 = and i32 %tmp4, -16777216 58 %tmp9 = and i32 %tmp1, 255 59 %tmp13 = and i32 %tmp4, 65280 60 %tmp6 = or i32 %tmp5, %tmp2 61 %tmp10 = or i32 %tmp6, %tmp13 62 %tmp14 = or i32 %tmp10, %tmp9 63 ret i32 %tmp14 64 } 65 66 ; 64-bit REV16 is *not* a swap then a 16-bit rotation: 67 ; 01234567 ->(bswap) 76543210 ->(rotr) 10765432 68 ; 01234567 ->(rev16) 10325476 69 define i64 @test_rev16_x(i64 %a) nounwind { 70 entry: 71 ; CHECK-LABEL: test_rev16_x: 72 ; CHECK-NOT: rev16 x0, x0 73 %0 = tail call i64 @llvm.bswap.i64(i64 %a) 74 %1 = lshr i64 %0, 16 75 %2 = shl i64 %0, 48 76 %3 = or i64 %1, %2 77 ret i64 %3 78 } 79 80 define i64 @test_rev32_x(i64 %a) nounwind { 81 entry: 82 ; CHECK-LABEL: test_rev32_x: 83 ; CHECK: rev32 x0, x0 84 %0 = tail call i64 @llvm.bswap.i64(i64 %a) 85 %1 = lshr i64 %0, 32 86 %2 = shl i64 %0, 32 87 %3 = or i64 %1, %2 88 ret i64 %3 89 } 90 91 define <8 x i8> @test_vrev64D8(<8 x i8>* %A) nounwind { 92 ;CHECK-LABEL: test_vrev64D8: 93 ;CHECK: rev64.8b 94 %tmp1 = load <8 x i8>, <8 x i8>* %A 95 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 96 ret <8 x i8> %tmp2 97 } 98 99 define <4 x i16> @test_vrev64D16(<4 x i16>* %A) nounwind { 100 ;CHECK-LABEL: test_vrev64D16: 101 ;CHECK: rev64.4h 102 %tmp1 = load <4 x i16>, <4 x i16>* %A 103 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 104 ret <4 x i16> %tmp2 105 } 106 107 define <2 x i32> @test_vrev64D32(<2 x i32>* %A) nounwind { 108 ;CHECK-LABEL: test_vrev64D32: 109 ;CHECK: rev64.2s 110 %tmp1 = load <2 x i32>, <2 x i32>* %A 111 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> <i32 1, i32 0> 112 ret <2 x i32> %tmp2 113 } 114 115 define <2 x float> @test_vrev64Df(<2 x float>* %A) nounwind { 116 ;CHECK-LABEL: test_vrev64Df: 117 ;CHECK: rev64.2s 118 %tmp1 = load <2 x float>, <2 x float>* %A 119 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> <i32 1, i32 0> 120 ret <2 x float> %tmp2 121 } 122 123 define <16 x i8> @test_vrev64Q8(<16 x i8>* %A) nounwind { 124 ;CHECK-LABEL: test_vrev64Q8: 125 ;CHECK: rev64.16b 126 %tmp1 = load <16 x i8>, <16 x i8>* %A 127 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8> 128 ret <16 x i8> %tmp2 129 } 130 131 define <8 x i16> @test_vrev64Q16(<8 x i16>* %A) nounwind { 132 ;CHECK-LABEL: test_vrev64Q16: 133 ;CHECK: rev64.8h 134 %tmp1 = load <8 x i16>, <8 x i16>* %A 135 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 136 ret <8 x i16> %tmp2 137 } 138 139 define <4 x i32> @test_vrev64Q32(<4 x i32>* %A) nounwind { 140 ;CHECK-LABEL: test_vrev64Q32: 141 ;CHECK: rev64.4s 142 %tmp1 = load <4 x i32>, <4 x i32>* %A 143 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 144 ret <4 x i32> %tmp2 145 } 146 147 define <4 x float> @test_vrev64Qf(<4 x float>* %A) nounwind { 148 ;CHECK-LABEL: test_vrev64Qf: 149 ;CHECK: rev64.4s 150 %tmp1 = load <4 x float>, <4 x float>* %A 151 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 152 ret <4 x float> %tmp2 153 } 154 155 define <8 x i8> @test_vrev32D8(<8 x i8>* %A) nounwind { 156 ;CHECK-LABEL: test_vrev32D8: 157 ;CHECK: rev32.8b 158 %tmp1 = load <8 x i8>, <8 x i8>* %A 159 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 160 ret <8 x i8> %tmp2 161 } 162 163 define <4 x i16> @test_vrev32D16(<4 x i16>* %A) nounwind { 164 ;CHECK-LABEL: test_vrev32D16: 165 ;CHECK: rev32.4h 166 %tmp1 = load <4 x i16>, <4 x i16>* %A 167 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 168 ret <4 x i16> %tmp2 169 } 170 171 define <16 x i8> @test_vrev32Q8(<16 x i8>* %A) nounwind { 172 ;CHECK-LABEL: test_vrev32Q8: 173 ;CHECK: rev32.16b 174 %tmp1 = load <16 x i8>, <16 x i8>* %A 175 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12> 176 ret <16 x i8> %tmp2 177 } 178 179 define <8 x i16> @test_vrev32Q16(<8 x i16>* %A) nounwind { 180 ;CHECK-LABEL: test_vrev32Q16: 181 ;CHECK: rev32.8h 182 %tmp1 = load <8 x i16>, <8 x i16>* %A 183 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 184 ret <8 x i16> %tmp2 185 } 186 187 define <8 x i8> @test_vrev16D8(<8 x i8>* %A) nounwind { 188 ;CHECK-LABEL: test_vrev16D8: 189 ;CHECK: rev16.8b 190 %tmp1 = load <8 x i8>, <8 x i8>* %A 191 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 192 ret <8 x i8> %tmp2 193 } 194 195 define <16 x i8> @test_vrev16Q8(<16 x i8>* %A) nounwind { 196 ;CHECK-LABEL: test_vrev16Q8: 197 ;CHECK: rev16.16b 198 %tmp1 = load <16 x i8>, <16 x i8>* %A 199 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14> 200 ret <16 x i8> %tmp2 201 } 202 203 ; Undef shuffle indices should not prevent matching to VREV: 204 205 define <8 x i8> @test_vrev64D8_undef(<8 x i8>* %A) nounwind { 206 ;CHECK-LABEL: test_vrev64D8_undef: 207 ;CHECK: rev64.8b 208 %tmp1 = load <8 x i8>, <8 x i8>* %A 209 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 undef, i32 undef, i32 4, i32 3, i32 2, i32 1, i32 0> 210 ret <8 x i8> %tmp2 211 } 212 213 define <8 x i16> @test_vrev32Q16_undef(<8 x i16>* %A) nounwind { 214 ;CHECK-LABEL: test_vrev32Q16_undef: 215 ;CHECK: rev32.8h 216 %tmp1 = load <8 x i16>, <8 x i16>* %A 217 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 5, i32 4, i32 7, i32 undef> 218 ret <8 x i16> %tmp2 219 } 220 221 ; vrev <4 x i16> should use REV32 and not REV64 222 define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp { 223 ; CHECK-LABEL: test_vrev64: 224 ; CHECK: ldr [[DEST:q[0-9]+]], 225 ; CHECK: st1.h 226 ; CHECK: st1.h 227 entry: 228 %0 = bitcast <4 x i16>* %source to <8 x i16>* 229 %tmp2 = load <8 x i16>, <8 x i16>* %0, align 4 230 %tmp3 = extractelement <8 x i16> %tmp2, i32 6 231 %tmp5 = insertelement <2 x i16> undef, i16 %tmp3, i32 0 232 %tmp9 = extractelement <8 x i16> %tmp2, i32 5 233 %tmp11 = insertelement <2 x i16> %tmp5, i16 %tmp9, i32 1 234 store <2 x i16> %tmp11, <2 x i16>* %dst, align 4 235 ret void 236 } 237 238 ; Test vrev of float4 239 define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest) nounwind noinline ssp { 240 ; CHECK: float_vrev64 241 ; CHECK: ldr [[DEST:q[0-9]+]], 242 ; CHECK: rev64.4s 243 entry: 244 %0 = bitcast float* %source to <4 x float>* 245 %tmp2 = load <4 x float>, <4 x float>* %0, align 4 246 %tmp5 = shufflevector <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x float> %tmp2, <4 x i32> <i32 0, i32 7, i32 0, i32 0> 247 %arrayidx8 = getelementptr inbounds <4 x float>, <4 x float>* %dest, i32 11 248 store <4 x float> %tmp5, <4 x float>* %arrayidx8, align 4 249 ret void 250 } 251 252 253 define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind { 254 ; CHECK-LABEL: test_vrev32_bswap: 255 ; CHECK: rev32.16b 256 ; CHECK-NOT: rev 257 ; CHECK: ret 258 %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source) 259 ret <4 x i32> %bswap 260 } 261 262 declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone 263