1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s 3 4 define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { 5 ; CHECK-LABEL: vuzpi8: 6 ; CHECK: @ %bb.0: 7 ; CHECK-NEXT: vldr d16, [r1] 8 ; CHECK-NEXT: vldr d17, [r0] 9 ; CHECK-NEXT: vuzp.8 d17, d16 10 ; CHECK-NEXT: vmul.i8 d16, d17, d16 11 ; CHECK-NEXT: vmov r0, r1, d16 12 ; CHECK-NEXT: mov pc, lr 13 %tmp1 = load <8 x i8>, <8 x i8>* %A 14 %tmp2 = load <8 x i8>, <8 x i8>* %B 15 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 16 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 17 %tmp5 = mul <8 x i8> %tmp3, %tmp4 18 ret <8 x i8> %tmp5 19 } 20 21 define <16 x i8> @vuzpi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { 22 ; CHECK-LABEL: vuzpi8_Qres: 23 ; CHECK: @ %bb.0: 24 ; CHECK-NEXT: vldr d17, [r1] 25 ; CHECK-NEXT: vldr d16, [r0] 26 ; CHECK-NEXT: vuzp.8 d16, d17 27 ; CHECK-NEXT: vmov r0, r1, d16 28 ; CHECK-NEXT: vmov r2, r3, d17 29 ; CHECK-NEXT: mov pc, lr 30 %tmp1 = load <8 x i8>, <8 x i8>* %A 31 %tmp2 = load <8 x i8>, <8 x i8>* %B 32 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 33 ret <16 x i8> %tmp3 34 } 35 36 define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 37 ; CHECK-LABEL: vuzpi16: 38 ; CHECK: @ %bb.0: 39 ; CHECK-NEXT: vldr d16, [r1] 40 ; CHECK-NEXT: vldr d17, [r0] 41 ; CHECK-NEXT: vuzp.16 d17, d16 42 ; CHECK-NEXT: vmul.i16 d16, d17, d16 43 ; CHECK-NEXT: vmov r0, r1, d16 44 ; CHECK-NEXT: mov pc, lr 45 %tmp1 = load <4 x i16>, <4 x i16>* %A 46 %tmp2 = load <4 x i16>, <4 x i16>* %B 47 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 48 %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 49 %tmp5 = mul <4 x i16> %tmp3, %tmp4 50 ret <4 x i16> %tmp5 51 } 52 53 define <8 x i16> @vuzpi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { 54 ; CHECK-LABEL: vuzpi16_Qres: 55 ; CHECK: @ %bb.0: 56 ; CHECK-NEXT: vldr d17, [r1] 57 ; CHECK-NEXT: vldr d16, [r0] 58 ; CHECK-NEXT: vuzp.16 d16, d17 59 ; CHECK-NEXT: vmov r0, r1, d16 60 ; CHECK-NEXT: vmov r2, r3, d17 61 ; CHECK-NEXT: mov pc, lr 62 %tmp1 = load <4 x i16>, <4 x i16>* %A 63 %tmp2 = load <4 x i16>, <4 x i16>* %B 64 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 65 ret <8 x i16> %tmp3 66 } 67 68 ; VUZP.32 is equivalent to VTRN.32 for 64-bit vectors. 69 70 define <16 x i8> @vuzpQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { 71 ; CHECK-LABEL: vuzpQi8: 72 ; CHECK: @ %bb.0: 73 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 74 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 75 ; CHECK-NEXT: vuzp.8 q9, q8 76 ; CHECK-NEXT: vadd.i8 q8, q9, q8 77 ; CHECK-NEXT: vmov r0, r1, d16 78 ; CHECK-NEXT: vmov r2, r3, d17 79 ; CHECK-NEXT: mov pc, lr 80 %tmp1 = load <16 x i8>, <16 x i8>* %A 81 %tmp2 = load <16 x i8>, <16 x i8>* %B 82 %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 83 %tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 84 %tmp5 = add <16 x i8> %tmp3, %tmp4 85 ret <16 x i8> %tmp5 86 } 87 88 define <32 x i8> @vuzpQi8_QQres(<16 x i8>* %A, <16 x i8>* %B) nounwind { 89 ; CHECK-LABEL: vuzpQi8_QQres: 90 ; CHECK: @ %bb.0: 91 ; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 92 ; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 93 ; CHECK-NEXT: vuzp.8 q9, q8 94 ; CHECK-NEXT: vst1.8 {d18, d19}, [r0:128]! 95 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 96 ; CHECK-NEXT: mov pc, lr 97 %tmp1 = load <16 x i8>, <16 x i8>* %A 98 %tmp2 = load <16 x i8>, <16 x i8>* %B 99 %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 100 ret <32 x i8> %tmp3 101 } 102 103 define <8 x i16> @vuzpQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { 104 ; CHECK-LABEL: vuzpQi16: 105 ; CHECK: @ %bb.0: 106 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 107 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 108 ; CHECK-NEXT: vuzp.16 q9, q8 109 ; CHECK-NEXT: vadd.i16 q8, q9, q8 110 ; CHECK-NEXT: vmov r0, r1, d16 111 ; CHECK-NEXT: vmov r2, r3, d17 112 ; CHECK-NEXT: mov pc, lr 113 %tmp1 = load <8 x i16>, <8 x i16>* %A 114 %tmp2 = load <8 x i16>, <8 x i16>* %B 115 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 116 %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 117 %tmp5 = add <8 x i16> %tmp3, %tmp4 118 ret <8 x i16> %tmp5 119 } 120 121 define <16 x i16> @vuzpQi16_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind { 122 ; CHECK-LABEL: vuzpQi16_QQres: 123 ; CHECK: @ %bb.0: 124 ; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 125 ; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 126 ; CHECK-NEXT: vuzp.16 q9, q8 127 ; CHECK-NEXT: vst1.16 {d18, d19}, [r0:128]! 128 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 129 ; CHECK-NEXT: mov pc, lr 130 %tmp1 = load <8 x i16>, <8 x i16>* %A 131 %tmp2 = load <8 x i16>, <8 x i16>* %B 132 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 133 ret <16 x i16> %tmp3 134 } 135 136 define <4 x i32> @vuzpQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { 137 ; CHECK-LABEL: vuzpQi32: 138 ; CHECK: @ %bb.0: 139 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 140 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 141 ; CHECK-NEXT: vuzp.32 q9, q8 142 ; CHECK-NEXT: vadd.i32 q8, q9, q8 143 ; CHECK-NEXT: vmov r0, r1, d16 144 ; CHECK-NEXT: vmov r2, r3, d17 145 ; CHECK-NEXT: mov pc, lr 146 %tmp1 = load <4 x i32>, <4 x i32>* %A 147 %tmp2 = load <4 x i32>, <4 x i32>* %B 148 %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 149 %tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 150 %tmp5 = add <4 x i32> %tmp3, %tmp4 151 ret <4 x i32> %tmp5 152 } 153 154 define <8 x i32> @vuzpQi32_QQres(<4 x i32>* %A, <4 x i32>* %B) nounwind { 155 ; CHECK-LABEL: vuzpQi32_QQres: 156 ; CHECK: @ %bb.0: 157 ; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 158 ; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 159 ; CHECK-NEXT: vuzp.32 q9, q8 160 ; CHECK-NEXT: vst1.32 {d18, d19}, [r0:128]! 161 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 162 ; CHECK-NEXT: mov pc, lr 163 %tmp1 = load <4 x i32>, <4 x i32>* %A 164 %tmp2 = load <4 x i32>, <4 x i32>* %B 165 %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 166 ret <8 x i32> %tmp3 167 } 168 169 define <4 x float> @vuzpQf(<4 x float>* %A, <4 x float>* %B) nounwind { 170 ; CHECK-LABEL: vuzpQf: 171 ; CHECK: @ %bb.0: 172 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 173 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 174 ; CHECK-NEXT: vuzp.32 q9, q8 175 ; CHECK-NEXT: vadd.f32 q8, q9, q8 176 ; CHECK-NEXT: vmov r0, r1, d16 177 ; CHECK-NEXT: vmov r2, r3, d17 178 ; CHECK-NEXT: mov pc, lr 179 %tmp1 = load <4 x float>, <4 x float>* %A 180 %tmp2 = load <4 x float>, <4 x float>* %B 181 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 182 %tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 183 %tmp5 = fadd <4 x float> %tmp3, %tmp4 184 ret <4 x float> %tmp5 185 } 186 187 define <8 x float> @vuzpQf_QQres(<4 x float>* %A, <4 x float>* %B) nounwind { 188 ; CHECK-LABEL: vuzpQf_QQres: 189 ; CHECK: @ %bb.0: 190 ; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 191 ; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 192 ; CHECK-NEXT: vuzp.32 q9, q8 193 ; CHECK-NEXT: vst1.32 {d18, d19}, [r0:128]! 194 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 195 ; CHECK-NEXT: mov pc, lr 196 %tmp1 = load <4 x float>, <4 x float>* %A 197 %tmp2 = load <4 x float>, <4 x float>* %B 198 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 199 ret <8 x float> %tmp3 200 } 201 202 ; Undef shuffle indices should not prevent matching to VUZP: 203 204 define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind { 205 ; CHECK-LABEL: vuzpi8_undef: 206 ; CHECK: @ %bb.0: 207 ; CHECK-NEXT: vldr d16, [r1] 208 ; CHECK-NEXT: vldr d17, [r0] 209 ; CHECK-NEXT: vuzp.8 d17, d16 210 ; CHECK-NEXT: vmul.i8 d16, d17, d16 211 ; CHECK-NEXT: vmov r0, r1, d16 212 ; CHECK-NEXT: mov pc, lr 213 %tmp1 = load <8 x i8>, <8 x i8>* %A 214 %tmp2 = load <8 x i8>, <8 x i8>* %B 215 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14> 216 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15> 217 %tmp5 = mul <8 x i8> %tmp3, %tmp4 218 ret <8 x i8> %tmp5 219 } 220 221 define <16 x i8> @vuzpi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { 222 ; CHECK-LABEL: vuzpi8_undef_Qres: 223 ; CHECK: @ %bb.0: 224 ; CHECK-NEXT: vldr d17, [r1] 225 ; CHECK-NEXT: vldr d16, [r0] 226 ; CHECK-NEXT: vuzp.8 d16, d17 227 ; CHECK-NEXT: vmov r0, r1, d16 228 ; CHECK-NEXT: vmov r2, r3, d17 229 ; CHECK-NEXT: mov pc, lr 230 %tmp1 = load <8 x i8>, <8 x i8>* %A 231 %tmp2 = load <8 x i8>, <8 x i8>* %B 232 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15> 233 ret <16 x i8> %tmp3 234 } 235 236 define <8 x i16> @vuzpQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind { 237 ; CHECK-LABEL: vuzpQi16_undef: 238 ; CHECK: @ %bb.0: 239 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 240 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 241 ; CHECK-NEXT: vuzp.16 q9, q8 242 ; CHECK-NEXT: vadd.i16 q8, q9, q8 243 ; CHECK-NEXT: vmov r0, r1, d16 244 ; CHECK-NEXT: vmov r2, r3, d17 245 ; CHECK-NEXT: mov pc, lr 246 %tmp1 = load <8 x i16>, <8 x i16>* %A 247 %tmp2 = load <8 x i16>, <8 x i16>* %B 248 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14> 249 %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15> 250 %tmp5 = add <8 x i16> %tmp3, %tmp4 251 ret <8 x i16> %tmp5 252 } 253 254 define <16 x i16> @vuzpQi16_undef_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind { 255 ; CHECK-LABEL: vuzpQi16_undef_QQres: 256 ; CHECK: @ %bb.0: 257 ; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 258 ; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 259 ; CHECK-NEXT: vuzp.16 q9, q8 260 ; CHECK-NEXT: vst1.16 {d18, d19}, [r0:128]! 261 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 262 ; CHECK-NEXT: mov pc, lr 263 %tmp1 = load <8 x i16>, <8 x i16>* %A 264 %tmp2 = load <8 x i16>, <8 x i16>* %B 265 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15> 266 ret <16 x i16> %tmp3 267 } 268 269 define <8 x i16> @vuzp_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) { 270 ; CHECK-LABEL: vuzp_lower_shufflemask_undef: 271 ; CHECK: @ %bb.0: @ %entry 272 ; CHECK-NEXT: vldr d17, [r1] 273 ; CHECK-NEXT: vldr d16, [r0] 274 ; CHECK-NEXT: vorr q9, q8, q8 275 ; CHECK-NEXT: vuzp.16 q8, q9 276 ; CHECK-NEXT: vmov r0, r1, d18 277 ; CHECK-NEXT: vmov r2, r3, d19 278 ; CHECK-NEXT: mov pc, lr 279 entry: 280 %tmp1 = load <4 x i16>, <4 x i16>* %A 281 %tmp2 = load <4 x i16>, <4 x i16>* %B 282 %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7> 283 ret <8 x i16> %0 284 } 285 286 define <4 x i32> @vuzp_lower_shufflemask_zeroed(<2 x i32>* %A, <2 x i32>* %B) { 287 ; CHECK-LABEL: vuzp_lower_shufflemask_zeroed: 288 ; CHECK: @ %bb.0: @ %entry 289 ; CHECK-NEXT: vldr d17, [r1] 290 ; CHECK-NEXT: vldr d16, [r0] 291 ; CHECK-NEXT: vdup.32 q9, d16[0] 292 ; CHECK-NEXT: vuzp.32 q8, q9 293 ; CHECK-NEXT: vext.32 q8, q9, q9, #2 294 ; CHECK-NEXT: vmov r0, r1, d16 295 ; CHECK-NEXT: vmov r2, r3, d17 296 ; CHECK-NEXT: mov pc, lr 297 entry: 298 %tmp1 = load <2 x i32>, <2 x i32>* %A 299 %tmp2 = load <2 x i32>, <2 x i32>* %B 300 %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 0, i32 0, i32 1, i32 3> 301 ret <4 x i32> %0 302 } 303 304 define void @vuzp_rev_shufflemask_vtrn(<2 x i32>* %A, <2 x i32>* %B, <4 x i32>* %C) { 305 ; CHECK-LABEL: vuzp_rev_shufflemask_vtrn: 306 ; CHECK: @ %bb.0: @ %entry 307 ; CHECK-NEXT: vldr d17, [r1] 308 ; CHECK-NEXT: vldr d16, [r0] 309 ; CHECK-NEXT: vrev64.32 q9, q8 310 ; CHECK-NEXT: vuzp.32 q8, q9 311 ; CHECK-NEXT: vst1.64 {d18, d19}, [r2] 312 ; CHECK-NEXT: mov pc, lr 313 entry: 314 %tmp1 = load <2 x i32>, <2 x i32>* %A 315 %tmp2 = load <2 x i32>, <2 x i32>* %B 316 %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 0, i32 2> 317 store <4 x i32> %0, <4 x i32>* %C 318 ret void 319 } 320 321 define <8 x i8> @cmpsel_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x i32> %cmp1) { 322 ; In order to create the select we need to truncate the vcgt result from a vector of i32 to a vector of i8. 323 ; This results in a build_vector with mismatched types. We will generate two vmovn.i32 instructions to 324 ; truncate from i32 to i16 and one vmovn.i16 to perform the final truncation for i8. 325 ; CHECK-LABEL: cmpsel_trunc: 326 ; CHECK: @ %bb.0: 327 ; CHECK-NEXT: add r12, sp, #16 328 ; CHECK-NEXT: vld1.64 {d16, d17}, [r12] 329 ; CHECK-NEXT: mov r12, sp 330 ; CHECK-NEXT: vld1.64 {d18, d19}, [r12] 331 ; CHECK-NEXT: add r12, sp, #48 332 ; CHECK-NEXT: vld1.64 {d20, d21}, [r12] 333 ; CHECK-NEXT: add r12, sp, #32 334 ; CHECK-NEXT: vcgt.u32 q8, q10, q8 335 ; CHECK-NEXT: vld1.64 {d20, d21}, [r12] 336 ; CHECK-NEXT: vcgt.u32 q9, q10, q9 337 ; CHECK-NEXT: vmov d20, r2, r3 338 ; CHECK-NEXT: vmovn.i32 d17, q8 339 ; CHECK-NEXT: vmovn.i32 d16, q9 340 ; CHECK-NEXT: vmov d18, r0, r1 341 ; CHECK-NEXT: vmovn.i16 d16, q8 342 ; CHECK-NEXT: vbsl d16, d18, d20 343 ; CHECK-NEXT: vmov r0, r1, d16 344 ; CHECK-NEXT: mov pc, lr 345 %c = icmp ult <8 x i32> %cmp0, %cmp1 346 %res = select <8 x i1> %c, <8 x i8> %in0, <8 x i8> %in1 347 ret <8 x i8> %res 348 } 349 350 ; Shuffle the result from the compare with a <4 x i8>. 351 ; We need to extend the loaded <4 x i8> to <4 x i16>. Otherwise we wouldn't be able 352 ; to perform the vuzp and get the vbsl mask. 353 define <8 x i8> @vuzp_trunc_and_shuffle(<8 x i8> %tr0, <8 x i8> %tr1, 354 ; CHECK-LABEL: vuzp_trunc_and_shuffle: 355 ; CHECK: @ %bb.0: 356 ; CHECK-NEXT: .save {r11, lr} 357 ; CHECK-NEXT: push {r11, lr} 358 ; CHECK-NEXT: add r12, sp, #8 359 ; CHECK-NEXT: add lr, sp, #24 360 ; CHECK-NEXT: vld1.64 {d16, d17}, [r12] 361 ; CHECK-NEXT: ldr r12, [sp, #40] 362 ; CHECK-NEXT: vld1.64 {d18, d19}, [lr] 363 ; CHECK-NEXT: vcgt.u32 q8, q9, q8 364 ; CHECK-NEXT: vld1.32 {d18[0]}, [r12:32] 365 ; CHECK-NEXT: vmov.i8 d19, #0x7 366 ; CHECK-NEXT: vmovl.u8 q10, d18 367 ; CHECK-NEXT: vmovn.i32 d16, q8 368 ; CHECK-NEXT: vneg.s8 d17, d19 369 ; CHECK-NEXT: vmov d18, r2, r3 370 ; CHECK-NEXT: vuzp.8 d16, d20 371 ; CHECK-NEXT: vshl.i8 d16, d16, #7 372 ; CHECK-NEXT: vshl.s8 d16, d16, d17 373 ; CHECK-NEXT: vmov d17, r0, r1 374 ; CHECK-NEXT: vbsl d16, d17, d18 375 ; CHECK-NEXT: vmov r0, r1, d16 376 ; CHECK-NEXT: pop {r11, lr} 377 ; CHECK-NEXT: mov pc, lr 378 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { 379 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 380 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> 381 %c0 = icmp ult <4 x i32> %cmp0, %cmp1 382 %c = shufflevector <4 x i1> %c0, <4 x i1> %cmp2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 383 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 384 ret <8 x i8> %rv 385 } 386 387 ; Use an undef value for the <4 x i8> that is being shuffled with the compare result. 388 ; This produces a build_vector with some of the operands undefs. 389 define <8 x i8> @vuzp_trunc_and_shuffle_undef_right(<8 x i8> %tr0, <8 x i8> %tr1, 390 ; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_right: 391 ; CHECK: @ %bb.0: 392 ; CHECK-NEXT: mov r12, sp 393 ; CHECK-NEXT: vld1.64 {d16, d17}, [r12] 394 ; CHECK-NEXT: add r12, sp, #16 395 ; CHECK-NEXT: vld1.64 {d18, d19}, [r12] 396 ; CHECK-NEXT: vcgt.u32 q8, q9, q8 397 ; CHECK-NEXT: vmov.i8 d18, #0x7 398 ; CHECK-NEXT: vmovn.i32 d16, q8 399 ; CHECK-NEXT: vuzp.8 d16, d17 400 ; CHECK-NEXT: vneg.s8 d17, d18 401 ; CHECK-NEXT: vshl.i8 d16, d16, #7 402 ; CHECK-NEXT: vmov d18, r2, r3 403 ; CHECK-NEXT: vshl.s8 d16, d16, d17 404 ; CHECK-NEXT: vmov d17, r0, r1 405 ; CHECK-NEXT: vbsl d16, d17, d18 406 ; CHECK-NEXT: vmov r0, r1, d16 407 ; CHECK-NEXT: mov pc, lr 408 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { 409 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 410 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> 411 %c0 = icmp ult <4 x i32> %cmp0, %cmp1 412 %c = shufflevector <4 x i1> %c0, <4 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 413 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 414 ret <8 x i8> %rv 415 } 416 417 define <8 x i8> @vuzp_trunc_and_shuffle_undef_left(<8 x i8> %tr0, <8 x i8> %tr1, 418 ; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_left: 419 ; CHECK: @ %bb.0: 420 ; CHECK-NEXT: mov r12, sp 421 ; CHECK-NEXT: vld1.64 {d16, d17}, [r12] 422 ; CHECK-NEXT: add r12, sp, #16 423 ; CHECK-NEXT: vld1.64 {d18, d19}, [r12] 424 ; CHECK-NEXT: vcgt.u32 q8, q9, q8 425 ; CHECK-NEXT: vldr d18, .LCPI22_0 426 ; CHECK-NEXT: vmov.i8 d19, #0x7 427 ; CHECK-NEXT: vmovn.i32 d16, q8 428 ; CHECK-NEXT: vtbl.8 d16, {d16}, d18 429 ; CHECK-NEXT: vneg.s8 d17, d19 430 ; CHECK-NEXT: vmov d18, r2, r3 431 ; CHECK-NEXT: vshl.i8 d16, d16, #7 432 ; CHECK-NEXT: vshl.s8 d16, d16, d17 433 ; CHECK-NEXT: vmov d17, r0, r1 434 ; CHECK-NEXT: vbsl d16, d17, d18 435 ; CHECK-NEXT: vmov r0, r1, d16 436 ; CHECK-NEXT: mov pc, lr 437 ; CHECK-NEXT: .p2align 3 438 ; CHECK-NEXT: @ %bb.1: 439 ; CHECK-NEXT: .LCPI22_0: 440 ; CHECK-NEXT: .byte 255 @ 0xff 441 ; CHECK-NEXT: .byte 255 @ 0xff 442 ; CHECK-NEXT: .byte 255 @ 0xff 443 ; CHECK-NEXT: .byte 255 @ 0xff 444 ; CHECK-NEXT: .byte 0 @ 0x0 445 ; CHECK-NEXT: .byte 2 @ 0x2 446 ; CHECK-NEXT: .byte 4 @ 0x4 447 ; CHECK-NEXT: .byte 6 @ 0x6 448 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { 449 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 450 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> 451 %c0 = icmp ult <4 x i32> %cmp0, %cmp1 452 %c = shufflevector <4 x i1> undef, <4 x i1> %c0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 453 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 454 ret <8 x i8> %rv 455 } 456 457 ; We're using large data types here, and we have to fill with undef values until we 458 ; get some vector size that we can represent. 459 define <10 x i8> @vuzp_wide_type(<10 x i8> %tr0, <10 x i8> %tr1, 460 ; CHECK-LABEL: vuzp_wide_type: 461 ; CHECK: @ %bb.0: 462 ; CHECK-NEXT: .save {r4, lr} 463 ; CHECK-NEXT: push {r4, lr} 464 ; CHECK-NEXT: add r12, sp, #32 465 ; CHECK-NEXT: add lr, sp, #48 466 ; CHECK-NEXT: vld1.32 {d17[0]}, [r12:32] 467 ; CHECK-NEXT: add r12, sp, #24 468 ; CHECK-NEXT: vld1.32 {d16[0]}, [r12:32] 469 ; CHECK-NEXT: add r12, sp, #56 470 ; CHECK-NEXT: vld1.32 {d19[0]}, [r12:32] 471 ; CHECK-NEXT: ldr r12, [sp, #68] 472 ; CHECK-NEXT: vld1.32 {d18[0]}, [lr:32] 473 ; CHECK-NEXT: add lr, sp, #40 474 ; CHECK-NEXT: vld1.32 {d20[0]}, [lr:32] 475 ; CHECK-NEXT: ldr r4, [r12] 476 ; CHECK-NEXT: vmov.32 d23[0], r4 477 ; CHECK-NEXT: add r4, sp, #64 478 ; CHECK-NEXT: vld1.32 {d24[0]}, [r4:32] 479 ; CHECK-NEXT: add r4, sp, #36 480 ; CHECK-NEXT: vld1.32 {d17[1]}, [r4:32] 481 ; CHECK-NEXT: add r4, sp, #28 482 ; CHECK-NEXT: vcgt.u32 q10, q12, q10 483 ; CHECK-NEXT: vmov.u8 lr, d23[3] 484 ; CHECK-NEXT: vld1.32 {d16[1]}, [r4:32] 485 ; CHECK-NEXT: add r4, sp, #60 486 ; CHECK-NEXT: vld1.32 {d19[1]}, [r4:32] 487 ; CHECK-NEXT: add r4, sp, #52 488 ; CHECK-NEXT: vld1.32 {d18[1]}, [r4:32] 489 ; CHECK-NEXT: add r4, r12, #4 490 ; CHECK-NEXT: vcgt.u32 q8, q9, q8 491 ; CHECK-NEXT: vmovn.i32 d19, q10 492 ; CHECK-NEXT: vldr d20, .LCPI23_0 493 ; CHECK-NEXT: vmovn.i32 d18, q8 494 ; CHECK-NEXT: vmovn.i16 d22, q9 495 ; CHECK-NEXT: vmov.i8 q9, #0x7 496 ; CHECK-NEXT: vmov.8 d17[0], lr 497 ; CHECK-NEXT: vneg.s8 q9, q9 498 ; CHECK-NEXT: vtbl.8 d16, {d22, d23}, d20 499 ; CHECK-NEXT: vld1.8 {d17[1]}, [r4] 500 ; CHECK-NEXT: add r4, sp, #8 501 ; CHECK-NEXT: vshl.i8 q8, q8, #7 502 ; CHECK-NEXT: vld1.64 {d20, d21}, [r4] 503 ; CHECK-NEXT: vshl.s8 q8, q8, q9 504 ; CHECK-NEXT: vmov d19, r2, r3 505 ; CHECK-NEXT: vmov d18, r0, r1 506 ; CHECK-NEXT: vbsl q8, q9, q10 507 ; CHECK-NEXT: vmov r0, r1, d16 508 ; CHECK-NEXT: vmov r2, r3, d17 509 ; CHECK-NEXT: pop {r4, lr} 510 ; CHECK-NEXT: mov pc, lr 511 ; CHECK-NEXT: .p2align 3 512 ; CHECK-NEXT: @ %bb.1: 513 ; CHECK-NEXT: .LCPI23_0: 514 ; CHECK-NEXT: .byte 0 @ 0x0 515 ; CHECK-NEXT: .byte 1 @ 0x1 516 ; CHECK-NEXT: .byte 2 @ 0x2 517 ; CHECK-NEXT: .byte 3 @ 0x3 518 ; CHECK-NEXT: .byte 4 @ 0x4 519 ; CHECK-NEXT: .byte 8 @ 0x8 520 ; CHECK-NEXT: .byte 9 @ 0x9 521 ; CHECK-NEXT: .byte 10 @ 0xa 522 <5 x i32> %cmp0, <5 x i32> %cmp1, <5 x i8> *%cmp2_ptr) { 523 %cmp2_load = load <5 x i8>, <5 x i8> * %cmp2_ptr, align 4 524 %cmp2 = trunc <5 x i8> %cmp2_load to <5 x i1> 525 %c0 = icmp ult <5 x i32> %cmp0, %cmp1 526 %c = shufflevector <5 x i1> %c0, <5 x i1> %cmp2, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9> 527 %rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1 528 ret <10 x i8> %rv 529 } 530 531 %struct.uint8x8x2_t = type { [2 x <8 x i8>] } 532 define %struct.uint8x8x2_t @vuzp_extract_subvector(<16 x i8> %t) #0 { 533 ; CHECK-LABEL: vuzp_extract_subvector: 534 ; CHECK: @ %bb.0: 535 ; CHECK-NEXT: vmov d17, r2, r3 536 ; CHECK-NEXT: vmov d16, r0, r1 537 ; CHECK-NEXT: vorr d18, d17, d17 538 ; CHECK-NEXT: vuzp.8 d16, d18 539 ; CHECK-NEXT: vmov r0, r1, d16 540 ; CHECK-NEXT: vmov r2, r3, d18 541 ; CHECK-NEXT: mov pc, lr 542 543 %vuzp.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 544 %vuzp1.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 545 %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0 546 %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1 547 ret %struct.uint8x8x2_t %.fca.0.1.insert 548 } 549