1 ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s 2 3 define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { 4 ; CHECK-LABEL: vuzpi8: 5 ; CHECK: @ BB#0: 6 ; CHECK-NEXT: vldr d16, [r1] 7 ; CHECK-NEXT: vldr d17, [r0] 8 ; CHECK-NEXT: vuzp.8 d17, d16 9 ; CHECK-NEXT: vadd.i8 d16, d17, d16 10 ; CHECK-NEXT: vmov r0, r1, d16 11 ; CHECK-NEXT: mov pc, lr 12 %tmp1 = load <8 x i8>, <8 x i8>* %A 13 %tmp2 = load <8 x i8>, <8 x i8>* %B 14 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 15 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 16 %tmp5 = add <8 x i8> %tmp3, %tmp4 17 ret <8 x i8> %tmp5 18 } 19 20 define <16 x i8> @vuzpi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { 21 ; CHECK-LABEL: vuzpi8_Qres: 22 ; CHECK: @ BB#0: 23 ; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] 24 ; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] 25 ; CHECK-NEXT: vuzp.8 [[LDR0]], [[LDR1]] 26 ; CHECK-NEXT: vmov r0, r1, [[LDR0]] 27 ; CHECK-NEXT: vmov r2, r3, [[LDR1]] 28 ; CHECK-NEXT: mov pc, lr 29 %tmp1 = load <8 x i8>, <8 x i8>* %A 30 %tmp2 = load <8 x i8>, <8 x i8>* %B 31 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 32 ret <16 x i8> %tmp3 33 } 34 35 define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 36 ; CHECK-LABEL: vuzpi16: 37 ; CHECK: @ BB#0: 38 ; CHECK-NEXT: vldr d16, [r1] 39 ; CHECK-NEXT: vldr d17, [r0] 40 ; CHECK-NEXT: vuzp.16 d17, d16 41 ; CHECK-NEXT: vadd.i16 d16, d17, d16 42 ; CHECK-NEXT: vmov r0, r1, d16 43 ; CHECK-NEXT: mov pc, lr 44 %tmp1 = load <4 x i16>, <4 x i16>* %A 45 %tmp2 = load <4 x i16>, <4 x i16>* %B 46 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 47 %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 48 %tmp5 = add <4 x i16> %tmp3, %tmp4 49 ret <4 x i16> %tmp5 50 } 51 52 define <8 x i16> @vuzpi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { 53 ; CHECK-LABEL: vuzpi16_Qres: 54 ; CHECK: @ BB#0: 55 ; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] 56 ; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] 57 ; CHECK-NEXT: vuzp.16 [[LDR0]], [[LDR1]] 58 ; CHECK-NEXT: vmov r0, r1, [[LDR0]] 59 ; CHECK-NEXT: vmov r2, r3, [[LDR1]] 60 ; CHECK-NEXT: mov pc, lr 61 %tmp1 = load <4 x i16>, <4 x i16>* %A 62 %tmp2 = load <4 x i16>, <4 x i16>* %B 63 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 64 ret <8 x i16> %tmp3 65 } 66 67 ; VUZP.32 is equivalent to VTRN.32 for 64-bit vectors. 68 69 define <16 x i8> @vuzpQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { 70 ; CHECK-LABEL: vuzpQi8: 71 ; CHECK: @ BB#0: 72 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 73 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 74 ; CHECK-NEXT: vuzp.8 q9, q8 75 ; CHECK-NEXT: vadd.i8 q8, q9, q8 76 ; CHECK-NEXT: vmov r0, r1, d16 77 ; CHECK-NEXT: vmov r2, r3, d17 78 ; CHECK-NEXT: mov pc, lr 79 %tmp1 = load <16 x i8>, <16 x i8>* %A 80 %tmp2 = load <16 x i8>, <16 x i8>* %B 81 %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 82 %tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 83 %tmp5 = add <16 x i8> %tmp3, %tmp4 84 ret <16 x i8> %tmp5 85 } 86 87 define <32 x i8> @vuzpQi8_QQres(<16 x i8>* %A, <16 x i8>* %B) nounwind { 88 ; CHECK-LABEL: vuzpQi8_QQres: 89 ; CHECK: @ BB#0: 90 ; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 91 ; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 92 ; CHECK-NEXT: vuzp.8 q9, q8 93 ; CHECK-NEXT: vst1.8 {d18, d19}, [r0:128]! 94 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 95 ; CHECK-NEXT: mov pc, lr 96 %tmp1 = load <16 x i8>, <16 x i8>* %A 97 %tmp2 = load <16 x i8>, <16 x i8>* %B 98 %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 99 ret <32 x i8> %tmp3 100 } 101 102 define <8 x i16> @vuzpQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { 103 ; CHECK-LABEL: vuzpQi16: 104 ; CHECK: @ BB#0: 105 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 106 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 107 ; CHECK-NEXT: vuzp.16 q9, q8 108 ; CHECK-NEXT: vadd.i16 q8, q9, q8 109 ; CHECK-NEXT: vmov r0, r1, d16 110 ; CHECK-NEXT: vmov r2, r3, d17 111 ; CHECK-NEXT: mov pc, lr 112 %tmp1 = load <8 x i16>, <8 x i16>* %A 113 %tmp2 = load <8 x i16>, <8 x i16>* %B 114 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 115 %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 116 %tmp5 = add <8 x i16> %tmp3, %tmp4 117 ret <8 x i16> %tmp5 118 } 119 120 define <16 x i16> @vuzpQi16_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind { 121 ; CHECK-LABEL: vuzpQi16_QQres: 122 ; CHECK: @ BB#0: 123 ; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 124 ; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 125 ; CHECK-NEXT: vuzp.16 q9, q8 126 ; CHECK-NEXT: vst1.16 {d18, d19}, [r0:128]! 127 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 128 ; CHECK-NEXT: mov pc, lr 129 %tmp1 = load <8 x i16>, <8 x i16>* %A 130 %tmp2 = load <8 x i16>, <8 x i16>* %B 131 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 132 ret <16 x i16> %tmp3 133 } 134 135 define <4 x i32> @vuzpQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { 136 ; CHECK-LABEL: vuzpQi32: 137 ; CHECK: @ BB#0: 138 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 139 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 140 ; CHECK-NEXT: vuzp.32 q9, q8 141 ; CHECK-NEXT: vadd.i32 q8, q9, q8 142 ; CHECK-NEXT: vmov r0, r1, d16 143 ; CHECK-NEXT: vmov r2, r3, d17 144 ; CHECK-NEXT: mov pc, lr 145 %tmp1 = load <4 x i32>, <4 x i32>* %A 146 %tmp2 = load <4 x i32>, <4 x i32>* %B 147 %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 148 %tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 149 %tmp5 = add <4 x i32> %tmp3, %tmp4 150 ret <4 x i32> %tmp5 151 } 152 153 define <8 x i32> @vuzpQi32_QQres(<4 x i32>* %A, <4 x i32>* %B) nounwind { 154 ; CHECK-LABEL: vuzpQi32_QQres: 155 ; CHECK: @ BB#0: 156 ; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 157 ; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 158 ; CHECK-NEXT: vuzp.32 q9, q8 159 ; CHECK-NEXT: vst1.32 {d18, d19}, [r0:128]! 160 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 161 ; CHECK-NEXT: mov pc, lr 162 %tmp1 = load <4 x i32>, <4 x i32>* %A 163 %tmp2 = load <4 x i32>, <4 x i32>* %B 164 %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 165 ret <8 x i32> %tmp3 166 } 167 168 define <4 x float> @vuzpQf(<4 x float>* %A, <4 x float>* %B) nounwind { 169 ; CHECK-LABEL: vuzpQf: 170 ; CHECK: @ BB#0: 171 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 172 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 173 ; CHECK-NEXT: vuzp.32 q9, q8 174 ; CHECK-NEXT: vadd.f32 q8, q9, q8 175 ; CHECK-NEXT: vmov r0, r1, d16 176 ; CHECK-NEXT: vmov r2, r3, d17 177 ; CHECK-NEXT: mov pc, lr 178 %tmp1 = load <4 x float>, <4 x float>* %A 179 %tmp2 = load <4 x float>, <4 x float>* %B 180 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 181 %tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 182 %tmp5 = fadd <4 x float> %tmp3, %tmp4 183 ret <4 x float> %tmp5 184 } 185 186 define <8 x float> @vuzpQf_QQres(<4 x float>* %A, <4 x float>* %B) nounwind { 187 ; CHECK-LABEL: vuzpQf_QQres: 188 ; CHECK: @ BB#0: 189 ; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 190 ; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 191 ; CHECK-NEXT: vuzp.32 q9, q8 192 ; CHECK-NEXT: vst1.32 {d18, d19}, [r0:128]! 193 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 194 ; CHECK-NEXT: mov pc, lr 195 %tmp1 = load <4 x float>, <4 x float>* %A 196 %tmp2 = load <4 x float>, <4 x float>* %B 197 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 198 ret <8 x float> %tmp3 199 } 200 201 ; Undef shuffle indices should not prevent matching to VUZP: 202 203 define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind { 204 ; CHECK-LABEL: vuzpi8_undef: 205 ; CHECK: @ BB#0: 206 ; CHECK-NEXT: vldr d16, [r1] 207 ; CHECK-NEXT: vldr d17, [r0] 208 ; CHECK-NEXT: vuzp.8 d17, d16 209 ; CHECK-NEXT: vadd.i8 d16, d17, d16 210 ; CHECK-NEXT: vmov r0, r1, d16 211 ; CHECK-NEXT: mov pc, lr 212 %tmp1 = load <8 x i8>, <8 x i8>* %A 213 %tmp2 = load <8 x i8>, <8 x i8>* %B 214 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14> 215 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15> 216 %tmp5 = add <8 x i8> %tmp3, %tmp4 217 ret <8 x i8> %tmp5 218 } 219 220 define <16 x i8> @vuzpi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { 221 ; CHECK-LABEL: vuzpi8_undef_Qres: 222 ; CHECK: @ BB#0: 223 ; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] 224 ; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] 225 ; CHECK-NEXT: vuzp.8 [[LDR0]], [[LDR1]] 226 ; CHECK-NEXT: vmov r0, r1, [[LDR0]] 227 ; CHECK-NEXT: vmov r2, r3, [[LDR1]] 228 ; CHECK-NEXT: mov pc, lr 229 %tmp1 = load <8 x i8>, <8 x i8>* %A 230 %tmp2 = load <8 x i8>, <8 x i8>* %B 231 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15> 232 ret <16 x i8> %tmp3 233 } 234 235 define <8 x i16> @vuzpQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind { 236 ; CHECK-LABEL: vuzpQi16_undef: 237 ; CHECK: @ BB#0: 238 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 239 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 240 ; CHECK-NEXT: vuzp.16 q9, q8 241 ; CHECK-NEXT: vadd.i16 q8, q9, q8 242 ; CHECK-NEXT: vmov r0, r1, d16 243 ; CHECK-NEXT: vmov r2, r3, d17 244 ; CHECK-NEXT: mov pc, lr 245 %tmp1 = load <8 x i16>, <8 x i16>* %A 246 %tmp2 = load <8 x i16>, <8 x i16>* %B 247 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14> 248 %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15> 249 %tmp5 = add <8 x i16> %tmp3, %tmp4 250 ret <8 x i16> %tmp5 251 } 252 253 define <16 x i16> @vuzpQi16_undef_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind { 254 ; CHECK-LABEL: vuzpQi16_undef_QQres: 255 ; CHECK: @ BB#0: 256 ; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 257 ; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 258 ; CHECK-NEXT: vuzp.16 q9, q8 259 ; CHECK-NEXT: vst1.16 {d18, d19}, [r0:128]! 260 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 261 ; CHECK-NEXT: mov pc, lr 262 %tmp1 = load <8 x i16>, <8 x i16>* %A 263 %tmp2 = load <8 x i16>, <8 x i16>* %B 264 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15> 265 ret <16 x i16> %tmp3 266 } 267 268 define <8 x i16> @vuzp_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) { 269 entry: 270 ; CHECK-LABEL: vuzp_lower_shufflemask_undef 271 ; CHECK: vuzp 272 %tmp1 = load <4 x i16>, <4 x i16>* %A 273 %tmp2 = load <4 x i16>, <4 x i16>* %B 274 %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7> 275 ret <8 x i16> %0 276 } 277 278 define <4 x i32> @vuzp_lower_shufflemask_zeroed(<2 x i32>* %A, <2 x i32>* %B) { 279 entry: 280 ; CHECK-LABEL: vuzp_lower_shufflemask_zeroed 281 ; CHECK-NOT: vtrn 282 ; CHECK: vuzp 283 %tmp1 = load <2 x i32>, <2 x i32>* %A 284 %tmp2 = load <2 x i32>, <2 x i32>* %B 285 %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 0, i32 0, i32 1, i32 3> 286 ret <4 x i32> %0 287 } 288 289 define void @vuzp_rev_shufflemask_vtrn(<2 x i32>* %A, <2 x i32>* %B, <4 x i32>* %C) { 290 entry: 291 ; CHECK-LABEL: vuzp_rev_shufflemask_vtrn 292 ; CHECK-NOT: vtrn 293 ; CHECK: vuzp 294 %tmp1 = load <2 x i32>, <2 x i32>* %A 295 %tmp2 = load <2 x i32>, <2 x i32>* %B 296 %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 0, i32 2> 297 store <4 x i32> %0, <4 x i32>* %C 298 ret void 299 } 300 301 define <8 x i8> @vuzp_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x i32> %cmp1) { 302 ; In order to create the select we need to truncate the vcgt result from a vector of i32 to a vector of i8. 303 ; This results in a build_vector with mismatched types. We will generate two vmovn.i32 instructions to 304 ; truncate from i32 to i16 and one vuzp to perform the final truncation for i8. 305 ; CHECK-LABEL: vuzp_trunc 306 ; CHECK: vmovn.i32 307 ; CHECK: vmovn.i32 308 ; CHECK: vuzp 309 ; CHECK: vbsl 310 %c = icmp ult <8 x i32> %cmp0, %cmp1 311 %res = select <8 x i1> %c, <8 x i8> %in0, <8 x i8> %in1 312 ret <8 x i8> %res 313 } 314 315 ; Shuffle the result from the compare with a <4 x i8>. 316 ; We need to extend the loaded <4 x i8> to <4 x i16>. Otherwise we wouldn't be able 317 ; to perform the vuzp and get the vbsl mask. 318 define <8 x i8> @vuzp_trunc_and_shuffle(<8 x i8> %tr0, <8 x i8> %tr1, 319 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { 320 ; CHECK-LABEL: vuzp_trunc_and_shuffle 321 ; CHECK: vmovl 322 ; CHECK: vuzp 323 ; CHECK: vbsl 324 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 325 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> 326 %c0 = icmp ult <4 x i32> %cmp0, %cmp1 327 %c = shufflevector <4 x i1> %c0, <4 x i1> %cmp2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 328 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 329 ret <8 x i8> %rv 330 } 331 332 ; Use an undef value for the <4 x i8> that is being shuffled with the compare result. 333 ; This produces a build_vector with some of the operands undefs. 334 define <8 x i8> @vuzp_trunc_and_shuffle_undef_right(<8 x i8> %tr0, <8 x i8> %tr1, 335 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { 336 ; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_right 337 ; CHECK: vuzp 338 ; CHECK: vbsl 339 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 340 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> 341 %c0 = icmp ult <4 x i32> %cmp0, %cmp1 342 %c = shufflevector <4 x i1> %c0, <4 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 343 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 344 ret <8 x i8> %rv 345 } 346 347 define <8 x i8> @vuzp_trunc_and_shuffle_undef_left(<8 x i8> %tr0, <8 x i8> %tr1, 348 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { 349 ; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_left 350 ; CHECK: vuzp 351 ; CHECK: vbsl 352 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 353 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> 354 %c0 = icmp ult <4 x i32> %cmp0, %cmp1 355 %c = shufflevector <4 x i1> undef, <4 x i1> %c0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 356 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 357 ret <8 x i8> %rv 358 } 359 360 ; We're using large data types here, and we have to fill with undef values until we 361 ; get some vector size that we can represent. 362 define <10 x i8> @vuzp_wide_type(<10 x i8> %tr0, <10 x i8> %tr1, 363 <5 x i32> %cmp0, <5 x i32> %cmp1, <5 x i8> *%cmp2_ptr) { 364 ; CHECK-LABEL: vuzp_wide_type 365 ; CHECK: vbsl 366 %cmp2_load = load <5 x i8>, <5 x i8> * %cmp2_ptr, align 4 367 %cmp2 = trunc <5 x i8> %cmp2_load to <5 x i1> 368 %c0 = icmp ult <5 x i32> %cmp0, %cmp1 369 %c = shufflevector <5 x i1> %c0, <5 x i1> %cmp2, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9> 370 %rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1 371 ret <10 x i8> %rv 372 } 373