Home | History | Annotate | Download | only in ARM
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
      3 
      4 define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
      5 ; CHECK-LABEL: vuzpi8:
      6 ; CHECK:       @ %bb.0:
      7 ; CHECK-NEXT:    vldr d16, [r1]
      8 ; CHECK-NEXT:    vldr d17, [r0]
      9 ; CHECK-NEXT:    vuzp.8 d17, d16
     10 ; CHECK-NEXT:    vmul.i8 d16, d17, d16
     11 ; CHECK-NEXT:    vmov r0, r1, d16
     12 ; CHECK-NEXT:    mov pc, lr
     13 	%tmp1 = load <8 x i8>, <8 x i8>* %A
     14 	%tmp2 = load <8 x i8>, <8 x i8>* %B
     15 	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
     16 	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
     17         %tmp5 = mul <8 x i8> %tmp3, %tmp4
     18 	ret <8 x i8> %tmp5
     19 }
     20 
     21 define <16 x i8> @vuzpi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
     22 ; CHECK-LABEL: vuzpi8_Qres:
     23 ; CHECK:       @ %bb.0:
     24 ; CHECK-NEXT:    vldr d17, [r1]
     25 ; CHECK-NEXT:    vldr d16, [r0]
     26 ; CHECK-NEXT:    vuzp.8 d16, d17
     27 ; CHECK-NEXT:    vmov r0, r1, d16
     28 ; CHECK-NEXT:    vmov r2, r3, d17
     29 ; CHECK-NEXT:    mov pc, lr
     30 	%tmp1 = load <8 x i8>, <8 x i8>* %A
     31 	%tmp2 = load <8 x i8>, <8 x i8>* %B
     32 	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
     33 	ret <16 x i8> %tmp3
     34 }
     35 
     36 define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
     37 ; CHECK-LABEL: vuzpi16:
     38 ; CHECK:       @ %bb.0:
     39 ; CHECK-NEXT:    vldr d16, [r1]
     40 ; CHECK-NEXT:    vldr d17, [r0]
     41 ; CHECK-NEXT:    vuzp.16 d17, d16
     42 ; CHECK-NEXT:    vmul.i16 d16, d17, d16
     43 ; CHECK-NEXT:    vmov r0, r1, d16
     44 ; CHECK-NEXT:    mov pc, lr
     45 	%tmp1 = load <4 x i16>, <4 x i16>* %A
     46 	%tmp2 = load <4 x i16>, <4 x i16>* %B
     47 	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
     48 	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
     49         %tmp5 = mul <4 x i16> %tmp3, %tmp4
     50 	ret <4 x i16> %tmp5
     51 }
     52 
     53 define <8 x i16> @vuzpi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind {
     54 ; CHECK-LABEL: vuzpi16_Qres:
     55 ; CHECK:       @ %bb.0:
     56 ; CHECK-NEXT:    vldr d17, [r1]
     57 ; CHECK-NEXT:    vldr d16, [r0]
     58 ; CHECK-NEXT:    vuzp.16 d16, d17
     59 ; CHECK-NEXT:    vmov r0, r1, d16
     60 ; CHECK-NEXT:    vmov r2, r3, d17
     61 ; CHECK-NEXT:    mov pc, lr
     62 	%tmp1 = load <4 x i16>, <4 x i16>* %A
     63 	%tmp2 = load <4 x i16>, <4 x i16>* %B
     64 	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
     65 	ret <8 x i16> %tmp3
     66 }
     67 
     68 ; VUZP.32 is equivalent to VTRN.32 for 64-bit vectors.
     69 
     70 define <16 x i8> @vuzpQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
     71 ; CHECK-LABEL: vuzpQi8:
     72 ; CHECK:       @ %bb.0:
     73 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
     74 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
     75 ; CHECK-NEXT:    vuzp.8 q9, q8
     76 ; CHECK-NEXT:    vadd.i8 q8, q9, q8
     77 ; CHECK-NEXT:    vmov r0, r1, d16
     78 ; CHECK-NEXT:    vmov r2, r3, d17
     79 ; CHECK-NEXT:    mov pc, lr
     80 	%tmp1 = load <16 x i8>, <16 x i8>* %A
     81 	%tmp2 = load <16 x i8>, <16 x i8>* %B
     82 	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
     83 	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
     84         %tmp5 = add <16 x i8> %tmp3, %tmp4
     85 	ret <16 x i8> %tmp5
     86 }
     87 
     88 define <32 x i8> @vuzpQi8_QQres(<16 x i8>* %A, <16 x i8>* %B) nounwind {
     89 ; CHECK-LABEL: vuzpQi8_QQres:
     90 ; CHECK:       @ %bb.0:
     91 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
     92 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
     93 ; CHECK-NEXT:    vuzp.8 q9, q8
     94 ; CHECK-NEXT:    vst1.8 {d18, d19}, [r0:128]!
     95 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
     96 ; CHECK-NEXT:    mov pc, lr
     97 	%tmp1 = load <16 x i8>, <16 x i8>* %A
     98 	%tmp2 = load <16 x i8>, <16 x i8>* %B
     99 	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
    100 	ret <32 x i8> %tmp3
    101 }
    102 
    103 define <8 x i16> @vuzpQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    104 ; CHECK-LABEL: vuzpQi16:
    105 ; CHECK:       @ %bb.0:
    106 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
    107 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
    108 ; CHECK-NEXT:    vuzp.16 q9, q8
    109 ; CHECK-NEXT:    vadd.i16 q8, q9, q8
    110 ; CHECK-NEXT:    vmov r0, r1, d16
    111 ; CHECK-NEXT:    vmov r2, r3, d17
    112 ; CHECK-NEXT:    mov pc, lr
    113 	%tmp1 = load <8 x i16>, <8 x i16>* %A
    114 	%tmp2 = load <8 x i16>, <8 x i16>* %B
    115 	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
    116 	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
    117         %tmp5 = add <8 x i16> %tmp3, %tmp4
    118 	ret <8 x i16> %tmp5
    119 }
    120 
    121 define <16 x i16> @vuzpQi16_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    122 ; CHECK-LABEL: vuzpQi16_QQres:
    123 ; CHECK:       @ %bb.0:
    124 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
    125 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
    126 ; CHECK-NEXT:    vuzp.16 q9, q8
    127 ; CHECK-NEXT:    vst1.16 {d18, d19}, [r0:128]!
    128 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
    129 ; CHECK-NEXT:    mov pc, lr
    130 	%tmp1 = load <8 x i16>, <8 x i16>* %A
    131 	%tmp2 = load <8 x i16>, <8 x i16>* %B
    132 	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
    133 	ret <16 x i16> %tmp3
    134 }
    135 
    136 define <4 x i32> @vuzpQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    137 ; CHECK-LABEL: vuzpQi32:
    138 ; CHECK:       @ %bb.0:
    139 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
    140 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
    141 ; CHECK-NEXT:    vuzp.32 q9, q8
    142 ; CHECK-NEXT:    vadd.i32 q8, q9, q8
    143 ; CHECK-NEXT:    vmov r0, r1, d16
    144 ; CHECK-NEXT:    vmov r2, r3, d17
    145 ; CHECK-NEXT:    mov pc, lr
    146 	%tmp1 = load <4 x i32>, <4 x i32>* %A
    147 	%tmp2 = load <4 x i32>, <4 x i32>* %B
    148 	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
    149 	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
    150         %tmp5 = add <4 x i32> %tmp3, %tmp4
    151 	ret <4 x i32> %tmp5
    152 }
    153 
    154 define <8 x i32> @vuzpQi32_QQres(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    155 ; CHECK-LABEL: vuzpQi32_QQres:
    156 ; CHECK:       @ %bb.0:
    157 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
    158 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
    159 ; CHECK-NEXT:    vuzp.32 q9, q8
    160 ; CHECK-NEXT:    vst1.32 {d18, d19}, [r0:128]!
    161 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
    162 ; CHECK-NEXT:    mov pc, lr
    163 	%tmp1 = load <4 x i32>, <4 x i32>* %A
    164 	%tmp2 = load <4 x i32>, <4 x i32>* %B
    165 	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
    166 	ret <8 x i32> %tmp3
    167 }
    168 
    169 define <4 x float> @vuzpQf(<4 x float>* %A, <4 x float>* %B) nounwind {
    170 ; CHECK-LABEL: vuzpQf:
    171 ; CHECK:       @ %bb.0:
    172 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
    173 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
    174 ; CHECK-NEXT:    vuzp.32 q9, q8
    175 ; CHECK-NEXT:    vadd.f32 q8, q9, q8
    176 ; CHECK-NEXT:    vmov r0, r1, d16
    177 ; CHECK-NEXT:    vmov r2, r3, d17
    178 ; CHECK-NEXT:    mov pc, lr
    179 	%tmp1 = load <4 x float>, <4 x float>* %A
    180 	%tmp2 = load <4 x float>, <4 x float>* %B
    181 	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
    182 	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
    183         %tmp5 = fadd <4 x float> %tmp3, %tmp4
    184 	ret <4 x float> %tmp5
    185 }
    186 
    187 define <8 x float> @vuzpQf_QQres(<4 x float>* %A, <4 x float>* %B) nounwind {
    188 ; CHECK-LABEL: vuzpQf_QQres:
    189 ; CHECK:       @ %bb.0:
    190 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
    191 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
    192 ; CHECK-NEXT:    vuzp.32 q9, q8
    193 ; CHECK-NEXT:    vst1.32 {d18, d19}, [r0:128]!
    194 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
    195 ; CHECK-NEXT:    mov pc, lr
    196 	%tmp1 = load <4 x float>, <4 x float>* %A
    197 	%tmp2 = load <4 x float>, <4 x float>* %B
    198 	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
    199 	ret <8 x float> %tmp3
    200 }
    201 
    202 ; Undef shuffle indices should not prevent matching to VUZP:
    203 
    204 define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
    205 ; CHECK-LABEL: vuzpi8_undef:
    206 ; CHECK:       @ %bb.0:
    207 ; CHECK-NEXT:    vldr d16, [r1]
    208 ; CHECK-NEXT:    vldr d17, [r0]
    209 ; CHECK-NEXT:    vuzp.8 d17, d16
    210 ; CHECK-NEXT:    vmul.i8 d16, d17, d16
    211 ; CHECK-NEXT:    vmov r0, r1, d16
    212 ; CHECK-NEXT:    mov pc, lr
    213 	%tmp1 = load <8 x i8>, <8 x i8>* %A
    214 	%tmp2 = load <8 x i8>, <8 x i8>* %B
    215 	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14>
    216 	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>
    217         %tmp5 = mul <8 x i8> %tmp3, %tmp4
    218 	ret <8 x i8> %tmp5
    219 }
    220 
    221 define <16 x i8> @vuzpi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
    222 ; CHECK-LABEL: vuzpi8_undef_Qres:
    223 ; CHECK:       @ %bb.0:
    224 ; CHECK-NEXT:    vldr d17, [r1]
    225 ; CHECK-NEXT:    vldr d16, [r0]
    226 ; CHECK-NEXT:    vuzp.8 d16, d17
    227 ; CHECK-NEXT:    vmov r0, r1, d16
    228 ; CHECK-NEXT:    vmov r2, r3, d17
    229 ; CHECK-NEXT:    mov pc, lr
    230 	%tmp1 = load <8 x i8>, <8 x i8>* %A
    231 	%tmp2 = load <8 x i8>, <8 x i8>* %B
    232 	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>
    233 	ret <16 x i8> %tmp3
    234 }
    235 
    236 define <8 x i16> @vuzpQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    237 ; CHECK-LABEL: vuzpQi16_undef:
    238 ; CHECK:       @ %bb.0:
    239 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
    240 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
    241 ; CHECK-NEXT:    vuzp.16 q9, q8
    242 ; CHECK-NEXT:    vadd.i16 q8, q9, q8
    243 ; CHECK-NEXT:    vmov r0, r1, d16
    244 ; CHECK-NEXT:    vmov r2, r3, d17
    245 ; CHECK-NEXT:    mov pc, lr
    246 	%tmp1 = load <8 x i16>, <8 x i16>* %A
    247 	%tmp2 = load <8 x i16>, <8 x i16>* %B
    248 	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14>
    249 	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15>
    250         %tmp5 = add <8 x i16> %tmp3, %tmp4
    251 	ret <8 x i16> %tmp5
    252 }
    253 
    254 define <16 x i16> @vuzpQi16_undef_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    255 ; CHECK-LABEL: vuzpQi16_undef_QQres:
    256 ; CHECK:       @ %bb.0:
    257 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
    258 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
    259 ; CHECK-NEXT:    vuzp.16 q9, q8
    260 ; CHECK-NEXT:    vst1.16 {d18, d19}, [r0:128]!
    261 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
    262 ; CHECK-NEXT:    mov pc, lr
    263 	%tmp1 = load <8 x i16>, <8 x i16>* %A
    264 	%tmp2 = load <8 x i16>, <8 x i16>* %B
    265 	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15>
    266 	ret <16 x i16> %tmp3
    267 }
    268 
    269 define <8 x i16> @vuzp_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) {
    270 ; CHECK-LABEL: vuzp_lower_shufflemask_undef:
    271 ; CHECK:       @ %bb.0: @ %entry
    272 ; CHECK-NEXT:    vldr d17, [r1]
    273 ; CHECK-NEXT:    vldr d16, [r0]
    274 ; CHECK-NEXT:    vorr q9, q8, q8
    275 ; CHECK-NEXT:    vuzp.16 q8, q9
    276 ; CHECK-NEXT:    vmov r0, r1, d18
    277 ; CHECK-NEXT:    vmov r2, r3, d19
    278 ; CHECK-NEXT:    mov pc, lr
    279 entry:
    280 	%tmp1 = load <4 x i16>, <4 x i16>* %A
    281 	%tmp2 = load <4 x i16>, <4 x i16>* %B
    282   %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
    283   ret <8 x i16> %0
    284 }
    285 
    286 define <4 x i32> @vuzp_lower_shufflemask_zeroed(<2 x i32>* %A, <2 x i32>* %B) {
    287 ; CHECK-LABEL: vuzp_lower_shufflemask_zeroed:
    288 ; CHECK:       @ %bb.0: @ %entry
    289 ; CHECK-NEXT:    vldr d17, [r1]
    290 ; CHECK-NEXT:    vldr d16, [r0]
    291 ; CHECK-NEXT:    vdup.32 q9, d16[0]
    292 ; CHECK-NEXT:    vuzp.32 q8, q9
    293 ; CHECK-NEXT:    vext.32 q8, q9, q9, #2
    294 ; CHECK-NEXT:    vmov r0, r1, d16
    295 ; CHECK-NEXT:    vmov r2, r3, d17
    296 ; CHECK-NEXT:    mov pc, lr
    297 entry:
    298   %tmp1 = load <2 x i32>, <2 x i32>* %A
    299 	%tmp2 = load <2 x i32>, <2 x i32>* %B
    300   %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 0, i32 0, i32 1, i32 3>
    301   ret <4 x i32> %0
    302 }
    303 
    304 define void @vuzp_rev_shufflemask_vtrn(<2 x i32>* %A, <2 x i32>* %B, <4 x i32>* %C) {
    305 ; CHECK-LABEL: vuzp_rev_shufflemask_vtrn:
    306 ; CHECK:       @ %bb.0: @ %entry
    307 ; CHECK-NEXT:    vldr d17, [r1]
    308 ; CHECK-NEXT:    vldr d16, [r0]
    309 ; CHECK-NEXT:    vrev64.32 q9, q8
    310 ; CHECK-NEXT:    vuzp.32 q8, q9
    311 ; CHECK-NEXT:    vst1.64 {d18, d19}, [r2]
    312 ; CHECK-NEXT:    mov pc, lr
    313 entry:
    314   %tmp1 = load <2 x i32>, <2 x i32>* %A
    315   %tmp2 = load <2 x i32>, <2 x i32>* %B
    316   %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
    317   store <4 x i32> %0, <4 x i32>* %C
    318   ret void
    319 }
    320 
    321 define <8 x i8> @cmpsel_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x i32> %cmp1) {
    322 ; In order to create the select we need to truncate the vcgt result from a vector of i32 to a vector of i8.
    323 ; This results in a build_vector with mismatched types. We will generate two vmovn.i32 instructions to
    324 ; truncate from i32 to i16 and one vmovn.i16 to perform the final truncation for i8.
    325 ; CHECK-LABEL: cmpsel_trunc:
    326 ; CHECK:       @ %bb.0:
    327 ; CHECK-NEXT:	add	r12, sp, #16
    328 ; CHECK-NEXT: 	vld1.64	{d16, d17}, [r12]
    329 ; CHECK-NEXT:	mov	r12, sp
    330 ; CHECK-NEXT:	vld1.64	{d18, d19}, [r12]
    331 ; CHECK-NEXT:	add	r12, sp, #48
    332 ; CHECK-NEXT:	vld1.64	{d20, d21}, [r12]
    333 ; CHECK-NEXT:	add	r12, sp, #32
    334 ; CHECK-NEXT:	vcgt.u32	q8, q10, q8
    335 ; CHECK-NEXT:	vld1.64	{d20, d21}, [r12]
    336 ; CHECK-NEXT:	vcgt.u32	q9, q10, q9
    337 ; CHECK-NEXT:	vmov	d20, r2, r3
    338 ; CHECK-NEXT:	vmovn.i32	d17, q8
    339 ; CHECK-NEXT:	vmovn.i32	d16, q9
    340 ; CHECK-NEXT:	vmov	d18, r0, r1
    341 ; CHECK-NEXT:	vmovn.i16	d16, q8
    342 ; CHECK-NEXT:	vbsl	d16, d18, d20
    343 ; CHECK-NEXT:	vmov	r0, r1, d16
    344 ; CHECK-NEXT:    mov pc, lr
    345   %c = icmp ult <8 x i32> %cmp0, %cmp1
    346   %res = select <8 x i1> %c, <8 x i8> %in0, <8 x i8> %in1
    347   ret <8 x i8> %res
    348 }
    349 
    350 ; Shuffle the result from the compare with a <4 x i8>.
    351 ; We need to extend the loaded <4 x i8> to <4 x i16>. Otherwise we wouldn't be able
    352 ; to perform the vuzp and get the vbsl mask.
    353 define <8 x i8> @vuzp_trunc_and_shuffle(<8 x i8> %tr0, <8 x i8> %tr1,
    354 ; CHECK-LABEL: vuzp_trunc_and_shuffle:
    355 ; CHECK:       @ %bb.0:
    356 ; CHECK-NEXT:	.save	{r11, lr}
    357 ; CHECK-NEXT:	push	{r11, lr}
    358 ; CHECK-NEXT:	add	r12, sp, #8
    359 ; CHECK-NEXT:	add	lr, sp, #24
    360 ; CHECK-NEXT:	vld1.64	{d16, d17}, [r12]
    361 ; CHECK-NEXT:	ldr	r12, [sp, #40]
    362 ; CHECK-NEXT:	vld1.64	{d18, d19}, [lr]
    363 ; CHECK-NEXT:	vcgt.u32	q8, q9, q8
    364 ; CHECK-NEXT:	vld1.32	{d18[0]}, [r12:32]
    365 ; CHECK-NEXT:	vmov.i8	d19, #0x7
    366 ; CHECK-NEXT:	vmovl.u8	q10, d18
    367 ; CHECK-NEXT:	vmovn.i32	d16, q8
    368 ; CHECK-NEXT:	vneg.s8	d17, d19
    369 ; CHECK-NEXT:	vmov	d18, r2, r3
    370 ; CHECK-NEXT:	vuzp.8	d16, d20
    371 ; CHECK-NEXT:	vshl.i8	d16, d16, #7
    372 ; CHECK-NEXT:	vshl.s8	d16, d16, d17
    373 ; CHECK-NEXT:	vmov	d17, r0, r1
    374 ; CHECK-NEXT:	vbsl	d16, d17, d18
    375 ; CHECK-NEXT:	vmov	r0, r1, d16
    376 ; CHECK-NEXT:	pop	{r11, lr}
    377 ; CHECK-NEXT:	mov	pc, lr
    378                          <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
    379   %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
    380   %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
    381   %c0 = icmp ult <4 x i32> %cmp0, %cmp1
    382   %c = shufflevector <4 x i1> %c0, <4 x i1> %cmp2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    383   %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
    384   ret <8 x i8> %rv
    385 }
    386 
    387 ; Use an undef value for the <4 x i8> that is being shuffled with the compare result.
    388 ; This produces a build_vector with some of the operands undefs.
    389 define <8 x i8> @vuzp_trunc_and_shuffle_undef_right(<8 x i8> %tr0, <8 x i8> %tr1,
    390 ; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_right:
    391 ; CHECK:       @ %bb.0:
    392 ; CHECK-NEXT:	mov	r12, sp
    393 ; CHECK-NEXT:	vld1.64	{d16, d17}, [r12]
    394 ; CHECK-NEXT:	add	r12, sp, #16
    395 ; CHECK-NEXT:	vld1.64	{d18, d19}, [r12]
    396 ; CHECK-NEXT:	vcgt.u32	q8, q9, q8
    397 ; CHECK-NEXT:	vmov.i8	d18, #0x7
    398 ; CHECK-NEXT:	vmovn.i32	d16, q8
    399 ; CHECK-NEXT:	vuzp.8	d16, d17
    400 ; CHECK-NEXT:	vneg.s8	d17, d18
    401 ; CHECK-NEXT:	vshl.i8	d16, d16, #7
    402 ; CHECK-NEXT:	vmov	d18, r2, r3
    403 ; CHECK-NEXT:	vshl.s8	d16, d16, d17
    404 ; CHECK-NEXT:	vmov	d17, r0, r1
    405 ; CHECK-NEXT:	vbsl	d16, d17, d18
    406 ; CHECK-NEXT:	vmov	r0, r1, d16
    407 ; CHECK-NEXT:	mov	pc, lr
    408                          <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
    409   %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
    410   %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
    411   %c0 = icmp ult <4 x i32> %cmp0, %cmp1
    412   %c = shufflevector <4 x i1> %c0, <4 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    413   %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
    414   ret <8 x i8> %rv
    415 }
    416 
    417 define <8 x i8> @vuzp_trunc_and_shuffle_undef_left(<8 x i8> %tr0, <8 x i8> %tr1,
    418 ; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_left:
    419 ; CHECK:       @ %bb.0:
    420 ; CHECK-NEXT:	mov	r12, sp
    421 ; CHECK-NEXT:	vld1.64	{d16, d17}, [r12]
    422 ; CHECK-NEXT:	add	r12, sp, #16
    423 ; CHECK-NEXT:	vld1.64	{d18, d19}, [r12]
    424 ; CHECK-NEXT:	vcgt.u32	q8, q9, q8
    425 ; CHECK-NEXT:	vldr	d18, .LCPI22_0
    426 ; CHECK-NEXT:	vmov.i8	d19, #0x7
    427 ; CHECK-NEXT:	vmovn.i32	d16, q8
    428 ; CHECK-NEXT:	vtbl.8	d16, {d16}, d18
    429 ; CHECK-NEXT:	vneg.s8	d17, d19
    430 ; CHECK-NEXT:	vmov	d18, r2, r3
    431 ; CHECK-NEXT:	vshl.i8	d16, d16, #7
    432 ; CHECK-NEXT:	vshl.s8	d16, d16, d17
    433 ; CHECK-NEXT:	vmov	d17, r0, r1
    434 ; CHECK-NEXT:	vbsl	d16, d17, d18
    435 ; CHECK-NEXT:	vmov	r0, r1, d16
    436 ; CHECK-NEXT:	mov	pc, lr
    437 ; CHECK-NEXT:    .p2align 3
    438 ; CHECK-NEXT:  @ %bb.1:
    439 ; CHECK-NEXT:  .LCPI22_0:
    440 ; CHECK-NEXT:    .byte 255 @ 0xff
    441 ; CHECK-NEXT:    .byte 255 @ 0xff
    442 ; CHECK-NEXT:    .byte 255 @ 0xff
    443 ; CHECK-NEXT:    .byte 255 @ 0xff
    444 ; CHECK-NEXT:    .byte 0 @ 0x0
    445 ; CHECK-NEXT:    .byte 2 @ 0x2
    446 ; CHECK-NEXT:    .byte 4 @ 0x4
    447 ; CHECK-NEXT:    .byte 6 @ 0x6
    448                          <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
    449   %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
    450   %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
    451   %c0 = icmp ult <4 x i32> %cmp0, %cmp1
    452   %c = shufflevector <4 x i1> undef, <4 x i1> %c0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    453   %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
    454   ret <8 x i8> %rv
    455 }
    456 
    457 ; We're using large data types here, and we have to fill with undef values until we
    458 ; get some vector size that we can represent.
    459 define <10 x i8> @vuzp_wide_type(<10 x i8> %tr0, <10 x i8> %tr1,
    460 ; CHECK-LABEL: vuzp_wide_type:
    461 ; CHECK:       @ %bb.0:
    462 ; CHECK-NEXT:	.save	{r4, lr}
    463 ; CHECK-NEXT:	push	{r4, lr}
    464 ; CHECK-NEXT:	add	r12, sp, #32
    465 ; CHECK-NEXT:	add	lr, sp, #48
    466 ; CHECK-NEXT:	vld1.32	{d17[0]}, [r12:32]
    467 ; CHECK-NEXT:	add	r12, sp, #24
    468 ; CHECK-NEXT:	vld1.32	{d16[0]}, [r12:32]
    469 ; CHECK-NEXT:	add	r12, sp, #56
    470 ; CHECK-NEXT:	vld1.32	{d19[0]}, [r12:32]
    471 ; CHECK-NEXT:	ldr	r12, [sp, #68]
    472 ; CHECK-NEXT:	vld1.32	{d18[0]}, [lr:32]
    473 ; CHECK-NEXT:	add	lr, sp, #40
    474 ; CHECK-NEXT:	vld1.32	{d20[0]}, [lr:32]
    475 ; CHECK-NEXT:	ldr	r4, [r12]
    476 ; CHECK-NEXT:	vmov.32	d23[0], r4
    477 ; CHECK-NEXT:	add	r4, sp, #64
    478 ; CHECK-NEXT:	vld1.32	{d24[0]}, [r4:32]
    479 ; CHECK-NEXT:	add	r4, sp, #36
    480 ; CHECK-NEXT:	vld1.32	{d17[1]}, [r4:32]
    481 ; CHECK-NEXT:	add	r4, sp, #28
    482 ; CHECK-NEXT:	vcgt.u32	q10, q12, q10
    483 ; CHECK-NEXT:	vmov.u8	lr, d23[3]
    484 ; CHECK-NEXT:	vld1.32	{d16[1]}, [r4:32]
    485 ; CHECK-NEXT:	add	r4, sp, #60
    486 ; CHECK-NEXT:	vld1.32	{d19[1]}, [r4:32]
    487 ; CHECK-NEXT:	add	r4, sp, #52
    488 ; CHECK-NEXT:	vld1.32	{d18[1]}, [r4:32]
    489 ; CHECK-NEXT:	add	r4, r12, #4
    490 ; CHECK-NEXT:	vcgt.u32	q8, q9, q8
    491 ; CHECK-NEXT:	vmovn.i32	d19, q10
    492 ; CHECK-NEXT:	vldr	d20, .LCPI23_0
    493 ; CHECK-NEXT:	vmovn.i32	d18, q8
    494 ; CHECK-NEXT:	vmovn.i16	d22, q9
    495 ; CHECK-NEXT:	vmov.i8	q9, #0x7
    496 ; CHECK-NEXT:	vmov.8	d17[0], lr
    497 ; CHECK-NEXT:	vneg.s8	q9, q9
    498 ; CHECK-NEXT:	vtbl.8	d16, {d22, d23}, d20
    499 ; CHECK-NEXT:	vld1.8	{d17[1]}, [r4]
    500 ; CHECK-NEXT:	add	r4, sp, #8
    501 ; CHECK-NEXT:	vshl.i8	q8, q8, #7
    502 ; CHECK-NEXT:	vld1.64	{d20, d21}, [r4]
    503 ; CHECK-NEXT:	vshl.s8	q8, q8, q9
    504 ; CHECK-NEXT:	vmov	d19, r2, r3
    505 ; CHECK-NEXT:	vmov	d18, r0, r1
    506 ; CHECK-NEXT:	vbsl	q8, q9, q10
    507 ; CHECK-NEXT:	vmov	r0, r1, d16
    508 ; CHECK-NEXT:	vmov	r2, r3, d17
    509 ; CHECK-NEXT:	pop	{r4, lr}
    510 ; CHECK-NEXT:	mov	pc, lr
    511 ; CHECK-NEXT:    .p2align 3
    512 ; CHECK-NEXT:  @ %bb.1:
    513 ; CHECK-NEXT:  .LCPI23_0:
    514 ; CHECK-NEXT:    .byte 0 @ 0x0
    515 ; CHECK-NEXT:    .byte 1 @ 0x1
    516 ; CHECK-NEXT:    .byte 2 @ 0x2
    517 ; CHECK-NEXT:    .byte 3 @ 0x3
    518 ; CHECK-NEXT:    .byte 4 @ 0x4
    519 ; CHECK-NEXT:    .byte 8 @ 0x8
    520 ; CHECK-NEXT:    .byte 9 @ 0x9
    521 ; CHECK-NEXT:    .byte 10 @ 0xa
    522                             <5 x i32> %cmp0, <5 x i32> %cmp1, <5 x i8> *%cmp2_ptr) {
    523   %cmp2_load = load <5 x i8>, <5 x i8> * %cmp2_ptr, align 4
    524   %cmp2 = trunc <5 x i8> %cmp2_load to <5 x i1>
    525   %c0 = icmp ult <5 x i32> %cmp0, %cmp1
    526   %c = shufflevector <5 x i1> %c0, <5 x i1> %cmp2, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
    527   %rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1
    528   ret <10 x i8> %rv
    529 }
    530 
    531 %struct.uint8x8x2_t = type { [2 x <8 x i8>] }
    532 define %struct.uint8x8x2_t @vuzp_extract_subvector(<16 x i8> %t) #0 {
    533 ; CHECK-LABEL: vuzp_extract_subvector:
    534 ; CHECK:       @ %bb.0:
    535 ; CHECK-NEXT:    vmov d17, r2, r3
    536 ; CHECK-NEXT:    vmov d16, r0, r1
    537 ; CHECK-NEXT:    vorr d18, d17, d17
    538 ; CHECK-NEXT:    vuzp.8 d16, d18
    539 ; CHECK-NEXT:    vmov r0, r1, d16
    540 ; CHECK-NEXT:    vmov r2, r3, d18
    541 ; CHECK-NEXT:    mov pc, lr
    542 
    543   %vuzp.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
    544   %vuzp1.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
    545   %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0
    546   %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1
    547   ret %struct.uint8x8x2_t %.fca.0.1.insert
    548 }
    549