Home | History | Annotate | Download | only in ARM
      1 ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
      2 
      3 define <8 x i8> @vtrni8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
      4 ; CHECK-LABEL: vtrni8:
      5 ; CHECK:       @ BB#0:
      6 ; CHECK-NEXT:    vldr d16, [r1]
      7 ; CHECK-NEXT:    vldr d17, [r0]
      8 ; CHECK-NEXT:    vtrn.8 d17, d16
      9 ; CHECK-NEXT:    vadd.i8 d16, d17, d16
     10 ; CHECK-NEXT:    vmov r0, r1, d16
     11 ; CHECK-NEXT:    mov pc, lr
     12 	%tmp1 = load <8 x i8>, <8 x i8>* %A
     13 	%tmp2 = load <8 x i8>, <8 x i8>* %B
     14 	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
     15 	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
     16         %tmp5 = add <8 x i8> %tmp3, %tmp4
     17 	ret <8 x i8> %tmp5
     18 }
     19 
     20 define <16 x i8> @vtrni8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
     21 ; CHECK-LABEL: vtrni8_Qres:
     22 ; CHECK:       @ BB#0:
     23 ; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
     24 ; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
     25 ; CHECK-NEXT:    vtrn.8 [[LDR0]], [[LDR1]]
     26 ; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
     27 ; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
     28 ; CHECK-NEXT:    mov pc, lr
     29 	%tmp1 = load <8 x i8>, <8 x i8>* %A
     30 	%tmp2 = load <8 x i8>, <8 x i8>* %B
     31 	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14, i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
     32 	ret <16 x i8> %tmp3
     33 }
     34 
     35 define <4 x i16> @vtrni16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
     36 ; CHECK-LABEL: vtrni16:
     37 ; CHECK:       @ BB#0:
     38 ; CHECK-NEXT:    vldr d16, [r1]
     39 ; CHECK-NEXT:    vldr d17, [r0]
     40 ; CHECK-NEXT:    vtrn.16 d17, d16
     41 ; CHECK-NEXT:    vadd.i16 d16, d17, d16
     42 ; CHECK-NEXT:    vmov r0, r1, d16
     43 ; CHECK-NEXT:    mov pc, lr
     44 	%tmp1 = load <4 x i16>, <4 x i16>* %A
     45 	%tmp2 = load <4 x i16>, <4 x i16>* %B
     46 	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
     47 	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
     48         %tmp5 = add <4 x i16> %tmp3, %tmp4
     49 	ret <4 x i16> %tmp5
     50 }
     51 
     52 define <8 x i16> @vtrni16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind {
     53 ; CHECK-LABEL: vtrni16_Qres:
     54 ; CHECK:       @ BB#0:
     55 ; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
     56 ; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
     57 ; CHECK-NEXT:    vtrn.16 [[LDR0]], [[LDR1]]
     58 ; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
     59 ; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
     60 ; CHECK-NEXT:    mov pc, lr
     61 	%tmp1 = load <4 x i16>, <4 x i16>* %A
     62 	%tmp2 = load <4 x i16>, <4 x i16>* %B
     63 	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 4, i32 2, i32 6, i32 1, i32 5, i32 3, i32 7>
     64 	ret <8 x i16> %tmp3
     65 }
     66 
     67 define <2 x i32> @vtrni32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
     68 ; CHECK-LABEL: vtrni32:
     69 ; CHECK:       @ BB#0:
     70 ; CHECK-NEXT:    vldr d16, [r1]
     71 ; CHECK-NEXT:    vldr d17, [r0]
     72 ; CHECK-NEXT:    vtrn.32 d17, d16
     73 ; CHECK-NEXT:    vadd.i32 d16, d17, d16
     74 ; CHECK-NEXT:    vmov r0, r1, d16
     75 ; CHECK-NEXT:    mov pc, lr
     76 	%tmp1 = load <2 x i32>, <2 x i32>* %A
     77 	%tmp2 = load <2 x i32>, <2 x i32>* %B
     78 	%tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 0, i32 2>
     79 	%tmp4 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 3>
     80         %tmp5 = add <2 x i32> %tmp3, %tmp4
     81 	ret <2 x i32> %tmp5
     82 }
     83 
     84 define <4 x i32> @vtrni32_Qres(<2 x i32>* %A, <2 x i32>* %B) nounwind {
     85 ; CHECK-LABEL: vtrni32_Qres:
     86 ; CHECK:       @ BB#0:
     87 ; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
     88 ; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
     89 ; CHECK-NEXT:    vtrn.32 [[LDR0]], [[LDR1]]
     90 ; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
     91 ; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
     92 ; CHECK-NEXT:    mov pc, lr
     93 	%tmp1 = load <2 x i32>, <2 x i32>* %A
     94 	%tmp2 = load <2 x i32>, <2 x i32>* %B
     95 	%tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
     96 	ret <4 x i32> %tmp3
     97 }
     98 
     99 define <2 x float> @vtrnf(<2 x float>* %A, <2 x float>* %B) nounwind {
    100 ; CHECK-LABEL: vtrnf:
    101 ; CHECK:       @ BB#0:
    102 ; CHECK-NEXT:    vldr d16, [r1]
    103 ; CHECK-NEXT:    vldr d17, [r0]
    104 ; CHECK-NEXT:    vtrn.32 d17, d16
    105 ; CHECK-NEXT:    vadd.f32 d16, d17, d16
    106 ; CHECK-NEXT:    vmov r0, r1, d16
    107 ; CHECK-NEXT:    mov pc, lr
    108 	%tmp1 = load <2 x float>, <2 x float>* %A
    109 	%tmp2 = load <2 x float>, <2 x float>* %B
    110 	%tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 0, i32 2>
    111 	%tmp4 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 1, i32 3>
    112         %tmp5 = fadd <2 x float> %tmp3, %tmp4
    113 	ret <2 x float> %tmp5
    114 }
    115 
    116 define <4 x float> @vtrnf_Qres(<2 x float>* %A, <2 x float>* %B) nounwind {
    117 ; CHECK-LABEL: vtrnf_Qres:
    118 ; CHECK:       @ BB#0:
    119 ; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
    120 ; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
    121 ; CHECK-NEXT:    vtrn.32 [[LDR0]], [[LDR1]]
    122 ; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
    123 ; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
    124 ; CHECK-NEXT:    mov pc, lr
    125 	%tmp1 = load <2 x float>, <2 x float>* %A
    126 	%tmp2 = load <2 x float>, <2 x float>* %B
    127 	%tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
    128 	ret <4 x float> %tmp3
    129 }
    130 
    131 define <16 x i8> @vtrnQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
    132 ; CHECK-LABEL: vtrnQi8:
    133 ; CHECK:       @ BB#0:
    134 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
    135 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
    136 ; CHECK-NEXT:    vtrn.8 q9, q8
    137 ; CHECK-NEXT:    vadd.i8 q8, q9, q8
    138 ; CHECK-NEXT:    vmov r0, r1, d16
    139 ; CHECK-NEXT:    vmov r2, r3, d17
    140 ; CHECK-NEXT:    mov pc, lr
    141 	%tmp1 = load <16 x i8>, <16 x i8>* %A
    142 	%tmp2 = load <16 x i8>, <16 x i8>* %B
    143 	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
    144 	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
    145         %tmp5 = add <16 x i8> %tmp3, %tmp4
    146 	ret <16 x i8> %tmp5
    147 }
    148 
    149 define <32 x i8> @vtrnQi8_QQres(<16 x i8>* %A, <16 x i8>* %B) nounwind {
    150 ; CHECK-LABEL: vtrnQi8_QQres:
    151 ; CHECK:       @ BB#0:
    152 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
    153 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
    154 ; CHECK-NEXT:    vtrn.8 q9, q8
    155 ; CHECK-NEXT:    vst1.8 {d18, d19}, [r0:128]!
    156 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
    157 ; CHECK-NEXT:    mov pc, lr
    158 	%tmp1 = load <16 x i8>, <16 x i8>* %A
    159 	%tmp2 = load <16 x i8>, <16 x i8>* %B
    160 	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30, i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
    161 	ret <32 x i8> %tmp3
    162 }
    163 
    164 define <8 x i16> @vtrnQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    165 ; CHECK-LABEL: vtrnQi16:
    166 ; CHECK:       @ BB#0:
    167 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
    168 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
    169 ; CHECK-NEXT:    vtrn.16 q9, q8
    170 ; CHECK-NEXT:    vadd.i16 q8, q9, q8
    171 ; CHECK-NEXT:    vmov r0, r1, d16
    172 ; CHECK-NEXT:    vmov r2, r3, d17
    173 ; CHECK-NEXT:    mov pc, lr
    174 	%tmp1 = load <8 x i16>, <8 x i16>* %A
    175 	%tmp2 = load <8 x i16>, <8 x i16>* %B
    176 	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
    177 	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
    178         %tmp5 = add <8 x i16> %tmp3, %tmp4
    179 	ret <8 x i16> %tmp5
    180 }
    181 
    182 define <16 x i16> @vtrnQi16_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    183 ; CHECK-LABEL: vtrnQi16_QQres:
    184 ; CHECK:       @ BB#0:
    185 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
    186 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
    187 ; CHECK-NEXT:    vtrn.16 q9, q8
    188 ; CHECK-NEXT:    vst1.16 {d18, d19}, [r0:128]!
    189 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
    190 ; CHECK-NEXT:    mov pc, lr
    191 	%tmp1 = load <8 x i16>, <8 x i16>* %A
    192 	%tmp2 = load <8 x i16>, <8 x i16>* %B
    193 	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14, i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
    194 	ret <16 x i16> %tmp3
    195 }
    196 
    197 define <4 x i32> @vtrnQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    198 ; CHECK-LABEL: vtrnQi32:
    199 ; CHECK:       @ BB#0:
    200 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
    201 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
    202 ; CHECK-NEXT:    vtrn.32 q9, q8
    203 ; CHECK-NEXT:    vadd.i32 q8, q9, q8
    204 ; CHECK-NEXT:    vmov r0, r1, d16
    205 ; CHECK-NEXT:    vmov r2, r3, d17
    206 ; CHECK-NEXT:    mov pc, lr
    207 	%tmp1 = load <4 x i32>, <4 x i32>* %A
    208 	%tmp2 = load <4 x i32>, <4 x i32>* %B
    209 	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
    210 	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
    211         %tmp5 = add <4 x i32> %tmp3, %tmp4
    212 	ret <4 x i32> %tmp5
    213 }
    214 
    215 define <8 x i32> @vtrnQi32_QQres(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    216 ; CHECK-LABEL: vtrnQi32_QQres:
    217 ; CHECK:       @ BB#0:
    218 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
    219 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
    220 ; CHECK-NEXT:    vtrn.32 q9, q8
    221 ; CHECK-NEXT:    vst1.32 {d18, d19}, [r0:128]!
    222 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
    223 ; CHECK-NEXT:    mov pc, lr
    224 	%tmp1 = load <4 x i32>, <4 x i32>* %A
    225 	%tmp2 = load <4 x i32>, <4 x i32>* %B
    226 	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <8 x i32> <i32 0, i32 4, i32 2, i32 6, i32 1, i32 5, i32 3, i32 7>
    227 	ret <8 x i32> %tmp3
    228 }
    229 
    230 define <4 x float> @vtrnQf(<4 x float>* %A, <4 x float>* %B) nounwind {
    231 ; CHECK-LABEL: vtrnQf:
    232 ; CHECK:       @ BB#0:
    233 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
    234 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
    235 ; CHECK-NEXT:    vtrn.32 q9, q8
    236 ; CHECK-NEXT:    vadd.f32 q8, q9, q8
    237 ; CHECK-NEXT:    vmov r0, r1, d16
    238 ; CHECK-NEXT:    vmov r2, r3, d17
    239 ; CHECK-NEXT:    mov pc, lr
    240 	%tmp1 = load <4 x float>, <4 x float>* %A
    241 	%tmp2 = load <4 x float>, <4 x float>* %B
    242 	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
    243 	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
    244         %tmp5 = fadd <4 x float> %tmp3, %tmp4
    245 	ret <4 x float> %tmp5
    246 }
    247 
    248 define <8 x float> @vtrnQf_QQres(<4 x float>* %A, <4 x float>* %B) nounwind {
    249 ; CHECK-LABEL: vtrnQf_QQres:
    250 ; CHECK:       @ BB#0:
    251 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
    252 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
    253 ; CHECK-NEXT:    vtrn.32 q9, q8
    254 ; CHECK-NEXT:    vst1.32 {d18, d19}, [r0:128]!
    255 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
    256 ; CHECK-NEXT:    mov pc, lr
    257 	%tmp1 = load <4 x float>, <4 x float>* %A
    258 	%tmp2 = load <4 x float>, <4 x float>* %B
    259 	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <8 x i32> <i32 0, i32 4, i32 2, i32 6, i32 1, i32 5, i32 3, i32 7>
    260 	ret <8 x float> %tmp3
    261 }
    262 
    263 
    264 define <8 x i8> @vtrni8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
    265 ; CHECK-LABEL: vtrni8_undef:
    266 ; CHECK:       @ BB#0:
    267 ; CHECK-NEXT:    vldr d16, [r1]
    268 ; CHECK-NEXT:    vldr d17, [r0]
    269 ; CHECK-NEXT:    vtrn.8 d17, d16
    270 ; CHECK-NEXT:    vadd.i8 d16, d17, d16
    271 ; CHECK-NEXT:    vmov r0, r1, d16
    272 ; CHECK-NEXT:    mov pc, lr
    273 	%tmp1 = load <8 x i8>, <8 x i8>* %A
    274 	%tmp2 = load <8 x i8>, <8 x i8>* %B
    275 	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 undef, i32 2, i32 10, i32 undef, i32 12, i32 6, i32 14>
    276 	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 undef, i32 undef, i32 15>
    277         %tmp5 = add <8 x i8> %tmp3, %tmp4
    278 	ret <8 x i8> %tmp5
    279 }
    280 
    281 define <16 x i8> @vtrni8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
    282 ; CHECK-LABEL: vtrni8_undef_Qres:
    283 ; CHECK:       @ BB#0:
    284 ; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
    285 ; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
    286 ; CHECK-NEXT:    vtrn.8 [[LDR0]], [[LDR1]]
    287 ; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
    288 ; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
    289 ; CHECK-NEXT:    mov pc, lr
    290 	%tmp1 = load <8 x i8>, <8 x i8>* %A
    291 	%tmp2 = load <8 x i8>, <8 x i8>* %B
    292 	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 undef, i32 2, i32 10, i32 undef, i32 12, i32 6, i32 14, i32 1, i32 9, i32 3, i32 11, i32 5, i32 undef, i32 undef, i32 15>
    293 	ret <16 x i8> %tmp3
    294 }
    295 
    296 define <8 x i16> @vtrnQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    297 ; CHECK-LABEL: vtrnQi16_undef:
    298 ; CHECK:       @ BB#0:
    299 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
    300 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
    301 ; CHECK-NEXT:    vtrn.16 q9, q8
    302 ; CHECK-NEXT:    vadd.i16 q8, q9, q8
    303 ; CHECK-NEXT:    vmov r0, r1, d16
    304 ; CHECK-NEXT:    vmov r2, r3, d17
    305 ; CHECK-NEXT:    mov pc, lr
    306 	%tmp1 = load <8 x i16>, <8 x i16>* %A
    307 	%tmp2 = load <8 x i16>, <8 x i16>* %B
    308 	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 6, i32 14>
    309 	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 undef, i32 3, i32 11, i32 5, i32 13, i32 undef, i32 undef>
    310         %tmp5 = add <8 x i16> %tmp3, %tmp4
    311 	ret <8 x i16> %tmp5
    312 }
    313 
    314 define <16 x i16> @vtrnQi16_undef_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    315 ; CHECK-LABEL: vtrnQi16_undef_QQres:
    316 ; CHECK:       @ BB#0:
    317 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
    318 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
    319 ; CHECK-NEXT:    vtrn.16 q9, q8
    320 ; CHECK-NEXT:    vst1.16 {d18, d19}, [r0:128]!
    321 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
    322 ; CHECK-NEXT:    mov pc, lr
    323 	%tmp1 = load <8 x i16>, <8 x i16>* %A
    324 	%tmp2 = load <8 x i16>, <8 x i16>* %B
    325 	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 6, i32 14, i32 1, i32 undef, i32 3, i32 11, i32 5, i32 13, i32 undef, i32 undef>
    326 	ret <16 x i16> %tmp3
    327 }
    328 
    329 define <8 x i16> @vtrn_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) {
    330 entry:
    331   ; CHECK-LABEL: vtrn_lower_shufflemask_undef
    332   ; CHECK: vtrn
    333 	%tmp1 = load <4 x i16>, <4 x i16>* %A
    334 	%tmp2 = load <4 x i16>, <4 x i16>* %B
    335   %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 5, i32 3, i32 7>
    336   ret <8 x i16> %0
    337 }
    338 
    339 ; Here we get a build_vector node, where all the incoming extract_element
    340 ; values do modify the type. However, we get different input types, as some of
    341 ; them get truncated from i32 to i8 (from comparing cmp0 with cmp1) and some of
    342 ; them get truncated from i16 to i8 (from comparing cmp2 with cmp3).
    343 define <8 x i8> @vtrn_mismatched_builvector0(<8 x i8> %tr0, <8 x i8> %tr1,
    344                                              <4 x i32> %cmp0, <4 x i32> %cmp1,
    345                                              <4 x i16> %cmp2, <4 x i16> %cmp3) {
    346   ; CHECK-LABEL: vtrn_mismatched_builvector0
    347   ; CHECK: vmovn.i32
    348   ; CHECK: vtrn
    349   ; CHECK: vbsl
    350   %c0 = icmp ult <4 x i32> %cmp0, %cmp1
    351   %c1 = icmp ult <4 x i16> %cmp2, %cmp3
    352   %c = shufflevector <4 x i1> %c0, <4 x i1> %c1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
    353   %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
    354   ret <8 x i8> %rv
    355 }
    356 
    357 ; Here we get a build_vector node, where half the incoming extract_element
    358 ; values do not modify the type (the values form cmp2), but half of them do
    359 ; (from the icmp operation).
    360 define <8 x i8> @vtrn_mismatched_builvector1(<8 x i8> %tr0, <8 x i8> %tr1,
    361                            <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
    362   ; CHECK-LABEL: vtrn_mismatched_builvector1
    363   ; We need to extend the 4 x i8 to 4 x i16 in order to perform the vtrn
    364   ; CHECK: vmovl
    365   ; CHECK: vtrn.8
    366   ; CHECK: vbsl
    367   %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
    368   %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
    369   %c0 = icmp ult <4 x i32> %cmp0, %cmp1
    370   %c = shufflevector <4 x i1> %c0, <4 x i1> %cmp2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
    371   %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
    372   ret <8 x i8> %rv
    373 }
    374 
    375 ; Negative test that should not generate a vtrn
    376 define void @lower_twice_no_vtrn(<4 x i16>* %A, <4 x i16>* %B, <8 x i16>* %C) {
    377 entry:
    378   ; CHECK-LABEL: lower_twice_no_vtrn
    379   ; CHECK: @ BB#0:
    380   ; CHECK-NOT: vtrn
    381   ; CHECK: mov pc, lr
    382   %tmp1 = load <4 x i16>, <4 x i16>* %A
    383   %tmp2 = load <4 x i16>, <4 x i16>* %B
    384   %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 5, i32 3, i32 7, i32 1, i32 5, i32 3, i32 7>
    385   store <8 x i16> %0, <8 x i16>* %C
    386   ret void
    387 }
    388 
    389 ; Negative test that should not generate a vtrn
    390 define void @upper_twice_no_vtrn(<4 x i16>* %A, <4 x i16>* %B, <8 x i16>* %C) {
    391 entry:
    392   ; CHECK-LABEL: upper_twice_no_vtrn
    393   ; CHECK: @ BB#0:
    394   ; CHECK-NOT: vtrn
    395   ; CHECK: mov pc, lr
    396   %tmp1 = load <4 x i16>, <4 x i16>* %A
    397   %tmp2 = load <4 x i16>, <4 x i16>* %B
    398   %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 2, i32 6, i32 0, i32 4, i32 2, i32 6>
    399   store <8 x i16> %0, <8 x i16>* %C
    400   ret void
    401 }
    402