Home | History | Annotate | Download | only in AArch64
      1 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
      2 
      3 define void @test_vext_s8() nounwind ssp {
      4   ; CHECK-LABEL: test_vext_s8:
      5   ; CHECK: {{ext.8.*#1}}
      6   %xS8x8 = alloca <8 x i8>, align 8
      7   %__a = alloca <8 x i8>, align 8
      8   %__b = alloca <8 x i8>, align 8
      9   %tmp = load <8 x i8>, <8 x i8>* %xS8x8, align 8
     10   store <8 x i8> %tmp, <8 x i8>* %__a, align 8
     11   %tmp1 = load <8 x i8>, <8 x i8>* %xS8x8, align 8
     12   store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
     13   %tmp2 = load <8 x i8>, <8 x i8>* %__a, align 8
     14   %tmp3 = load <8 x i8>, <8 x i8>* %__b, align 8
     15   %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
     16   store <8 x i8> %vext, <8 x i8>* %xS8x8, align 8
     17   ret void
     18 }
     19 
     20 define void @test_vext_u8() nounwind ssp {
     21   ; CHECK-LABEL: test_vext_u8:
     22   ; CHECK: {{ext.8.*#2}}
     23   %xU8x8 = alloca <8 x i8>, align 8
     24   %__a = alloca <8 x i8>, align 8
     25   %__b = alloca <8 x i8>, align 8
     26   %tmp = load <8 x i8>, <8 x i8>* %xU8x8, align 8
     27   store <8 x i8> %tmp, <8 x i8>* %__a, align 8
     28   %tmp1 = load <8 x i8>, <8 x i8>* %xU8x8, align 8
     29   store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
     30   %tmp2 = load <8 x i8>, <8 x i8>* %__a, align 8
     31   %tmp3 = load <8 x i8>, <8 x i8>* %__b, align 8
     32   %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
     33   store <8 x i8> %vext, <8 x i8>* %xU8x8, align 8
     34   ret void
     35 }
     36 
     37 define void @test_vext_p8() nounwind ssp {
     38   ; CHECK-LABEL: test_vext_p8:
     39   ; CHECK: {{ext.8.*#3}}
     40   %xP8x8 = alloca <8 x i8>, align 8
     41   %__a = alloca <8 x i8>, align 8
     42   %__b = alloca <8 x i8>, align 8
     43   %tmp = load <8 x i8>, <8 x i8>* %xP8x8, align 8
     44   store <8 x i8> %tmp, <8 x i8>* %__a, align 8
     45   %tmp1 = load <8 x i8>, <8 x i8>* %xP8x8, align 8
     46   store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
     47   %tmp2 = load <8 x i8>, <8 x i8>* %__a, align 8
     48   %tmp3 = load <8 x i8>, <8 x i8>* %__b, align 8
     49   %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
     50   store <8 x i8> %vext, <8 x i8>* %xP8x8, align 8
     51   ret void
     52 }
     53 
     54 define void @test_vext_s16() nounwind ssp {
     55   ; CHECK-LABEL: test_vext_s16:
     56   ; CHECK: {{ext.8.*#2}}
     57   %xS16x4 = alloca <4 x i16>, align 8
     58   %__a = alloca <4 x i16>, align 8
     59   %__b = alloca <4 x i16>, align 8
     60   %tmp = load <4 x i16>, <4 x i16>* %xS16x4, align 8
     61   store <4 x i16> %tmp, <4 x i16>* %__a, align 8
     62   %tmp1 = load <4 x i16>, <4 x i16>* %xS16x4, align 8
     63   store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
     64   %tmp2 = load <4 x i16>, <4 x i16>* %__a, align 8
     65   %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
     66   %tmp4 = load <4 x i16>, <4 x i16>* %__b, align 8
     67   %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
     68   %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
     69   %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
     70   %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
     71   store <4 x i16> %vext, <4 x i16>* %xS16x4, align 8
     72   ret void
     73 }
     74 
     75 define void @test_vext_u16() nounwind ssp {
     76   ; CHECK-LABEL: test_vext_u16:
     77   ; CHECK: {{ext.8.*#4}}
     78   %xU16x4 = alloca <4 x i16>, align 8
     79   %__a = alloca <4 x i16>, align 8
     80   %__b = alloca <4 x i16>, align 8
     81   %tmp = load <4 x i16>, <4 x i16>* %xU16x4, align 8
     82   store <4 x i16> %tmp, <4 x i16>* %__a, align 8
     83   %tmp1 = load <4 x i16>, <4 x i16>* %xU16x4, align 8
     84   store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
     85   %tmp2 = load <4 x i16>, <4 x i16>* %__a, align 8
     86   %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
     87   %tmp4 = load <4 x i16>, <4 x i16>* %__b, align 8
     88   %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
     89   %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
     90   %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
     91   %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
     92   store <4 x i16> %vext, <4 x i16>* %xU16x4, align 8
     93   ret void
     94 }
     95 
     96 define void @test_vext_p16() nounwind ssp {
     97   ; CHECK-LABEL: test_vext_p16:
     98   ; CHECK: {{ext.8.*#6}}
     99   %xP16x4 = alloca <4 x i16>, align 8
    100   %__a = alloca <4 x i16>, align 8
    101   %__b = alloca <4 x i16>, align 8
    102   %tmp = load <4 x i16>, <4 x i16>* %xP16x4, align 8
    103   store <4 x i16> %tmp, <4 x i16>* %__a, align 8
    104   %tmp1 = load <4 x i16>, <4 x i16>* %xP16x4, align 8
    105   store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
    106   %tmp2 = load <4 x i16>, <4 x i16>* %__a, align 8
    107   %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
    108   %tmp4 = load <4 x i16>, <4 x i16>* %__b, align 8
    109   %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
    110   %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
    111   %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
    112   %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
    113   store <4 x i16> %vext, <4 x i16>* %xP16x4, align 8
    114   ret void
    115 }
    116 
    117 define void @test_vext_s32() nounwind ssp {
    118   ; CHECK-LABEL: test_vext_s32:
    119   ; CHECK: {{rev64.2s.*}}
    120   %xS32x2 = alloca <2 x i32>, align 8
    121   %__a = alloca <2 x i32>, align 8
    122   %__b = alloca <2 x i32>, align 8
    123   %tmp = load <2 x i32>, <2 x i32>* %xS32x2, align 8
    124   store <2 x i32> %tmp, <2 x i32>* %__a, align 8
    125   %tmp1 = load <2 x i32>, <2 x i32>* %xS32x2, align 8
    126   store <2 x i32> %tmp1, <2 x i32>* %__b, align 8
    127   %tmp2 = load <2 x i32>, <2 x i32>* %__a, align 8
    128   %tmp3 = bitcast <2 x i32> %tmp2 to <8 x i8>
    129   %tmp4 = load <2 x i32>, <2 x i32>* %__b, align 8
    130   %tmp5 = bitcast <2 x i32> %tmp4 to <8 x i8>
    131   %tmp6 = bitcast <8 x i8> %tmp3 to <2 x i32>
    132   %tmp7 = bitcast <8 x i8> %tmp5 to <2 x i32>
    133   %vext = shufflevector <2 x i32> %tmp6, <2 x i32> %tmp7, <2 x i32> <i32 1, i32 2>
    134   store <2 x i32> %vext, <2 x i32>* %xS32x2, align 8
    135   ret void
    136 }
    137 
    138 define void @test_vext_u32() nounwind ssp {
    139   ; CHECK-LABEL: test_vext_u32:
    140   ; CHECK: {{rev64.2s.*}}
    141   %xU32x2 = alloca <2 x i32>, align 8
    142   %__a = alloca <2 x i32>, align 8
    143   %__b = alloca <2 x i32>, align 8
    144   %tmp = load <2 x i32>, <2 x i32>* %xU32x2, align 8
    145   store <2 x i32> %tmp, <2 x i32>* %__a, align 8
    146   %tmp1 = load <2 x i32>, <2 x i32>* %xU32x2, align 8
    147   store <2 x i32> %tmp1, <2 x i32>* %__b, align 8
    148   %tmp2 = load <2 x i32>, <2 x i32>* %__a, align 8
    149   %tmp3 = bitcast <2 x i32> %tmp2 to <8 x i8>
    150   %tmp4 = load <2 x i32>, <2 x i32>* %__b, align 8
    151   %tmp5 = bitcast <2 x i32> %tmp4 to <8 x i8>
    152   %tmp6 = bitcast <8 x i8> %tmp3 to <2 x i32>
    153   %tmp7 = bitcast <8 x i8> %tmp5 to <2 x i32>
    154   %vext = shufflevector <2 x i32> %tmp6, <2 x i32> %tmp7, <2 x i32> <i32 1, i32 2>
    155   store <2 x i32> %vext, <2 x i32>* %xU32x2, align 8
    156   ret void
    157 }
    158 
    159 define void @test_vext_f32() nounwind ssp {
    160   ; CHECK-LABEL: test_vext_f32:
    161   ; CHECK: {{rev64.2s.*}}
    162   %xF32x2 = alloca <2 x float>, align 8
    163   %__a = alloca <2 x float>, align 8
    164   %__b = alloca <2 x float>, align 8
    165   %tmp = load <2 x float>, <2 x float>* %xF32x2, align 8
    166   store <2 x float> %tmp, <2 x float>* %__a, align 8
    167   %tmp1 = load <2 x float>, <2 x float>* %xF32x2, align 8
    168   store <2 x float> %tmp1, <2 x float>* %__b, align 8
    169   %tmp2 = load <2 x float>, <2 x float>* %__a, align 8
    170   %tmp3 = bitcast <2 x float> %tmp2 to <8 x i8>
    171   %tmp4 = load <2 x float>, <2 x float>* %__b, align 8
    172   %tmp5 = bitcast <2 x float> %tmp4 to <8 x i8>
    173   %tmp6 = bitcast <8 x i8> %tmp3 to <2 x float>
    174   %tmp7 = bitcast <8 x i8> %tmp5 to <2 x float>
    175   %vext = shufflevector <2 x float> %tmp6, <2 x float> %tmp7, <2 x i32> <i32 1, i32 2>
    176   store <2 x float> %vext, <2 x float>* %xF32x2, align 8
    177   ret void
    178 }
    179 
    180 define void @test_vext_s64() nounwind ssp {
    181   ; CHECK-LABEL: test_vext_s64:
    182   ; CHECK_FIXME: {{rev64.2s.*}}
    183   ; this just turns into a load of the second element
    184   %xS64x1 = alloca <1 x i64>, align 8
    185   %__a = alloca <1 x i64>, align 8
    186   %__b = alloca <1 x i64>, align 8
    187   %tmp = load <1 x i64>, <1 x i64>* %xS64x1, align 8
    188   store <1 x i64> %tmp, <1 x i64>* %__a, align 8
    189   %tmp1 = load <1 x i64>, <1 x i64>* %xS64x1, align 8
    190   store <1 x i64> %tmp1, <1 x i64>* %__b, align 8
    191   %tmp2 = load <1 x i64>, <1 x i64>* %__a, align 8
    192   %tmp3 = bitcast <1 x i64> %tmp2 to <8 x i8>
    193   %tmp4 = load <1 x i64>, <1 x i64>* %__b, align 8
    194   %tmp5 = bitcast <1 x i64> %tmp4 to <8 x i8>
    195   %tmp6 = bitcast <8 x i8> %tmp3 to <1 x i64>
    196   %tmp7 = bitcast <8 x i8> %tmp5 to <1 x i64>
    197   %vext = shufflevector <1 x i64> %tmp6, <1 x i64> %tmp7, <1 x i32> <i32 1>
    198   store <1 x i64> %vext, <1 x i64>* %xS64x1, align 8
    199   ret void
    200 }
    201 
    202 define void @test_vext_u64() nounwind ssp {
    203   ; CHECK-LABEL: test_vext_u64:
    204   ; CHECK_FIXME: {{ext.8.*#1}}
    205   ; this is turned into a simple load of the 2nd element
    206   %xU64x1 = alloca <1 x i64>, align 8
    207   %__a = alloca <1 x i64>, align 8
    208   %__b = alloca <1 x i64>, align 8
    209   %tmp = load <1 x i64>, <1 x i64>* %xU64x1, align 8
    210   store <1 x i64> %tmp, <1 x i64>* %__a, align 8
    211   %tmp1 = load <1 x i64>, <1 x i64>* %xU64x1, align 8
    212   store <1 x i64> %tmp1, <1 x i64>* %__b, align 8
    213   %tmp2 = load <1 x i64>, <1 x i64>* %__a, align 8
    214   %tmp3 = bitcast <1 x i64> %tmp2 to <8 x i8>
    215   %tmp4 = load <1 x i64>, <1 x i64>* %__b, align 8
    216   %tmp5 = bitcast <1 x i64> %tmp4 to <8 x i8>
    217   %tmp6 = bitcast <8 x i8> %tmp3 to <1 x i64>
    218   %tmp7 = bitcast <8 x i8> %tmp5 to <1 x i64>
    219   %vext = shufflevector <1 x i64> %tmp6, <1 x i64> %tmp7, <1 x i32> <i32 1>
    220   store <1 x i64> %vext, <1 x i64>* %xU64x1, align 8
    221   ret void
    222 }
    223 
    224 define void @test_vextq_s8() nounwind ssp {
    225   ; CHECK-LABEL: test_vextq_s8:
    226   ; CHECK: {{ext.16.*#4}}
    227   %xS8x16 = alloca <16 x i8>, align 16
    228   %__a = alloca <16 x i8>, align 16
    229   %__b = alloca <16 x i8>, align 16
    230   %tmp = load <16 x i8>, <16 x i8>* %xS8x16, align 16
    231   store <16 x i8> %tmp, <16 x i8>* %__a, align 16
    232   %tmp1 = load <16 x i8>, <16 x i8>* %xS8x16, align 16
    233   store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
    234   %tmp2 = load <16 x i8>, <16 x i8>* %__a, align 16
    235   %tmp3 = load <16 x i8>, <16 x i8>* %__b, align 16
    236   %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
    237   store <16 x i8> %vext, <16 x i8>* %xS8x16, align 16
    238   ret void
    239 }
    240 
    241 define void @test_vextq_u8() nounwind ssp {
    242   ; CHECK-LABEL: test_vextq_u8:
    243   ; CHECK: {{ext.16.*#5}}
    244   %xU8x16 = alloca <16 x i8>, align 16
    245   %__a = alloca <16 x i8>, align 16
    246   %__b = alloca <16 x i8>, align 16
    247   %tmp = load <16 x i8>, <16 x i8>* %xU8x16, align 16
    248   store <16 x i8> %tmp, <16 x i8>* %__a, align 16
    249   %tmp1 = load <16 x i8>, <16 x i8>* %xU8x16, align 16
    250   store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
    251   %tmp2 = load <16 x i8>, <16 x i8>* %__a, align 16
    252   %tmp3 = load <16 x i8>, <16 x i8>* %__b, align 16
    253   %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
    254   store <16 x i8> %vext, <16 x i8>* %xU8x16, align 16
    255   ret void
    256 }
    257 
    258 define void @test_vextq_p8() nounwind ssp {
    259   ; CHECK-LABEL: test_vextq_p8:
    260   ; CHECK: {{ext.16.*#6}}
    261   %xP8x16 = alloca <16 x i8>, align 16
    262   %__a = alloca <16 x i8>, align 16
    263   %__b = alloca <16 x i8>, align 16
    264   %tmp = load <16 x i8>, <16 x i8>* %xP8x16, align 16
    265   store <16 x i8> %tmp, <16 x i8>* %__a, align 16
    266   %tmp1 = load <16 x i8>, <16 x i8>* %xP8x16, align 16
    267   store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
    268   %tmp2 = load <16 x i8>, <16 x i8>* %__a, align 16
    269   %tmp3 = load <16 x i8>, <16 x i8>* %__b, align 16
    270   %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
    271   store <16 x i8> %vext, <16 x i8>* %xP8x16, align 16
    272   ret void
    273 }
    274 
    275 define void @test_vextq_s16() nounwind ssp {
    276   ; CHECK-LABEL: test_vextq_s16:
    277   ; CHECK: {{ext.16.*#14}}
    278   %xS16x8 = alloca <8 x i16>, align 16
    279   %__a = alloca <8 x i16>, align 16
    280   %__b = alloca <8 x i16>, align 16
    281   %tmp = load <8 x i16>, <8 x i16>* %xS16x8, align 16
    282   store <8 x i16> %tmp, <8 x i16>* %__a, align 16
    283   %tmp1 = load <8 x i16>, <8 x i16>* %xS16x8, align 16
    284   store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
    285   %tmp2 = load <8 x i16>, <8 x i16>* %__a, align 16
    286   %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
    287   %tmp4 = load <8 x i16>, <8 x i16>* %__b, align 16
    288   %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
    289   %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
    290   %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
    291   %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
    292   store <8 x i16> %vext, <8 x i16>* %xS16x8, align 16
    293   ret void
    294 }
    295 
    296 define void @test_vextq_u16() nounwind ssp {
    297   ; CHECK-LABEL: test_vextq_u16:
    298   ; CHECK: {{ext.16.*#8}}
    299   %xU16x8 = alloca <8 x i16>, align 16
    300   %__a = alloca <8 x i16>, align 16
    301   %__b = alloca <8 x i16>, align 16
    302   %tmp = load <8 x i16>, <8 x i16>* %xU16x8, align 16
    303   store <8 x i16> %tmp, <8 x i16>* %__a, align 16
    304   %tmp1 = load <8 x i16>, <8 x i16>* %xU16x8, align 16
    305   store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
    306   %tmp2 = load <8 x i16>, <8 x i16>* %__a, align 16
    307   %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
    308   %tmp4 = load <8 x i16>, <8 x i16>* %__b, align 16
    309   %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
    310   %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
    311   %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
    312   %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
    313   store <8 x i16> %vext, <8 x i16>* %xU16x8, align 16
    314   ret void
    315 }
    316 
    317 define void @test_vextq_p16() nounwind ssp {
    318   ; CHECK-LABEL: test_vextq_p16:
    319   ; CHECK: {{ext.16.*#10}}
    320   %xP16x8 = alloca <8 x i16>, align 16
    321   %__a = alloca <8 x i16>, align 16
    322   %__b = alloca <8 x i16>, align 16
    323   %tmp = load <8 x i16>, <8 x i16>* %xP16x8, align 16
    324   store <8 x i16> %tmp, <8 x i16>* %__a, align 16
    325   %tmp1 = load <8 x i16>, <8 x i16>* %xP16x8, align 16
    326   store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
    327   %tmp2 = load <8 x i16>, <8 x i16>* %__a, align 16
    328   %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
    329   %tmp4 = load <8 x i16>, <8 x i16>* %__b, align 16
    330   %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
    331   %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
    332   %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
    333   %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
    334   store <8 x i16> %vext, <8 x i16>* %xP16x8, align 16
    335   ret void
    336 }
    337 
    338 define void @test_vextq_s32() nounwind ssp {
    339   ; CHECK-LABEL: test_vextq_s32:
    340   ; CHECK: {{ext.16.*#4}}
    341   %xS32x4 = alloca <4 x i32>, align 16
    342   %__a = alloca <4 x i32>, align 16
    343   %__b = alloca <4 x i32>, align 16
    344   %tmp = load <4 x i32>, <4 x i32>* %xS32x4, align 16
    345   store <4 x i32> %tmp, <4 x i32>* %__a, align 16
    346   %tmp1 = load <4 x i32>, <4 x i32>* %xS32x4, align 16
    347   store <4 x i32> %tmp1, <4 x i32>* %__b, align 16
    348   %tmp2 = load <4 x i32>, <4 x i32>* %__a, align 16
    349   %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
    350   %tmp4 = load <4 x i32>, <4 x i32>* %__b, align 16
    351   %tmp5 = bitcast <4 x i32> %tmp4 to <16 x i8>
    352   %tmp6 = bitcast <16 x i8> %tmp3 to <4 x i32>
    353   %tmp7 = bitcast <16 x i8> %tmp5 to <4 x i32>
    354   %vext = shufflevector <4 x i32> %tmp6, <4 x i32> %tmp7, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
    355   store <4 x i32> %vext, <4 x i32>* %xS32x4, align 16
    356   ret void
    357 }
    358 
    359 define void @test_vextq_u32() nounwind ssp {
    360   ; CHECK-LABEL: test_vextq_u32:
    361   ; CHECK: {{ext.16.*#8}}
    362   %xU32x4 = alloca <4 x i32>, align 16
    363   %__a = alloca <4 x i32>, align 16
    364   %__b = alloca <4 x i32>, align 16
    365   %tmp = load <4 x i32>, <4 x i32>* %xU32x4, align 16
    366   store <4 x i32> %tmp, <4 x i32>* %__a, align 16
    367   %tmp1 = load <4 x i32>, <4 x i32>* %xU32x4, align 16
    368   store <4 x i32> %tmp1, <4 x i32>* %__b, align 16
    369   %tmp2 = load <4 x i32>, <4 x i32>* %__a, align 16
    370   %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
    371   %tmp4 = load <4 x i32>, <4 x i32>* %__b, align 16
    372   %tmp5 = bitcast <4 x i32> %tmp4 to <16 x i8>
    373   %tmp6 = bitcast <16 x i8> %tmp3 to <4 x i32>
    374   %tmp7 = bitcast <16 x i8> %tmp5 to <4 x i32>
    375   %vext = shufflevector <4 x i32> %tmp6, <4 x i32> %tmp7, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
    376   store <4 x i32> %vext, <4 x i32>* %xU32x4, align 16
    377   ret void
    378 }
    379 
    380 define void @test_vextq_f32() nounwind ssp {
    381   ; CHECK-LABEL: test_vextq_f32:
    382   ; CHECK: {{ext.16.*#12}}
    383   %xF32x4 = alloca <4 x float>, align 16
    384   %__a = alloca <4 x float>, align 16
    385   %__b = alloca <4 x float>, align 16
    386   %tmp = load <4 x float>, <4 x float>* %xF32x4, align 16
    387   store <4 x float> %tmp, <4 x float>* %__a, align 16
    388   %tmp1 = load <4 x float>, <4 x float>* %xF32x4, align 16
    389   store <4 x float> %tmp1, <4 x float>* %__b, align 16
    390   %tmp2 = load <4 x float>, <4 x float>* %__a, align 16
    391   %tmp3 = bitcast <4 x float> %tmp2 to <16 x i8>
    392   %tmp4 = load <4 x float>, <4 x float>* %__b, align 16
    393   %tmp5 = bitcast <4 x float> %tmp4 to <16 x i8>
    394   %tmp6 = bitcast <16 x i8> %tmp3 to <4 x float>
    395   %tmp7 = bitcast <16 x i8> %tmp5 to <4 x float>
    396   %vext = shufflevector <4 x float> %tmp6, <4 x float> %tmp7, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
    397   store <4 x float> %vext, <4 x float>* %xF32x4, align 16
    398   ret void
    399 }
    400 
    401 define void @test_vextq_s64() nounwind ssp {
    402   ; CHECK-LABEL: test_vextq_s64:
    403   ; CHECK: {{ext.16.*#8}}
    404   %xS64x2 = alloca <2 x i64>, align 16
    405   %__a = alloca <2 x i64>, align 16
    406   %__b = alloca <2 x i64>, align 16
    407   %tmp = load <2 x i64>, <2 x i64>* %xS64x2, align 16
    408   store <2 x i64> %tmp, <2 x i64>* %__a, align 16
    409   %tmp1 = load <2 x i64>, <2 x i64>* %xS64x2, align 16
    410   store <2 x i64> %tmp1, <2 x i64>* %__b, align 16
    411   %tmp2 = load <2 x i64>, <2 x i64>* %__a, align 16
    412   %tmp3 = bitcast <2 x i64> %tmp2 to <16 x i8>
    413   %tmp4 = load <2 x i64>, <2 x i64>* %__b, align 16
    414   %tmp5 = bitcast <2 x i64> %tmp4 to <16 x i8>
    415   %tmp6 = bitcast <16 x i8> %tmp3 to <2 x i64>
    416   %tmp7 = bitcast <16 x i8> %tmp5 to <2 x i64>
    417   %vext = shufflevector <2 x i64> %tmp6, <2 x i64> %tmp7, <2 x i32> <i32 1, i32 2>
    418   store <2 x i64> %vext, <2 x i64>* %xS64x2, align 16
    419   ret void
    420 }
    421 
    422 define void @test_vextq_u64() nounwind ssp {
    423   ; CHECK-LABEL: test_vextq_u64:
    424   ; CHECK: {{ext.16.*#8}}
    425   %xU64x2 = alloca <2 x i64>, align 16
    426   %__a = alloca <2 x i64>, align 16
    427   %__b = alloca <2 x i64>, align 16
    428   %tmp = load <2 x i64>, <2 x i64>* %xU64x2, align 16
    429   store <2 x i64> %tmp, <2 x i64>* %__a, align 16
    430   %tmp1 = load <2 x i64>, <2 x i64>* %xU64x2, align 16
    431   store <2 x i64> %tmp1, <2 x i64>* %__b, align 16
    432   %tmp2 = load <2 x i64>, <2 x i64>* %__a, align 16
    433   %tmp3 = bitcast <2 x i64> %tmp2 to <16 x i8>
    434   %tmp4 = load <2 x i64>, <2 x i64>* %__b, align 16
    435   %tmp5 = bitcast <2 x i64> %tmp4 to <16 x i8>
    436   %tmp6 = bitcast <16 x i8> %tmp3 to <2 x i64>
    437   %tmp7 = bitcast <16 x i8> %tmp5 to <2 x i64>
    438   %vext = shufflevector <2 x i64> %tmp6, <2 x i64> %tmp7, <2 x i32> <i32 1, i32 2>
    439   store <2 x i64> %vext, <2 x i64>* %xU64x2, align 16
    440   ret void
    441 }
    442 
    443 ; shuffles with an undef second operand can use an EXT also so long as the
    444 ; indices wrap and stay sequential.
    445 ; rdar://12051674
    446 define <16 x i8> @vext1(<16 x i8> %_a) nounwind {
    447 ; CHECK-LABEL: vext1:
    448 ; CHECK: ext.16b  v0, v0, v0, #8
    449   %vext = shufflevector <16 x i8> %_a, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    450   ret <16 x i8> %vext
    451 }
    452 
    453 ; <rdar://problem/12212062>
    454 define <2 x i64> @vext2(<2 x i64> %p0, <2 x i64> %p1) nounwind readnone ssp {
    455 entry:
    456 ; CHECK-LABEL: vext2:
    457 ; CHECK: ext.16b v1, v1, v1, #8
    458 ; CHECK: ext.16b v0, v0, v0, #8
    459 ; CHECK: add.2d  v0, v0, v1
    460   %t0 = shufflevector <2 x i64> %p1, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
    461   %t1 = shufflevector <2 x i64> %p0, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
    462   %t2 = add <2 x i64> %t1, %t0
    463   ret <2 x i64> %t2
    464 }
    465