Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -disable-peephole | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 -disable-peephole | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
      4 
      5 define <8 x float> @shuffle_v8f32_45670123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
      6 ; AVX1-LABEL: shuffle_v8f32_45670123:
      7 ; AVX1:       # %bb.0: # %entry
      8 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
      9 ; AVX1-NEXT:    retq
     10 ;
     11 ; AVX2-LABEL: shuffle_v8f32_45670123:
     12 ; AVX2:       # %bb.0: # %entry
     13 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
     14 ; AVX2-NEXT:    retq
     15 entry:
     16   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
     17   ret <8 x float> %shuffle
     18 }
     19 
     20 define <8 x float> @shuffle_v8f32_45670123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
     21 ; AVX1-LABEL: shuffle_v8f32_45670123_mem:
     22 ; AVX1:       # %bb.0: # %entry
     23 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
     24 ; AVX1-NEXT:    retq
     25 ;
     26 ; AVX2-LABEL: shuffle_v8f32_45670123_mem:
     27 ; AVX2:       # %bb.0: # %entry
     28 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,3,0,1]
     29 ; AVX2-NEXT:    retq
     30 entry:
     31   %a = load <8 x float>, <8 x float>* %pa
     32   %b = load <8 x float>, <8 x float>* %pb
     33   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
     34   ret <8 x float> %shuffle
     35 }
     36 
     37 define <8 x float> @shuffle_v8f32_0123cdef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
     38 ; ALL-LABEL: shuffle_v8f32_0123cdef:
     39 ; ALL:       # %bb.0: # %entry
     40 ; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
     41 ; ALL-NEXT:    retq
     42 entry:
     43   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
     44   ret <8 x float> %shuffle
     45 }
     46 
     47 define <8 x float> @shuffle_v8f32_01230123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
     48 ; AVX1-LABEL: shuffle_v8f32_01230123:
     49 ; AVX1:       # %bb.0: # %entry
     50 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
     51 ; AVX1-NEXT:    retq
     52 ;
     53 ; AVX2-LABEL: shuffle_v8f32_01230123:
     54 ; AVX2:       # %bb.0: # %entry
     55 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
     56 ; AVX2-NEXT:    retq
     57 entry:
     58   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
     59   ret <8 x float> %shuffle
     60 }
     61 
     62 define <8 x float> @shuffle_v8f32_01230123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
     63 ; AVX1-LABEL: shuffle_v8f32_01230123_mem:
     64 ; AVX1:       # %bb.0: # %entry
     65 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[0,1,0,1]
     66 ; AVX1-NEXT:    retq
     67 ;
     68 ; AVX2-LABEL: shuffle_v8f32_01230123_mem:
     69 ; AVX2:       # %bb.0: # %entry
     70 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = mem[0,1,0,1]
     71 ; AVX2-NEXT:    retq
     72 entry:
     73   %a = load <8 x float>, <8 x float>* %pa
     74   %b = load <8 x float>, <8 x float>* %pb
     75   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
     76   ret <8 x float> %shuffle
     77 }
     78 
     79 define <8 x float> @shuffle_v8f32_45674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
     80 ; AVX1-LABEL: shuffle_v8f32_45674567:
     81 ; AVX1:       # %bb.0: # %entry
     82 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
     83 ; AVX1-NEXT:    retq
     84 ;
     85 ; AVX2-LABEL: shuffle_v8f32_45674567:
     86 ; AVX2:       # %bb.0: # %entry
     87 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
     88 ; AVX2-NEXT:    retq
     89 entry:
     90   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
     91   ret <8 x float> %shuffle
     92 }
     93 
     94 define <8 x float> @shuffle_v8f32_45674567_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
     95 ; AVX1-LABEL: shuffle_v8f32_45674567_mem:
     96 ; AVX1:       # %bb.0: # %entry
     97 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
     98 ; AVX1-NEXT:    retq
     99 ;
    100 ; AVX2-LABEL: shuffle_v8f32_45674567_mem:
    101 ; AVX2:       # %bb.0: # %entry
    102 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,3,2,3]
    103 ; AVX2-NEXT:    retq
    104 entry:
    105   %a = load <8 x float>, <8 x float>* %pa
    106   %b = load <8 x float>, <8 x float>* %pb
    107   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
    108   ret <8 x float> %shuffle
    109 }
    110 
    111 define <32 x i8> @shuffle_v32i8_2323(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
    112 ; AVX1-LABEL: shuffle_v32i8_2323:
    113 ; AVX1:       # %bb.0: # %entry
    114 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
    115 ; AVX1-NEXT:    retq
    116 ;
    117 ; AVX2-LABEL: shuffle_v32i8_2323:
    118 ; AVX2:       # %bb.0: # %entry
    119 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
    120 ; AVX2-NEXT:    retq
    121 entry:
    122   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    123   ret <32 x i8> %shuffle
    124 }
    125 
    126 define <32 x i8> @shuffle_v32i8_2323_domain(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
    127 ; AVX1-LABEL: shuffle_v32i8_2323_domain:
    128 ; AVX1:       # %bb.0: # %entry
    129 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
    130 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
    131 ; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
    132 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    133 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
    134 ; AVX1-NEXT:    retq
    135 ;
    136 ; AVX2-LABEL: shuffle_v32i8_2323_domain:
    137 ; AVX2:       # %bb.0: # %entry
    138 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    139 ; AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
    140 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
    141 ; AVX2-NEXT:    retq
    142 entry:
    143   ; add forces execution domain
    144   %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
    145   %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    146   ret <32 x i8> %shuffle
    147 }
    148 
    149 define <4 x i64> @shuffle_v4i64_6701(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
    150 ; ALL-LABEL: shuffle_v4i64_6701:
    151 ; ALL:       # %bb.0: # %entry
    152 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
    153 ; ALL-NEXT:    retq
    154 entry:
    155   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
    156   ret <4 x i64> %shuffle
    157 }
    158 
    159 define <4 x i64> @shuffle_v4i64_6701_domain(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
    160 ; AVX1-LABEL: shuffle_v4i64_6701_domain:
    161 ; AVX1:       # %bb.0: # %entry
    162 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
    163 ; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
    164 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
    165 ; AVX1-NEXT:    retq
    166 ;
    167 ; AVX2-LABEL: shuffle_v4i64_6701_domain:
    168 ; AVX2:       # %bb.0: # %entry
    169 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
    170 ; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
    171 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
    172 ; AVX2-NEXT:    retq
    173 entry:
    174   ; add forces execution domain
    175   %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
    176   %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
    177   ret <4 x i64> %shuffle
    178 }
    179 
    180 define <8 x i32> @shuffle_v8i32_u5u7cdef(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
    181 ; AVX1-LABEL: shuffle_v8i32_u5u7cdef:
    182 ; AVX1:       # %bb.0: # %entry
    183 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
    184 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
    185 ; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
    186 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    187 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
    188 ; AVX1-NEXT:    retq
    189 ;
    190 ; AVX2-LABEL: shuffle_v8i32_u5u7cdef:
    191 ; AVX2:       # %bb.0: # %entry
    192 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
    193 ; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
    194 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
    195 ; AVX2-NEXT:    retq
    196 entry:
    197   ; add forces execution domain
    198   %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    199   %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15>
    200   ret <8 x i32> %shuffle
    201 }
    202 
    203 define <16 x i16> @shuffle_v16i16_4501(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
    204 ; AVX1-LABEL: shuffle_v16i16_4501:
    205 ; AVX1:       # %bb.0: # %entry
    206 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
    207 ; AVX1-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
    208 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
    209 ; AVX1-NEXT:    retq
    210 ;
    211 ; AVX2-LABEL: shuffle_v16i16_4501:
    212 ; AVX2:       # %bb.0: # %entry
    213 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
    214 ; AVX2-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
    215 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
    216 ; AVX2-NEXT:    retq
    217 entry:
    218   ; add forces execution domain
    219   %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
    220   %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    221   ret <16 x i16> %shuffle
    222 }
    223 
    224 define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
    225 ; AVX1-LABEL: shuffle_v16i16_4501_mem:
    226 ; AVX1:       # %bb.0: # %entry
    227 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
    228 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
    229 ; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
    230 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1]
    231 ; AVX1-NEXT:    retq
    232 ;
    233 ; AVX2-LABEL: shuffle_v16i16_4501_mem:
    234 ; AVX2:       # %bb.0: # %entry
    235 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
    236 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    237 ; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
    238 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1]
    239 ; AVX2-NEXT:    retq
    240 entry:
    241   %c = load <16 x i16>, <16 x i16>* %a
    242   %d = load <16 x i16>, <16 x i16>* %b
    243   %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
    244   %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    245   ret <16 x i16> %shuffle
    246 }
    247 
    248 ;;;; Cases with undef indicies mixed in the mask
    249 
    250 define <8 x float> @shuffle_v8f32_uu67u9ub(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
    251 ; ALL-LABEL: shuffle_v8f32_uu67u9ub:
    252 ; ALL:       # %bb.0: # %entry
    253 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
    254 ; ALL-NEXT:    retq
    255 entry:
    256   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11>
    257   ret <8 x float> %shuffle
    258 }
    259 
    260 define <8 x float> @shuffle_v8f32_uu67uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
    261 ; AVX1-LABEL: shuffle_v8f32_uu67uu67:
    262 ; AVX1:       # %bb.0: # %entry
    263 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
    264 ; AVX1-NEXT:    retq
    265 ;
    266 ; AVX2-LABEL: shuffle_v8f32_uu67uu67:
    267 ; AVX2:       # %bb.0: # %entry
    268 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
    269 ; AVX2-NEXT:    retq
    270 entry:
    271   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7>
    272   ret <8 x float> %shuffle
    273 }
    274 
    275 define <8 x float> @shuffle_v8f32_uu67uuab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
    276 ; ALL-LABEL: shuffle_v8f32_uu67uuab:
    277 ; ALL:       # %bb.0: # %entry
    278 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
    279 ; ALL-NEXT:    retq
    280 entry:
    281   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 10, i32 11>
    282   ret <8 x float> %shuffle
    283 }
    284 
    285 define <8 x float> @shuffle_v8f32_uu67uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
    286 ; ALL-LABEL: shuffle_v8f32_uu67uuef:
    287 ; ALL:       # %bb.0: # %entry
    288 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
    289 ; ALL-NEXT:    retq
    290 entry:
    291   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15>
    292   ret <8 x float> %shuffle
    293 }
    294 
    295 define <8 x float> @shuffle_v8f32_uu674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
    296 ; AVX1-LABEL: shuffle_v8f32_uu674567:
    297 ; AVX1:       # %bb.0: # %entry
    298 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
    299 ; AVX1-NEXT:    retq
    300 ;
    301 ; AVX2-LABEL: shuffle_v8f32_uu674567:
    302 ; AVX2:       # %bb.0: # %entry
    303 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
    304 ; AVX2-NEXT:    retq
    305 entry:
    306   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
    307   ret <8 x float> %shuffle
    308 }
    309 
    310 define <8 x float> @shuffle_v8f32_uu6789ab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
    311 ; ALL-LABEL: shuffle_v8f32_uu6789ab:
    312 ; ALL:       # %bb.0: # %entry
    313 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
    314 ; ALL-NEXT:    retq
    315 entry:
    316   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
    317   ret <8 x float> %shuffle
    318 }
    319 
    320 define <8 x float> @shuffle_v8f32_4567uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
    321 ; AVX1-LABEL: shuffle_v8f32_4567uu67:
    322 ; AVX1:       # %bb.0: # %entry
    323 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
    324 ; AVX1-NEXT:    retq
    325 ;
    326 ; AVX2-LABEL: shuffle_v8f32_4567uu67:
    327 ; AVX2:       # %bb.0: # %entry
    328 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
    329 ; AVX2-NEXT:    retq
    330 entry:
    331   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7>
    332   ret <8 x float> %shuffle
    333 }
    334 
    335 define <8 x float> @shuffle_v8f32_4567uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
    336 ; ALL-LABEL: shuffle_v8f32_4567uuef:
    337 ; ALL:       # %bb.0: # %entry
    338 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
    339 ; ALL-NEXT:    retq
    340 entry:
    341   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15>
    342   ret <8 x float> %shuffle
    343 }
    344 
    345 ;;;; Cases we must not select vperm2f128
    346 
    347 define <8 x float> @shuffle_v8f32_uu67ucuf(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
    348 ; ALL-LABEL: shuffle_v8f32_uu67ucuf:
    349 ; ALL:       # %bb.0: # %entry
    350 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
    351 ; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
    352 ; ALL-NEXT:    retq
    353 entry:
    354   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15>
    355   ret <8 x float> %shuffle
    356 }
    357 
    358 ;; Test zero mask generation.
    359 ;; PR22984: https://llvm.org/bugs/show_bug.cgi?id=22984
    360 ;; Prefer xor+vblendpd over vperm2f128 because that has better performance.
    361 ;; TODO: When building for optsize we should use vperm2f128.
    362 
    363 define <4 x double> @shuffle_v4f64_zz01(<4 x double> %a) {
    364 ; ALL-LABEL: shuffle_v4f64_zz01:
    365 ; ALL:       # %bb.0:
    366 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
    367 ; ALL-NEXT:    retq
    368   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
    369   ret <4 x double> %s
    370 }
    371 define <4 x double> @shuffle_v4f64_zz01_optsize(<4 x double> %a) optsize {
    372 ; ALL-LABEL: shuffle_v4f64_zz01_optsize:
    373 ; ALL:       # %bb.0:
    374 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
    375 ; ALL-NEXT:    retq
    376   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
    377   ret <4 x double> %s
    378 }
    379 
    380 define <4 x double> @shuffle_v4f64_zz23(<4 x double> %a) {
    381 ; ALL-LABEL: shuffle_v4f64_zz23:
    382 ; ALL:       # %bb.0:
    383 ; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    384 ; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
    385 ; ALL-NEXT:    retq
    386   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
    387   ret <4 x double> %s
    388 }
    389 define <4 x double> @shuffle_v4f64_zz23_optsize(<4 x double> %a) optsize {
    390 ; ALL-LABEL: shuffle_v4f64_zz23_optsize:
    391 ; ALL:       # %bb.0:
    392 ; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    393 ; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
    394 ; ALL-NEXT:    retq
    395   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
    396   ret <4 x double> %s
    397 }
    398 
    399 define <4 x double> @shuffle_v4f64_zz45(<4 x double> %a) {
    400 ; ALL-LABEL: shuffle_v4f64_zz45:
    401 ; ALL:       # %bb.0:
    402 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
    403 ; ALL-NEXT:    retq
    404   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
    405   ret <4 x double> %s
    406 }
    407 define <4 x double> @shuffle_v4f64_zz45_optsize(<4 x double> %a) optsize {
    408 ; ALL-LABEL: shuffle_v4f64_zz45_optsize:
    409 ; ALL:       # %bb.0:
    410 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
    411 ; ALL-NEXT:    retq
    412   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
    413   ret <4 x double> %s
    414 }
    415 
    416 define <4 x double> @shuffle_v4f64_zz67(<4 x double> %a) {
    417 ; ALL-LABEL: shuffle_v4f64_zz67:
    418 ; ALL:       # %bb.0:
    419 ; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    420 ; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
    421 ; ALL-NEXT:    retq
    422   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
    423   ret <4 x double> %s
    424 }
    425 define <4 x double> @shuffle_v4f64_zz67_optsize(<4 x double> %a) optsize {
    426 ; ALL-LABEL: shuffle_v4f64_zz67_optsize:
    427 ; ALL:       # %bb.0:
    428 ; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    429 ; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
    430 ; ALL-NEXT:    retq
    431   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
    432   ret <4 x double> %s
    433 }
    434 
    435 define <4 x double> @shuffle_v4f64_01zz(<4 x double> %a) {
    436 ; ALL-LABEL: shuffle_v4f64_01zz:
    437 ; ALL:       # %bb.0:
    438 ; ALL-NEXT:    vmovaps %xmm0, %xmm0
    439 ; ALL-NEXT:    retq
    440   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
    441   ret <4 x double> %s
    442 }
    443 define <4 x double> @shuffle_v4f64_01zz_optsize(<4 x double> %a) optsize {
    444 ; ALL-LABEL: shuffle_v4f64_01zz_optsize:
    445 ; ALL:       # %bb.0:
    446 ; ALL-NEXT:    vmovaps %xmm0, %xmm0
    447 ; ALL-NEXT:    retq
    448   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
    449   ret <4 x double> %s
    450 }
    451 
    452 define <4 x double> @shuffle_v4f64_23zz(<4 x double> %a) {
    453 ; ALL-LABEL: shuffle_v4f64_23zz:
    454 ; ALL:       # %bb.0:
    455 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
    456 ; ALL-NEXT:    retq
    457   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
    458   ret <4 x double> %s
    459 }
    460 define <4 x double> @shuffle_v4f64_23zz_optsize(<4 x double> %a) optsize {
    461 ; ALL-LABEL: shuffle_v4f64_23zz_optsize:
    462 ; ALL:       # %bb.0:
    463 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
    464 ; ALL-NEXT:    retq
    465   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
    466   ret <4 x double> %s
    467 }
    468 
    469 define <4 x double> @shuffle_v4f64_45zz(<4 x double> %a) {
    470 ; ALL-LABEL: shuffle_v4f64_45zz:
    471 ; ALL:       # %bb.0:
    472 ; ALL-NEXT:    vmovaps %xmm0, %xmm0
    473 ; ALL-NEXT:    retq
    474   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
    475   ret <4 x double> %s
    476 }
    477 define <4 x double> @shuffle_v4f64_45zz_optsize(<4 x double> %a) optsize {
    478 ; ALL-LABEL: shuffle_v4f64_45zz_optsize:
    479 ; ALL:       # %bb.0:
    480 ; ALL-NEXT:    vmovaps %xmm0, %xmm0
    481 ; ALL-NEXT:    retq
    482   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
    483   ret <4 x double> %s
    484 }
    485 
    486 define <4 x double> @shuffle_v4f64_67zz(<4 x double> %a) {
    487 ; ALL-LABEL: shuffle_v4f64_67zz:
    488 ; ALL:       # %bb.0:
    489 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
    490 ; ALL-NEXT:    retq
    491   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
    492   ret <4 x double> %s
    493 }
    494 define <4 x double> @shuffle_v4f64_67zz_optsize(<4 x double> %a) optsize {
    495 ; ALL-LABEL: shuffle_v4f64_67zz_optsize:
    496 ; ALL:       # %bb.0:
    497 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
    498 ; ALL-NEXT:    retq
    499   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
    500   ret <4 x double> %s
    501 }
    502 
    503 ;; With AVX2 select the integer version of the instruction. Use an add to force the domain selection.
    504 
    505 define <4 x i64> @shuffle_v4i64_67zz(<4 x i64> %a, <4 x i64> %b) {
    506 ; AVX1-LABEL: shuffle_v4i64_67zz:
    507 ; AVX1:       # %bb.0:
    508 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
    509 ; AVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
    510 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
    511 ; AVX1-NEXT:    retq
    512 ;
    513 ; AVX2-LABEL: shuffle_v4i64_67zz:
    514 ; AVX2:       # %bb.0:
    515 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
    516 ; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
    517 ; AVX2-NEXT:    retq
    518   %s = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
    519   %c = add <4 x i64> %b, %s
    520   ret <4 x i64> %c
    521 }
    522 
    523 ;;; Memory folding cases
    524 
    525 define <4 x double> @ld0_hi0_lo1_4f64(<4 x double> * %pa, <4 x double> %b) nounwind uwtable readnone ssp {
    526 ; AVX1-LABEL: ld0_hi0_lo1_4f64:
    527 ; AVX1:       # %bb.0: # %entry
    528 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
    529 ; AVX1-NEXT:    vaddpd {{.*}}(%rip), %ymm0, %ymm0
    530 ; AVX1-NEXT:    retq
    531 ;
    532 ; AVX2-LABEL: ld0_hi0_lo1_4f64:
    533 ; AVX2:       # %bb.0: # %entry
    534 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
    535 ; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1]
    536 ; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
    537 ; AVX2-NEXT:    retq
    538 entry:
    539   %a = load <4 x double>, <4 x double> * %pa
    540   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
    541   %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0>
    542   ret <4 x double> %res
    543 }
    544 
    545 define <4 x double> @ld1_hi0_hi1_4f64(<4 x double> %a, <4 x double> * %pb) nounwind uwtable readnone ssp {
    546 ; AVX1-LABEL: ld1_hi0_hi1_4f64:
    547 ; AVX1:       # %bb.0: # %entry
    548 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
    549 ; AVX1-NEXT:    vaddpd {{.*}}(%rip), %ymm0, %ymm0
    550 ; AVX1-NEXT:    retq
    551 ;
    552 ; AVX2-LABEL: ld1_hi0_hi1_4f64:
    553 ; AVX2:       # %bb.0: # %entry
    554 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
    555 ; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1]
    556 ; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
    557 ; AVX2-NEXT:    retq
    558 entry:
    559   %b = load <4 x double>, <4 x double> * %pb
    560   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
    561   %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0>
    562   ret <4 x double> %res
    563 }
    564 
    565 define <8 x float> @ld0_hi0_lo1_8f32(<8 x float> * %pa, <8 x float> %b) nounwind uwtable readnone ssp {
    566 ; AVX1-LABEL: ld0_hi0_lo1_8f32:
    567 ; AVX1:       # %bb.0: # %entry
    568 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
    569 ; AVX1-NEXT:    vaddps {{.*}}(%rip), %ymm0, %ymm0
    570 ; AVX1-NEXT:    retq
    571 ;
    572 ; AVX2-LABEL: ld0_hi0_lo1_8f32:
    573 ; AVX2:       # %bb.0: # %entry
    574 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
    575 ; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
    576 ; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
    577 ; AVX2-NEXT:    retq
    578 entry:
    579   %a = load <8 x float>, <8 x float> * %pa
    580   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
    581   %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
    582   ret <8 x float> %res
    583 }
    584 
    585 define <8 x float> @ld1_hi0_hi1_8f32(<8 x float> %a, <8 x float> * %pb) nounwind uwtable readnone ssp {
    586 ; AVX1-LABEL: ld1_hi0_hi1_8f32:
    587 ; AVX1:       # %bb.0: # %entry
    588 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
    589 ; AVX1-NEXT:    vaddps {{.*}}(%rip), %ymm0, %ymm0
    590 ; AVX1-NEXT:    retq
    591 ;
    592 ; AVX2-LABEL: ld1_hi0_hi1_8f32:
    593 ; AVX2:       # %bb.0: # %entry
    594 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
    595 ; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
    596 ; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
    597 ; AVX2-NEXT:    retq
    598 entry:
    599   %b = load <8 x float>, <8 x float> * %pb
    600   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
    601   %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
    602   ret <8 x float> %res
    603 }
    604 
    605 define <4 x i64> @ld0_hi0_lo1_4i64(<4 x i64> * %pa, <4 x i64> %b) nounwind uwtable readnone ssp {
    606 ; AVX1-LABEL: ld0_hi0_lo1_4i64:
    607 ; AVX1:       # %bb.0: # %entry
    608 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
    609 ; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm1
    610 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
    611 ; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm0
    612 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
    613 ; AVX1-NEXT:    retq
    614 ;
    615 ; AVX2-LABEL: ld0_hi0_lo1_4i64:
    616 ; AVX2:       # %bb.0: # %entry
    617 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
    618 ; AVX2-NEXT:    vpaddq {{.*}}(%rip), %ymm0, %ymm0
    619 ; AVX2-NEXT:    retq
    620 entry:
    621   %a = load <4 x i64>, <4 x i64> * %pa
    622   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
    623   %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4>
    624   ret <4 x i64> %res
    625 }
    626 
    627 define <4 x i64> @ld1_hi0_hi1_4i64(<4 x i64> %a, <4 x i64> * %pb) nounwind uwtable readnone ssp {
    628 ; AVX1-LABEL: ld1_hi0_hi1_4i64:
    629 ; AVX1:       # %bb.0: # %entry
    630 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
    631 ; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm1
    632 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
    633 ; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm0
    634 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
    635 ; AVX1-NEXT:    retq
    636 ;
    637 ; AVX2-LABEL: ld1_hi0_hi1_4i64:
    638 ; AVX2:       # %bb.0: # %entry
    639 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
    640 ; AVX2-NEXT:    vpaddq {{.*}}(%rip), %ymm0, %ymm0
    641 ; AVX2-NEXT:    retq
    642 entry:
    643   %b = load <4 x i64>, <4 x i64> * %pb
    644   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
    645   %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4>
    646   ret <4 x i64> %res
    647 }
    648 
    649 define <8 x i32> @ld0_hi0_lo1_8i32(<8 x i32> * %pa, <8 x i32> %b) nounwind uwtable readnone ssp {
    650 ; AVX1-LABEL: ld0_hi0_lo1_8i32:
    651 ; AVX1:       # %bb.0: # %entry
    652 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
    653 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    654 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,2,3,4]
    655 ; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
    656 ; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
    657 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    658 ; AVX1-NEXT:    retq
    659 ;
    660 ; AVX2-LABEL: ld0_hi0_lo1_8i32:
    661 ; AVX2:       # %bb.0: # %entry
    662 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
    663 ; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
    664 ; AVX2-NEXT:    retq
    665 entry:
    666   %a = load <8 x i32>, <8 x i32> * %pa
    667   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
    668   %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
    669   ret <8 x i32> %res
    670 }
    671 
    672 define <8 x i32> @ld1_hi0_hi1_8i32(<8 x i32> %a, <8 x i32> * %pb) nounwind uwtable readnone ssp {
    673 ; AVX1-LABEL: ld1_hi0_hi1_8i32:
    674 ; AVX1:       # %bb.0: # %entry
    675 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
    676 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    677 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,2,3,4]
    678 ; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
    679 ; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
    680 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    681 ; AVX1-NEXT:    retq
    682 ;
    683 ; AVX2-LABEL: ld1_hi0_hi1_8i32:
    684 ; AVX2:       # %bb.0: # %entry
    685 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
    686 ; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
    687 ; AVX2-NEXT:    retq
    688 entry:
    689   %b = load <8 x i32>, <8 x i32> * %pb
    690   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
    691   %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
    692   ret <8 x i32> %res
    693 }
    694