Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
      3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
      4 
      5 define <8 x float> @shuffle_v8f32_45670123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
      6 ; ALL-LABEL: shuffle_v8f32_45670123:
      7 ; ALL:       ## BB#0: ## %entry
      8 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
      9 ; ALL-NEXT:    retq
     10 entry:
     11   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
     12   ret <8 x float> %shuffle
     13 }
     14 
     15 define <8 x float> @shuffle_v8f32_45670123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
     16 ; ALL-LABEL: shuffle_v8f32_45670123_mem:
     17 ; ALL:       ## BB#0: ## %entry
     18 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
     19 ; ALL-NEXT:    retq
     20 entry:
     21   %a = load <8 x float>, <8 x float>* %pa
     22   %b = load <8 x float>, <8 x float>* %pb
     23   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
     24   ret <8 x float> %shuffle
     25 }
     26 
     27 define <8 x float> @shuffle_v8f32_0123cdef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
     28 ; ALL-LABEL: shuffle_v8f32_0123cdef:
     29 ; ALL:       ## BB#0: ## %entry
     30 ; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
     31 ; ALL-NEXT:    retq
     32 entry:
     33   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
     34   ret <8 x float> %shuffle
     35 }
     36 
     37 define <8 x float> @shuffle_v8f32_01230123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
     38 ; AVX1-LABEL: shuffle_v8f32_01230123:
     39 ; AVX1:       ## BB#0: ## %entry
     40 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
     41 ; AVX1-NEXT:    retq
     42 ;
     43 ; AVX2-LABEL: shuffle_v8f32_01230123:
     44 ; AVX2:       ## BB#0: ## %entry
     45 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
     46 ; AVX2-NEXT:    retq
     47 entry:
     48   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
     49   ret <8 x float> %shuffle
     50 }
     51 
     52 define <8 x float> @shuffle_v8f32_01230123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
     53 ; AVX1-LABEL: shuffle_v8f32_01230123_mem:
     54 ; AVX1:       ## BB#0: ## %entry
     55 ; AVX1-NEXT:    vmovaps (%rdi), %ymm0
     56 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
     57 ; AVX1-NEXT:    retq
     58 ;
     59 ; AVX2-LABEL: shuffle_v8f32_01230123_mem:
     60 ; AVX2:       ## BB#0: ## %entry
     61 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = mem[0,1,0,1]
     62 ; AVX2-NEXT:    retq
     63 entry:
     64   %a = load <8 x float>, <8 x float>* %pa
     65   %b = load <8 x float>, <8 x float>* %pb
     66   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
     67   ret <8 x float> %shuffle
     68 }
     69 
     70 define <8 x float> @shuffle_v8f32_45674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
     71 ; ALL-LABEL: shuffle_v8f32_45674567:
     72 ; ALL:       ## BB#0: ## %entry
     73 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
     74 ; ALL-NEXT:    retq
     75 entry:
     76   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
     77   ret <8 x float> %shuffle
     78 }
     79 
     80 define <8 x float> @shuffle_v8f32_45674567_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
     81 ; ALL-LABEL: shuffle_v8f32_45674567_mem:
     82 ; ALL:       ## BB#0: ## %entry
     83 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
     84 ; ALL-NEXT:    retq
     85 entry:
     86   %a = load <8 x float>, <8 x float>* %pa
     87   %b = load <8 x float>, <8 x float>* %pb
     88   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
     89   ret <8 x float> %shuffle
     90 }
     91 
     92 define <32 x i8> @shuffle_v32i8_2323(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
     93 ; ALL-LABEL: shuffle_v32i8_2323:
     94 ; ALL:       ## BB#0: ## %entry
     95 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
     96 ; ALL-NEXT:    retq
     97 entry:
     98   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
     99   ret <32 x i8> %shuffle
    100 }
    101 
    102 define <32 x i8> @shuffle_v32i8_2323_domain(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
    103 ; AVX1-LABEL: shuffle_v32i8_2323_domain:
    104 ; AVX1:       ## BB#0: ## %entry
    105 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
    106 ; AVX1-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
    107 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    108 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
    109 ; AVX1-NEXT:    retq
    110 ;
    111 ; AVX2-LABEL: shuffle_v32i8_2323_domain:
    112 ; AVX2:       ## BB#0: ## %entry
    113 ; AVX2-NEXT:    vpaddb {{.*}}(%rip), %ymm0, %ymm0
    114 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
    115 ; AVX2-NEXT:    retq
    116 entry:
    117   ; add forces execution domain
    118   %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
    119   %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    120   ret <32 x i8> %shuffle
    121 }
    122 
    123 define <4 x i64> @shuffle_v4i64_6701(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
    124 ; ALL-LABEL: shuffle_v4i64_6701:
    125 ; ALL:       ## BB#0: ## %entry
    126 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
    127 ; ALL-NEXT:    retq
    128 entry:
    129   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
    130   ret <4 x i64> %shuffle
    131 }
    132 
    133 define <4 x i64> @shuffle_v4i64_6701_domain(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
    134 ; AVX1-LABEL: shuffle_v4i64_6701_domain:
    135 ; AVX1:       ## BB#0: ## %entry
    136 ; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm0
    137 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
    138 ; AVX1-NEXT:    retq
    139 ;
    140 ; AVX2-LABEL: shuffle_v4i64_6701_domain:
    141 ; AVX2:       ## BB#0: ## %entry
    142 ; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
    143 ; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
    144 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
    145 ; AVX2-NEXT:    retq
    146 entry:
    147   ; add forces execution domain
    148   %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
    149   %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
    150   ret <4 x i64> %shuffle
    151 }
    152 
    153 define <8 x i32> @shuffle_v8i32_u5u7cdef(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
    154 ; AVX1-LABEL: shuffle_v8i32_u5u7cdef:
    155 ; AVX1:       ## BB#0: ## %entry
    156 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
    157 ; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
    158 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    159 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
    160 ; AVX1-NEXT:    retq
    161 ;
    162 ; AVX2-LABEL: shuffle_v8i32_u5u7cdef:
    163 ; AVX2:       ## BB#0: ## %entry
    164 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
    165 ; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
    166 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
    167 ; AVX2-NEXT:    retq
    168 entry:
    169   ; add forces execution domain
    170   %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    171   %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15>
    172   ret <8 x i32> %shuffle
    173 }
    174 
    175 define <16 x i16> @shuffle_v16i16_4501(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
    176 ; AVX1-LABEL: shuffle_v16i16_4501:
    177 ; AVX1:       ## BB#0: ## %entry
    178 ; AVX1-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
    179 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
    180 ; AVX1-NEXT:    retq
    181 ;
    182 ; AVX2-LABEL: shuffle_v16i16_4501:
    183 ; AVX2:       ## BB#0: ## %entry
    184 ; AVX2-NEXT:    vpaddw {{.*}}(%rip), %ymm0, %ymm0
    185 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
    186 ; AVX2-NEXT:    retq
    187 entry:
    188   ; add forces execution domain
    189   %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
    190   %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    191   ret <16 x i16> %shuffle
    192 }
    193 
    194 define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
    195 ; AVX1-LABEL: shuffle_v16i16_4501_mem:
    196 ; AVX1:       ## BB#0: ## %entry
    197 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
    198 ; AVX1-NEXT:    vmovaps (%rsi), %ymm1
    199 ; AVX1-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
    200 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
    201 ; AVX1-NEXT:    retq
    202 ;
    203 ; AVX2-LABEL: shuffle_v16i16_4501_mem:
    204 ; AVX2:       ## BB#0: ## %entry
    205 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
    206 ; AVX2-NEXT:    vmovdqa (%rsi), %ymm1
    207 ; AVX2-NEXT:    vpaddw {{.*}}(%rip), %ymm0, %ymm0
    208 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
    209 ; AVX2-NEXT:    retq
    210 entry:
    211   %c = load <16 x i16>, <16 x i16>* %a
    212   %d = load <16 x i16>, <16 x i16>* %b
    213   %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
    214   %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    215   ret <16 x i16> %shuffle
    216 }
    217 
    218 ;;;; Cases with undef indicies mixed in the mask
    219 
    220 define <8 x float> @shuffle_v8f32_uu67u9ub(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
    221 ; ALL-LABEL: shuffle_v8f32_uu67u9ub:
    222 ; ALL:       ## BB#0: ## %entry
    223 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
    224 ; ALL-NEXT:    retq
    225 entry:
    226   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11>
    227   ret <8 x float> %shuffle
    228 }
    229 
    230 define <8 x float> @shuffle_v8f32_uu67uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
    231 ; ALL-LABEL: shuffle_v8f32_uu67uu67:
    232 ; ALL:       ## BB#0: ## %entry
    233 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
    234 ; ALL-NEXT:    retq
    235 entry:
    236   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7>
    237   ret <8 x float> %shuffle
    238 }
    239 
    240 define <8 x float> @shuffle_v8f32_uu67uuab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
    241 ; ALL-LABEL: shuffle_v8f32_uu67uuab:
    242 ; ALL:       ## BB#0: ## %entry
    243 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
    244 ; ALL-NEXT:    retq
    245 entry:
    246   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 10, i32 11>
    247   ret <8 x float> %shuffle
    248 }
    249 
    250 define <8 x float> @shuffle_v8f32_uu67uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
    251 ; ALL-LABEL: shuffle_v8f32_uu67uuef:
    252 ; ALL:       ## BB#0: ## %entry
    253 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
    254 ; ALL-NEXT:    retq
    255 entry:
    256   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15>
    257   ret <8 x float> %shuffle
    258 }
    259 
    260 define <8 x float> @shuffle_v8f32_uu674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
    261 ; ALL-LABEL: shuffle_v8f32_uu674567:
    262 ; ALL:       ## BB#0: ## %entry
    263 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
    264 ; ALL-NEXT:    retq
    265 entry:
    266   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
    267   ret <8 x float> %shuffle
    268 }
    269 
    270 define <8 x float> @shuffle_v8f32_uu6789ab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
    271 ; ALL-LABEL: shuffle_v8f32_uu6789ab:
    272 ; ALL:       ## BB#0: ## %entry
    273 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
    274 ; ALL-NEXT:    retq
    275 entry:
    276   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
    277   ret <8 x float> %shuffle
    278 }
    279 
    280 define <8 x float> @shuffle_v8f32_4567uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
    281 ; ALL-LABEL: shuffle_v8f32_4567uu67:
    282 ; ALL:       ## BB#0: ## %entry
    283 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
    284 ; ALL-NEXT:    retq
    285 entry:
    286   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7>
    287   ret <8 x float> %shuffle
    288 }
    289 
    290 define <8 x float> @shuffle_v8f32_4567uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
    291 ; ALL-LABEL: shuffle_v8f32_4567uuef:
    292 ; ALL:       ## BB#0: ## %entry
    293 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
    294 ; ALL-NEXT:    retq
    295 entry:
    296   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15>
    297   ret <8 x float> %shuffle
    298 }
    299 
    300 ;;;; Cases we must not select vperm2f128
    301 
    302 define <8 x float> @shuffle_v8f32_uu67ucuf(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
    303 ; ALL-LABEL: shuffle_v8f32_uu67ucuf:
    304 ; ALL:       ## BB#0: ## %entry
    305 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
    306 ; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
    307 ; ALL-NEXT:    retq
    308 entry:
    309   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15>
    310   ret <8 x float> %shuffle
    311 }
    312 
    313 ;; Test zero mask generation.
    314 ;; PR22984: https://llvm.org/bugs/show_bug.cgi?id=22984
    315 ;; Prefer xor+vblendpd over vperm2f128 because that has better performance.
    316 ;; TODO: When building for optsize we should use vperm2f128.
    317 
    318 define <4 x double> @shuffle_v4f64_zz01(<4 x double> %a) {
    319 ; ALL-LABEL: shuffle_v4f64_zz01:
    320 ; ALL:       ## BB#0:
    321 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
    322 ; ALL-NEXT:    retq
    323   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
    324   ret <4 x double> %s
    325 }
    326 define <4 x double> @shuffle_v4f64_zz01_optsize(<4 x double> %a) optsize {
    327 ; ALL-LABEL: shuffle_v4f64_zz01_optsize:
    328 ; ALL:       ## BB#0:
    329 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
    330 ; ALL-NEXT:    retq
    331   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
    332   ret <4 x double> %s
    333 }
    334 
    335 define <4 x double> @shuffle_v4f64_zz23(<4 x double> %a) {
    336 ; ALL-LABEL: shuffle_v4f64_zz23:
    337 ; ALL:       ## BB#0:
    338 ; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
    339 ; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
    340 ; ALL-NEXT:    retq
    341   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
    342   ret <4 x double> %s
    343 }
    344 define <4 x double> @shuffle_v4f64_zz23_optsize(<4 x double> %a) optsize {
    345 ; ALL-LABEL: shuffle_v4f64_zz23_optsize:
    346 ; ALL:       ## BB#0:
    347 ; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
    348 ; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
    349 ; ALL-NEXT:    retq
    350   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
    351   ret <4 x double> %s
    352 }
    353 
    354 define <4 x double> @shuffle_v4f64_zz45(<4 x double> %a) {
    355 ; ALL-LABEL: shuffle_v4f64_zz45:
    356 ; ALL:       ## BB#0:
    357 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
    358 ; ALL-NEXT:    retq
    359   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
    360   ret <4 x double> %s
    361 }
    362 define <4 x double> @shuffle_v4f64_zz45_optsize(<4 x double> %a) optsize {
    363 ; ALL-LABEL: shuffle_v4f64_zz45_optsize:
    364 ; ALL:       ## BB#0:
    365 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
    366 ; ALL-NEXT:    retq
    367   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
    368   ret <4 x double> %s
    369 }
    370 
    371 define <4 x double> @shuffle_v4f64_zz67(<4 x double> %a) {
    372 ; ALL-LABEL: shuffle_v4f64_zz67:
    373 ; ALL:       ## BB#0:
    374 ; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
    375 ; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
    376 ; ALL-NEXT:    retq
    377   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
    378   ret <4 x double> %s
    379 }
    380 define <4 x double> @shuffle_v4f64_zz67_optsize(<4 x double> %a) optsize {
    381 ; ALL-LABEL: shuffle_v4f64_zz67_optsize:
    382 ; ALL:       ## BB#0:
    383 ; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
    384 ; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
    385 ; ALL-NEXT:    retq
    386   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
    387   ret <4 x double> %s
    388 }
    389 
    390 define <4 x double> @shuffle_v4f64_01zz(<4 x double> %a) {
    391 ; ALL-LABEL: shuffle_v4f64_01zz:
    392 ; ALL:       ## BB#0:
    393 ; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
    394 ; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
    395 ; ALL-NEXT:    retq
    396   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
    397   ret <4 x double> %s
    398 }
    399 define <4 x double> @shuffle_v4f64_01zz_optsize(<4 x double> %a) optsize {
    400 ; ALL-LABEL: shuffle_v4f64_01zz_optsize:
    401 ; ALL:       ## BB#0:
    402 ; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
    403 ; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
    404 ; ALL-NEXT:    retq
    405   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
    406   ret <4 x double> %s
    407 }
    408 
    409 define <4 x double> @shuffle_v4f64_23zz(<4 x double> %a) {
    410 ; ALL-LABEL: shuffle_v4f64_23zz:
    411 ; ALL:       ## BB#0:
    412 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
    413 ; ALL-NEXT:    retq
    414   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
    415   ret <4 x double> %s
    416 }
    417 define <4 x double> @shuffle_v4f64_23zz_optsize(<4 x double> %a) optsize {
    418 ; ALL-LABEL: shuffle_v4f64_23zz_optsize:
    419 ; ALL:       ## BB#0:
    420 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
    421 ; ALL-NEXT:    retq
    422   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
    423   ret <4 x double> %s
    424 }
    425 
    426 define <4 x double> @shuffle_v4f64_45zz(<4 x double> %a) {
    427 ; ALL-LABEL: shuffle_v4f64_45zz:
    428 ; ALL:       ## BB#0:
    429 ; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
    430 ; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
    431 ; ALL-NEXT:    retq
    432   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
    433   ret <4 x double> %s
    434 }
    435 define <4 x double> @shuffle_v4f64_45zz_optsize(<4 x double> %a) optsize {
    436 ; ALL-LABEL: shuffle_v4f64_45zz_optsize:
    437 ; ALL:       ## BB#0:
    438 ; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
    439 ; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
    440 ; ALL-NEXT:    retq
    441   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
    442   ret <4 x double> %s
    443 }
    444 
    445 define <4 x double> @shuffle_v4f64_67zz(<4 x double> %a) {
    446 ; ALL-LABEL: shuffle_v4f64_67zz:
    447 ; ALL:       ## BB#0:
    448 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
    449 ; ALL-NEXT:    retq
    450   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
    451   ret <4 x double> %s
    452 }
    453 define <4 x double> @shuffle_v4f64_67zz_optsize(<4 x double> %a) optsize {
    454 ; ALL-LABEL: shuffle_v4f64_67zz_optsize:
    455 ; ALL:       ## BB#0:
    456 ; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
    457 ; ALL-NEXT:    retq
    458   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
    459   ret <4 x double> %s
    460 }
    461 
    462 ;; With AVX2 select the integer version of the instruction. Use an add to force the domain selection.
    463 
    464 define <4 x i64> @shuffle_v4i64_67zz(<4 x i64> %a, <4 x i64> %b) {
    465 ; AVX1-LABEL: shuffle_v4i64_67zz:
    466 ; AVX1:       ## BB#0:
    467 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
    468 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    469 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
    470 ; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
    471 ; AVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
    472 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    473 ; AVX1-NEXT:    retq
    474 ;
    475 ; AVX2-LABEL: shuffle_v4i64_67zz:
    476 ; AVX2:       ## BB#0:
    477 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
    478 ; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
    479 ; AVX2-NEXT:    retq
    480   %s = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
    481   %c = add <4 x i64> %b, %s
    482   ret <4 x i64> %c
    483 }
    484 
    485 ;;; Memory folding cases
    486 
    487 define <4 x double> @ld0_hi0_lo1_4f64(<4 x double> * %pa, <4 x double> %b) nounwind uwtable readnone ssp {
    488 ; AVX1-LABEL: ld0_hi0_lo1_4f64:
    489 ; AVX1:       ## BB#0: ## %entry
    490 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
    491 ; AVX1-NEXT:    vaddpd {{.*}}(%rip), %ymm0, %ymm0
    492 ; AVX1-NEXT:    retq
    493 ;
    494 ; AVX2-LABEL: ld0_hi0_lo1_4f64:
    495 ; AVX2:       ## BB#0: ## %entry
    496 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
    497 ; AVX2-NEXT:    vbroadcastsd {{.*}}(%rip), %ymm1
    498 ; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
    499 ; AVX2-NEXT:    retq
    500 entry:
    501   %a = load <4 x double>, <4 x double> * %pa
    502   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
    503   %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0>
    504   ret <4 x double> %res
    505 }
    506 
    507 define <4 x double> @ld1_hi0_hi1_4f64(<4 x double> %a, <4 x double> * %pb) nounwind uwtable readnone ssp {
    508 ; AVX1-LABEL: ld1_hi0_hi1_4f64:
    509 ; AVX1:       ## BB#0: ## %entry
    510 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
    511 ; AVX1-NEXT:    vaddpd {{.*}}(%rip), %ymm0, %ymm0
    512 ; AVX1-NEXT:    retq
    513 ;
    514 ; AVX2-LABEL: ld1_hi0_hi1_4f64:
    515 ; AVX2:       ## BB#0: ## %entry
    516 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
    517 ; AVX2-NEXT:    vbroadcastsd {{.*}}(%rip), %ymm1
    518 ; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
    519 ; AVX2-NEXT:    retq
    520 entry:
    521   %b = load <4 x double>, <4 x double> * %pb
    522   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
    523   %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0>
    524   ret <4 x double> %res
    525 }
    526 
    527 define <8 x float> @ld0_hi0_lo1_8f32(<8 x float> * %pa, <8 x float> %b) nounwind uwtable readnone ssp {
    528 ; AVX1-LABEL: ld0_hi0_lo1_8f32:
    529 ; AVX1:       ## BB#0: ## %entry
    530 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
    531 ; AVX1-NEXT:    vaddps {{.*}}(%rip), %ymm0, %ymm0
    532 ; AVX1-NEXT:    retq
    533 ;
    534 ; AVX2-LABEL: ld0_hi0_lo1_8f32:
    535 ; AVX2:       ## BB#0: ## %entry
    536 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
    537 ; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1
    538 ; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
    539 ; AVX2-NEXT:    retq
    540 entry:
    541   %a = load <8 x float>, <8 x float> * %pa
    542   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
    543   %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
    544   ret <8 x float> %res
    545 }
    546 
    547 define <8 x float> @ld1_hi0_hi1_8f32(<8 x float> %a, <8 x float> * %pb) nounwind uwtable readnone ssp {
    548 ; AVX1-LABEL: ld1_hi0_hi1_8f32:
    549 ; AVX1:       ## BB#0: ## %entry
    550 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
    551 ; AVX1-NEXT:    vaddps {{.*}}(%rip), %ymm0, %ymm0
    552 ; AVX1-NEXT:    retq
    553 ;
    554 ; AVX2-LABEL: ld1_hi0_hi1_8f32:
    555 ; AVX2:       ## BB#0: ## %entry
    556 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
    557 ; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1
    558 ; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
    559 ; AVX2-NEXT:    retq
    560 entry:
    561   %b = load <8 x float>, <8 x float> * %pb
    562   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
    563   %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
    564   ret <8 x float> %res
    565 }
    566 
    567 define <4 x i64> @ld0_hi0_lo1_4i64(<4 x i64> * %pa, <4 x i64> %b) nounwind uwtable readnone ssp {
    568 ; AVX1-LABEL: ld0_hi0_lo1_4i64:
    569 ; AVX1:       ## BB#0: ## %entry
    570 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
    571 ; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm1
    572 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
    573 ; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm0
    574 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
    575 ; AVX1-NEXT:    retq
    576 ;
    577 ; AVX2-LABEL: ld0_hi0_lo1_4i64:
    578 ; AVX2:       ## BB#0: ## %entry
    579 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
    580 ; AVX2-NEXT:    vpaddq {{.*}}(%rip), %ymm0, %ymm0
    581 ; AVX2-NEXT:    retq
    582 entry:
    583   %a = load <4 x i64>, <4 x i64> * %pa
    584   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
    585   %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4>
    586   ret <4 x i64> %res
    587 }
    588 
    589 define <4 x i64> @ld1_hi0_hi1_4i64(<4 x i64> %a, <4 x i64> * %pb) nounwind uwtable readnone ssp {
    590 ; AVX1-LABEL: ld1_hi0_hi1_4i64:
    591 ; AVX1:       ## BB#0: ## %entry
    592 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
    593 ; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm1
    594 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
    595 ; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm0
    596 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
    597 ; AVX1-NEXT:    retq
    598 ;
    599 ; AVX2-LABEL: ld1_hi0_hi1_4i64:
    600 ; AVX2:       ## BB#0: ## %entry
    601 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
    602 ; AVX2-NEXT:    vpaddq {{.*}}(%rip), %ymm0, %ymm0
    603 ; AVX2-NEXT:    retq
    604 entry:
    605   %b = load <4 x i64>, <4 x i64> * %pb
    606   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
    607   %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4>
    608   ret <4 x i64> %res
    609 }
    610 
    611 define <8 x i32> @ld0_hi0_lo1_8i32(<8 x i32> * %pa, <8 x i32> %b) nounwind uwtable readnone ssp {
    612 ; AVX1-LABEL: ld0_hi0_lo1_8i32:
    613 ; AVX1:       ## BB#0: ## %entry
    614 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
    615 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    616 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,2,3,4]
    617 ; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
    618 ; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
    619 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    620 ; AVX1-NEXT:    retq
    621 ;
    622 ; AVX2-LABEL: ld0_hi0_lo1_8i32:
    623 ; AVX2:       ## BB#0: ## %entry
    624 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
    625 ; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
    626 ; AVX2-NEXT:    retq
    627 entry:
    628   %a = load <8 x i32>, <8 x i32> * %pa
    629   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
    630   %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
    631   ret <8 x i32> %res
    632 }
    633 
    634 define <8 x i32> @ld1_hi0_hi1_8i32(<8 x i32> %a, <8 x i32> * %pb) nounwind uwtable readnone ssp {
    635 ; AVX1-LABEL: ld1_hi0_hi1_8i32:
    636 ; AVX1:       ## BB#0: ## %entry
    637 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
    638 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    639 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,2,3,4]
    640 ; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
    641 ; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
    642 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    643 ; AVX1-NEXT:    retq
    644 ;
    645 ; AVX2-LABEL: ld1_hi0_hi1_8i32:
    646 ; AVX2:       ## BB#0: ## %entry
    647 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
    648 ; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
    649 ; AVX2-NEXT:    retq
    650 entry:
    651   %b = load <8 x i32>, <8 x i32> * %pb
    652   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
    653   %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
    654   ret <8 x i32> %res
    655 }
    656