Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX1
      3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX2
      4 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 --check-prefix=X32-AVX512F
      5 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 --check-prefix=X32-AVX512BW
      6 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 --check-prefix=X32-AVX512DQ
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
      8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
      9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 --check-prefix=X64-AVX512F
     10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 --check-prefix=X64-AVX512BW
     11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 --check-prefix=X64-AVX512DQ
     12 
     13 ;
     14 ; Subvector Load + Broadcast
     15 ;
     16 
     17 define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
     18 ; X32-LABEL: test_broadcast_2f64_4f64:
     19 ; X32:       # %bb.0:
     20 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
     21 ; X32-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
     22 ; X32-NEXT:    retl
     23 ;
     24 ; X64-LABEL: test_broadcast_2f64_4f64:
     25 ; X64:       # %bb.0:
     26 ; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
     27 ; X64-NEXT:    retq
     28  %1 = load <2 x double>, <2 x double> *%p
     29  %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
     30  ret <4 x double> %2
     31 }
     32 
     33 define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind {
     34 ; X32-AVX-LABEL: test_broadcast_2f64_8f64:
     35 ; X32-AVX:       # %bb.0:
     36 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
     37 ; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
     38 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
     39 ; X32-AVX-NEXT:    retl
     40 ;
     41 ; X32-AVX512-LABEL: test_broadcast_2f64_8f64:
     42 ; X32-AVX512:       # %bb.0:
     43 ; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
     44 ; X32-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
     45 ; X32-AVX512-NEXT:    retl
     46 ;
     47 ; X64-AVX-LABEL: test_broadcast_2f64_8f64:
     48 ; X64-AVX:       # %bb.0:
     49 ; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
     50 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
     51 ; X64-AVX-NEXT:    retq
     52 ;
     53 ; X64-AVX512-LABEL: test_broadcast_2f64_8f64:
     54 ; X64-AVX512:       # %bb.0:
     55 ; X64-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
     56 ; X64-AVX512-NEXT:    retq
     57  %1 = load <2 x double>, <2 x double> *%p
     58  %2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
     59  ret <8 x double> %2
     60 }
     61 
     62 define <8 x double> @test_broadcast_4f64_8f64(<4 x double> *%p) nounwind {
     63 ; X32-AVX-LABEL: test_broadcast_4f64_8f64:
     64 ; X32-AVX:       # %bb.0:
     65 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
     66 ; X32-AVX-NEXT:    vmovaps (%eax), %ymm0
     67 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
     68 ; X32-AVX-NEXT:    retl
     69 ;
     70 ; X32-AVX512-LABEL: test_broadcast_4f64_8f64:
     71 ; X32-AVX512:       # %bb.0:
     72 ; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
     73 ; X32-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
     74 ; X32-AVX512-NEXT:    retl
     75 ;
     76 ; X64-AVX-LABEL: test_broadcast_4f64_8f64:
     77 ; X64-AVX:       # %bb.0:
     78 ; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
     79 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
     80 ; X64-AVX-NEXT:    retq
     81 ;
     82 ; X64-AVX512-LABEL: test_broadcast_4f64_8f64:
     83 ; X64-AVX512:       # %bb.0:
     84 ; X64-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
     85 ; X64-AVX512-NEXT:    retq
     86  %1 = load <4 x double>, <4 x double> *%p
     87  %2 = shufflevector <4 x double> %1, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
     88  ret <8 x double> %2
     89 }
     90 
     91 define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
     92 ; X32-AVX-LABEL: test_broadcast_2i64_4i64:
     93 ; X32-AVX:       # %bb.0:
     94 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
     95 ; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
     96 ; X32-AVX-NEXT:    retl
     97 ;
     98 ; X32-AVX512-LABEL: test_broadcast_2i64_4i64:
     99 ; X32-AVX512:       # %bb.0:
    100 ; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
    101 ; X32-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
    102 ; X32-AVX512-NEXT:    retl
    103 ;
    104 ; X64-AVX-LABEL: test_broadcast_2i64_4i64:
    105 ; X64-AVX:       # %bb.0:
    106 ; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    107 ; X64-AVX-NEXT:    retq
    108 ;
    109 ; X64-AVX512-LABEL: test_broadcast_2i64_4i64:
    110 ; X64-AVX512:       # %bb.0:
    111 ; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
    112 ; X64-AVX512-NEXT:    retq
    113  %1 = load <2 x i64>, <2 x i64> *%p
    114  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
    115  ret <4 x i64> %2
    116 }
    117 
    118 define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind {
    119 ; X32-AVX-LABEL: test_broadcast_2i64_8i64:
    120 ; X32-AVX:       # %bb.0:
    121 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    122 ; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    123 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
    124 ; X32-AVX-NEXT:    retl
    125 ;
    126 ; X32-AVX512-LABEL: test_broadcast_2i64_8i64:
    127 ; X32-AVX512:       # %bb.0:
    128 ; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
    129 ; X32-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
    130 ; X32-AVX512-NEXT:    retl
    131 ;
    132 ; X64-AVX-LABEL: test_broadcast_2i64_8i64:
    133 ; X64-AVX:       # %bb.0:
    134 ; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    135 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
    136 ; X64-AVX-NEXT:    retq
    137 ;
    138 ; X64-AVX512-LABEL: test_broadcast_2i64_8i64:
    139 ; X64-AVX512:       # %bb.0:
    140 ; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
    141 ; X64-AVX512-NEXT:    retq
    142  %1 = load <2 x i64>, <2 x i64> *%p
    143  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
    144  ret <8 x i64> %2
    145 }
    146 
    147 define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind {
    148 ; X32-AVX-LABEL: test_broadcast_4i64_8i64:
    149 ; X32-AVX:       # %bb.0:
    150 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    151 ; X32-AVX-NEXT:    vmovaps (%eax), %ymm0
    152 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
    153 ; X32-AVX-NEXT:    retl
    154 ;
    155 ; X32-AVX512-LABEL: test_broadcast_4i64_8i64:
    156 ; X32-AVX512:       # %bb.0:
    157 ; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
    158 ; X32-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
    159 ; X32-AVX512-NEXT:    retl
    160 ;
    161 ; X64-AVX-LABEL: test_broadcast_4i64_8i64:
    162 ; X64-AVX:       # %bb.0:
    163 ; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
    164 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
    165 ; X64-AVX-NEXT:    retq
    166 ;
    167 ; X64-AVX512-LABEL: test_broadcast_4i64_8i64:
    168 ; X64-AVX512:       # %bb.0:
    169 ; X64-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
    170 ; X64-AVX512-NEXT:    retq
    171  %1 = load <4 x i64>, <4 x i64> *%p
    172  %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    173  ret <8 x i64> %2
    174 }
    175 
    176 define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
    177 ; X32-LABEL: test_broadcast_4f32_8f32:
    178 ; X32:       # %bb.0:
    179 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    180 ; X32-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    181 ; X32-NEXT:    retl
    182 ;
    183 ; X64-LABEL: test_broadcast_4f32_8f32:
    184 ; X64:       # %bb.0:
    185 ; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    186 ; X64-NEXT:    retq
    187  %1 = load <4 x float>, <4 x float> *%p
    188  %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    189  ret <8 x float> %2
    190 }
    191 
    192 define <16 x float> @test_broadcast_4f32_16f32(<4 x float> *%p) nounwind {
    193 ; X32-AVX-LABEL: test_broadcast_4f32_16f32:
    194 ; X32-AVX:       # %bb.0:
    195 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    196 ; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    197 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
    198 ; X32-AVX-NEXT:    retl
    199 ;
    200 ; X32-AVX512-LABEL: test_broadcast_4f32_16f32:
    201 ; X32-AVX512:       # %bb.0:
    202 ; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
    203 ; X32-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
    204 ; X32-AVX512-NEXT:    retl
    205 ;
    206 ; X64-AVX-LABEL: test_broadcast_4f32_16f32:
    207 ; X64-AVX:       # %bb.0:
    208 ; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    209 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
    210 ; X64-AVX-NEXT:    retq
    211 ;
    212 ; X64-AVX512-LABEL: test_broadcast_4f32_16f32:
    213 ; X64-AVX512:       # %bb.0:
    214 ; X64-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
    215 ; X64-AVX512-NEXT:    retq
    216  %1 = load <4 x float>, <4 x float> *%p
    217  %2 = shufflevector <4 x float> %1, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    218  ret <16 x float> %2
    219 }
    220 
    221 define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind {
    222 ; X32-AVX-LABEL: test_broadcast_8f32_16f32:
    223 ; X32-AVX:       # %bb.0:
    224 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    225 ; X32-AVX-NEXT:    vmovaps (%eax), %ymm0
    226 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
    227 ; X32-AVX-NEXT:    retl
    228 ;
    229 ; X32-AVX512-LABEL: test_broadcast_8f32_16f32:
    230 ; X32-AVX512:       # %bb.0:
    231 ; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
    232 ; X32-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
    233 ; X32-AVX512-NEXT:    retl
    234 ;
    235 ; X64-AVX-LABEL: test_broadcast_8f32_16f32:
    236 ; X64-AVX:       # %bb.0:
    237 ; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
    238 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
    239 ; X64-AVX-NEXT:    retq
    240 ;
    241 ; X64-AVX512-LABEL: test_broadcast_8f32_16f32:
    242 ; X64-AVX512:       # %bb.0:
    243 ; X64-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
    244 ; X64-AVX512-NEXT:    retq
    245  %1 = load <8 x float>, <8 x float> *%p
    246  %2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    247  ret <16 x float> %2
    248 }
    249 
    250 define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
    251 ; X32-AVX-LABEL: test_broadcast_4i32_8i32:
    252 ; X32-AVX:       # %bb.0:
    253 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    254 ; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    255 ; X32-AVX-NEXT:    retl
    256 ;
    257 ; X32-AVX512-LABEL: test_broadcast_4i32_8i32:
    258 ; X32-AVX512:       # %bb.0:
    259 ; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
    260 ; X32-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
    261 ; X32-AVX512-NEXT:    retl
    262 ;
    263 ; X64-AVX-LABEL: test_broadcast_4i32_8i32:
    264 ; X64-AVX:       # %bb.0:
    265 ; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    266 ; X64-AVX-NEXT:    retq
    267 ;
    268 ; X64-AVX512-LABEL: test_broadcast_4i32_8i32:
    269 ; X64-AVX512:       # %bb.0:
    270 ; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
    271 ; X64-AVX512-NEXT:    retq
    272  %1 = load <4 x i32>, <4 x i32> *%p
    273  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    274  ret <8 x i32> %2
    275 }
    276 
    277 define <16 x i32> @test_broadcast_4i32_16i32(<4 x i32> *%p) nounwind {
    278 ; X32-AVX-LABEL: test_broadcast_4i32_16i32:
    279 ; X32-AVX:       # %bb.0:
    280 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    281 ; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    282 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
    283 ; X32-AVX-NEXT:    retl
    284 ;
    285 ; X32-AVX512-LABEL: test_broadcast_4i32_16i32:
    286 ; X32-AVX512:       # %bb.0:
    287 ; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
    288 ; X32-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
    289 ; X32-AVX512-NEXT:    retl
    290 ;
    291 ; X64-AVX-LABEL: test_broadcast_4i32_16i32:
    292 ; X64-AVX:       # %bb.0:
    293 ; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    294 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
    295 ; X64-AVX-NEXT:    retq
    296 ;
    297 ; X64-AVX512-LABEL: test_broadcast_4i32_16i32:
    298 ; X64-AVX512:       # %bb.0:
    299 ; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
    300 ; X64-AVX512-NEXT:    retq
    301  %1 = load <4 x i32>, <4 x i32> *%p
    302  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    303  ret <16 x i32> %2
    304 }
    305 
    306 define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind {
    307 ; X32-AVX-LABEL: test_broadcast_8i32_16i32:
    308 ; X32-AVX:       # %bb.0:
    309 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    310 ; X32-AVX-NEXT:    vmovaps (%eax), %ymm0
    311 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
    312 ; X32-AVX-NEXT:    retl
    313 ;
    314 ; X32-AVX512-LABEL: test_broadcast_8i32_16i32:
    315 ; X32-AVX512:       # %bb.0:
    316 ; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
    317 ; X32-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
    318 ; X32-AVX512-NEXT:    retl
    319 ;
    320 ; X64-AVX-LABEL: test_broadcast_8i32_16i32:
    321 ; X64-AVX:       # %bb.0:
    322 ; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
    323 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
    324 ; X64-AVX-NEXT:    retq
    325 ;
    326 ; X64-AVX512-LABEL: test_broadcast_8i32_16i32:
    327 ; X64-AVX512:       # %bb.0:
    328 ; X64-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
    329 ; X64-AVX512-NEXT:    retq
    330  %1 = load <8 x i32>, <8 x i32> *%p
    331  %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    332  ret <16 x i32> %2
    333 }
    334 
    335 define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
    336 ; X32-AVX-LABEL: test_broadcast_8i16_16i16:
    337 ; X32-AVX:       # %bb.0:
    338 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    339 ; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    340 ; X32-AVX-NEXT:    retl
    341 ;
    342 ; X32-AVX512-LABEL: test_broadcast_8i16_16i16:
    343 ; X32-AVX512:       # %bb.0:
    344 ; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
    345 ; X32-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
    346 ; X32-AVX512-NEXT:    retl
    347 ;
    348 ; X64-AVX-LABEL: test_broadcast_8i16_16i16:
    349 ; X64-AVX:       # %bb.0:
    350 ; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    351 ; X64-AVX-NEXT:    retq
    352 ;
    353 ; X64-AVX512-LABEL: test_broadcast_8i16_16i16:
    354 ; X64-AVX512:       # %bb.0:
    355 ; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
    356 ; X64-AVX512-NEXT:    retq
    357  %1 = load <8 x i16>, <8 x i16> *%p
    358  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    359  ret <16 x i16> %2
    360 }
    361 
    362 define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind {
    363 ; X32-AVX-LABEL: test_broadcast_8i16_32i16:
    364 ; X32-AVX:       # %bb.0:
    365 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    366 ; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    367 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
    368 ; X32-AVX-NEXT:    retl
    369 ;
    370 ; X32-AVX512F-LABEL: test_broadcast_8i16_32i16:
    371 ; X32-AVX512F:       # %bb.0:
    372 ; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
    373 ; X32-AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
    374 ; X32-AVX512F-NEXT:    vmovdqa %ymm0, %ymm1
    375 ; X32-AVX512F-NEXT:    retl
    376 ;
    377 ; X32-AVX512BW-LABEL: test_broadcast_8i16_32i16:
    378 ; X32-AVX512BW:       # %bb.0:
    379 ; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
    380 ; X32-AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
    381 ; X32-AVX512BW-NEXT:    retl
    382 ;
    383 ; X32-AVX512DQ-LABEL: test_broadcast_8i16_32i16:
    384 ; X32-AVX512DQ:       # %bb.0:
    385 ; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
    386 ; X32-AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
    387 ; X32-AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
    388 ; X32-AVX512DQ-NEXT:    retl
    389 ;
    390 ; X64-AVX-LABEL: test_broadcast_8i16_32i16:
    391 ; X64-AVX:       # %bb.0:
    392 ; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    393 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
    394 ; X64-AVX-NEXT:    retq
    395 ;
    396 ; X64-AVX512F-LABEL: test_broadcast_8i16_32i16:
    397 ; X64-AVX512F:       # %bb.0:
    398 ; X64-AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
    399 ; X64-AVX512F-NEXT:    vmovdqa %ymm0, %ymm1
    400 ; X64-AVX512F-NEXT:    retq
    401 ;
    402 ; X64-AVX512BW-LABEL: test_broadcast_8i16_32i16:
    403 ; X64-AVX512BW:       # %bb.0:
    404 ; X64-AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
    405 ; X64-AVX512BW-NEXT:    retq
    406 ;
    407 ; X64-AVX512DQ-LABEL: test_broadcast_8i16_32i16:
    408 ; X64-AVX512DQ:       # %bb.0:
    409 ; X64-AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
    410 ; X64-AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
    411 ; X64-AVX512DQ-NEXT:    retq
    412  %1 = load <8 x i16>, <8 x i16> *%p
    413  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    414  ret <32 x i16> %2
    415 }
    416 
    417 define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind {
    418 ; X32-AVX-LABEL: test_broadcast_16i16_32i16:
    419 ; X32-AVX:       # %bb.0:
    420 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    421 ; X32-AVX-NEXT:    vmovaps (%eax), %ymm0
    422 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
    423 ; X32-AVX-NEXT:    retl
    424 ;
    425 ; X32-AVX512F-LABEL: test_broadcast_16i16_32i16:
    426 ; X32-AVX512F:       # %bb.0:
    427 ; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
    428 ; X32-AVX512F-NEXT:    vmovaps (%eax), %ymm0
    429 ; X32-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
    430 ; X32-AVX512F-NEXT:    retl
    431 ;
    432 ; X32-AVX512BW-LABEL: test_broadcast_16i16_32i16:
    433 ; X32-AVX512BW:       # %bb.0:
    434 ; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
    435 ; X32-AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
    436 ; X32-AVX512BW-NEXT:    retl
    437 ;
    438 ; X32-AVX512DQ-LABEL: test_broadcast_16i16_32i16:
    439 ; X32-AVX512DQ:       # %bb.0:
    440 ; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
    441 ; X32-AVX512DQ-NEXT:    vmovaps (%eax), %ymm0
    442 ; X32-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
    443 ; X32-AVX512DQ-NEXT:    retl
    444 ;
    445 ; X64-AVX-LABEL: test_broadcast_16i16_32i16:
    446 ; X64-AVX:       # %bb.0:
    447 ; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
    448 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
    449 ; X64-AVX-NEXT:    retq
    450 ;
    451 ; X64-AVX512F-LABEL: test_broadcast_16i16_32i16:
    452 ; X64-AVX512F:       # %bb.0:
    453 ; X64-AVX512F-NEXT:    vmovaps (%rdi), %ymm0
    454 ; X64-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
    455 ; X64-AVX512F-NEXT:    retq
    456 ;
    457 ; X64-AVX512BW-LABEL: test_broadcast_16i16_32i16:
    458 ; X64-AVX512BW:       # %bb.0:
    459 ; X64-AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
    460 ; X64-AVX512BW-NEXT:    retq
    461 ;
    462 ; X64-AVX512DQ-LABEL: test_broadcast_16i16_32i16:
    463 ; X64-AVX512DQ:       # %bb.0:
    464 ; X64-AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
    465 ; X64-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
    466 ; X64-AVX512DQ-NEXT:    retq
    467  %1 = load <16 x i16>, <16 x i16> *%p
    468  %2 = shufflevector <16 x i16> %1, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    469  ret <32 x i16> %2
    470 }
    471 
    472 define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind {
    473 ; X32-AVX-LABEL: test_broadcast_16i8_32i8:
    474 ; X32-AVX:       # %bb.0:
    475 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    476 ; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    477 ; X32-AVX-NEXT:    retl
    478 ;
    479 ; X32-AVX512-LABEL: test_broadcast_16i8_32i8:
    480 ; X32-AVX512:       # %bb.0:
    481 ; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
    482 ; X32-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
    483 ; X32-AVX512-NEXT:    retl
    484 ;
    485 ; X64-AVX-LABEL: test_broadcast_16i8_32i8:
    486 ; X64-AVX:       # %bb.0:
    487 ; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    488 ; X64-AVX-NEXT:    retq
    489 ;
    490 ; X64-AVX512-LABEL: test_broadcast_16i8_32i8:
    491 ; X64-AVX512:       # %bb.0:
    492 ; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
    493 ; X64-AVX512-NEXT:    retq
    494  %1 = load <16 x i8>, <16 x i8> *%p
    495  %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    496  ret <32 x i8> %2
    497 }
    498 
    499 define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind {
    500 ; X32-AVX-LABEL: test_broadcast_16i8_64i8:
    501 ; X32-AVX:       # %bb.0:
    502 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    503 ; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    504 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
    505 ; X32-AVX-NEXT:    retl
    506 ;
    507 ; X32-AVX512F-LABEL: test_broadcast_16i8_64i8:
    508 ; X32-AVX512F:       # %bb.0:
    509 ; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
    510 ; X32-AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
    511 ; X32-AVX512F-NEXT:    vmovdqa %ymm0, %ymm1
    512 ; X32-AVX512F-NEXT:    retl
    513 ;
    514 ; X32-AVX512BW-LABEL: test_broadcast_16i8_64i8:
    515 ; X32-AVX512BW:       # %bb.0:
    516 ; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
    517 ; X32-AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
    518 ; X32-AVX512BW-NEXT:    retl
    519 ;
    520 ; X32-AVX512DQ-LABEL: test_broadcast_16i8_64i8:
    521 ; X32-AVX512DQ:       # %bb.0:
    522 ; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
    523 ; X32-AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
    524 ; X32-AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
    525 ; X32-AVX512DQ-NEXT:    retl
    526 ;
    527 ; X64-AVX-LABEL: test_broadcast_16i8_64i8:
    528 ; X64-AVX:       # %bb.0:
    529 ; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    530 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
    531 ; X64-AVX-NEXT:    retq
    532 ;
    533 ; X64-AVX512F-LABEL: test_broadcast_16i8_64i8:
    534 ; X64-AVX512F:       # %bb.0:
    535 ; X64-AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
    536 ; X64-AVX512F-NEXT:    vmovdqa %ymm0, %ymm1
    537 ; X64-AVX512F-NEXT:    retq
    538 ;
    539 ; X64-AVX512BW-LABEL: test_broadcast_16i8_64i8:
    540 ; X64-AVX512BW:       # %bb.0:
    541 ; X64-AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
    542 ; X64-AVX512BW-NEXT:    retq
    543 ;
    544 ; X64-AVX512DQ-LABEL: test_broadcast_16i8_64i8:
    545 ; X64-AVX512DQ:       # %bb.0:
    546 ; X64-AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
    547 ; X64-AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
    548 ; X64-AVX512DQ-NEXT:    retq
    549  %1 = load <16 x i8>, <16 x i8> *%p
    550  %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    551  ret <64 x i8> %2
    552 }
    553 
    554 define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind {
    555 ; X32-AVX-LABEL: test_broadcast_32i8_64i8:
    556 ; X32-AVX:       # %bb.0:
    557 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    558 ; X32-AVX-NEXT:    vmovaps (%eax), %ymm0
    559 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
    560 ; X32-AVX-NEXT:    retl
    561 ;
    562 ; X32-AVX512F-LABEL: test_broadcast_32i8_64i8:
    563 ; X32-AVX512F:       # %bb.0:
    564 ; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
    565 ; X32-AVX512F-NEXT:    vmovaps (%eax), %ymm0
    566 ; X32-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
    567 ; X32-AVX512F-NEXT:    retl
    568 ;
    569 ; X32-AVX512BW-LABEL: test_broadcast_32i8_64i8:
    570 ; X32-AVX512BW:       # %bb.0:
    571 ; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
    572 ; X32-AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
    573 ; X32-AVX512BW-NEXT:    retl
    574 ;
    575 ; X32-AVX512DQ-LABEL: test_broadcast_32i8_64i8:
    576 ; X32-AVX512DQ:       # %bb.0:
    577 ; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
    578 ; X32-AVX512DQ-NEXT:    vmovaps (%eax), %ymm0
    579 ; X32-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
    580 ; X32-AVX512DQ-NEXT:    retl
    581 ;
    582 ; X64-AVX-LABEL: test_broadcast_32i8_64i8:
    583 ; X64-AVX:       # %bb.0:
    584 ; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
    585 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
    586 ; X64-AVX-NEXT:    retq
    587 ;
    588 ; X64-AVX512F-LABEL: test_broadcast_32i8_64i8:
    589 ; X64-AVX512F:       # %bb.0:
    590 ; X64-AVX512F-NEXT:    vmovaps (%rdi), %ymm0
    591 ; X64-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
    592 ; X64-AVX512F-NEXT:    retq
    593 ;
    594 ; X64-AVX512BW-LABEL: test_broadcast_32i8_64i8:
    595 ; X64-AVX512BW:       # %bb.0:
    596 ; X64-AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
    597 ; X64-AVX512BW-NEXT:    retq
    598 ;
    599 ; X64-AVX512DQ-LABEL: test_broadcast_32i8_64i8:
    600 ; X64-AVX512DQ:       # %bb.0:
    601 ; X64-AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
    602 ; X64-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
    603 ; X64-AVX512DQ-NEXT:    retq
    604  %1 = load <32 x i8>, <32 x i8> *%p
    605  %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    606  ret <64 x i8> %2
    607 }
    608 
    609 ;
    610 ; Subvector Load + Broadcast + Store
    611 ;
    612 
    613 define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x double>* %p1) {
    614 ; X32-LABEL: test_broadcast_2f64_4f64_reuse:
    615 ; X32:       # %bb.0:
    616 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    617 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    618 ; X32-NEXT:    vmovaps (%ecx), %xmm0
    619 ; X32-NEXT:    vmovaps %xmm0, (%eax)
    620 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    621 ; X32-NEXT:    retl
    622 ;
    623 ; X64-LABEL: test_broadcast_2f64_4f64_reuse:
    624 ; X64:       # %bb.0:
    625 ; X64-NEXT:    vmovaps (%rdi), %xmm0
    626 ; X64-NEXT:    vmovaps %xmm0, (%rsi)
    627 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    628 ; X64-NEXT:    retq
    629  %1 = load <2 x double>, <2 x double>* %p0
    630  store <2 x double> %1, <2 x double>* %p1
    631  %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
    632  ret <4 x double> %2
    633 }
    634 
    635 define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) {
    636 ; X32-LABEL: test_broadcast_2i64_4i64_reuse:
    637 ; X32:       # %bb.0:
    638 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    639 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    640 ; X32-NEXT:    vmovaps (%ecx), %xmm0
    641 ; X32-NEXT:    vmovaps %xmm0, (%eax)
    642 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    643 ; X32-NEXT:    retl
    644 ;
    645 ; X64-LABEL: test_broadcast_2i64_4i64_reuse:
    646 ; X64:       # %bb.0:
    647 ; X64-NEXT:    vmovaps (%rdi), %xmm0
    648 ; X64-NEXT:    vmovaps %xmm0, (%rsi)
    649 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    650 ; X64-NEXT:    retq
    651  %1 = load <2 x i64>, <2 x i64>* %p0
    652  store <2 x i64> %1, <2 x i64>* %p1
    653  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
    654  ret <4 x i64> %2
    655 }
    656 
    657 define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) {
    658 ; X32-LABEL: test_broadcast_4f32_8f32_reuse:
    659 ; X32:       # %bb.0:
    660 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    661 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    662 ; X32-NEXT:    vmovaps (%ecx), %xmm0
    663 ; X32-NEXT:    vmovaps %xmm0, (%eax)
    664 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    665 ; X32-NEXT:    retl
    666 ;
    667 ; X64-LABEL: test_broadcast_4f32_8f32_reuse:
    668 ; X64:       # %bb.0:
    669 ; X64-NEXT:    vmovaps (%rdi), %xmm0
    670 ; X64-NEXT:    vmovaps %xmm0, (%rsi)
    671 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    672 ; X64-NEXT:    retq
    673  %1 = load <4 x float>, <4 x float>* %p0
    674  store <4 x float> %1, <4 x float>* %p1
    675  %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    676  ret <8 x float> %2
    677 }
    678 
    679 define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) {
    680 ; X32-LABEL: test_broadcast_4i32_8i32_reuse:
    681 ; X32:       # %bb.0:
    682 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    683 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    684 ; X32-NEXT:    vmovaps (%ecx), %xmm0
    685 ; X32-NEXT:    vmovaps %xmm0, (%eax)
    686 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    687 ; X32-NEXT:    retl
    688 ;
    689 ; X64-LABEL: test_broadcast_4i32_8i32_reuse:
    690 ; X64:       # %bb.0:
    691 ; X64-NEXT:    vmovaps (%rdi), %xmm0
    692 ; X64-NEXT:    vmovaps %xmm0, (%rsi)
    693 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    694 ; X64-NEXT:    retq
    695  %1 = load <4 x i32>, <4 x i32>* %p0
    696  store <4 x i32> %1, <4 x i32>* %p1
    697  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    698  ret <8 x i32> %2
    699 }
    700 
    701 define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p1) nounwind {
    702 ; X32-LABEL: test_broadcast_8i16_16i16_reuse:
    703 ; X32:       # %bb.0:
    704 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    705 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    706 ; X32-NEXT:    vmovaps (%ecx), %xmm0
    707 ; X32-NEXT:    vmovaps %xmm0, (%eax)
    708 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    709 ; X32-NEXT:    retl
    710 ;
    711 ; X64-LABEL: test_broadcast_8i16_16i16_reuse:
    712 ; X64:       # %bb.0:
    713 ; X64-NEXT:    vmovaps (%rdi), %xmm0
    714 ; X64-NEXT:    vmovaps %xmm0, (%rsi)
    715 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    716 ; X64-NEXT:    retq
    717  %1 = load <8 x i16>, <8 x i16> *%p0
    718  store <8 x i16> %1, <8 x i16>* %p1
    719  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    720  ret <16 x i16> %2
    721 }
    722 
    723 define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) nounwind {
    724 ; X32-LABEL: test_broadcast_16i8_32i8_reuse:
    725 ; X32:       # %bb.0:
    726 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    727 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    728 ; X32-NEXT:    vmovaps (%ecx), %xmm0
    729 ; X32-NEXT:    vmovaps %xmm0, (%eax)
    730 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    731 ; X32-NEXT:    retl
    732 ;
    733 ; X64-LABEL: test_broadcast_16i8_32i8_reuse:
    734 ; X64:       # %bb.0:
    735 ; X64-NEXT:    vmovaps (%rdi), %xmm0
    736 ; X64-NEXT:    vmovaps %xmm0, (%rsi)
    737 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    738 ; X64-NEXT:    retq
    739  %1 = load <16 x i8>, <16 x i8> *%p0
    740  store <16 x i8> %1, <16 x i8>* %p1
    741  %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    742  ret <32 x i8> %2
    743 }
    744 
    745 ;
    746 ; Subvector Load + Broadcast with Separate Store
    747 ;
    748 
    749 define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p1) {
    750 ; X32-AVX-LABEL: test_broadcast_4i32_8i32_chain:
    751 ; X32-AVX:       # %bb.0:
    752 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    753 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    754 ; X32-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    755 ; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    756 ; X32-AVX-NEXT:    vmovaps %xmm1, (%eax)
    757 ; X32-AVX-NEXT:    retl
    758 ;
    759 ; X32-AVX512-LABEL: test_broadcast_4i32_8i32_chain:
    760 ; X32-AVX512:       # %bb.0:
    761 ; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
    762 ; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    763 ; X32-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    764 ; X32-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
    765 ; X32-AVX512-NEXT:    vmovaps %xmm1, (%eax)
    766 ; X32-AVX512-NEXT:    retl
    767 ;
    768 ; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain:
    769 ; X64-AVX:       # %bb.0:
    770 ; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    771 ; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    772 ; X64-AVX-NEXT:    vmovaps %xmm1, (%rsi)
    773 ; X64-AVX-NEXT:    retq
    774 ;
    775 ; X64-AVX512-LABEL: test_broadcast_4i32_8i32_chain:
    776 ; X64-AVX512:       # %bb.0:
    777 ; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    778 ; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
    779 ; X64-AVX512-NEXT:    vmovaps %xmm1, (%rsi)
    780 ; X64-AVX512-NEXT:    retq
    781   %1 = load <4 x i32>, <4 x i32>* %p0
    782   store <4 x float> zeroinitializer, <4 x float>* %p1
    783   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    784   ret <8 x i32> %2
    785 }
    786 
    787 define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* %p1) {
    788 ; X32-AVX-LABEL: test_broadcast_4i32_16i32_chain:
    789 ; X32-AVX:       # %bb.0:
    790 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    791 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    792 ; X32-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    793 ; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    794 ; X32-AVX-NEXT:    vmovaps %xmm1, (%eax)
    795 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
    796 ; X32-AVX-NEXT:    retl
    797 ;
    798 ; X32-AVX512-LABEL: test_broadcast_4i32_16i32_chain:
    799 ; X32-AVX512:       # %bb.0:
    800 ; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
    801 ; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    802 ; X32-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    803 ; X32-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
    804 ; X32-AVX512-NEXT:    vmovaps %xmm1, (%eax)
    805 ; X32-AVX512-NEXT:    retl
    806 ;
    807 ; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain:
    808 ; X64-AVX:       # %bb.0:
    809 ; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    810 ; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    811 ; X64-AVX-NEXT:    vmovaps %xmm1, (%rsi)
    812 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
    813 ; X64-AVX-NEXT:    retq
    814 ;
    815 ; X64-AVX512-LABEL: test_broadcast_4i32_16i32_chain:
    816 ; X64-AVX512:       # %bb.0:
    817 ; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    818 ; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
    819 ; X64-AVX512-NEXT:    vmovaps %xmm1, (%rsi)
    820 ; X64-AVX512-NEXT:    retq
    821   %1 = load <4 x i32>, <4 x i32>* %p0
    822   store <4 x float> zeroinitializer, <4 x float>* %p1
    823   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    824   ret <16 x i32> %2
    825 }
    826 
    827 ;
    828 ; subvector Load with multiple uses + broadcast
    829 ; Fallback to the broadcast should be done
    830 ;
    831 
    832 @ga4 = global <4 x i64> zeroinitializer, align 8
    833 @gb4 = global <8 x i64> zeroinitializer, align 8
    834 
    835 define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) {
    836 ; X32-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
    837 ; X32-AVX1:       # %bb.0: # %entry
    838 ; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
    839 ; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [3,0,4,0]
    840 ; X32-AVX1-NEXT:    vpaddq %xmm4, %xmm3, %xmm3
    841 ; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,0,2,0]
    842 ; X32-AVX1-NEXT:    vpaddq %xmm5, %xmm0, %xmm0
    843 ; X32-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
    844 ; X32-AVX1-NEXT:    vmovaps {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0]
    845 ; X32-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
    846 ; X32-AVX1-NEXT:    vpaddq %xmm4, %xmm6, %xmm6
    847 ; X32-AVX1-NEXT:    vpaddq %xmm5, %xmm2, %xmm2
    848 ; X32-AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
    849 ; X32-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
    850 ; X32-AVX1-NEXT:    vpaddq %xmm4, %xmm6, %xmm4
    851 ; X32-AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm1
    852 ; X32-AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
    853 ; X32-AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
    854 ; X32-AVX1-NEXT:    vandps %ymm3, %ymm2, %ymm2
    855 ; X32-AVX1-NEXT:    vmovups %ymm0, ga4
    856 ; X32-AVX1-NEXT:    vmovups %ymm2, gb4+32
    857 ; X32-AVX1-NEXT:    vmovups %ymm1, gb4
    858 ; X32-AVX1-NEXT:    vzeroupper
    859 ; X32-AVX1-NEXT:    retl
    860 ;
    861 ; X32-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
    862 ; X32-AVX2:       # %bb.0: # %entry
    863 ; X32-AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0]
    864 ; X32-AVX2-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
    865 ; X32-AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
    866 ; X32-AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
    867 ; X32-AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
    868 ; X32-AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
    869 ; X32-AVX2-NEXT:    vmovdqu %ymm0, ga4
    870 ; X32-AVX2-NEXT:    vmovdqu %ymm2, gb4+32
    871 ; X32-AVX2-NEXT:    vmovdqu %ymm1, gb4
    872 ; X32-AVX2-NEXT:    vzeroupper
    873 ; X32-AVX2-NEXT:    retl
    874 ;
    875 ; X32-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
    876 ; X32-AVX512:       # %bb.0: # %entry
    877 ; X32-AVX512-NEXT:    vpaddq {{\.LCPI.*}}, %ymm0, %ymm0
    878 ; X32-AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,0,2,0,3,0,4,0,1,0,2,0,3,0,4,0]
    879 ; X32-AVX512-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
    880 ; X32-AVX512-NEXT:    vpandq %zmm2, %zmm1, %zmm1
    881 ; X32-AVX512-NEXT:    vmovdqu %ymm0, ga4
    882 ; X32-AVX512-NEXT:    vmovdqu64 %zmm1, gb4
    883 ; X32-AVX512-NEXT:    vzeroupper
    884 ; X32-AVX512-NEXT:    retl
    885 ;
    886 ; X64-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
    887 ; X64-AVX1:       # %bb.0: # %entry
    888 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
    889 ; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [3,4]
    890 ; X64-AVX1-NEXT:    vpaddq %xmm4, %xmm3, %xmm3
    891 ; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,2]
    892 ; X64-AVX1-NEXT:    vpaddq %xmm5, %xmm0, %xmm0
    893 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
    894 ; X64-AVX1-NEXT:    vmovaps {{.*#+}} ymm3 = [1,2,3,4]
    895 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
    896 ; X64-AVX1-NEXT:    vpaddq %xmm4, %xmm6, %xmm6
    897 ; X64-AVX1-NEXT:    vpaddq %xmm5, %xmm2, %xmm2
    898 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
    899 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
    900 ; X64-AVX1-NEXT:    vpaddq %xmm4, %xmm6, %xmm4
    901 ; X64-AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm1
    902 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
    903 ; X64-AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
    904 ; X64-AVX1-NEXT:    vandps %ymm3, %ymm2, %ymm2
    905 ; X64-AVX1-NEXT:    vmovups %ymm0, {{.*}}(%rip)
    906 ; X64-AVX1-NEXT:    vmovups %ymm2, gb4+{{.*}}(%rip)
    907 ; X64-AVX1-NEXT:    vmovups %ymm1, {{.*}}(%rip)
    908 ; X64-AVX1-NEXT:    vzeroupper
    909 ; X64-AVX1-NEXT:    retq
    910 ;
    911 ; X64-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
    912 ; X64-AVX2:       # %bb.0: # %entry
    913 ; X64-AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,2,3,4]
    914 ; X64-AVX2-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
    915 ; X64-AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
    916 ; X64-AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
    917 ; X64-AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
    918 ; X64-AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
    919 ; X64-AVX2-NEXT:    vmovdqu %ymm0, {{.*}}(%rip)
    920 ; X64-AVX2-NEXT:    vmovdqu %ymm2, gb4+{{.*}}(%rip)
    921 ; X64-AVX2-NEXT:    vmovdqu %ymm1, {{.*}}(%rip)
    922 ; X64-AVX2-NEXT:    vzeroupper
    923 ; X64-AVX2-NEXT:    retq
    924 ;
    925 ; X64-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
    926 ; X64-AVX512:       # %bb.0: # %entry
    927 ; X64-AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,3,4]
    928 ; X64-AVX512-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
    929 ; X64-AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm2
    930 ; X64-AVX512-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
    931 ; X64-AVX512-NEXT:    vpandq %zmm2, %zmm1, %zmm1
    932 ; X64-AVX512-NEXT:    vmovdqu %ymm0, {{.*}}(%rip)
    933 ; X64-AVX512-NEXT:    vmovdqu64 %zmm1, {{.*}}(%rip)
    934 ; X64-AVX512-NEXT:    vzeroupper
    935 ; X64-AVX512-NEXT:    retq
    936 entry:
    937   %0 = add <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4>
    938   %1 = add <8 x i64> %b, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4>
    939   %2 = and <8 x i64> %1, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4>
    940   store <4 x i64> %0, <4 x i64>* @ga4, align 8
    941   store <8 x i64> %2, <8 x i64>* @gb4, align 8
    942   ret void
    943 }
    944 
    945 
    946 @ga2 = global <4 x double> zeroinitializer, align 8
    947 @gb2 = global <8 x double> zeroinitializer, align 8
    948 
    949 define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b) {
    950 ; X32-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
    951 ; X32-AVX:       # %bb.0: # %entry
    952 ; X32-AVX-NEXT:    vmovapd {{.*#+}} ymm3 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
    953 ; X32-AVX-NEXT:    vaddpd %ymm3, %ymm0, %ymm0
    954 ; X32-AVX-NEXT:    vaddpd %ymm3, %ymm2, %ymm2
    955 ; X32-AVX-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
    956 ; X32-AVX-NEXT:    vdivpd %ymm3, %ymm1, %ymm1
    957 ; X32-AVX-NEXT:    vdivpd %ymm3, %ymm2, %ymm2
    958 ; X32-AVX-NEXT:    vmovupd %ymm0, ga2
    959 ; X32-AVX-NEXT:    vmovupd %ymm2, gb2+32
    960 ; X32-AVX-NEXT:    vmovupd %ymm1, gb2
    961 ; X32-AVX-NEXT:    vzeroupper
    962 ; X32-AVX-NEXT:    retl
    963 ;
    964 ; X32-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
    965 ; X32-AVX512:       # %bb.0: # %entry
    966 ; X32-AVX512-NEXT:    vmovapd {{.*#+}} ymm2 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
    967 ; X32-AVX512-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
    968 ; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm2, %zmm2
    969 ; X32-AVX512-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
    970 ; X32-AVX512-NEXT:    vdivpd %zmm2, %zmm1, %zmm1
    971 ; X32-AVX512-NEXT:    vmovupd %ymm0, ga2
    972 ; X32-AVX512-NEXT:    vmovupd %zmm1, gb2
    973 ; X32-AVX512-NEXT:    vzeroupper
    974 ; X32-AVX512-NEXT:    retl
    975 ;
    976 ; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
    977 ; X64-AVX:       # %bb.0: # %entry
    978 ; X64-AVX-NEXT:    vmovapd {{.*#+}} ymm3 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
    979 ; X64-AVX-NEXT:    vaddpd %ymm3, %ymm0, %ymm0
    980 ; X64-AVX-NEXT:    vaddpd %ymm3, %ymm2, %ymm2
    981 ; X64-AVX-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
    982 ; X64-AVX-NEXT:    vdivpd %ymm3, %ymm1, %ymm1
    983 ; X64-AVX-NEXT:    vdivpd %ymm3, %ymm2, %ymm2
    984 ; X64-AVX-NEXT:    vmovupd %ymm0, {{.*}}(%rip)
    985 ; X64-AVX-NEXT:    vmovupd %ymm2, gb2+{{.*}}(%rip)
    986 ; X64-AVX-NEXT:    vmovupd %ymm1, {{.*}}(%rip)
    987 ; X64-AVX-NEXT:    vzeroupper
    988 ; X64-AVX-NEXT:    retq
    989 ;
    990 ; X64-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
    991 ; X64-AVX512:       # %bb.0: # %entry
    992 ; X64-AVX512-NEXT:    vmovapd {{.*#+}} ymm2 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
    993 ; X64-AVX512-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
    994 ; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm2, %zmm2
    995 ; X64-AVX512-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
    996 ; X64-AVX512-NEXT:    vdivpd %zmm2, %zmm1, %zmm1
    997 ; X64-AVX512-NEXT:    vmovupd %ymm0, {{.*}}(%rip)
    998 ; X64-AVX512-NEXT:    vmovupd %zmm1, {{.*}}(%rip)
    999 ; X64-AVX512-NEXT:    vzeroupper
   1000 ; X64-AVX512-NEXT:    retq
   1001 entry:
   1002   %0 = fadd <4 x double> %a, <double 1.0, double 2.0, double 3.0, double 4.0>
   1003   %1 = fadd <8 x double> %b, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0>
   1004   %2 = fdiv <8 x double> %1, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0>
   1005   store <4 x double> %0, <4 x double>* @ga2, align 8
   1006   store <8 x double> %2, <8 x double>* @gb2, align 8
   1007   ret void
   1008 }
   1009 
   1010 ;
   1011 ; Subvector Broadcast from register
   1012 ;
   1013 
   1014 define <4 x double> @reg_broadcast_2f64_4f64(<2 x double> %a0) nounwind {
   1015 ; X32-LABEL: reg_broadcast_2f64_4f64:
   1016 ; X32:       # %bb.0:
   1017 ; X32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1018 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1019 ; X32-NEXT:    retl
   1020 ;
   1021 ; X64-LABEL: reg_broadcast_2f64_4f64:
   1022 ; X64:       # %bb.0:
   1023 ; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1024 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1025 ; X64-NEXT:    retq
   1026  %1 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   1027  ret <4 x double> %1
   1028 }
   1029 
   1030 define <8 x double> @reg_broadcast_2f64_8f64(<2 x double> %a0) nounwind {
   1031 ; X32-AVX-LABEL: reg_broadcast_2f64_8f64:
   1032 ; X32-AVX:       # %bb.0:
   1033 ; X32-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1034 ; X32-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1035 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
   1036 ; X32-AVX-NEXT:    retl
   1037 ;
   1038 ; X32-AVX512-LABEL: reg_broadcast_2f64_8f64:
   1039 ; X32-AVX512:       # %bb.0:
   1040 ; X32-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1041 ; X32-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1042 ; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
   1043 ; X32-AVX512-NEXT:    retl
   1044 ;
   1045 ; X64-AVX-LABEL: reg_broadcast_2f64_8f64:
   1046 ; X64-AVX:       # %bb.0:
   1047 ; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1048 ; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1049 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
   1050 ; X64-AVX-NEXT:    retq
   1051 ;
   1052 ; X64-AVX512-LABEL: reg_broadcast_2f64_8f64:
   1053 ; X64-AVX512:       # %bb.0:
   1054 ; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1055 ; X64-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1056 ; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
   1057 ; X64-AVX512-NEXT:    retq
   1058  %1 = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   1059  ret <8 x double> %1
   1060 }
   1061 
   1062 define <8 x double> @reg_broadcast_4f64_8f64(<4 x double> %a0) nounwind {
   1063 ; X32-AVX-LABEL: reg_broadcast_4f64_8f64:
   1064 ; X32-AVX:       # %bb.0:
   1065 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
   1066 ; X32-AVX-NEXT:    retl
   1067 ;
   1068 ; X32-AVX512-LABEL: reg_broadcast_4f64_8f64:
   1069 ; X32-AVX512:       # %bb.0:
   1070 ; X32-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   1071 ; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
   1072 ; X32-AVX512-NEXT:    retl
   1073 ;
   1074 ; X64-AVX-LABEL: reg_broadcast_4f64_8f64:
   1075 ; X64-AVX:       # %bb.0:
   1076 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
   1077 ; X64-AVX-NEXT:    retq
   1078 ;
   1079 ; X64-AVX512-LABEL: reg_broadcast_4f64_8f64:
   1080 ; X64-AVX512:       # %bb.0:
   1081 ; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   1082 ; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
   1083 ; X64-AVX512-NEXT:    retq
   1084  %1 = shufflevector <4 x double> %a0, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   1085  ret <8 x double> %1
   1086 }
   1087 
   1088 define <4 x i64> @reg_broadcast_2i64_4i64(<2 x i64> %a0) nounwind {
   1089 ; X32-LABEL: reg_broadcast_2i64_4i64:
   1090 ; X32:       # %bb.0:
   1091 ; X32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1092 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1093 ; X32-NEXT:    retl
   1094 ;
   1095 ; X64-LABEL: reg_broadcast_2i64_4i64:
   1096 ; X64:       # %bb.0:
   1097 ; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1098 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1099 ; X64-NEXT:    retq
   1100  %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   1101  ret <4 x i64> %1
   1102 }
   1103 
   1104 define <8 x i64> @reg_broadcast_2i64_8i64(<2 x i64> %a0) nounwind {
   1105 ; X32-AVX-LABEL: reg_broadcast_2i64_8i64:
   1106 ; X32-AVX:       # %bb.0:
   1107 ; X32-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1108 ; X32-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1109 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
   1110 ; X32-AVX-NEXT:    retl
   1111 ;
   1112 ; X32-AVX512-LABEL: reg_broadcast_2i64_8i64:
   1113 ; X32-AVX512:       # %bb.0:
   1114 ; X32-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1115 ; X32-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1116 ; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
   1117 ; X32-AVX512-NEXT:    retl
   1118 ;
   1119 ; X64-AVX-LABEL: reg_broadcast_2i64_8i64:
   1120 ; X64-AVX:       # %bb.0:
   1121 ; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1122 ; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1123 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
   1124 ; X64-AVX-NEXT:    retq
   1125 ;
   1126 ; X64-AVX512-LABEL: reg_broadcast_2i64_8i64:
   1127 ; X64-AVX512:       # %bb.0:
   1128 ; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1129 ; X64-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1130 ; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
   1131 ; X64-AVX512-NEXT:    retq
   1132  %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   1133  ret <8 x i64> %1
   1134 }
   1135 
   1136 define <8 x i64> @reg_broadcast_4i64_8i64(<4 x i64> %a0) nounwind {
   1137 ; X32-AVX-LABEL: reg_broadcast_4i64_8i64:
   1138 ; X32-AVX:       # %bb.0:
   1139 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
   1140 ; X32-AVX-NEXT:    retl
   1141 ;
   1142 ; X32-AVX512-LABEL: reg_broadcast_4i64_8i64:
   1143 ; X32-AVX512:       # %bb.0:
   1144 ; X32-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   1145 ; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
   1146 ; X32-AVX512-NEXT:    retl
   1147 ;
   1148 ; X64-AVX-LABEL: reg_broadcast_4i64_8i64:
   1149 ; X64-AVX:       # %bb.0:
   1150 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
   1151 ; X64-AVX-NEXT:    retq
   1152 ;
   1153 ; X64-AVX512-LABEL: reg_broadcast_4i64_8i64:
   1154 ; X64-AVX512:       # %bb.0:
   1155 ; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   1156 ; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
   1157 ; X64-AVX512-NEXT:    retq
   1158  %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   1159  ret <8 x i64> %1
   1160 }
   1161 
   1162 define <8 x float> @reg_broadcast_4f32_8f32(<4 x float> %a0) nounwind {
   1163 ; X32-LABEL: reg_broadcast_4f32_8f32:
   1164 ; X32:       # %bb.0:
   1165 ; X32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1166 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1167 ; X32-NEXT:    retl
   1168 ;
   1169 ; X64-LABEL: reg_broadcast_4f32_8f32:
   1170 ; X64:       # %bb.0:
   1171 ; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1172 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1173 ; X64-NEXT:    retq
   1174  %1 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   1175  ret <8 x float> %1
   1176 }
   1177 
   1178 define <16 x float> @reg_broadcast_4f32_16f32(<4 x float> %a0) nounwind {
   1179 ; X32-AVX-LABEL: reg_broadcast_4f32_16f32:
   1180 ; X32-AVX:       # %bb.0:
   1181 ; X32-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1182 ; X32-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1183 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
   1184 ; X32-AVX-NEXT:    retl
   1185 ;
   1186 ; X32-AVX512-LABEL: reg_broadcast_4f32_16f32:
   1187 ; X32-AVX512:       # %bb.0:
   1188 ; X32-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1189 ; X32-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1190 ; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
   1191 ; X32-AVX512-NEXT:    retl
   1192 ;
   1193 ; X64-AVX-LABEL: reg_broadcast_4f32_16f32:
   1194 ; X64-AVX:       # %bb.0:
   1195 ; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1196 ; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1197 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
   1198 ; X64-AVX-NEXT:    retq
   1199 ;
   1200 ; X64-AVX512-LABEL: reg_broadcast_4f32_16f32:
   1201 ; X64-AVX512:       # %bb.0:
   1202 ; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1203 ; X64-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1204 ; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
   1205 ; X64-AVX512-NEXT:    retq
   1206  %1 = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   1207  ret <16 x float> %1
   1208 }
   1209 
   1210 define <16 x float> @reg_broadcast_8f32_16f32(<8 x float> %a0) nounwind {
   1211 ; X32-AVX-LABEL: reg_broadcast_8f32_16f32:
   1212 ; X32-AVX:       # %bb.0:
   1213 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
   1214 ; X32-AVX-NEXT:    retl
   1215 ;
   1216 ; X32-AVX512-LABEL: reg_broadcast_8f32_16f32:
   1217 ; X32-AVX512:       # %bb.0:
   1218 ; X32-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   1219 ; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
   1220 ; X32-AVX512-NEXT:    retl
   1221 ;
   1222 ; X64-AVX-LABEL: reg_broadcast_8f32_16f32:
   1223 ; X64-AVX:       # %bb.0:
   1224 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
   1225 ; X64-AVX-NEXT:    retq
   1226 ;
   1227 ; X64-AVX512-LABEL: reg_broadcast_8f32_16f32:
   1228 ; X64-AVX512:       # %bb.0:
   1229 ; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   1230 ; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
   1231 ; X64-AVX512-NEXT:    retq
   1232  %1 = shufflevector <8 x float> %a0, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1233  ret <16 x float> %1
   1234 }
   1235 
   1236 define <8 x i32> @reg_broadcast_4i32_8i32(<4 x i32> %a0) nounwind {
   1237 ; X32-LABEL: reg_broadcast_4i32_8i32:
   1238 ; X32:       # %bb.0:
   1239 ; X32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1240 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1241 ; X32-NEXT:    retl
   1242 ;
   1243 ; X64-LABEL: reg_broadcast_4i32_8i32:
   1244 ; X64:       # %bb.0:
   1245 ; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1246 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1247 ; X64-NEXT:    retq
   1248  %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   1249  ret <8 x i32> %1
   1250 }
   1251 
   1252 define <16 x i32> @reg_broadcast_4i32_16i32(<4 x i32> %a0) nounwind {
   1253 ; X32-AVX-LABEL: reg_broadcast_4i32_16i32:
   1254 ; X32-AVX:       # %bb.0:
   1255 ; X32-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1256 ; X32-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1257 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
   1258 ; X32-AVX-NEXT:    retl
   1259 ;
   1260 ; X32-AVX512-LABEL: reg_broadcast_4i32_16i32:
   1261 ; X32-AVX512:       # %bb.0:
   1262 ; X32-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1263 ; X32-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1264 ; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
   1265 ; X32-AVX512-NEXT:    retl
   1266 ;
   1267 ; X64-AVX-LABEL: reg_broadcast_4i32_16i32:
   1268 ; X64-AVX:       # %bb.0:
   1269 ; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1270 ; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1271 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
   1272 ; X64-AVX-NEXT:    retq
   1273 ;
   1274 ; X64-AVX512-LABEL: reg_broadcast_4i32_16i32:
   1275 ; X64-AVX512:       # %bb.0:
   1276 ; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1277 ; X64-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1278 ; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
   1279 ; X64-AVX512-NEXT:    retq
   1280  %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   1281  ret <16 x i32> %1
   1282 }
   1283 
   1284 define <16 x i32> @reg_broadcast_8i32_16i32(<8 x i32> %a0) nounwind {
   1285 ; X32-AVX-LABEL: reg_broadcast_8i32_16i32:
   1286 ; X32-AVX:       # %bb.0:
   1287 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
   1288 ; X32-AVX-NEXT:    retl
   1289 ;
   1290 ; X32-AVX512-LABEL: reg_broadcast_8i32_16i32:
   1291 ; X32-AVX512:       # %bb.0:
   1292 ; X32-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   1293 ; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
   1294 ; X32-AVX512-NEXT:    retl
   1295 ;
   1296 ; X64-AVX-LABEL: reg_broadcast_8i32_16i32:
   1297 ; X64-AVX:       # %bb.0:
   1298 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
   1299 ; X64-AVX-NEXT:    retq
   1300 ;
   1301 ; X64-AVX512-LABEL: reg_broadcast_8i32_16i32:
   1302 ; X64-AVX512:       # %bb.0:
   1303 ; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   1304 ; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
   1305 ; X64-AVX512-NEXT:    retq
   1306  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1307  ret <16 x i32> %1
   1308 }
   1309 
   1310 define <16 x i16> @reg_broadcast_8i16_16i16(<8 x i16> %a0) nounwind {
   1311 ; X32-LABEL: reg_broadcast_8i16_16i16:
   1312 ; X32:       # %bb.0:
   1313 ; X32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1314 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1315 ; X32-NEXT:    retl
   1316 ;
   1317 ; X64-LABEL: reg_broadcast_8i16_16i16:
   1318 ; X64:       # %bb.0:
   1319 ; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1320 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1321 ; X64-NEXT:    retq
   1322  %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1323  ret <16 x i16> %1
   1324 }
   1325 
   1326 define <32 x i16> @reg_broadcast_8i16_32i16(<8 x i16> %a0) nounwind {
   1327 ; X32-AVX-LABEL: reg_broadcast_8i16_32i16:
   1328 ; X32-AVX:       # %bb.0:
   1329 ; X32-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1330 ; X32-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1331 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
   1332 ; X32-AVX-NEXT:    retl
   1333 ;
   1334 ; X32-AVX512F-LABEL: reg_broadcast_8i16_32i16:
   1335 ; X32-AVX512F:       # %bb.0:
   1336 ; X32-AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1337 ; X32-AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1338 ; X32-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
   1339 ; X32-AVX512F-NEXT:    retl
   1340 ;
   1341 ; X32-AVX512BW-LABEL: reg_broadcast_8i16_32i16:
   1342 ; X32-AVX512BW:       # %bb.0:
   1343 ; X32-AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1344 ; X32-AVX512BW-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1345 ; X32-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
   1346 ; X32-AVX512BW-NEXT:    retl
   1347 ;
   1348 ; X32-AVX512DQ-LABEL: reg_broadcast_8i16_32i16:
   1349 ; X32-AVX512DQ:       # %bb.0:
   1350 ; X32-AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1351 ; X32-AVX512DQ-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1352 ; X32-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
   1353 ; X32-AVX512DQ-NEXT:    retl
   1354 ;
   1355 ; X64-AVX-LABEL: reg_broadcast_8i16_32i16:
   1356 ; X64-AVX:       # %bb.0:
   1357 ; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1358 ; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1359 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
   1360 ; X64-AVX-NEXT:    retq
   1361 ;
   1362 ; X64-AVX512F-LABEL: reg_broadcast_8i16_32i16:
   1363 ; X64-AVX512F:       # %bb.0:
   1364 ; X64-AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1365 ; X64-AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1366 ; X64-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
   1367 ; X64-AVX512F-NEXT:    retq
   1368 ;
   1369 ; X64-AVX512BW-LABEL: reg_broadcast_8i16_32i16:
   1370 ; X64-AVX512BW:       # %bb.0:
   1371 ; X64-AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1372 ; X64-AVX512BW-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1373 ; X64-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
   1374 ; X64-AVX512BW-NEXT:    retq
   1375 ;
   1376 ; X64-AVX512DQ-LABEL: reg_broadcast_8i16_32i16:
   1377 ; X64-AVX512DQ:       # %bb.0:
   1378 ; X64-AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1379 ; X64-AVX512DQ-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1380 ; X64-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
   1381 ; X64-AVX512DQ-NEXT:    retq
   1382  %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1383  ret <32 x i16> %1
   1384 }
   1385 
   1386 define <32 x i16> @reg_broadcast_16i16_32i16(<16 x i16> %a0) nounwind {
   1387 ; X32-AVX-LABEL: reg_broadcast_16i16_32i16:
   1388 ; X32-AVX:       # %bb.0:
   1389 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
   1390 ; X32-AVX-NEXT:    retl
   1391 ;
   1392 ; X32-AVX512F-LABEL: reg_broadcast_16i16_32i16:
   1393 ; X32-AVX512F:       # %bb.0:
   1394 ; X32-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
   1395 ; X32-AVX512F-NEXT:    retl
   1396 ;
   1397 ; X32-AVX512BW-LABEL: reg_broadcast_16i16_32i16:
   1398 ; X32-AVX512BW:       # %bb.0:
   1399 ; X32-AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   1400 ; X32-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
   1401 ; X32-AVX512BW-NEXT:    retl
   1402 ;
   1403 ; X32-AVX512DQ-LABEL: reg_broadcast_16i16_32i16:
   1404 ; X32-AVX512DQ:       # %bb.0:
   1405 ; X32-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
   1406 ; X32-AVX512DQ-NEXT:    retl
   1407 ;
   1408 ; X64-AVX-LABEL: reg_broadcast_16i16_32i16:
   1409 ; X64-AVX:       # %bb.0:
   1410 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
   1411 ; X64-AVX-NEXT:    retq
   1412 ;
   1413 ; X64-AVX512F-LABEL: reg_broadcast_16i16_32i16:
   1414 ; X64-AVX512F:       # %bb.0:
   1415 ; X64-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
   1416 ; X64-AVX512F-NEXT:    retq
   1417 ;
   1418 ; X64-AVX512BW-LABEL: reg_broadcast_16i16_32i16:
   1419 ; X64-AVX512BW:       # %bb.0:
   1420 ; X64-AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   1421 ; X64-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
   1422 ; X64-AVX512BW-NEXT:    retq
   1423 ;
   1424 ; X64-AVX512DQ-LABEL: reg_broadcast_16i16_32i16:
   1425 ; X64-AVX512DQ:       # %bb.0:
   1426 ; X64-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
   1427 ; X64-AVX512DQ-NEXT:    retq
   1428  %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1429  ret <32 x i16> %1
   1430 }
   1431 
   1432 define <32 x i8> @reg_broadcast_16i8_32i8(<16 x i8> %a0) nounwind {
   1433 ; X32-LABEL: reg_broadcast_16i8_32i8:
   1434 ; X32:       # %bb.0:
   1435 ; X32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1436 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1437 ; X32-NEXT:    retl
   1438 ;
   1439 ; X64-LABEL: reg_broadcast_16i8_32i8:
   1440 ; X64:       # %bb.0:
   1441 ; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1442 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1443 ; X64-NEXT:    retq
   1444  %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1445  ret <32 x i8> %1
   1446 }
   1447 
   1448 define <64 x i8> @reg_broadcast_16i8_64i8(<16 x i8> %a0) nounwind {
   1449 ; X32-AVX-LABEL: reg_broadcast_16i8_64i8:
   1450 ; X32-AVX:       # %bb.0:
   1451 ; X32-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1452 ; X32-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1453 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
   1454 ; X32-AVX-NEXT:    retl
   1455 ;
   1456 ; X32-AVX512F-LABEL: reg_broadcast_16i8_64i8:
   1457 ; X32-AVX512F:       # %bb.0:
   1458 ; X32-AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1459 ; X32-AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1460 ; X32-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
   1461 ; X32-AVX512F-NEXT:    retl
   1462 ;
   1463 ; X32-AVX512BW-LABEL: reg_broadcast_16i8_64i8:
   1464 ; X32-AVX512BW:       # %bb.0:
   1465 ; X32-AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1466 ; X32-AVX512BW-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1467 ; X32-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
   1468 ; X32-AVX512BW-NEXT:    retl
   1469 ;
   1470 ; X32-AVX512DQ-LABEL: reg_broadcast_16i8_64i8:
   1471 ; X32-AVX512DQ:       # %bb.0:
   1472 ; X32-AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1473 ; X32-AVX512DQ-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1474 ; X32-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
   1475 ; X32-AVX512DQ-NEXT:    retl
   1476 ;
   1477 ; X64-AVX-LABEL: reg_broadcast_16i8_64i8:
   1478 ; X64-AVX:       # %bb.0:
   1479 ; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1480 ; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1481 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
   1482 ; X64-AVX-NEXT:    retq
   1483 ;
   1484 ; X64-AVX512F-LABEL: reg_broadcast_16i8_64i8:
   1485 ; X64-AVX512F:       # %bb.0:
   1486 ; X64-AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1487 ; X64-AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1488 ; X64-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
   1489 ; X64-AVX512F-NEXT:    retq
   1490 ;
   1491 ; X64-AVX512BW-LABEL: reg_broadcast_16i8_64i8:
   1492 ; X64-AVX512BW:       # %bb.0:
   1493 ; X64-AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1494 ; X64-AVX512BW-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1495 ; X64-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
   1496 ; X64-AVX512BW-NEXT:    retq
   1497 ;
   1498 ; X64-AVX512DQ-LABEL: reg_broadcast_16i8_64i8:
   1499 ; X64-AVX512DQ:       # %bb.0:
   1500 ; X64-AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   1501 ; X64-AVX512DQ-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1502 ; X64-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
   1503 ; X64-AVX512DQ-NEXT:    retq
   1504  %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1505  ret <64 x i8> %1
   1506 }
   1507 
   1508 define <64 x i8> @reg_broadcast_32i8_64i8(<32 x i8> %a0) nounwind {
   1509 ; X32-AVX-LABEL: reg_broadcast_32i8_64i8:
   1510 ; X32-AVX:       # %bb.0:
   1511 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
   1512 ; X32-AVX-NEXT:    retl
   1513 ;
   1514 ; X32-AVX512F-LABEL: reg_broadcast_32i8_64i8:
   1515 ; X32-AVX512F:       # %bb.0:
   1516 ; X32-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
   1517 ; X32-AVX512F-NEXT:    retl
   1518 ;
   1519 ; X32-AVX512BW-LABEL: reg_broadcast_32i8_64i8:
   1520 ; X32-AVX512BW:       # %bb.0:
   1521 ; X32-AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   1522 ; X32-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
   1523 ; X32-AVX512BW-NEXT:    retl
   1524 ;
   1525 ; X32-AVX512DQ-LABEL: reg_broadcast_32i8_64i8:
   1526 ; X32-AVX512DQ:       # %bb.0:
   1527 ; X32-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
   1528 ; X32-AVX512DQ-NEXT:    retl
   1529 ;
   1530 ; X64-AVX-LABEL: reg_broadcast_32i8_64i8:
   1531 ; X64-AVX:       # %bb.0:
   1532 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
   1533 ; X64-AVX-NEXT:    retq
   1534 ;
   1535 ; X64-AVX512F-LABEL: reg_broadcast_32i8_64i8:
   1536 ; X64-AVX512F:       # %bb.0:
   1537 ; X64-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
   1538 ; X64-AVX512F-NEXT:    retq
   1539 ;
   1540 ; X64-AVX512BW-LABEL: reg_broadcast_32i8_64i8:
   1541 ; X64-AVX512BW:       # %bb.0:
   1542 ; X64-AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   1543 ; X64-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
   1544 ; X64-AVX512BW-NEXT:    retq
   1545 ;
   1546 ; X64-AVX512DQ-LABEL: reg_broadcast_32i8_64i8:
   1547 ; X64-AVX512DQ:       # %bb.0:
   1548 ; X64-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
   1549 ; X64-AVX512DQ-NEXT:    retq
   1550  %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   1551  ret <64 x i8> %1
   1552 }
   1553