Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
      7 
      8 ; Widened shuffle broadcast loads
      9 
     10 define <4 x float> @load_splat_4f32_4f32_0101(<4 x float>* %ptr) nounwind uwtable readnone ssp {
     11 ; SSE2-LABEL: load_splat_4f32_4f32_0101:
     12 ; SSE2:       # %bb.0: # %entry
     13 ; SSE2-NEXT:    movaps (%rdi), %xmm0
     14 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
     15 ; SSE2-NEXT:    retq
     16 ;
     17 ; SSE42-LABEL: load_splat_4f32_4f32_0101:
     18 ; SSE42:       # %bb.0: # %entry
     19 ; SSE42-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
     20 ; SSE42-NEXT:    retq
     21 ;
     22 ; AVX-LABEL: load_splat_4f32_4f32_0101:
     23 ; AVX:       # %bb.0: # %entry
     24 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
     25 ; AVX-NEXT:    retq
     26 entry:
     27   %ld = load <4 x float>, <4 x float>* %ptr
     28   %ret = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
     29   ret <4 x float> %ret
     30 }
     31 
     32 define <8 x float> @load_splat_8f32_4f32_01010101(<4 x float>* %ptr) nounwind uwtable readnone ssp {
     33 ; SSE2-LABEL: load_splat_8f32_4f32_01010101:
     34 ; SSE2:       # %bb.0: # %entry
     35 ; SSE2-NEXT:    movaps (%rdi), %xmm0
     36 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
     37 ; SSE2-NEXT:    movaps %xmm0, %xmm1
     38 ; SSE2-NEXT:    retq
     39 ;
     40 ; SSE42-LABEL: load_splat_8f32_4f32_01010101:
     41 ; SSE42:       # %bb.0: # %entry
     42 ; SSE42-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
     43 ; SSE42-NEXT:    movapd %xmm0, %xmm1
     44 ; SSE42-NEXT:    retq
     45 ;
     46 ; AVX1-LABEL: load_splat_8f32_4f32_01010101:
     47 ; AVX1:       # %bb.0: # %entry
     48 ; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
     49 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
     50 ; AVX1-NEXT:    retq
     51 ;
     52 ; AVX2-LABEL: load_splat_8f32_4f32_01010101:
     53 ; AVX2:       # %bb.0: # %entry
     54 ; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
     55 ; AVX2-NEXT:    retq
     56 ;
     57 ; AVX512-LABEL: load_splat_8f32_4f32_01010101:
     58 ; AVX512:       # %bb.0: # %entry
     59 ; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
     60 ; AVX512-NEXT:    retq
     61 entry:
     62   %ld = load <4 x float>, <4 x float>* %ptr
     63   %ret = shufflevector <4 x float> %ld, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
     64   ret <8 x float> %ret
     65 }
     66 
     67 define <8 x float> @load_splat_8f32_8f32_01010101(<8 x float>* %ptr) nounwind uwtable readnone ssp {
     68 ; SSE2-LABEL: load_splat_8f32_8f32_01010101:
     69 ; SSE2:       # %bb.0: # %entry
     70 ; SSE2-NEXT:    movaps (%rdi), %xmm0
     71 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
     72 ; SSE2-NEXT:    movaps %xmm0, %xmm1
     73 ; SSE2-NEXT:    retq
     74 ;
     75 ; SSE42-LABEL: load_splat_8f32_8f32_01010101:
     76 ; SSE42:       # %bb.0: # %entry
     77 ; SSE42-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
     78 ; SSE42-NEXT:    movapd %xmm0, %xmm1
     79 ; SSE42-NEXT:    retq
     80 ;
     81 ; AVX-LABEL: load_splat_8f32_8f32_01010101:
     82 ; AVX:       # %bb.0: # %entry
     83 ; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
     84 ; AVX-NEXT:    retq
     85 entry:
     86   %ld = load <8 x float>, <8 x float>* %ptr
     87   %ret = shufflevector <8 x float> %ld, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
     88   ret <8 x float> %ret
     89 }
     90 
     91 define <4 x i32> @load_splat_4i32_4i32_0101(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
     92 ; SSE-LABEL: load_splat_4i32_4i32_0101:
     93 ; SSE:       # %bb.0: # %entry
     94 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
     95 ; SSE-NEXT:    retq
     96 ;
     97 ; AVX1-LABEL: load_splat_4i32_4i32_0101:
     98 ; AVX1:       # %bb.0: # %entry
     99 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,0,1]
    100 ; AVX1-NEXT:    retq
    101 ;
    102 ; AVX2-LABEL: load_splat_4i32_4i32_0101:
    103 ; AVX2:       # %bb.0: # %entry
    104 ; AVX2-NEXT:    vpbroadcastq (%rdi), %xmm0
    105 ; AVX2-NEXT:    retq
    106 ;
    107 ; AVX512-LABEL: load_splat_4i32_4i32_0101:
    108 ; AVX512:       # %bb.0: # %entry
    109 ; AVX512-NEXT:    vpbroadcastq (%rdi), %xmm0
    110 ; AVX512-NEXT:    retq
    111 entry:
    112   %ld = load <4 x i32>, <4 x i32>* %ptr
    113   %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
    114   ret <4 x i32> %ret
    115 }
    116 
    117 define <8 x i32> @load_splat_8i32_4i32_01010101(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
    118 ; SSE-LABEL: load_splat_8i32_4i32_01010101:
    119 ; SSE:       # %bb.0: # %entry
    120 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
    121 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    122 ; SSE-NEXT:    retq
    123 ;
    124 ; AVX-LABEL: load_splat_8i32_4i32_01010101:
    125 ; AVX:       # %bb.0: # %entry
    126 ; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
    127 ; AVX-NEXT:    retq
    128 entry:
    129   %ld = load <4 x i32>, <4 x i32>* %ptr
    130   %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
    131   ret <8 x i32> %ret
    132 }
    133 
    134 define <8 x i32> @load_splat_8i32_8i32_01010101(<8 x i32>* %ptr) nounwind uwtable readnone ssp {
    135 ; SSE-LABEL: load_splat_8i32_8i32_01010101:
    136 ; SSE:       # %bb.0: # %entry
    137 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
    138 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    139 ; SSE-NEXT:    retq
    140 ;
    141 ; AVX1-LABEL: load_splat_8i32_8i32_01010101:
    142 ; AVX1:       # %bb.0: # %entry
    143 ; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
    144 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    145 ; AVX1-NEXT:    retq
    146 ;
    147 ; AVX2-LABEL: load_splat_8i32_8i32_01010101:
    148 ; AVX2:       # %bb.0: # %entry
    149 ; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
    150 ; AVX2-NEXT:    retq
    151 ;
    152 ; AVX512-LABEL: load_splat_8i32_8i32_01010101:
    153 ; AVX512:       # %bb.0: # %entry
    154 ; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
    155 ; AVX512-NEXT:    retq
    156 entry:
    157   %ld = load <8 x i32>, <8 x i32>* %ptr
    158   %ret = shufflevector <8 x i32> %ld, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
    159   ret <8 x i32> %ret
    160 }
    161 
    162 define <8 x i16> @load_splat_8i16_8i16_01010101(<8 x i16>* %ptr) nounwind uwtable readnone ssp {
    163 ; SSE-LABEL: load_splat_8i16_8i16_01010101:
    164 ; SSE:       # %bb.0: # %entry
    165 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,0,0,0]
    166 ; SSE-NEXT:    retq
    167 ;
    168 ; AVX1-LABEL: load_splat_8i16_8i16_01010101:
    169 ; AVX1:       # %bb.0: # %entry
    170 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,0,0,0]
    171 ; AVX1-NEXT:    retq
    172 ;
    173 ; AVX2-LABEL: load_splat_8i16_8i16_01010101:
    174 ; AVX2:       # %bb.0: # %entry
    175 ; AVX2-NEXT:    vbroadcastss (%rdi), %xmm0
    176 ; AVX2-NEXT:    retq
    177 ;
    178 ; AVX512-LABEL: load_splat_8i16_8i16_01010101:
    179 ; AVX512:       # %bb.0: # %entry
    180 ; AVX512-NEXT:    vbroadcastss (%rdi), %xmm0
    181 ; AVX512-NEXT:    retq
    182 entry:
    183   %ld = load <8 x i16>, <8 x i16>* %ptr
    184   %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
    185   ret <8 x i16> %ret
    186 }
    187 
    188 define <8 x i16> @load_splat_8i16_8i16_01230123(<8 x i16>* %ptr) nounwind uwtable readnone ssp {
    189 ; SSE-LABEL: load_splat_8i16_8i16_01230123:
    190 ; SSE:       # %bb.0: # %entry
    191 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
    192 ; SSE-NEXT:    retq
    193 ;
    194 ; AVX1-LABEL: load_splat_8i16_8i16_01230123:
    195 ; AVX1:       # %bb.0: # %entry
    196 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,0,1]
    197 ; AVX1-NEXT:    retq
    198 ;
    199 ; AVX2-LABEL: load_splat_8i16_8i16_01230123:
    200 ; AVX2:       # %bb.0: # %entry
    201 ; AVX2-NEXT:    vpbroadcastq (%rdi), %xmm0
    202 ; AVX2-NEXT:    retq
    203 ;
    204 ; AVX512-LABEL: load_splat_8i16_8i16_01230123:
    205 ; AVX512:       # %bb.0: # %entry
    206 ; AVX512-NEXT:    vpbroadcastq (%rdi), %xmm0
    207 ; AVX512-NEXT:    retq
    208 entry:
    209   %ld = load <8 x i16>, <8 x i16>* %ptr
    210   %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    211   ret <8 x i16> %ret
    212 }
    213 
    214 define <16 x i16> @load_splat_16i16_8i16_0101010101010101(<8 x i16>* %ptr) nounwind uwtable readnone ssp {
    215 ; SSE-LABEL: load_splat_16i16_8i16_0101010101010101:
    216 ; SSE:       # %bb.0: # %entry
    217 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,0,0,0]
    218 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    219 ; SSE-NEXT:    retq
    220 ;
    221 ; AVX1-LABEL: load_splat_16i16_8i16_0101010101010101:
    222 ; AVX1:       # %bb.0: # %entry
    223 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,0,0,0]
    224 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    225 ; AVX1-NEXT:    retq
    226 ;
    227 ; AVX2-LABEL: load_splat_16i16_8i16_0101010101010101:
    228 ; AVX2:       # %bb.0: # %entry
    229 ; AVX2-NEXT:    vbroadcastss (%rdi), %ymm0
    230 ; AVX2-NEXT:    retq
    231 ;
    232 ; AVX512-LABEL: load_splat_16i16_8i16_0101010101010101:
    233 ; AVX512:       # %bb.0: # %entry
    234 ; AVX512-NEXT:    vbroadcastss (%rdi), %ymm0
    235 ; AVX512-NEXT:    retq
    236 entry:
    237   %ld = load <8 x i16>, <8 x i16>* %ptr
    238   %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
    239   ret <16 x i16> %ret
    240 }
    241 
    242 define <16 x i16> @load_splat_16i16_8i16_0123012301230123(<8 x i16>* %ptr) nounwind uwtable readnone ssp {
    243 ; SSE-LABEL: load_splat_16i16_8i16_0123012301230123:
    244 ; SSE:       # %bb.0: # %entry
    245 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
    246 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    247 ; SSE-NEXT:    retq
    248 ;
    249 ; AVX-LABEL: load_splat_16i16_8i16_0123012301230123:
    250 ; AVX:       # %bb.0: # %entry
    251 ; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
    252 ; AVX-NEXT:    retq
    253 entry:
    254   %ld = load <8 x i16>, <8 x i16>* %ptr
    255   %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3,i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    256   ret <16 x i16> %ret
    257 }
    258 
    259 define <16 x i16> @load_splat_16i16_16i16_0101010101010101(<16 x i16>* %ptr) nounwind uwtable readnone ssp {
    260 ; SSE-LABEL: load_splat_16i16_16i16_0101010101010101:
    261 ; SSE:       # %bb.0: # %entry
    262 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,0,0,0]
    263 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    264 ; SSE-NEXT:    retq
    265 ;
    266 ; AVX1-LABEL: load_splat_16i16_16i16_0101010101010101:
    267 ; AVX1:       # %bb.0: # %entry
    268 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,0,0,0]
    269 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    270 ; AVX1-NEXT:    retq
    271 ;
    272 ; AVX2-LABEL: load_splat_16i16_16i16_0101010101010101:
    273 ; AVX2:       # %bb.0: # %entry
    274 ; AVX2-NEXT:    vbroadcastss (%rdi), %ymm0
    275 ; AVX2-NEXT:    retq
    276 ;
    277 ; AVX512-LABEL: load_splat_16i16_16i16_0101010101010101:
    278 ; AVX512:       # %bb.0: # %entry
    279 ; AVX512-NEXT:    vbroadcastss (%rdi), %ymm0
    280 ; AVX512-NEXT:    retq
    281 entry:
    282   %ld = load <16 x i16>, <16 x i16>* %ptr
    283   %ret = shufflevector <16 x i16> %ld, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
    284   ret <16 x i16> %ret
    285 }
    286 
    287 define <16 x i16> @load_splat_16i16_16i16_0123012301230123(<16 x i16>* %ptr) nounwind uwtable readnone ssp {
    288 ; SSE-LABEL: load_splat_16i16_16i16_0123012301230123:
    289 ; SSE:       # %bb.0: # %entry
    290 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
    291 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    292 ; SSE-NEXT:    retq
    293 ;
    294 ; AVX-LABEL: load_splat_16i16_16i16_0123012301230123:
    295 ; AVX:       # %bb.0: # %entry
    296 ; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
    297 ; AVX-NEXT:    retq
    298 entry:
    299   %ld = load <16 x i16>, <16 x i16>* %ptr
    300   %ret = shufflevector <16 x i16> %ld, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3,i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    301   ret <16 x i16> %ret
    302 }
    303 
    304 define <16 x i8> @load_splat_16i8_16i8_0101010101010101(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
    305 ; SSE-LABEL: load_splat_16i8_16i8_0101010101010101:
    306 ; SSE:       # %bb.0: # %entry
    307 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = mem[0,0,2,3,4,5,6,7]
    308 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
    309 ; SSE-NEXT:    retq
    310 ;
    311 ; AVX1-LABEL: load_splat_16i8_16i8_0101010101010101:
    312 ; AVX1:       # %bb.0: # %entry
    313 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[0,0,2,3,4,5,6,7]
    314 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
    315 ; AVX1-NEXT:    retq
    316 ;
    317 ; AVX2-LABEL: load_splat_16i8_16i8_0101010101010101:
    318 ; AVX2:       # %bb.0: # %entry
    319 ; AVX2-NEXT:    vpbroadcastw (%rdi), %xmm0
    320 ; AVX2-NEXT:    retq
    321 ;
    322 ; AVX512-LABEL: load_splat_16i8_16i8_0101010101010101:
    323 ; AVX512:       # %bb.0: # %entry
    324 ; AVX512-NEXT:    vpbroadcastw (%rdi), %xmm0
    325 ; AVX512-NEXT:    retq
    326 entry:
    327   %ld = load <16 x i8>, <16 x i8>* %ptr
    328   %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
    329   ret <16 x i8> %ret
    330 }
    331 
    332 define <16 x i8> @load_splat_16i8_16i8_0123012301230123(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
    333 ; SSE-LABEL: load_splat_16i8_16i8_0123012301230123:
    334 ; SSE:       # %bb.0: # %entry
    335 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,0,0,0]
    336 ; SSE-NEXT:    retq
    337 ;
    338 ; AVX1-LABEL: load_splat_16i8_16i8_0123012301230123:
    339 ; AVX1:       # %bb.0: # %entry
    340 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,0,0,0]
    341 ; AVX1-NEXT:    retq
    342 ;
    343 ; AVX2-LABEL: load_splat_16i8_16i8_0123012301230123:
    344 ; AVX2:       # %bb.0: # %entry
    345 ; AVX2-NEXT:    vbroadcastss (%rdi), %xmm0
    346 ; AVX2-NEXT:    retq
    347 ;
    348 ; AVX512-LABEL: load_splat_16i8_16i8_0123012301230123:
    349 ; AVX512:       # %bb.0: # %entry
    350 ; AVX512-NEXT:    vbroadcastss (%rdi), %xmm0
    351 ; AVX512-NEXT:    retq
    352 entry:
    353   %ld = load <16 x i8>, <16 x i8>* %ptr
    354   %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    355   ret <16 x i8> %ret
    356 }
    357 
    358 define <16 x i8> @load_splat_16i8_16i8_0123456701234567(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
    359 ; SSE-LABEL: load_splat_16i8_16i8_0123456701234567:
    360 ; SSE:       # %bb.0: # %entry
    361 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
    362 ; SSE-NEXT:    retq
    363 ;
    364 ; AVX1-LABEL: load_splat_16i8_16i8_0123456701234567:
    365 ; AVX1:       # %bb.0: # %entry
    366 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,0,1]
    367 ; AVX1-NEXT:    retq
    368 ;
    369 ; AVX2-LABEL: load_splat_16i8_16i8_0123456701234567:
    370 ; AVX2:       # %bb.0: # %entry
    371 ; AVX2-NEXT:    vpbroadcastq (%rdi), %xmm0
    372 ; AVX2-NEXT:    retq
    373 ;
    374 ; AVX512-LABEL: load_splat_16i8_16i8_0123456701234567:
    375 ; AVX512:       # %bb.0: # %entry
    376 ; AVX512-NEXT:    vpbroadcastq (%rdi), %xmm0
    377 ; AVX512-NEXT:    retq
    378 entry:
    379   %ld = load <16 x i8>, <16 x i8>* %ptr
    380   %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    381   ret <16 x i8> %ret
    382 }
    383 
    384 define <32 x i8> @load_splat_32i8_16i8_01010101010101010101010101010101(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
    385 ; SSE-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101:
    386 ; SSE:       # %bb.0: # %entry
    387 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = mem[0,0,2,3,4,5,6,7]
    388 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
    389 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    390 ; SSE-NEXT:    retq
    391 ;
    392 ; AVX1-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101:
    393 ; AVX1:       # %bb.0: # %entry
    394 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[0,0,2,3,4,5,6,7]
    395 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
    396 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    397 ; AVX1-NEXT:    retq
    398 ;
    399 ; AVX2-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101:
    400 ; AVX2:       # %bb.0: # %entry
    401 ; AVX2-NEXT:    vpbroadcastw (%rdi), %ymm0
    402 ; AVX2-NEXT:    retq
    403 ;
    404 ; AVX512-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101:
    405 ; AVX512:       # %bb.0: # %entry
    406 ; AVX512-NEXT:    vpbroadcastw (%rdi), %ymm0
    407 ; AVX512-NEXT:    retq
    408 entry:
    409   %ld = load <16 x i8>, <16 x i8>* %ptr
    410   %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
    411   ret <32 x i8> %ret
    412 }
    413 
    414 define <32 x i8> @load_splat_32i8_16i8_01230123012301230123012301230123(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
    415 ; SSE-LABEL: load_splat_32i8_16i8_01230123012301230123012301230123:
    416 ; SSE:       # %bb.0: # %entry
    417 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,0,0,0]
    418 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    419 ; SSE-NEXT:    retq
    420 ;
    421 ; AVX1-LABEL: load_splat_32i8_16i8_01230123012301230123012301230123:
    422 ; AVX1:       # %bb.0: # %entry
    423 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,0,0,0]
    424 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    425 ; AVX1-NEXT:    retq
    426 ;
    427 ; AVX2-LABEL: load_splat_32i8_16i8_01230123012301230123012301230123:
    428 ; AVX2:       # %bb.0: # %entry
    429 ; AVX2-NEXT:    vbroadcastss (%rdi), %ymm0
    430 ; AVX2-NEXT:    retq
    431 ;
    432 ; AVX512-LABEL: load_splat_32i8_16i8_01230123012301230123012301230123:
    433 ; AVX512:       # %bb.0: # %entry
    434 ; AVX512-NEXT:    vbroadcastss (%rdi), %ymm0
    435 ; AVX512-NEXT:    retq
    436 entry:
    437   %ld = load <16 x i8>, <16 x i8>* %ptr
    438   %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    439   ret <32 x i8> %ret
    440 }
    441 
    442 define <32 x i8> @load_splat_32i8_16i8_01234567012345670123456701234567(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
    443 ; SSE-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
    444 ; SSE:       # %bb.0: # %entry
    445 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
    446 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    447 ; SSE-NEXT:    retq
    448 ;
    449 ; AVX-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
    450 ; AVX:       # %bb.0: # %entry
    451 ; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
    452 ; AVX-NEXT:    retq
    453 entry:
    454   %ld = load <16 x i8>, <16 x i8>* %ptr
    455   %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    456   ret <32 x i8> %ret
    457 }
    458 
    459 define <32 x i8> @load_splat_32i8_32i8_01010101010101010101010101010101(<32 x i8>* %ptr) nounwind uwtable readnone ssp {
    460 ; SSE-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101:
    461 ; SSE:       # %bb.0: # %entry
    462 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = mem[0,0,2,3,4,5,6,7]
    463 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
    464 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    465 ; SSE-NEXT:    retq
    466 ;
    467 ; AVX1-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101:
    468 ; AVX1:       # %bb.0: # %entry
    469 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[0,0,2,3,4,5,6,7]
    470 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
    471 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    472 ; AVX1-NEXT:    retq
    473 ;
    474 ; AVX2-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101:
    475 ; AVX2:       # %bb.0: # %entry
    476 ; AVX2-NEXT:    vpbroadcastw (%rdi), %ymm0
    477 ; AVX2-NEXT:    retq
    478 ;
    479 ; AVX512-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101:
    480 ; AVX512:       # %bb.0: # %entry
    481 ; AVX512-NEXT:    vpbroadcastw (%rdi), %ymm0
    482 ; AVX512-NEXT:    retq
    483 entry:
    484   %ld = load <32 x i8>, <32 x i8>* %ptr
    485   %ret = shufflevector <32 x i8> %ld, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
    486   ret <32 x i8> %ret
    487 }
    488 
    489 define <32 x i8> @load_splat_32i8_32i8_01230123012301230123012301230123(<32 x i8>* %ptr) nounwind uwtable readnone ssp {
    490 ; SSE-LABEL: load_splat_32i8_32i8_01230123012301230123012301230123:
    491 ; SSE:       # %bb.0: # %entry
    492 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,0,0,0]
    493 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    494 ; SSE-NEXT:    retq
    495 ;
    496 ; AVX-LABEL: load_splat_32i8_32i8_01230123012301230123012301230123:
    497 ; AVX:       # %bb.0: # %entry
    498 ; AVX-NEXT:    vbroadcastss (%rdi), %ymm0
    499 ; AVX-NEXT:    retq
    500 entry:
    501   %ld = load <32 x i8>, <32 x i8>* %ptr
    502   %ret = shufflevector <32 x i8> %ld, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    503   ret <32 x i8> %ret
    504 }
    505 
    506 define <32 x i8> @load_splat_32i8_32i8_01234567012345670123456701234567(<32 x i8>* %ptr) nounwind uwtable readnone ssp {
    507 ; SSE-LABEL: load_splat_32i8_32i8_01234567012345670123456701234567:
    508 ; SSE:       # %bb.0: # %entry
    509 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
    510 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    511 ; SSE-NEXT:    retq
    512 ;
    513 ; AVX-LABEL: load_splat_32i8_32i8_01234567012345670123456701234567:
    514 ; AVX:       # %bb.0: # %entry
    515 ; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
    516 ; AVX-NEXT:    retq
    517 entry:
    518   %ld = load <32 x i8>, <32 x i8>* %ptr
    519   %ret = shufflevector <32 x i8> %ld, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    520   ret <32 x i8> %ret
    521 }
    522 
    523 define <4 x float> @load_splat_4f32_8f32_0000(<8 x float>* %ptr) nounwind uwtable readnone ssp {
    524 ; SSE-LABEL: load_splat_4f32_8f32_0000:
    525 ; SSE:       # %bb.0: # %entry
    526 ; SSE-NEXT:    movaps (%rdi), %xmm0
    527 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
    528 ; SSE-NEXT:    retq
    529 ;
    530 ; AVX-LABEL: load_splat_4f32_8f32_0000:
    531 ; AVX:       # %bb.0: # %entry
    532 ; AVX-NEXT:    vbroadcastss (%rdi), %xmm0
    533 ; AVX-NEXT:    retq
    534 entry:
    535   %ld = load <8 x float>, <8 x float>* %ptr
    536   %ret = shufflevector <8 x float> %ld, <8 x float> undef, <4 x i32> zeroinitializer
    537   ret <4 x float> %ret
    538 }
    539 
    540 define <8 x float> @load_splat_8f32_16f32_89898989(<16 x float>* %ptr) nounwind uwtable readnone ssp {
    541 ; SSE2-LABEL: load_splat_8f32_16f32_89898989:
    542 ; SSE2:       # %bb.0: # %entry
    543 ; SSE2-NEXT:    movaps 32(%rdi), %xmm0
    544 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
    545 ; SSE2-NEXT:    movaps %xmm0, %xmm1
    546 ; SSE2-NEXT:    retq
    547 ;
    548 ; SSE42-LABEL: load_splat_8f32_16f32_89898989:
    549 ; SSE42:       # %bb.0: # %entry
    550 ; SSE42-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
    551 ; SSE42-NEXT:    movapd %xmm0, %xmm1
    552 ; SSE42-NEXT:    retq
    553 ;
    554 ; AVX-LABEL: load_splat_8f32_16f32_89898989:
    555 ; AVX:       # %bb.0: # %entry
    556 ; AVX-NEXT:    vbroadcastsd 32(%rdi), %ymm0
    557 ; AVX-NEXT:    retq
    558 entry:
    559   %ld = load <16 x float>, <16 x float>* %ptr
    560   %ret = shufflevector <16 x float> %ld, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 8, i32 9, i32 8, i32 9, i32 8, i32 9>
    561   ret <8 x float> %ret
    562 }
    563 
    564 ; PR34394
    565 define <4 x i32> @load_splat_4i32_2i32_0101(<2 x i32>* %vp) {
    566 ; SSE-LABEL: load_splat_4i32_2i32_0101:
    567 ; SSE:       # %bb.0:
    568 ; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    569 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
    570 ; SSE-NEXT:    retq
    571 ;
    572 ; AVX1-LABEL: load_splat_4i32_2i32_0101:
    573 ; AVX1:       # %bb.0:
    574 ; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
    575 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
    576 ; AVX1-NEXT:    retq
    577 ;
    578 ; AVX2-LABEL: load_splat_4i32_2i32_0101:
    579 ; AVX2:       # %bb.0:
    580 ; AVX2-NEXT:    vpbroadcastq (%rdi), %xmm0
    581 ; AVX2-NEXT:    retq
    582 ;
    583 ; AVX512-LABEL: load_splat_4i32_2i32_0101:
    584 ; AVX512:       # %bb.0:
    585 ; AVX512-NEXT:    vpbroadcastq (%rdi), %xmm0
    586 ; AVX512-NEXT:    retq
    587   %vec = load <2 x i32>, <2 x i32>* %vp
    588   %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
    589   ret <4 x i32> %res
    590 }
    591 
    592 define <8 x i32> @load_splat_8i32_2i32_0101(<2 x i32>* %vp) {
    593 ; SSE-LABEL: load_splat_8i32_2i32_0101:
    594 ; SSE:       # %bb.0:
    595 ; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    596 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
    597 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    598 ; SSE-NEXT:    retq
    599 ;
    600 ; AVX1-LABEL: load_splat_8i32_2i32_0101:
    601 ; AVX1:       # %bb.0:
    602 ; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
    603 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
    604 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    605 ; AVX1-NEXT:    retq
    606 ;
    607 ; AVX2-LABEL: load_splat_8i32_2i32_0101:
    608 ; AVX2:       # %bb.0:
    609 ; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
    610 ; AVX2-NEXT:    retq
    611 ;
    612 ; AVX512-LABEL: load_splat_8i32_2i32_0101:
    613 ; AVX512:       # %bb.0:
    614 ; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
    615 ; AVX512-NEXT:    retq
    616   %vec = load <2 x i32>, <2 x i32>* %vp
    617   %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
    618   ret <8 x i32> %res
    619 }
    620 
    621 define <16 x i32> @load_splat_16i32_2i32_0101(<2 x i32>* %vp) {
    622 ; SSE-LABEL: load_splat_16i32_2i32_0101:
    623 ; SSE:       # %bb.0:
    624 ; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    625 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
    626 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    627 ; SSE-NEXT:    movdqa %xmm0, %xmm2
    628 ; SSE-NEXT:    movdqa %xmm0, %xmm3
    629 ; SSE-NEXT:    retq
    630 ;
    631 ; AVX1-LABEL: load_splat_16i32_2i32_0101:
    632 ; AVX1:       # %bb.0:
    633 ; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
    634 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
    635 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    636 ; AVX1-NEXT:    vmovaps %ymm0, %ymm1
    637 ; AVX1-NEXT:    retq
    638 ;
    639 ; AVX2-LABEL: load_splat_16i32_2i32_0101:
    640 ; AVX2:       # %bb.0:
    641 ; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
    642 ; AVX2-NEXT:    vmovaps %ymm0, %ymm1
    643 ; AVX2-NEXT:    retq
    644 ;
    645 ; AVX512-LABEL: load_splat_16i32_2i32_0101:
    646 ; AVX512:       # %bb.0:
    647 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
    648 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
    649 ; AVX512-NEXT:    vpermd %zmm0, %zmm1, %zmm0
    650 ; AVX512-NEXT:    retq
    651   %vec = load <2 x i32>, <2 x i32>* %vp
    652   %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
    653   ret <16 x i32> %res
    654 }
    655