Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE
      3 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
      4 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
      8 
      9 ; Tests for SSE2 and below, without SSE3+.
     10 
     11 define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
     12 ; X86-SSE-LABEL: test1:
     13 ; X86-SSE:       # %bb.0:
     14 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
     15 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
     16 ; X86-SSE-NEXT:    movapd (%ecx), %xmm0
     17 ; X86-SSE-NEXT:    movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
     18 ; X86-SSE-NEXT:    movapd %xmm0, (%eax)
     19 ; X86-SSE-NEXT:    retl
     20 ;
     21 ; X86-AVX-LABEL: test1:
     22 ; X86-AVX:       # %bb.0:
     23 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
     24 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
     25 ; X86-AVX-NEXT:    vmovapd (%ecx), %xmm0
     26 ; X86-AVX-NEXT:    vmovlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
     27 ; X86-AVX-NEXT:    vmovapd %xmm0, (%eax)
     28 ; X86-AVX-NEXT:    retl
     29 ;
     30 ; X64-SSE-LABEL: test1:
     31 ; X64-SSE:       # %bb.0:
     32 ; X64-SSE-NEXT:    movapd (%rsi), %xmm1
     33 ; X64-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
     34 ; X64-SSE-NEXT:    movapd %xmm1, (%rdi)
     35 ; X64-SSE-NEXT:    retq
     36 ;
     37 ; X64-AVX-LABEL: test1:
     38 ; X64-AVX:       # %bb.0:
     39 ; X64-AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
     40 ; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
     41 ; X64-AVX-NEXT:    retq
     42 	%tmp3 = load <2 x double>, <2 x double>* %A, align 16
     43 	%tmp7 = insertelement <2 x double> undef, double %B, i32 0
     44 	%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 >
     45 	store <2 x double> %tmp9, <2 x double>* %r, align 16
     46 	ret void
     47 }
     48 
     49 define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
     50 ; X86-SSE-LABEL: test2:
     51 ; X86-SSE:       # %bb.0:
     52 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
     53 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
     54 ; X86-SSE-NEXT:    movapd (%ecx), %xmm0
     55 ; X86-SSE-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
     56 ; X86-SSE-NEXT:    movapd %xmm0, (%eax)
     57 ; X86-SSE-NEXT:    retl
     58 ;
     59 ; X86-AVX-LABEL: test2:
     60 ; X86-AVX:       # %bb.0:
     61 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
     62 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
     63 ; X86-AVX-NEXT:    vmovapd (%ecx), %xmm0
     64 ; X86-AVX-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
     65 ; X86-AVX-NEXT:    vmovapd %xmm0, (%eax)
     66 ; X86-AVX-NEXT:    retl
     67 ;
     68 ; X64-SSE-LABEL: test2:
     69 ; X64-SSE:       # %bb.0:
     70 ; X64-SSE-NEXT:    movaps (%rsi), %xmm1
     71 ; X64-SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
     72 ; X64-SSE-NEXT:    movaps %xmm1, (%rdi)
     73 ; X64-SSE-NEXT:    retq
     74 ;
     75 ; X64-AVX-LABEL: test2:
     76 ; X64-AVX:       # %bb.0:
     77 ; X64-AVX-NEXT:    vmovaps (%rsi), %xmm1
     78 ; X64-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
     79 ; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
     80 ; X64-AVX-NEXT:    retq
     81 	%tmp3 = load <2 x double>, <2 x double>* %A, align 16
     82 	%tmp7 = insertelement <2 x double> undef, double %B, i32 0
     83 	%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 >
     84 	store <2 x double> %tmp9, <2 x double>* %r, align 16
     85 	ret void
     86 }
     87 
     88 
     89 define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind {
     90 ; X86-SSE-LABEL: test3:
     91 ; X86-SSE:       # %bb.0:
     92 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
     93 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
     94 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
     95 ; X86-SSE-NEXT:    movaps (%edx), %xmm0
     96 ; X86-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
     97 ; X86-SSE-NEXT:    movaps %xmm0, (%eax)
     98 ; X86-SSE-NEXT:    retl
     99 ;
    100 ; X86-AVX-LABEL: test3:
    101 ; X86-AVX:       # %bb.0:
    102 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    103 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    104 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
    105 ; X86-AVX-NEXT:    vmovaps (%edx), %xmm0
    106 ; X86-AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
    107 ; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
    108 ; X86-AVX-NEXT:    retl
    109 ;
    110 ; X64-SSE-LABEL: test3:
    111 ; X64-SSE:       # %bb.0:
    112 ; X64-SSE-NEXT:    movaps (%rsi), %xmm0
    113 ; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
    114 ; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
    115 ; X64-SSE-NEXT:    retq
    116 ;
    117 ; X64-AVX-LABEL: test3:
    118 ; X64-AVX:       # %bb.0:
    119 ; X64-AVX-NEXT:    vmovaps (%rsi), %xmm0
    120 ; X64-AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
    121 ; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
    122 ; X64-AVX-NEXT:    retq
    123 	%tmp = load <4 x float>, <4 x float>* %B		; <<4 x float>> [#uses=2]
    124 	%tmp3 = load <4 x float>, <4 x float>* %A		; <<4 x float>> [#uses=2]
    125 	%tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0		; <float> [#uses=1]
    126 	%tmp7 = extractelement <4 x float> %tmp, i32 0		; <float> [#uses=1]
    127 	%tmp8 = extractelement <4 x float> %tmp3, i32 1		; <float> [#uses=1]
    128 	%tmp9 = extractelement <4 x float> %tmp, i32 1		; <float> [#uses=1]
    129 	%tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.1, i32 0		; <<4 x float>> [#uses=1]
    130 	%tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1		; <<4 x float>> [#uses=1]
    131 	%tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2		; <<4 x float>> [#uses=1]
    132 	%tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3		; <<4 x float>> [#uses=1]
    133 	store <4 x float> %tmp13, <4 x float>* %res
    134 	ret void
    135 }
    136 
    137 define void @test4(<4 x float> %X, <4 x float>* %res) nounwind {
    138 ; X86-SSE-LABEL: test4:
    139 ; X86-SSE:       # %bb.0:
    140 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    141 ; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
    142 ; X86-SSE-NEXT:    movaps %xmm0, (%eax)
    143 ; X86-SSE-NEXT:    retl
    144 ;
    145 ; X86-AVX-LABEL: test4:
    146 ; X86-AVX:       # %bb.0:
    147 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    148 ; X86-AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
    149 ; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
    150 ; X86-AVX-NEXT:    retl
    151 ;
    152 ; X64-SSE-LABEL: test4:
    153 ; X64-SSE:       # %bb.0:
    154 ; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
    155 ; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
    156 ; X64-SSE-NEXT:    retq
    157 ;
    158 ; X64-AVX-LABEL: test4:
    159 ; X64-AVX:       # %bb.0:
    160 ; X64-AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
    161 ; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
    162 ; X64-AVX-NEXT:    retq
    163 	%tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=1]
    164 	store <4 x float> %tmp5, <4 x float>* %res
    165 	ret void
    166 }
    167 
    168 define <4 x i32> @test5(i8** %ptr) nounwind {
    169 ; X86-SSE-LABEL: test5:
    170 ; X86-SSE:       # %bb.0:
    171 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    172 ; X86-SSE-NEXT:    movl (%eax), %eax
    173 ; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
    174 ; X86-SSE-NEXT:    pxor %xmm0, %xmm0
    175 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
    176 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    177 ; X86-SSE-NEXT:    retl
    178 ;
    179 ; X86-AVX-LABEL: test5:
    180 ; X86-AVX:       # %bb.0:
    181 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    182 ; X86-AVX-NEXT:    movl (%eax), %eax
    183 ; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    184 ; X86-AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    185 ; X86-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    186 ; X86-AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
    187 ; X86-AVX-NEXT:    retl
    188 ;
    189 ; X64-SSE-LABEL: test5:
    190 ; X64-SSE:       # %bb.0:
    191 ; X64-SSE-NEXT:    movq (%rdi), %rax
    192 ; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
    193 ; X64-SSE-NEXT:    pxor %xmm0, %xmm0
    194 ; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
    195 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    196 ; X64-SSE-NEXT:    retq
    197 ;
    198 ; X64-AVX-LABEL: test5:
    199 ; X64-AVX:       # %bb.0:
    200 ; X64-AVX-NEXT:    movq (%rdi), %rax
    201 ; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    202 ; X64-AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    203 ; X64-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    204 ; X64-AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
    205 ; X64-AVX-NEXT:    retq
    206 	%tmp = load i8*, i8** %ptr		; <i8*> [#uses=1]
    207 	%tmp.upgrd.1 = bitcast i8* %tmp to float*		; <float*> [#uses=1]
    208 	%tmp.upgrd.2 = load float, float* %tmp.upgrd.1		; <float> [#uses=1]
    209 	%tmp.upgrd.3 = insertelement <4 x float> undef, float %tmp.upgrd.2, i32 0		; <<4 x float>> [#uses=1]
    210 	%tmp9 = insertelement <4 x float> %tmp.upgrd.3, float 0.000000e+00, i32 1		; <<4 x float>> [#uses=1]
    211 	%tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 2		; <<4 x float>> [#uses=1]
    212 	%tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 3		; <<4 x float>> [#uses=1]
    213 	%tmp21 = bitcast <4 x float> %tmp11 to <16 x i8>		; <<16 x i8>> [#uses=1]
    214 	%tmp22 = shufflevector <16 x i8> %tmp21, <16 x i8> zeroinitializer, <16 x i32> < i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23 >		; <<16 x i8>> [#uses=1]
    215 	%tmp31 = bitcast <16 x i8> %tmp22 to <8 x i16>		; <<8 x i16>> [#uses=1]
    216 	%tmp.upgrd.4 = shufflevector <8 x i16> zeroinitializer, <8 x i16> %tmp31, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11 >		; <<8 x i16>> [#uses=1]
    217 	%tmp36 = bitcast <8 x i16> %tmp.upgrd.4 to <4 x i32>		; <<4 x i32>> [#uses=1]
    218 	ret <4 x i32> %tmp36
    219 }
    220 
    221 define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind {
    222 ; X86-SSE-LABEL: test6:
    223 ; X86-SSE:       # %bb.0:
    224 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    225 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    226 ; X86-SSE-NEXT:    movaps (%ecx), %xmm0
    227 ; X86-SSE-NEXT:    movaps %xmm0, (%eax)
    228 ; X86-SSE-NEXT:    retl
    229 ;
    230 ; X86-AVX-LABEL: test6:
    231 ; X86-AVX:       # %bb.0:
    232 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    233 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    234 ; X86-AVX-NEXT:    vmovaps (%ecx), %xmm0
    235 ; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
    236 ; X86-AVX-NEXT:    retl
    237 ;
    238 ; X64-SSE-LABEL: test6:
    239 ; X64-SSE:       # %bb.0:
    240 ; X64-SSE-NEXT:    movaps (%rsi), %xmm0
    241 ; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
    242 ; X64-SSE-NEXT:    retq
    243 ;
    244 ; X64-AVX-LABEL: test6:
    245 ; X64-AVX:       # %bb.0:
    246 ; X64-AVX-NEXT:    vmovaps (%rsi), %xmm0
    247 ; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
    248 ; X64-AVX-NEXT:    retq
    249   %tmp1 = load <4 x float>, <4 x float>* %A            ; <<4 x float>> [#uses=1]
    250   %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 >          ; <<4 x float>> [#uses=1]
    251   store <4 x float> %tmp2, <4 x float>* %res
    252   ret void
    253 }
    254 
    255 define void @test7() nounwind {
    256 ; SSE-LABEL: test7:
    257 ; SSE:       # %bb.0:
    258 ; SSE-NEXT:    xorps %xmm0, %xmm0
    259 ; SSE-NEXT:    movaps %xmm0, 0
    260 ; SSE-NEXT:    ret{{[l|q]}}
    261 ;
    262 ; AVX-LABEL: test7:
    263 ; AVX:       # %bb.0:
    264 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
    265 ; AVX-NEXT:    vmovaps %xmm0, 0
    266 ; AVX-NEXT:    ret{{[l|q]}}
    267   bitcast <4 x i32> zeroinitializer to <4 x float>                ; <<4 x float>>:1 [#uses=1]
    268   shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer         ; <<4 x float>>:2 [#uses=1]
    269   store <4 x float> %2, <4 x float>* null
    270   ret void
    271 }
    272 
    273 @x = external global [4 x i32]
    274 
    275 define <2 x i64> @test8() nounwind {
    276 ; X86-SSE-LABEL: test8:
    277 ; X86-SSE:       # %bb.0:
    278 ; X86-SSE-NEXT:    movups x, %xmm0
    279 ; X86-SSE-NEXT:    retl
    280 ;
    281 ; X86-AVX-LABEL: test8:
    282 ; X86-AVX:       # %bb.0:
    283 ; X86-AVX-NEXT:    vmovups x, %xmm0
    284 ; X86-AVX-NEXT:    retl
    285 ;
    286 ; X64-SSE-LABEL: test8:
    287 ; X64-SSE:       # %bb.0:
    288 ; X64-SSE-NEXT:    movups {{.*}}(%rip), %xmm0
    289 ; X64-SSE-NEXT:    retq
    290 ;
    291 ; X64-AVX-LABEL: test8:
    292 ; X64-AVX:       # %bb.0:
    293 ; X64-AVX-NEXT:    vmovups {{.*}}(%rip), %xmm0
    294 ; X64-AVX-NEXT:    retq
    295 	%tmp = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 0)		; <i32> [#uses=1]
    296 	%tmp3 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 1)		; <i32> [#uses=1]
    297 	%tmp5 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 2)		; <i32> [#uses=1]
    298 	%tmp7 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 3)		; <i32> [#uses=1]
    299 	%tmp.upgrd.1 = insertelement <4 x i32> undef, i32 %tmp, i32 0		; <<4 x i32>> [#uses=1]
    300 	%tmp13 = insertelement <4 x i32> %tmp.upgrd.1, i32 %tmp3, i32 1		; <<4 x i32>> [#uses=1]
    301 	%tmp14 = insertelement <4 x i32> %tmp13, i32 %tmp5, i32 2		; <<4 x i32>> [#uses=1]
    302 	%tmp15 = insertelement <4 x i32> %tmp14, i32 %tmp7, i32 3		; <<4 x i32>> [#uses=1]
    303 	%tmp16 = bitcast <4 x i32> %tmp15 to <2 x i64>		; <<2 x i64>> [#uses=1]
    304 	ret <2 x i64> %tmp16
    305 }
    306 
    307 define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind {
    308 ; X86-SSE-LABEL: test9:
    309 ; X86-SSE:       # %bb.0:
    310 ; X86-SSE-NEXT:    movups {{[0-9]+}}(%esp), %xmm0
    311 ; X86-SSE-NEXT:    retl
    312 ;
    313 ; X86-AVX-LABEL: test9:
    314 ; X86-AVX:       # %bb.0:
    315 ; X86-AVX-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
    316 ; X86-AVX-NEXT:    retl
    317 ;
    318 ; X64-SSE-LABEL: test9:
    319 ; X64-SSE:       # %bb.0:
    320 ; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
    321 ; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    322 ; X64-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
    323 ; X64-SSE-NEXT:    retq
    324 ;
    325 ; X64-AVX-LABEL: test9:
    326 ; X64-AVX:       # %bb.0:
    327 ; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
    328 ; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
    329 ; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
    330 ; X64-AVX-NEXT:    retq
    331 	%tmp = insertelement <4 x float> undef, float %a, i32 0		; <<4 x float>> [#uses=1]
    332 	%tmp11 = insertelement <4 x float> %tmp, float %b, i32 1		; <<4 x float>> [#uses=1]
    333 	%tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2		; <<4 x float>> [#uses=1]
    334 	%tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3		; <<4 x float>> [#uses=1]
    335 	ret <4 x float> %tmp13
    336 }
    337 
    338 define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind {
    339 ; X86-SSE-LABEL: test10:
    340 ; X86-SSE:       # %bb.0:
    341 ; X86-SSE-NEXT:    movups {{[0-9]+}}(%esp), %xmm0
    342 ; X86-SSE-NEXT:    retl
    343 ;
    344 ; X86-AVX-LABEL: test10:
    345 ; X86-AVX:       # %bb.0:
    346 ; X86-AVX-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
    347 ; X86-AVX-NEXT:    retl
    348 ;
    349 ; X64-SSE-LABEL: test10:
    350 ; X64-SSE:       # %bb.0:
    351 ; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
    352 ; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    353 ; X64-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
    354 ; X64-SSE-NEXT:    retq
    355 ;
    356 ; X64-AVX-LABEL: test10:
    357 ; X64-AVX:       # %bb.0:
    358 ; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
    359 ; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
    360 ; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
    361 ; X64-AVX-NEXT:    retq
    362 	%tmp = insertelement <4 x float> undef, float %a, i32 0		; <<4 x float>> [#uses=1]
    363 	%tmp11 = insertelement <4 x float> %tmp, float %b, i32 1		; <<4 x float>> [#uses=1]
    364 	%tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2		; <<4 x float>> [#uses=1]
    365 	%tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3		; <<4 x float>> [#uses=1]
    366 	ret <4 x float> %tmp13
    367 }
    368 
    369 define <2 x double> @test11(double %a, double %b) nounwind {
    370 ; X86-SSE-LABEL: test11:
    371 ; X86-SSE:       # %bb.0:
    372 ; X86-SSE-NEXT:    movups {{[0-9]+}}(%esp), %xmm0
    373 ; X86-SSE-NEXT:    retl
    374 ;
    375 ; X86-AVX-LABEL: test11:
    376 ; X86-AVX:       # %bb.0:
    377 ; X86-AVX-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
    378 ; X86-AVX-NEXT:    retl
    379 ;
    380 ; X64-SSE-LABEL: test11:
    381 ; X64-SSE:       # %bb.0:
    382 ; X64-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    383 ; X64-SSE-NEXT:    retq
    384 ;
    385 ; X64-AVX-LABEL: test11:
    386 ; X64-AVX:       # %bb.0:
    387 ; X64-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    388 ; X64-AVX-NEXT:    retq
    389 	%tmp = insertelement <2 x double> undef, double %a, i32 0		; <<2 x double>> [#uses=1]
    390 	%tmp7 = insertelement <2 x double> %tmp, double %b, i32 1		; <<2 x double>> [#uses=1]
    391 	ret <2 x double> %tmp7
    392 }
    393 
    394 define void @test12() nounwind {
    395 ; SSE-LABEL: test12:
    396 ; SSE:       # %bb.0:
    397 ; SSE-NEXT:    movapd 0, %xmm0
    398 ; SSE-NEXT:    movapd {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    399 ; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
    400 ; SSE-NEXT:    xorps %xmm2, %xmm2
    401 ; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
    402 ; SSE-NEXT:    addps %xmm1, %xmm2
    403 ; SSE-NEXT:    movaps %xmm2, 0
    404 ; SSE-NEXT:    ret{{[l|q]}}
    405 ;
    406 ; AVX1-LABEL: test12:
    407 ; AVX1:       # %bb.0:
    408 ; AVX1-NEXT:    vmovaps 0, %xmm0
    409 ; AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
    410 ; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
    411 ; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
    412 ; AVX1-NEXT:    vaddps %xmm0, %xmm1, %xmm0
    413 ; AVX1-NEXT:    vmovaps %xmm0, 0
    414 ; AVX1-NEXT:    ret{{[l|q]}}
    415 ;
    416 ; AVX512-LABEL: test12:
    417 ; AVX512:       # %bb.0:
    418 ; AVX512-NEXT:    vmovaps 0, %xmm0
    419 ; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1]
    420 ; AVX512-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
    421 ; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
    422 ; AVX512-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
    423 ; AVX512-NEXT:    vaddps %xmm0, %xmm1, %xmm0
    424 ; AVX512-NEXT:    vmovaps %xmm0, 0
    425 ; AVX512-NEXT:    ret{{[l|q]}}
    426   %tmp1 = load <4 x float>, <4 x float>* null          ; <<4 x float>> [#uses=2]
    427   %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 >             ; <<4 x float>> [#uses=1]
    428   %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 >                ; <<4 x float>> [#uses=1]
    429   %tmp4 = fadd <4 x float> %tmp2, %tmp3            ; <<4 x float>> [#uses=1]
    430   store <4 x float> %tmp4, <4 x float>* null
    431   ret void
    432 }
    433 
    434 define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
    435 ; X86-SSE-LABEL: test13:
    436 ; X86-SSE:       # %bb.0:
    437 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    438 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    439 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
    440 ; X86-SSE-NEXT:    movaps (%edx), %xmm0
    441 ; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
    442 ; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
    443 ; X86-SSE-NEXT:    movaps %xmm0, (%eax)
    444 ; X86-SSE-NEXT:    retl
    445 ;
    446 ; X86-AVX-LABEL: test13:
    447 ; X86-AVX:       # %bb.0:
    448 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    449 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    450 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
    451 ; X86-AVX-NEXT:    vmovaps (%edx), %xmm0
    452 ; X86-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
    453 ; X86-AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
    454 ; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
    455 ; X86-AVX-NEXT:    retl
    456 ;
    457 ; X64-SSE-LABEL: test13:
    458 ; X64-SSE:       # %bb.0:
    459 ; X64-SSE-NEXT:    movaps (%rdx), %xmm0
    460 ; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
    461 ; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
    462 ; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
    463 ; X64-SSE-NEXT:    retq
    464 ;
    465 ; X64-AVX-LABEL: test13:
    466 ; X64-AVX:       # %bb.0:
    467 ; X64-AVX-NEXT:    vmovaps (%rdx), %xmm0
    468 ; X64-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
    469 ; X64-AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
    470 ; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
    471 ; X64-AVX-NEXT:    retq
    472   %tmp3 = load <4 x float>, <4 x float>* %B            ; <<4 x float>> [#uses=1]
    473   %tmp5 = load <4 x float>, <4 x float>* %C            ; <<4 x float>> [#uses=1]
    474   %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 >         ; <<4 x float>> [#uses=1]
    475   store <4 x float> %tmp11, <4 x float>* %res
    476   ret void
    477 }
    478 
    479 define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind {
    480 ; X86-SSE-LABEL: test14:
    481 ; X86-SSE:       # %bb.0:
    482 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    483 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    484 ; X86-SSE-NEXT:    movaps (%ecx), %xmm1
    485 ; X86-SSE-NEXT:    movaps (%eax), %xmm2
    486 ; X86-SSE-NEXT:    movaps %xmm2, %xmm0
    487 ; X86-SSE-NEXT:    addps %xmm1, %xmm0
    488 ; X86-SSE-NEXT:    subps %xmm1, %xmm2
    489 ; X86-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
    490 ; X86-SSE-NEXT:    retl
    491 ;
    492 ; X86-AVX-LABEL: test14:
    493 ; X86-AVX:       # %bb.0:
    494 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    495 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    496 ; X86-AVX-NEXT:    vmovaps (%ecx), %xmm0
    497 ; X86-AVX-NEXT:    vmovaps (%eax), %xmm1
    498 ; X86-AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm2
    499 ; X86-AVX-NEXT:    vsubps %xmm0, %xmm1, %xmm0
    500 ; X86-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0]
    501 ; X86-AVX-NEXT:    retl
    502 ;
    503 ; X64-SSE-LABEL: test14:
    504 ; X64-SSE:       # %bb.0:
    505 ; X64-SSE-NEXT:    movaps (%rsi), %xmm1
    506 ; X64-SSE-NEXT:    movaps (%rdi), %xmm2
    507 ; X64-SSE-NEXT:    movaps %xmm2, %xmm0
    508 ; X64-SSE-NEXT:    addps %xmm1, %xmm0
    509 ; X64-SSE-NEXT:    subps %xmm1, %xmm2
    510 ; X64-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
    511 ; X64-SSE-NEXT:    retq
    512 ;
    513 ; X64-AVX-LABEL: test14:
    514 ; X64-AVX:       # %bb.0:
    515 ; X64-AVX-NEXT:    vmovaps (%rsi), %xmm0
    516 ; X64-AVX-NEXT:    vmovaps (%rdi), %xmm1
    517 ; X64-AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm2
    518 ; X64-AVX-NEXT:    vsubps %xmm0, %xmm1, %xmm0
    519 ; X64-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0]
    520 ; X64-AVX-NEXT:    retq
    521   %tmp = load <4 x float>, <4 x float>* %y             ; <<4 x float>> [#uses=2]
    522   %tmp5 = load <4 x float>, <4 x float>* %x            ; <<4 x float>> [#uses=2]
    523   %tmp9 = fadd <4 x float> %tmp5, %tmp             ; <<4 x float>> [#uses=1]
    524   %tmp21 = fsub <4 x float> %tmp5, %tmp            ; <<4 x float>> [#uses=1]
    525   %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 >                ; <<4 x float>> [#uses=1]
    526   ret <4 x float> %tmp27
    527 }
    528 
    529 define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind {
    530 ; X86-SSE-LABEL: test15:
    531 ; X86-SSE:       # %bb.0: # %entry
    532 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    533 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    534 ; X86-SSE-NEXT:    movaps (%ecx), %xmm0
    535 ; X86-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
    536 ; X86-SSE-NEXT:    retl
    537 ;
    538 ; X86-AVX-LABEL: test15:
    539 ; X86-AVX:       # %bb.0: # %entry
    540 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    541 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    542 ; X86-AVX-NEXT:    vmovaps (%ecx), %xmm0
    543 ; X86-AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
    544 ; X86-AVX-NEXT:    retl
    545 ;
    546 ; X64-SSE-LABEL: test15:
    547 ; X64-SSE:       # %bb.0: # %entry
    548 ; X64-SSE-NEXT:    movaps (%rdi), %xmm0
    549 ; X64-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
    550 ; X64-SSE-NEXT:    retq
    551 ;
    552 ; X64-AVX-LABEL: test15:
    553 ; X64-AVX:       # %bb.0: # %entry
    554 ; X64-AVX-NEXT:    vmovaps (%rdi), %xmm0
    555 ; X64-AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
    556 ; X64-AVX-NEXT:    retq
    557 entry:
    558   %tmp = load <4 x float>, <4 x float>* %y             ; <<4 x float>> [#uses=1]
    559   %tmp3 = load <4 x float>, <4 x float>* %x            ; <<4 x float>> [#uses=1]
    560   %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 >           ; <<4 x float>> [#uses=1]
    561   ret <4 x float> %tmp4
    562 }
    563 
    564 ; PR8900
    565 
    566 define  <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocapture %dst) {
    567 ; X86-SSE-LABEL: test16:
    568 ; X86-SSE:       # %bb.0:
    569 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    570 ; X86-SSE-NEXT:    movaps 96(%eax), %xmm0
    571 ; X86-SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
    572 ; X86-SSE-NEXT:    retl
    573 ;
    574 ; X86-AVX-LABEL: test16:
    575 ; X86-AVX:       # %bb.0:
    576 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    577 ; X86-AVX-NEXT:    vmovaps 96(%eax), %ymm0
    578 ; X86-AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
    579 ; X86-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    580 ; X86-AVX-NEXT:    vzeroupper
    581 ; X86-AVX-NEXT:    retl
    582 ;
    583 ; X64-SSE-LABEL: test16:
    584 ; X64-SSE:       # %bb.0:
    585 ; X64-SSE-NEXT:    movaps 96(%rdi), %xmm0
    586 ; X64-SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
    587 ; X64-SSE-NEXT:    retq
    588 ;
    589 ; X64-AVX-LABEL: test16:
    590 ; X64-AVX:       # %bb.0:
    591 ; X64-AVX-NEXT:    vmovaps 96(%rdi), %ymm0
    592 ; X64-AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
    593 ; X64-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    594 ; X64-AVX-NEXT:    vzeroupper
    595 ; X64-AVX-NEXT:    retq
    596   %i5 = getelementptr inbounds <4 x double>, <4 x double>* %srcA, i32 3
    597   %i6 = load <4 x double>, <4 x double>* %i5, align 32
    598   %i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> <i32 0, i32 2>
    599   ret <2 x double> %i7
    600 }
    601 
    602 ; PR9009
    603 define fastcc void @test17() nounwind {
    604 ; X86-SSE-LABEL: test17:
    605 ; X86-SSE:       # %bb.0: # %entry
    606 ; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = <u,u,32768,32768>
    607 ; X86-SSE-NEXT:    movaps %xmm0, (%eax)
    608 ; X86-SSE-NEXT:    retl
    609 ;
    610 ; X86-AVX1-LABEL: test17:
    611 ; X86-AVX1:       # %bb.0: # %entry
    612 ; X86-AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = <u,u,32768,32768>
    613 ; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax)
    614 ; X86-AVX1-NEXT:    retl
    615 ;
    616 ; X86-AVX512-LABEL: test17:
    617 ; X86-AVX512:       # %bb.0: # %entry
    618 ; X86-AVX512-NEXT:    vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768]
    619 ; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax)
    620 ; X86-AVX512-NEXT:    retl
    621 ;
    622 ; X64-SSE-LABEL: test17:
    623 ; X64-SSE:       # %bb.0: # %entry
    624 ; X64-SSE-NEXT:    movaps {{.*#+}} xmm0 = <u,u,32768,32768>
    625 ; X64-SSE-NEXT:    movaps %xmm0, (%rax)
    626 ; X64-SSE-NEXT:    retq
    627 ;
    628 ; X64-AVX1-LABEL: test17:
    629 ; X64-AVX1:       # %bb.0: # %entry
    630 ; X64-AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = <u,u,32768,32768>
    631 ; X64-AVX1-NEXT:    vmovaps %xmm0, (%rax)
    632 ; X64-AVX1-NEXT:    retq
    633 ;
    634 ; X64-AVX512-LABEL: test17:
    635 ; X64-AVX512:       # %bb.0: # %entry
    636 ; X64-AVX512-NEXT:    vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768]
    637 ; X64-AVX512-NEXT:    vmovaps %xmm0, (%rax)
    638 ; X64-AVX512-NEXT:    retq
    639 entry:
    640   %0 = insertelement <4 x i32> undef, i32 undef, i32 1
    641   %1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
    642   %2 = bitcast <4 x i32> %1 to <4 x float>
    643   store <4 x float> %2, <4 x float> * undef
    644   ret void
    645 }
    646 
    647 ; PR9210
    648 define <4 x float> @f(<4 x double>) nounwind {
    649 ; SSE-LABEL: f:
    650 ; SSE:       # %bb.0: # %entry
    651 ; SSE-NEXT:    cvtpd2ps %xmm1, %xmm1
    652 ; SSE-NEXT:    cvtpd2ps %xmm0, %xmm0
    653 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    654 ; SSE-NEXT:    ret{{[l|q]}}
    655 ;
    656 ; AVX-LABEL: f:
    657 ; AVX:       # %bb.0: # %entry
    658 ; AVX-NEXT:    vcvtpd2ps %ymm0, %xmm0
    659 ; AVX-NEXT:    vzeroupper
    660 ; AVX-NEXT:    ret{{[l|q]}}
    661 entry:
    662  %double2float.i = fptrunc <4 x double> %0 to <4 x float>
    663  ret <4 x float> %double2float.i
    664 }
    665 
    666 define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
    667 ; SSE-LABEL: test_insert_64_zext:
    668 ; SSE:       # %bb.0:
    669 ; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
    670 ; SSE-NEXT:    ret{{[l|q]}}
    671 ;
    672 ; AVX-LABEL: test_insert_64_zext:
    673 ; AVX:       # %bb.0:
    674 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
    675 ; AVX-NEXT:    ret{{[l|q]}}
    676   %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
    677   ret <2 x i64> %1
    678 }
    679 
    680 define <4 x i32> @PR19721(<4 x i32> %i) {
    681 ; X86-SSE-LABEL: PR19721:
    682 ; X86-SSE:       # %bb.0:
    683 ; X86-SSE-NEXT:    andps {{\.LCPI.*}}, %xmm0
    684 ; X86-SSE-NEXT:    retl
    685 ;
    686 ; X86-AVX-LABEL: PR19721:
    687 ; X86-AVX:       # %bb.0:
    688 ; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    689 ; X86-AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
    690 ; X86-AVX-NEXT:    retl
    691 ;
    692 ; X64-SSE-LABEL: PR19721:
    693 ; X64-SSE:       # %bb.0:
    694 ; X64-SSE-NEXT:    movq %xmm0, %rax
    695 ; X64-SSE-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
    696 ; X64-SSE-NEXT:    andq %rax, %rcx
    697 ; X64-SSE-NEXT:    movq %rcx, %xmm1
    698 ; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
    699 ; X64-SSE-NEXT:    retq
    700 ;
    701 ; X64-AVX1-LABEL: PR19721:
    702 ; X64-AVX1:       # %bb.0:
    703 ; X64-AVX1-NEXT:    vmovq %xmm0, %rax
    704 ; X64-AVX1-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
    705 ; X64-AVX1-NEXT:    andq %rax, %rcx
    706 ; X64-AVX1-NEXT:    vmovq %rcx, %xmm1
    707 ; X64-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
    708 ; X64-AVX1-NEXT:    retq
    709 ;
    710 ; X64-AVX512-LABEL: PR19721:
    711 ; X64-AVX512:       # %bb.0:
    712 ; X64-AVX512-NEXT:    vmovq %xmm0, %rax
    713 ; X64-AVX512-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
    714 ; X64-AVX512-NEXT:    andq %rax, %rcx
    715 ; X64-AVX512-NEXT:    vmovq %rcx, %xmm1
    716 ; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    717 ; X64-AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
    718 ; X64-AVX512-NEXT:    retq
    719   %bc = bitcast <4 x i32> %i to i128
    720   %insert = and i128 %bc, -4294967296
    721   %bc2 = bitcast i128 %insert to <4 x i32>
    722   ret <4 x i32> %bc2
    723 }
    724 
    725 define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) {
    726 ; SSE-LABEL: test_mul:
    727 ; SSE:       # %bb.0:
    728 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
    729 ; SSE-NEXT:    pmuludq %xmm1, %xmm0
    730 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    731 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
    732 ; SSE-NEXT:    pmuludq %xmm2, %xmm1
    733 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    734 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    735 ; SSE-NEXT:    ret{{[l|q]}}
    736 ;
    737 ; AVX-LABEL: test_mul:
    738 ; AVX:       # %bb.0:
    739 ; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
    740 ; AVX-NEXT:    ret{{[l|q]}}
    741   %m = mul <4 x i32> %x, %y
    742   ret <4 x i32> %m
    743 }
    744