Home | History | Annotate | Download | only in X86
      1 ; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32 --check-prefix=CHECK
      2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64 --check-prefix=CHECK
      3 
      4 @g16 = external global i16
      5 
      6 define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind {
      7         %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1
      8         ret <4 x i32> %tmp1
      9 ; X32-LABEL: pinsrd_1:
     10 ; X32:    pinsrd $1, 4(%esp), %xmm0
     11 
     12 ; X64-LABEL: pinsrd_1:
     13 ; X64:    pinsrd $1, %edi, %xmm0
     14 }
     15 
     16 define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind {
     17         %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1
     18         ret <16 x i8> %tmp1
     19 ; X32-LABEL: pinsrb_1:
     20 ; X32:    pinsrb $1, 4(%esp), %xmm0
     21 
     22 ; X64-LABEL: pinsrb_1:
     23 ; X64:    pinsrb $1, %edi, %xmm0
     24 }
     25 
     26 
     27 define <2 x i64> @pmovsxbd_1(i32* %p) nounwind {
     28 entry:
     29 	%0 = load i32* %p, align 4
     30 	%1 = insertelement <4 x i32> undef, i32 %0, i32 0
     31 	%2 = insertelement <4 x i32> %1, i32 0, i32 1
     32 	%3 = insertelement <4 x i32> %2, i32 0, i32 2
     33 	%4 = insertelement <4 x i32> %3, i32 0, i32 3
     34 	%5 = bitcast <4 x i32> %4 to <16 x i8>
     35 	%6 = tail call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %5) nounwind readnone
     36 	%7 = bitcast <4 x i32> %6 to <2 x i64>
     37 	ret <2 x i64> %7
     38         
     39 ; X32: _pmovsxbd_1:
     40 ; X32:   movl      4(%esp), %eax
     41 ; X32:   pmovsxbd   (%eax), %xmm0
     42 
     43 ; X64: _pmovsxbd_1:
     44 ; X64:   pmovsxbd   (%rdi), %xmm0
     45 }
     46 
     47 define <2 x i64> @pmovsxwd_1(i64* %p) nounwind readonly {
     48 entry:
     49 	%0 = load i64* %p		; <i64> [#uses=1]
     50 	%tmp2 = insertelement <2 x i64> zeroinitializer, i64 %0, i32 0		; <<2 x i64>> [#uses=1]
     51 	%1 = bitcast <2 x i64> %tmp2 to <8 x i16>		; <<8 x i16>> [#uses=1]
     52 	%2 = tail call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) nounwind readnone		; <<4 x i32>> [#uses=1]
     53 	%3 = bitcast <4 x i32> %2 to <2 x i64>		; <<2 x i64>> [#uses=1]
     54 	ret <2 x i64> %3
     55         
     56 ; X32: _pmovsxwd_1:
     57 ; X32:   movl 4(%esp), %eax
     58 ; X32:   pmovsxwd (%eax), %xmm0
     59 
     60 ; X64: _pmovsxwd_1:
     61 ; X64:   pmovsxwd (%rdi), %xmm0
     62 }
     63 
     64 
     65 
     66 
     67 define <2 x i64> @pmovzxbq_1() nounwind {
     68 entry:
     69 	%0 = load i16* @g16, align 2		; <i16> [#uses=1]
     70 	%1 = insertelement <8 x i16> undef, i16 %0, i32 0		; <<8 x i16>> [#uses=1]
     71 	%2 = bitcast <8 x i16> %1 to <16 x i8>		; <<16 x i8>> [#uses=1]
     72 	%3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone		; <<2 x i64>> [#uses=1]
     73 	ret <2 x i64> %3
     74 
     75 ; X32: _pmovzxbq_1:
     76 ; X32:   movl	L_g16$non_lazy_ptr, %eax
     77 ; X32:   pmovzxbq	(%eax), %xmm0
     78 
     79 ; X64: _pmovzxbq_1:
     80 ; X64:   movq	_g16@GOTPCREL(%rip), %rax
     81 ; X64:   pmovzxbq	(%rax), %xmm0
     82 }
     83 
     84 declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
     85 declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
     86 declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
     87 
     88 
     89 
     90 
     91 define i32 @extractps_1(<4 x float> %v) nounwind {
     92   %s = extractelement <4 x float> %v, i32 3
     93   %i = bitcast float %s to i32
     94   ret i32 %i
     95 
     96 ; X32: _extractps_1:  
     97 ; X32:	  extractps	$3, %xmm0, %eax
     98 
     99 ; X64: _extractps_1:  
    100 ; X64:	  extractps	$3, %xmm0, %eax
    101 }
    102 define i32 @extractps_2(<4 x float> %v) nounwind {
    103   %t = bitcast <4 x float> %v to <4 x i32>
    104   %s = extractelement <4 x i32> %t, i32 3
    105   ret i32 %s
    106 
    107 ; X32: _extractps_2:
    108 ; X32:	  extractps	$3, %xmm0, %eax
    109 
    110 ; X64: _extractps_2:
    111 ; X64:	  extractps	$3, %xmm0, %eax
    112 }
    113 
    114 
    115 ; The non-store form of extractps puts its result into a GPR.
    116 ; This makes it suitable for an extract from a <4 x float> that
    117 ; is bitcasted to i32, but unsuitable for much of anything else.
    118 
    119 define float @ext_1(<4 x float> %v) nounwind {
    120   %s = extractelement <4 x float> %v, i32 3
    121   %t = fadd float %s, 1.0
    122   ret float %t
    123 
    124 ; X32: _ext_1:
    125 ; X32:	  pshufd	$3, %xmm0, %xmm0
    126 ; X32:	  addss	LCPI7_0, %xmm0
    127 
    128 ; X64: _ext_1:
    129 ; X64:	  pshufd	$3, %xmm0, %xmm0
    130 ; X64:	  addss	LCPI7_0(%rip), %xmm0
    131 }
    132 define float @ext_2(<4 x float> %v) nounwind {
    133   %s = extractelement <4 x float> %v, i32 3
    134   ret float %s
    135 
    136 ; X32: _ext_2:
    137 ; X32:	  pshufd	$3, %xmm0, %xmm0
    138 
    139 ; X64: _ext_2:
    140 ; X64:	  pshufd	$3, %xmm0, %xmm0
    141 }
    142 define i32 @ext_3(<4 x i32> %v) nounwind {
    143   %i = extractelement <4 x i32> %v, i32 3
    144   ret i32 %i
    145 
    146 ; X32: _ext_3:
    147 ; X32:	  pextrd	$3, %xmm0, %eax
    148 
    149 ; X64: _ext_3:
    150 ; X64:	  pextrd	$3, %xmm0, %eax
    151 }
    152 
    153 define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind {
    154         %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 1) nounwind readnone
    155         ret <4 x float> %tmp1
    156 ; X32: _insertps_1:
    157 ; X32:    insertps  $1, %xmm1, %xmm0
    158 
    159 ; X64: _insertps_1:
    160 ; X64:    insertps  $1, %xmm1, %xmm0
    161 }
    162 
    163 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
    164 
    165 define <4 x float> @insertps_2(<4 x float> %t1, float %t2) nounwind {
    166         %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
    167         ret <4 x float> %tmp1
    168 ; X32: _insertps_2:
    169 ; X32:    insertps  $0, 4(%esp), %xmm0
    170 
    171 ; X64: _insertps_2:
    172 ; X64:    insertps  $0, %xmm1, %xmm0        
    173 }
    174 
    175 define <4 x float> @insertps_3(<4 x float> %t1, <4 x float> %t2) nounwind {
    176         %tmp2 = extractelement <4 x float> %t2, i32 0
    177         %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
    178         ret <4 x float> %tmp1
    179 ; X32: _insertps_3:
    180 ; X32:    insertps  $0, %xmm1, %xmm0        
    181 
    182 ; X64: _insertps_3:
    183 ; X64:    insertps  $0, %xmm1, %xmm0        
    184 }
    185 
    186 define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind {
    187         %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
    188         ret i32 %tmp1
    189 ; X32: _ptestz_1:
    190 ; X32:    ptest 	%xmm1, %xmm0
    191 ; X32:    sete	%al
    192 
    193 ; X64: _ptestz_1:
    194 ; X64:    ptest 	%xmm1, %xmm0
    195 ; X64:    sete	%al
    196 }
    197 
    198 define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind {
    199         %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
    200         ret i32 %tmp1
    201 ; X32: _ptestz_2:
    202 ; X32:    ptest 	%xmm1, %xmm0
    203 ; X32:    sbbl	%eax
    204 
    205 ; X64: _ptestz_2:
    206 ; X64:    ptest 	%xmm1, %xmm0
    207 ; X64:    sbbl	%eax
    208 }
    209 
    210 define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind {
    211         %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
    212         ret i32 %tmp1
    213 ; X32: _ptestz_3:
    214 ; X32:    ptest 	%xmm1, %xmm0
    215 ; X32:    seta	%al
    216 
    217 ; X64: _ptestz_3:
    218 ; X64:    ptest 	%xmm1, %xmm0
    219 ; X64:    seta	%al
    220 }
    221 
    222 
    223 declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
    224 declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
    225 declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
    226 
    227 ; This used to compile to insertps $0  + insertps $16.  insertps $0 is always
    228 ; pointless.
    229 define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind  {
    230 entry:
    231   %tmp7 = extractelement <2 x float> %A, i32 0
    232   %tmp5 = extractelement <2 x float> %A, i32 1
    233   %tmp3 = extractelement <2 x float> %B, i32 0
    234   %tmp1 = extractelement <2 x float> %B, i32 1
    235   %add.r = fadd float %tmp7, %tmp3
    236   %add.i = fadd float %tmp5, %tmp1
    237   %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0
    238   %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1
    239   ret <2 x float> %tmp9
    240 ; X32-LABEL: buildvector:
    241 ; X32-NOT: insertps $0
    242 ; X32: insertps $16
    243 ; X32-NOT: insertps $0
    244 ; X32: ret
    245 ; X64-LABEL: buildvector:
    246 ; X64-NOT: insertps $0
    247 ; X64: insertps $16
    248 ; X64-NOT: insertps $0
    249 ; X64: ret
    250 }
    251 
    252 define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
    253 entry:
    254   %0 = load <4 x float>* %pb, align 16
    255   %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
    256   ret <4 x float> %vecinit6
    257 ; CHECK-LABEL: insertps_from_shufflevector_1:
    258 ; CHECK-NOT: movss
    259 ; CHECK-NOT: shufps
    260 ; CHECK: insertps    $48,
    261 ; CHECK: ret
    262 }
    263 
    264 define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
    265 entry:
    266   %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
    267   ret <4 x float> %vecinit6
    268 ; CHECK-LABEL: insertps_from_shufflevector_2:
    269 ; CHECK-NOT: shufps
    270 ; CHECK: insertps    $96,
    271 ; CHECK: ret
    272 }
    273 
    274 ; For loading an i32 from memory into an xmm register we use pinsrd
    275 ; instead of insertps
    276 define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) {
    277 entry:
    278   %0 = load <4 x i32>* %pb, align 16
    279   %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
    280   ret <4 x i32> %vecinit6
    281 ; CHECK-LABEL: pinsrd_from_shufflevector_i32:
    282 ; CHECK-NOT: movss
    283 ; CHECK-NOT: shufps
    284 ; CHECK: pinsrd  $3,
    285 ; CHECK: ret
    286 }
    287 
    288 define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
    289 entry:
    290   %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
    291   ret <4 x i32> %vecinit6
    292 ; CHECK-LABEL: insertps_from_shufflevector_i32_2:
    293 ; CHECK-NOT: shufps
    294 ; CHECK-NOT: movaps
    295 ; CHECK: insertps    $208,
    296 ; CHECK: ret
    297 }
    298 
    299 define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) {
    300 ; CHECK-LABEL: insertps_from_load_ins_elt_undef:
    301 ; CHECK-NOT: movss
    302 ; CHECK-NOT: shufps
    303 ; CHECK: insertps    $16,
    304 ; CHECK: ret
    305   %1 = load float* %b, align 4
    306   %2 = insertelement <4 x float> undef, float %1, i32 0
    307   %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
    308   ret <4 x float> %result
    309 }
    310 
    311 define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
    312 ; CHECK-LABEL: insertps_from_load_ins_elt_undef_i32:
    313 ; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
    314 ;; aCHECK-NOT: movd
    315 ; CHECK-NOT: shufps
    316 ; CHECK: insertps    $32,
    317 ; CHECK: ret
    318   %1 = load i32* %b, align 4
    319   %2 = insertelement <4 x i32> undef, i32 %1, i32 0
    320   %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
    321   ret <4 x i32> %result
    322 }
    323 
    324 ;;;;;; Shuffles optimizable with a single insertps instruction
    325 define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
    326 ; CHECK-LABEL: shuf_XYZ0:
    327 ; CHECK-NOT: pextrd
    328 ; CHECK-NOT: punpckldq
    329 ; CHECK: insertps    $8
    330 ; CHECK: ret
    331   %vecext = extractelement <4 x float> %x, i32 0
    332   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    333   %vecext1 = extractelement <4 x float> %x, i32 1
    334   %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
    335   %vecext3 = extractelement <4 x float> %x, i32 2
    336   %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
    337   %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
    338   ret <4 x float> %vecinit5
    339 }
    340 
    341 define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
    342 ; CHECK-LABEL: shuf_XY00:
    343 ; CHECK-NOT: pextrd
    344 ; CHECK-NOT: punpckldq
    345 ; CHECK: insertps    $12
    346 ; CHECK: ret
    347   %vecext = extractelement <4 x float> %x, i32 0
    348   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    349   %vecext1 = extractelement <4 x float> %x, i32 1
    350   %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
    351   %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
    352   %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
    353   ret <4 x float> %vecinit4
    354 }
    355 
    356 define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
    357 ; CHECK-LABEL: shuf_XYY0:
    358 ; CHECK-NOT: pextrd
    359 ; CHECK-NOT: punpckldq
    360 ; CHECK: insertps    $104
    361 ; CHECK: ret
    362   %vecext = extractelement <4 x float> %x, i32 0
    363   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    364   %vecext1 = extractelement <4 x float> %x, i32 1
    365   %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
    366   %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2
    367   %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
    368   ret <4 x float> %vecinit5
    369 }
    370 
    371 define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
    372 ; CHECK-LABEL: shuf_XYW0:
    373 ; CHECK: insertps    $232
    374 ; CHECK: ret
    375   %vecext = extractelement <4 x float> %x, i32 0
    376   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    377   %vecext1 = extractelement <4 x float> %x, i32 1
    378   %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
    379   %vecext2 = extractelement <4 x float> %x, i32 3
    380   %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2
    381   %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
    382   ret <4 x float> %vecinit4
    383 }
    384 
    385 define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
    386 ; CHECK-LABEL: shuf_W00W:
    387 ; CHECK-NOT: pextrd
    388 ; CHECK-NOT: punpckldq
    389 ; CHECK: insertps    $198
    390 ; CHECK: ret
    391   %vecext = extractelement <4 x float> %x, i32 3
    392   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    393   %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1
    394   %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
    395   %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3
    396   ret <4 x float> %vecinit4
    397 }
    398 
    399 define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
    400 ; CHECK-LABEL: shuf_X00A:
    401 ; CHECK-NOT: movaps
    402 ; CHECK-NOT: shufps
    403 ; CHECK: insertps    $48
    404 ; CHECK: ret
    405   %vecext = extractelement <4 x float> %x, i32 0
    406   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    407   %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
    408   %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
    409   %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
    410   ret <4 x float> %vecinit4
    411 }
    412 
    413 define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
    414 ; CHECK-LABEL: shuf_X00X:
    415 ; CHECK-NOT: movaps
    416 ; CHECK-NOT: shufps
    417 ; CHECK: insertps    $48
    418 ; CHECK: ret
    419   %vecext = extractelement <4 x float> %x, i32 0
    420   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    421   %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
    422   %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
    423   %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
    424   ret <4 x float> %vecinit4
    425 }
    426 
    427 define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
    428 ; CHECK-LABEL: shuf_X0YC:
    429 ; CHECK: shufps
    430 ; CHECK-NOT: movhlps
    431 ; CHECK-NOT: shufps
    432 ; CHECK: insertps    $176
    433 ; CHECK: ret
    434   %vecext = extractelement <4 x float> %x, i32 0
    435   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    436   %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
    437   %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
    438   %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
    439   ret <4 x float> %vecinit5
    440 }
    441 
    442 define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
    443 ; CHECK-LABEL: i32_shuf_XYZ0:
    444 ; CHECK-NOT: pextrd
    445 ; CHECK-NOT: punpckldq
    446 ; CHECK: insertps    $8
    447 ; CHECK: ret
    448   %vecext = extractelement <4 x i32> %x, i32 0
    449   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
    450   %vecext1 = extractelement <4 x i32> %x, i32 1
    451   %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
    452   %vecext3 = extractelement <4 x i32> %x, i32 2
    453   %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
    454   %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
    455   ret <4 x i32> %vecinit5
    456 }
    457 
    458 define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
    459 ; CHECK-LABEL: i32_shuf_XY00:
    460 ; CHECK-NOT: pextrd
    461 ; CHECK-NOT: punpckldq
    462 ; CHECK: insertps    $12
    463 ; CHECK: ret
    464   %vecext = extractelement <4 x i32> %x, i32 0
    465   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
    466   %vecext1 = extractelement <4 x i32> %x, i32 1
    467   %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
    468   %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
    469   %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
    470   ret <4 x i32> %vecinit4
    471 }
    472 
    473 define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
    474 ; CHECK-LABEL: i32_shuf_XYY0:
    475 ; CHECK-NOT: pextrd
    476 ; CHECK-NOT: punpckldq
    477 ; CHECK: insertps    $104
    478 ; CHECK: ret
    479   %vecext = extractelement <4 x i32> %x, i32 0
    480   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
    481   %vecext1 = extractelement <4 x i32> %x, i32 1
    482   %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
    483   %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2
    484   %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
    485   ret <4 x i32> %vecinit5
    486 }
    487 
    488 define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
    489 ; CHECK-LABEL: i32_shuf_XYW0:
    490 ; CHECK-NOT: pextrd
    491 ; CHECK-NOT: punpckldq
    492 ; CHECK: insertps    $232
    493 ; CHECK: ret
    494   %vecext = extractelement <4 x i32> %x, i32 0
    495   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
    496   %vecext1 = extractelement <4 x i32> %x, i32 1
    497   %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
    498   %vecext2 = extractelement <4 x i32> %x, i32 3
    499   %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2
    500   %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
    501   ret <4 x i32> %vecinit4
    502 }
    503 
    504 define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
    505 ; CHECK-LABEL: i32_shuf_W00W:
    506 ; CHECK-NOT: pextrd
    507 ; CHECK-NOT: punpckldq
    508 ; CHECK: insertps    $198
    509 ; CHECK: ret
    510   %vecext = extractelement <4 x i32> %x, i32 3
    511   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
    512   %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1
    513   %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
    514   %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3
    515   ret <4 x i32> %vecinit4
    516 }
    517 
    518 define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
    519 ; CHECK-LABEL: i32_shuf_X00A:
    520 ; CHECK-NOT: movaps
    521 ; CHECK-NOT: shufps
    522 ; CHECK: insertps    $48
    523 ; CHECK: ret
    524   %vecext = extractelement <4 x i32> %x, i32 0
    525   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
    526   %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
    527   %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
    528   %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
    529   ret <4 x i32> %vecinit4
    530 }
    531 
    532 define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
    533 ; CHECK-LABEL: i32_shuf_X00X:
    534 ; CHECK-NOT: movaps
    535 ; CHECK-NOT: shufps
    536 ; CHECK: insertps    $48
    537 ; CHECK: ret
    538   %vecext = extractelement <4 x i32> %x, i32 0
    539   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
    540   %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
    541   %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
    542   %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
    543   ret <4 x i32> %vecinit4
    544 }
    545 
    546 define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
    547 ; CHECK-LABEL: i32_shuf_X0YC:
    548 ; CHECK: shufps
    549 ; CHECK-NOT: movhlps
    550 ; CHECK-NOT: shufps
    551 ; CHECK: insertps    $176
    552 ; CHECK: ret
    553   %vecext = extractelement <4 x i32> %x, i32 0
    554   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
    555   %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
    556   %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
    557   %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
    558   ret <4 x i32> %vecinit5
    559 }
    560 
    561 ;; Test for a bug in the first implementation of LowerBuildVectorv4x32
    562 define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
    563 ; CHECK-LABEL: test_insertps_no_undef:
    564 ; CHECK: movaps  %xmm0, %xmm1
    565 ; CHECK-NEXT: insertps        $8, %xmm1, %xmm1
    566 ; CHECK-NEXT: maxps   %xmm1, %xmm0
    567 ; CHECK-NEXT: ret
    568   %vecext = extractelement <4 x float> %x, i32 0
    569   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    570   %vecext1 = extractelement <4 x float> %x, i32 1
    571   %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
    572   %vecext3 = extractelement <4 x float> %x, i32 2
    573   %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
    574   %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
    575   %mask = fcmp olt <4 x float> %vecinit5, %x
    576   %res = select  <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5
    577   ret <4 x float> %res
    578 }
    579 
    580 define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
    581 ; CHECK-LABEL: blendvb_fallback
    582 ; CHECK: blendvb
    583 ; CHECK: ret
    584   %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
    585   ret <8 x i16> %ret
    586 }
    587 
    588 define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
    589 ; CHECK-LABEL: insertps_from_vector_load:
    590 ; On X32, account for the argument's move to registers
    591 ; X32: movl    4(%esp), %eax
    592 ; CHECK-NOT: mov
    593 ; CHECK: insertps    $48
    594 ; CHECK-NEXT: ret
    595   %1 = load <4 x float>* %pb, align 16
    596   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
    597   ret <4 x float> %2
    598 }
    599 
    600 ;; Use a non-zero CountS for insertps
    601 define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
    602 ; CHECK-LABEL: insertps_from_vector_load_offset:
    603 ; On X32, account for the argument's move to registers
    604 ; X32: movl    4(%esp), %eax
    605 ; CHECK-NOT: mov
    606 ;; Try to match a bit more of the instr, since we need the load's offset.
    607 ; CHECK: insertps    $96, 4(%{{...}}), %
    608 ; CHECK-NEXT: ret
    609   %1 = load <4 x float>* %pb, align 16
    610   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
    611   ret <4 x float> %2
    612 }
    613 
    614 define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
    615 ; CHECK-LABEL: insertps_from_vector_load_offset_2:
    616 ; On X32, account for the argument's move to registers
    617 ; X32: movl    4(%esp), %eax
    618 ; X32: movl    8(%esp), %ecx
    619 ; CHECK-NOT: mov
    620 ;; Try to match a bit more of the instr, since we need the load's offset.
    621 ; CHECK: insertps    $192, 12(%{{...}},%{{...}}), %
    622 ; CHECK-NEXT: ret
    623   %1 = getelementptr inbounds <4 x float>* %pb, i64 %index
    624   %2 = load <4 x float>* %1, align 16
    625   %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
    626   ret <4 x float> %3
    627 }
    628 
    629 define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
    630 ; CHECK-LABEL: insertps_from_broadcast_loadf32:
    631 ; On X32, account for the arguments' move to registers
    632 ; X32: movl    8(%esp), %eax
    633 ; X32: movl    4(%esp), %ecx
    634 ; CHECK-NOT: mov
    635 ; CHECK: insertps    $48
    636 ; CHECK-NEXT: ret
    637   %1 = getelementptr inbounds float* %fb, i64 %index
    638   %2 = load float* %1, align 4
    639   %3 = insertelement <4 x float> undef, float %2, i32 0
    640   %4 = insertelement <4 x float> %3, float %2, i32 1
    641   %5 = insertelement <4 x float> %4, float %2, i32 2
    642   %6 = insertelement <4 x float> %5, float %2, i32 3
    643   %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
    644   ret <4 x float> %7
    645 }
    646 
    647 define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
    648 ; CHECK-LABEL: insertps_from_broadcast_loadv4f32:
    649 ; On X32, account for the arguments' move to registers
    650 ; X32: movl    4(%esp), %{{...}}
    651 ; CHECK-NOT: mov
    652 ; CHECK: insertps    $48
    653 ; CHECK-NEXT: ret
    654   %1 = load <4 x float>* %b, align 4
    655   %2 = extractelement <4 x float> %1, i32 0
    656   %3 = insertelement <4 x float> undef, float %2, i32 0
    657   %4 = insertelement <4 x float> %3, float %2, i32 1
    658   %5 = insertelement <4 x float> %4, float %2, i32 2
    659   %6 = insertelement <4 x float> %5, float %2, i32 3
    660   %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
    661   ret <4 x float> %7
    662 }
    663 
    664 ;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
    665 define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
    666 ; CHECK-LABEL: insertps_from_broadcast_multiple_use:
    667 ; On X32, account for the arguments' move to registers
    668 ; X32: movl    8(%esp), %eax
    669 ; X32: movl    4(%esp), %ecx
    670 ; CHECK: movss
    671 ; CHECK-NOT: mov
    672 ; CHECK: insertps    $48
    673 ; CHECK: insertps    $48
    674 ; CHECK: insertps    $48
    675 ; CHECK: insertps    $48
    676 ; CHECK: addps
    677 ; CHECK: addps
    678 ; CHECK: addps
    679 ; CHECK-NEXT: ret
    680   %1 = getelementptr inbounds float* %fb, i64 %index
    681   %2 = load float* %1, align 4
    682   %3 = insertelement <4 x float> undef, float %2, i32 0
    683   %4 = insertelement <4 x float> %3, float %2, i32 1
    684   %5 = insertelement <4 x float> %4, float %2, i32 2
    685   %6 = insertelement <4 x float> %5, float %2, i32 3
    686   %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
    687   %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
    688   %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
    689   %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
    690   %11 = fadd <4 x float> %7, %8
    691   %12 = fadd <4 x float> %9, %10
    692   %13 = fadd <4 x float> %11, %12
    693   ret <4 x float> %13
    694 }
    695 
    696 define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) {
    697 ; CHECK-LABEL: insertps_with_undefs:
    698 ; CHECK-NOT: shufps
    699 ; CHECK: insertps    $32, %xmm0
    700 ; CHECK: ret
    701   %1 = load float* %b, align 4
    702   %2 = insertelement <4 x float> undef, float %1, i32 0
    703   %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 4, i32 undef, i32 0, i32 7>
    704   ret <4 x float> %result
    705 }
    706 
    707 ; Test for a bug in X86ISelLowering.cpp:getINSERTPS where we were using
    708 ; the destination index to change the load, instead of the source index.
    709 define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) {
    710 ; CHECK-LABEL: pr20087:
    711 ; CHECK: insertps  $48
    712 ; CHECK: ret
    713   %load = load <4 x float> *%ptr
    714   %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2>
    715   ret <4 x float> %ret
    716 }
    717