Home | History | Annotate | Download | only in X86
      1 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 -O0 < %s | FileCheck %s
      2 
      3 ; Check that at -O0, the backend doesn't attempt to canonicalize a vector load
      4 ; used by an INSERTPS into a scalar load plus scalar_to_vector.
      5 ;
      6 ; In order to fold a load into the memory operand of an INSERTPSrm, the backend
      7 ; tries to canonicalize a vector load in input to an INSERTPS node into a
      8 ; scalar load plus scalar_to_vector. This would allow ISel to match the
      9 ; INSERTPSrm variant rather than a load plus INSERTPSrr.
     10 ;
     11 ; However, ISel can only select an INSERTPSrm if folding a load into the operand
     12 ; of an insertps is considered to be profitable.
     13 ;
     14 ; In the example below:
     15 ;
     16 ; __m128 test(__m128 a, __m128 *b) {
     17 ;   __m128 c = _mm_insert_ps(a, *b, 1 << 6);
     18 ;   return c;
     19 ; }
     20 ;
     21 ; At -O0, the backend would attempt to canonicalize the load to 'b' into
     22 ; a scalar load in the hope of matching an INSERTPSrm.
     23 ; However, ISel would fail to recognize an INSERTPSrm since load folding is
     24 ; always considered unprofitable at -O0. This would leave the insertps mask
     25 ; in an invalid state.
     26 ;
     27 ; The problem with the canonicalization rule performed by the backend is that
     28 ; it assumes ISel to always be able to match an INSERTPSrm. This assumption is
     29 ; not always correct at -O0. In this example, FastISel fails to lower the
     30 ; arguments needed by the entry block. This is enough to enable the DAGCombiner
     31 ; and eventually trigger the canonicalization on the INSERTPS node.
     32 ;
     33 ; This test checks that the vector load in input to the insertps is not
     34 ; canonicalized into a scalar load plus scalar_to_vector (a movss).
     35 
     36 define <4 x float> @test(<4 x float> %a, <4 x float>* %b) {
     37 ; CHECK-LABEL: test:
     38 ; CHECK: movaps (%rdi), [[REG:%[a-z0-9]+]]
     39 ; CHECK-NOT: movss
     40 ; CHECK: insertps $64, [[REG]],
     41 ; CHECK: ret
     42 entry:
     43   %0 = load <4 x float>, <4 x float>* %b, align 16
     44   %1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %0, i32 64)
     45   %2 = alloca <4 x float>, align 16
     46   store <4 x float> %1, <4 x float>* %2, align 16
     47   %3 = load <4 x float>, <4 x float>* %2, align 16
     48   ret <4 x float> %3
     49 }
     50 
     51 
     52 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32)
     53