Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=sse2 | FileCheck %s
      3 
      4 ; Source file looks something like this:
      5 ;
      6 ; typedef int AAA[100][100];
      7 ;
      8 ; void testCombineMultiplies(AAA a,int lll)
      9 ; {
     10 ;   int LOC = lll + 5;
     11 ;
     12 ;   a[LOC][LOC] = 11;
     13 ;
     14 ;   a[LOC][20] = 22;
     15 ;   a[LOC+20][20] = 33;
     16 ; }
     17 ;
     18 ; We want to make sure we don't generate 2 multiply instructions,
     19 ; one for a[LOC][] and one for a[LOC+20]. visitMUL in DAGCombiner.cpp
     20 ; should combine the instructions in such a way to avoid the extra
     21 ; multiply.
     22 ;
     23 ; Output looks roughly like this:
     24 ;
     25 ;	movl	8(%esp), %eax
     26 ;	movl	12(%esp), %ecx
     27 ;	imull	$400, %ecx, %edx        # imm = 0x190
     28 ;	leal	(%edx,%eax), %esi
     29 ;	movl	$11, 2020(%esi,%ecx,4)
     30 ;	movl	$22, 2080(%edx,%eax)
     31 ;	movl	$33, 10080(%edx,%eax)
     32 
     33 ; Function Attrs: nounwind
     34 define void @testCombineMultiplies([100 x i32]* nocapture %a, i32 %lll) nounwind {
     35 ; CHECK-LABEL: testCombineMultiplies:
     36 ; CHECK:       # %bb.0: # %entry
     37 ; CHECK-NEXT:    pushl %esi
     38 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
     39 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
     40 ; CHECK-NEXT:    imull $400, %ecx, %edx # imm = 0x190
     41 ; CHECK-NEXT:    leal (%eax,%edx), %esi
     42 ; CHECK-NEXT:    movl $11, 2020(%esi,%ecx,4)
     43 ; CHECK-NEXT:    movl $22, 2080(%eax,%edx)
     44 ; CHECK-NEXT:    movl $33, 10080(%eax,%edx)
     45 ; CHECK-NEXT:    popl %esi
     46 ; CHECK-NEXT:    retl
     47 entry:
     48   %add = add nsw i32 %lll, 5
     49   %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 %add
     50   store i32 11, i32* %arrayidx1, align 4
     51   %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 20
     52   store i32 22, i32* %arrayidx3, align 4
     53   %add4 = add nsw i32 %lll, 25
     54   %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add4, i32 20
     55   store i32 33, i32* %arrayidx6, align 4
     56   ret void
     57 }
     58 
     59 
     60 ; Test for the same optimization on vector multiplies.
     61 ;
     62 ; Source looks something like this:
     63 ;
     64 ; typedef int v4int __attribute__((__vector_size__(16)));
     65 ;
     66 ; v4int x;
     67 ; v4int v2, v3;
     68 ; void testCombineMultiplies_splat(v4int v1) {
     69 ;   v2 = (v1 + (v4int){ 11, 11, 11, 11 }) * (v4int) {22, 22, 22, 22};
     70 ;   v3 = (v1 + (v4int){ 33, 33, 33, 33 }) * (v4int) {22, 22, 22, 22};
     71 ;   x = (v1 + (v4int){ 11, 11, 11, 11 });
     72 ; }
     73 ;
     74 ; Output looks something like this:
     75 ;
     76 ; testCombineMultiplies_splat:                              # @testCombineMultiplies_splat
     77 ; # %bb.0:                                 # %entry
     78 ; 	movdqa	.LCPI1_0, %xmm1         # xmm1 = [11,11,11,11]
     79 ; 	paddd	%xmm0, %xmm1
     80 ; 	movdqa	.LCPI1_1, %xmm2         # xmm2 = [22,22,22,22]
     81 ; 	pshufd	$245, %xmm0, %xmm3      # xmm3 = xmm0[1,1,3,3]
     82 ; 	pmuludq	%xmm2, %xmm0
     83 ; 	pshufd	$232, %xmm0, %xmm0      # xmm0 = xmm0[0,2,2,3]
     84 ; 	pmuludq	%xmm2, %xmm3
     85 ; 	pshufd	$232, %xmm3, %xmm2      # xmm2 = xmm3[0,2,2,3]
     86 ; 	punpckldq	%xmm2, %xmm0    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
     87 ; 	movdqa	.LCPI1_2, %xmm2         # xmm2 = [242,242,242,242]
     88 ;	paddd	%xmm0, %xmm2
     89 ;	paddd	.LCPI1_3, %xmm0
     90 ;	movdqa	%xmm2, v2
     91 ;	movdqa	%xmm0, v3
     92 ;	movdqa	%xmm1, x
     93 ;	retl
     94 ;
     95 ; Again, we want to make sure we don't generate two different multiplies.
     96 ; We should have a single multiply for "v1 * {22, 22, 22, 22}" (made up of two
     97 ; pmuludq instructions), followed by two adds. Without this optimization, we'd
     98 ; do 2 adds, followed by 2 multiplies (i.e. 4 pmuludq instructions).
     99 
    100 @v2 = common global <4 x i32> zeroinitializer, align 16
    101 @v3 = common global <4 x i32> zeroinitializer, align 16
    102 @x = common global <4 x i32> zeroinitializer, align 16
    103 
    104 ; Function Attrs: nounwind
    105 define void @testCombineMultiplies_splat(<4 x i32> %v1) nounwind {
    106 ; CHECK-LABEL: testCombineMultiplies_splat:
    107 ; CHECK:       # %bb.0: # %entry
    108 ; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [11,11,11,11]
    109 ; CHECK-NEXT:    paddd %xmm0, %xmm1
    110 ; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [22,22,22,22]
    111 ; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
    112 ; CHECK-NEXT:    pmuludq %xmm2, %xmm0
    113 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    114 ; CHECK-NEXT:    pmuludq %xmm2, %xmm3
    115 ; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
    116 ; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    117 ; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [242,242,242,242]
    118 ; CHECK-NEXT:    paddd %xmm0, %xmm2
    119 ; CHECK-NEXT:    paddd {{\.LCPI.*}}, %xmm0
    120 ; CHECK-NEXT:    movdqa %xmm2, v2
    121 ; CHECK-NEXT:    movdqa %xmm0, v3
    122 ; CHECK-NEXT:    movdqa %xmm1, x
    123 ; CHECK-NEXT:    retl
    124 entry:
    125   %add1 = add <4 x i32> %v1, <i32 11, i32 11, i32 11, i32 11>
    126   %mul1 = mul <4 x i32> %add1, <i32 22, i32 22, i32 22, i32 22>
    127   %add2 = add <4 x i32> %v1, <i32 33, i32 33, i32 33, i32 33>
    128   %mul2 = mul <4 x i32> %add2, <i32 22, i32 22, i32 22, i32 22>
    129   store <4 x i32> %mul1, <4 x i32>* @v2, align 16
    130   store <4 x i32> %mul2, <4 x i32>* @v3, align 16
    131   store <4 x i32> %add1, <4 x i32>* @x, align 16
    132   ret void
    133 }
    134 
    135 ; Finally, check the non-splatted vector case. This is very similar
    136 ; to the previous test case, except for the vector values.
    137 
    138 ; Function Attrs: nounwind
    139 define void @testCombineMultiplies_non_splat(<4 x i32> %v1) nounwind {
    140 ; CHECK-LABEL: testCombineMultiplies_non_splat:
    141 ; CHECK:       # %bb.0: # %entry
    142 ; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [11,22,33,44]
    143 ; CHECK-NEXT:    paddd %xmm0, %xmm1
    144 ; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [22,33,44,55]
    145 ; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
    146 ; CHECK-NEXT:    pmuludq %xmm2, %xmm0
    147 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    148 ; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
    149 ; CHECK-NEXT:    pmuludq %xmm3, %xmm2
    150 ; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
    151 ; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    152 ; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [242,726,1452,2420]
    153 ; CHECK-NEXT:    paddd %xmm0, %xmm2
    154 ; CHECK-NEXT:    paddd {{\.LCPI.*}}, %xmm0
    155 ; CHECK-NEXT:    movdqa %xmm2, v2
    156 ; CHECK-NEXT:    movdqa %xmm0, v3
    157 ; CHECK-NEXT:    movdqa %xmm1, x
    158 ; CHECK-NEXT:    retl
    159 entry:
    160   %add1 = add <4 x i32> %v1, <i32 11, i32 22, i32 33, i32 44>
    161   %mul1 = mul <4 x i32> %add1, <i32 22, i32 33, i32 44, i32 55>
    162   %add2 = add <4 x i32> %v1, <i32 33, i32 44, i32 55, i32 66>
    163   %mul2 = mul <4 x i32> %add2, <i32 22, i32 33, i32 44, i32 55>
    164   store <4 x i32> %mul1, <4 x i32>* @v2, align 16
    165   store <4 x i32> %mul2, <4 x i32>* @v3, align 16
    166   store <4 x i32> %add1, <4 x i32>* @x, align 16
    167   ret void
    168 }
    169