1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=sse2 | FileCheck %s 3 4 ; Source file looks something like this: 5 ; 6 ; typedef int AAA[100][100]; 7 ; 8 ; void testCombineMultiplies(AAA a,int lll) 9 ; { 10 ; int LOC = lll + 5; 11 ; 12 ; a[LOC][LOC] = 11; 13 ; 14 ; a[LOC][20] = 22; 15 ; a[LOC+20][20] = 33; 16 ; } 17 ; 18 ; We want to make sure we don't generate 2 multiply instructions, 19 ; one for a[LOC][] and one for a[LOC+20]. visitMUL in DAGCombiner.cpp 20 ; should combine the instructions in such a way to avoid the extra 21 ; multiply. 22 ; 23 ; Output looks roughly like this: 24 ; 25 ; movl 8(%esp), %eax 26 ; movl 12(%esp), %ecx 27 ; imull $400, %ecx, %edx # imm = 0x190 28 ; leal (%edx,%eax), %esi 29 ; movl $11, 2020(%esi,%ecx,4) 30 ; movl $22, 2080(%edx,%eax) 31 ; movl $33, 10080(%edx,%eax) 32 33 ; Function Attrs: nounwind 34 define void @testCombineMultiplies([100 x i32]* nocapture %a, i32 %lll) nounwind { 35 ; CHECK-LABEL: testCombineMultiplies: 36 ; CHECK: # %bb.0: # %entry 37 ; CHECK-NEXT: pushl %esi 38 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 39 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx 40 ; CHECK-NEXT: imull $400, %ecx, %edx # imm = 0x190 41 ; CHECK-NEXT: leal (%eax,%edx), %esi 42 ; CHECK-NEXT: movl $11, 2020(%esi,%ecx,4) 43 ; CHECK-NEXT: movl $22, 2080(%eax,%edx) 44 ; CHECK-NEXT: movl $33, 10080(%eax,%edx) 45 ; CHECK-NEXT: popl %esi 46 ; CHECK-NEXT: retl 47 entry: 48 %add = add nsw i32 %lll, 5 49 %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 %add 50 store i32 11, i32* %arrayidx1, align 4 51 %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 20 52 store i32 22, i32* %arrayidx3, align 4 53 %add4 = add nsw i32 %lll, 25 54 %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add4, i32 20 55 store i32 33, i32* %arrayidx6, align 4 56 ret void 57 } 58 59 60 ; Test for the same optimization on vector multiplies. 61 ; 62 ; Source looks something like this: 63 ; 64 ; typedef int v4int __attribute__((__vector_size__(16))); 65 ; 66 ; v4int x; 67 ; v4int v2, v3; 68 ; void testCombineMultiplies_splat(v4int v1) { 69 ; v2 = (v1 + (v4int){ 11, 11, 11, 11 }) * (v4int) {22, 22, 22, 22}; 70 ; v3 = (v1 + (v4int){ 33, 33, 33, 33 }) * (v4int) {22, 22, 22, 22}; 71 ; x = (v1 + (v4int){ 11, 11, 11, 11 }); 72 ; } 73 ; 74 ; Output looks something like this: 75 ; 76 ; testCombineMultiplies_splat: # @testCombineMultiplies_splat 77 ; # %bb.0: # %entry 78 ; movdqa .LCPI1_0, %xmm1 # xmm1 = [11,11,11,11] 79 ; paddd %xmm0, %xmm1 80 ; movdqa .LCPI1_1, %xmm2 # xmm2 = [22,22,22,22] 81 ; pshufd $245, %xmm0, %xmm3 # xmm3 = xmm0[1,1,3,3] 82 ; pmuludq %xmm2, %xmm0 83 ; pshufd $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3] 84 ; pmuludq %xmm2, %xmm3 85 ; pshufd $232, %xmm3, %xmm2 # xmm2 = xmm3[0,2,2,3] 86 ; punpckldq %xmm2, %xmm0 # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 87 ; movdqa .LCPI1_2, %xmm2 # xmm2 = [242,242,242,242] 88 ; paddd %xmm0, %xmm2 89 ; paddd .LCPI1_3, %xmm0 90 ; movdqa %xmm2, v2 91 ; movdqa %xmm0, v3 92 ; movdqa %xmm1, x 93 ; retl 94 ; 95 ; Again, we want to make sure we don't generate two different multiplies. 96 ; We should have a single multiply for "v1 * {22, 22, 22, 22}" (made up of two 97 ; pmuludq instructions), followed by two adds. Without this optimization, we'd 98 ; do 2 adds, followed by 2 multiplies (i.e. 4 pmuludq instructions). 99 100 @v2 = common global <4 x i32> zeroinitializer, align 16 101 @v3 = common global <4 x i32> zeroinitializer, align 16 102 @x = common global <4 x i32> zeroinitializer, align 16 103 104 ; Function Attrs: nounwind 105 define void @testCombineMultiplies_splat(<4 x i32> %v1) nounwind { 106 ; CHECK-LABEL: testCombineMultiplies_splat: 107 ; CHECK: # %bb.0: # %entry 108 ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,11,11,11] 109 ; CHECK-NEXT: paddd %xmm0, %xmm1 110 ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [22,22,22,22] 111 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 112 ; CHECK-NEXT: pmuludq %xmm2, %xmm0 113 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 114 ; CHECK-NEXT: pmuludq %xmm2, %xmm3 115 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 116 ; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 117 ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,242,242,242] 118 ; CHECK-NEXT: paddd %xmm0, %xmm2 119 ; CHECK-NEXT: paddd {{\.LCPI.*}}, %xmm0 120 ; CHECK-NEXT: movdqa %xmm2, v2 121 ; CHECK-NEXT: movdqa %xmm0, v3 122 ; CHECK-NEXT: movdqa %xmm1, x 123 ; CHECK-NEXT: retl 124 entry: 125 %add1 = add <4 x i32> %v1, <i32 11, i32 11, i32 11, i32 11> 126 %mul1 = mul <4 x i32> %add1, <i32 22, i32 22, i32 22, i32 22> 127 %add2 = add <4 x i32> %v1, <i32 33, i32 33, i32 33, i32 33> 128 %mul2 = mul <4 x i32> %add2, <i32 22, i32 22, i32 22, i32 22> 129 store <4 x i32> %mul1, <4 x i32>* @v2, align 16 130 store <4 x i32> %mul2, <4 x i32>* @v3, align 16 131 store <4 x i32> %add1, <4 x i32>* @x, align 16 132 ret void 133 } 134 135 ; Finally, check the non-splatted vector case. This is very similar 136 ; to the previous test case, except for the vector values. 137 138 ; Function Attrs: nounwind 139 define void @testCombineMultiplies_non_splat(<4 x i32> %v1) nounwind { 140 ; CHECK-LABEL: testCombineMultiplies_non_splat: 141 ; CHECK: # %bb.0: # %entry 142 ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,22,33,44] 143 ; CHECK-NEXT: paddd %xmm0, %xmm1 144 ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [22,33,44,55] 145 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 146 ; CHECK-NEXT: pmuludq %xmm2, %xmm0 147 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 148 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 149 ; CHECK-NEXT: pmuludq %xmm3, %xmm2 150 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 151 ; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 152 ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,726,1452,2420] 153 ; CHECK-NEXT: paddd %xmm0, %xmm2 154 ; CHECK-NEXT: paddd {{\.LCPI.*}}, %xmm0 155 ; CHECK-NEXT: movdqa %xmm2, v2 156 ; CHECK-NEXT: movdqa %xmm0, v3 157 ; CHECK-NEXT: movdqa %xmm1, x 158 ; CHECK-NEXT: retl 159 entry: 160 %add1 = add <4 x i32> %v1, <i32 11, i32 22, i32 33, i32 44> 161 %mul1 = mul <4 x i32> %add1, <i32 22, i32 33, i32 44, i32 55> 162 %add2 = add <4 x i32> %v1, <i32 33, i32 44, i32 55, i32 66> 163 %mul2 = mul <4 x i32> %add2, <i32 22, i32 33, i32 44, i32 55> 164 store <4 x i32> %mul1, <4 x i32>* @v2, align 16 165 store <4 x i32> %mul2, <4 x i32>* @v3, align 16 166 store <4 x i32> %add1, <4 x i32>* @x, align 16 167 ret void 168 } 169