1 ; RUN: llc < %s -mattr=sse2 -mtriple=i386-unknown-linux-gnu | FileCheck %s 2 3 ; Source file looks something like this: 4 ; 5 ; typedef int AAA[100][100]; 6 ; 7 ; void testCombineMultiplies(AAA a,int lll) 8 ; { 9 ; int LOC = lll + 5; 10 ; 11 ; a[LOC][LOC] = 11; 12 ; 13 ; a[LOC][20] = 22; 14 ; a[LOC+20][20] = 33; 15 ; } 16 ; 17 ; We want to make sure we don't generate 2 multiply instructions, 18 ; one for a[LOC][] and one for a[LOC+20]. visitMUL in DAGCombiner.cpp 19 ; should combine the instructions in such a way to avoid the extra 20 ; multiply. 21 ; 22 ; Output looks roughly like this: 23 ; 24 ; movl 8(%esp), %eax 25 ; movl 12(%esp), %ecx 26 ; imull $400, %ecx, %edx # imm = 0x190 27 ; leal (%edx,%eax), %esi 28 ; movl $11, 2020(%esi,%ecx,4) 29 ; movl $22, 2080(%edx,%eax) 30 ; movl $33, 10080(%edx,%eax) 31 ; 32 ; CHECK-LABEL: testCombineMultiplies 33 ; CHECK: imull $400, [[ARG1:%[a-z]+]], [[MUL:%[a-z]+]] # imm = 0x190 34 ; CHECK-NEXT: leal ([[MUL]],[[ARG2:%[a-z]+]]), [[LEA:%[a-z]+]] 35 ; CHECK-NEXT: movl $11, {{[0-9]+}}([[LEA]],[[ARG1]],4) 36 ; CHECK-NEXT: movl $22, {{[0-9]+}}([[MUL]],[[ARG2]]) 37 ; CHECK-NEXT: movl $33, {{[0-9]+}}([[MUL]],[[ARG2]]) 38 ; CHECK: retl 39 ; 40 41 ; Function Attrs: nounwind 42 define void @testCombineMultiplies([100 x i32]* nocapture %a, i32 %lll) { 43 entry: 44 %add = add nsw i32 %lll, 5 45 %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 %add 46 store i32 11, i32* %arrayidx1, align 4 47 %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 20 48 store i32 22, i32* %arrayidx3, align 4 49 %add4 = add nsw i32 %lll, 25 50 %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add4, i32 20 51 store i32 33, i32* %arrayidx6, align 4 52 ret void 53 } 54 55 56 ; Test for the same optimization on vector multiplies. 57 ; 58 ; Source looks something like this: 59 ; 60 ; typedef int v4int __attribute__((__vector_size__(16))); 61 ; 62 ; v4int x; 63 ; v4int v2, v3; 64 ; void testCombineMultiplies_splat(v4int v1) { 65 ; v2 = (v1 + (v4int){ 11, 11, 11, 11 }) * (v4int) {22, 22, 22, 22}; 66 ; v3 = (v1 + (v4int){ 33, 33, 33, 33 }) * (v4int) {22, 22, 22, 22}; 67 ; x = (v1 + (v4int){ 11, 11, 11, 11 }); 68 ; } 69 ; 70 ; Output looks something like this: 71 ; 72 ; testCombineMultiplies_splat: # @testCombineMultiplies_splat 73 ; # BB#0: # %entry 74 ; movdqa .LCPI1_0, %xmm1 # xmm1 = [11,11,11,11] 75 ; paddd %xmm0, %xmm1 76 ; movdqa .LCPI1_1, %xmm2 # xmm2 = [22,22,22,22] 77 ; pshufd $245, %xmm0, %xmm3 # xmm3 = xmm0[1,1,3,3] 78 ; pmuludq %xmm2, %xmm0 79 ; pshufd $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3] 80 ; pmuludq %xmm2, %xmm3 81 ; pshufd $232, %xmm3, %xmm2 # xmm2 = xmm3[0,2,2,3] 82 ; punpckldq %xmm2, %xmm0 # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 83 ; movdqa .LCPI1_2, %xmm2 # xmm2 = [242,242,242,242] 84 ; paddd %xmm0, %xmm2 85 ; paddd .LCPI1_3, %xmm0 86 ; movdqa %xmm2, v2 87 ; movdqa %xmm0, v3 88 ; movdqa %xmm1, x 89 ; retl 90 ; 91 ; Again, we want to make sure we don't generate two different multiplies. 92 ; We should have a single multiply for "v1 * {22, 22, 22, 22}" (made up of two 93 ; pmuludq instructions), followed by two adds. Without this optimization, we'd 94 ; do 2 adds, followed by 2 multiplies (i.e. 4 pmuludq instructions). 95 ; 96 ; CHECK-LABEL: testCombineMultiplies_splat 97 ; CHECK: movdqa .LCPI1_0, [[C11:%xmm[0-9]]] 98 ; CHECK-NEXT: paddd %xmm0, [[C11]] 99 ; CHECK-NEXT: movdqa .LCPI1_1, [[C22:%xmm[0-9]]] 100 ; CHECK-NEXT: pshufd $245, %xmm0, [[T1:%xmm[0-9]]] 101 ; CHECK-NEXT: pmuludq [[C22]], [[T2:%xmm[0-9]]] 102 ; CHECK-NEXT: pshufd $232, [[T2]], [[T3:%xmm[0-9]]] 103 ; CHECK-NEXT: pmuludq [[C22]], [[T4:%xmm[0-9]]] 104 ; CHECK-NEXT: pshufd $232, [[T4]], [[T5:%xmm[0-9]]] 105 ; CHECK-NEXT: punpckldq [[T5]], [[T6:%xmm[0-9]]] 106 ; CHECK-NEXT: movdqa .LCPI1_2, [[C242:%xmm[0-9]]] 107 ; CHECK-NEXT: paddd [[T6]], [[C242]] 108 ; CHECK-NEXT: paddd .LCPI1_3, [[C726:%xmm[0-9]]] 109 ; CHECK-NEXT: movdqa [[C242]], v2 110 ; CHECK-NEXT: [[C726]], v3 111 ; CHECK-NEXT: [[C11]], x 112 ; CHECK-NEXT: retl 113 114 @v2 = common global <4 x i32> zeroinitializer, align 16 115 @v3 = common global <4 x i32> zeroinitializer, align 16 116 @x = common global <4 x i32> zeroinitializer, align 16 117 118 ; Function Attrs: nounwind 119 define void @testCombineMultiplies_splat(<4 x i32> %v1) { 120 entry: 121 %add1 = add <4 x i32> %v1, <i32 11, i32 11, i32 11, i32 11> 122 %mul1 = mul <4 x i32> %add1, <i32 22, i32 22, i32 22, i32 22> 123 %add2 = add <4 x i32> %v1, <i32 33, i32 33, i32 33, i32 33> 124 %mul2 = mul <4 x i32> %add2, <i32 22, i32 22, i32 22, i32 22> 125 store <4 x i32> %mul1, <4 x i32>* @v2, align 16 126 store <4 x i32> %mul2, <4 x i32>* @v3, align 16 127 store <4 x i32> %add1, <4 x i32>* @x, align 16 128 ret void 129 } 130 131 ; Finally, check the non-splatted vector case. This is very similar 132 ; to the previous test case, except for the vector values. 133 ; 134 ; CHECK-LABEL: testCombineMultiplies_non_splat 135 ; CHECK: movdqa .LCPI2_0, [[C11:%xmm[0-9]]] 136 ; CHECK-NEXT: paddd %xmm0, [[C11]] 137 ; CHECK-NEXT: movdqa .LCPI2_1, [[C22:%xmm[0-9]]] 138 ; CHECK-NEXT: pshufd $245, %xmm0, [[T1:%xmm[0-9]]] 139 ; CHECK-NEXT: pmuludq [[C22]], [[T2:%xmm[0-9]]] 140 ; CHECK-NEXT: pshufd $232, [[T2]], [[T3:%xmm[0-9]]] 141 ; CHECK-NEXT: pshufd $245, [[C22]], [[T7:%xmm[0-9]]] 142 ; CHECK-NEXT: pmuludq [[T1]], [[T7]] 143 ; CHECK-NEXT: pshufd $232, [[T7]], [[T5:%xmm[0-9]]] 144 ; CHECK-NEXT: punpckldq [[T5]], [[T6:%xmm[0-9]]] 145 ; CHECK-NEXT: movdqa .LCPI2_2, [[C242:%xmm[0-9]]] 146 ; CHECK-NEXT: paddd [[T6]], [[C242]] 147 ; CHECK-NEXT: paddd .LCPI2_3, [[C726:%xmm[0-9]]] 148 ; CHECK-NEXT: movdqa [[C242]], v2 149 ; CHECK-NEXT: [[C726]], v3 150 ; CHECK-NEXT: [[C11]], x 151 ; CHECK-NEXT: retl 152 ; Function Attrs: nounwind 153 define void @testCombineMultiplies_non_splat(<4 x i32> %v1) { 154 entry: 155 %add1 = add <4 x i32> %v1, <i32 11, i32 22, i32 33, i32 44> 156 %mul1 = mul <4 x i32> %add1, <i32 22, i32 33, i32 44, i32 55> 157 %add2 = add <4 x i32> %v1, <i32 33, i32 44, i32 55, i32 66> 158 %mul2 = mul <4 x i32> %add2, <i32 22, i32 33, i32 44, i32 55> 159 store <4 x i32> %mul1, <4 x i32>* @v2, align 16 160 store <4 x i32> %mul2, <4 x i32>* @v3, align 16 161 store <4 x i32> %add1, <4 x i32>* @x, align 16 162 ret void 163 } 164