1 ; Tests for SSE2 and below, without SSE3+. 2 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=pentium4 -O3 | FileCheck %s 3 4 define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind { 5 %tmp3 = load <2 x double>* %A, align 16 6 %tmp7 = insertelement <2 x double> undef, double %B, i32 0 7 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 > 8 store <2 x double> %tmp9, <2 x double>* %r, align 16 9 ret void 10 11 ; CHECK: test1: 12 ; CHECK: movl 8(%esp), %eax 13 ; CHECK-NEXT: movapd (%eax), %xmm0 14 ; CHECK-NEXT: movlpd 12(%esp), %xmm0 15 ; CHECK-NEXT: movl 4(%esp), %eax 16 ; CHECK-NEXT: movapd %xmm0, (%eax) 17 ; CHECK-NEXT: ret 18 } 19 20 define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind { 21 %tmp3 = load <2 x double>* %A, align 16 22 %tmp7 = insertelement <2 x double> undef, double %B, i32 0 23 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 > 24 store <2 x double> %tmp9, <2 x double>* %r, align 16 25 ret void 26 27 ; CHECK: test2: 28 ; CHECK: movl 8(%esp), %eax 29 ; CHECK-NEXT: movapd (%eax), %xmm0 30 ; CHECK-NEXT: movhpd 12(%esp), %xmm0 31 ; CHECK-NEXT: movl 4(%esp), %eax 32 ; CHECK-NEXT: movapd %xmm0, (%eax) 33 ; CHECK-NEXT: ret 34 } 35 36 37 define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind { 38 %tmp = load <4 x float>* %B ; <<4 x float>> [#uses=2] 39 %tmp3 = load <4 x float>* %A ; <<4 x float>> [#uses=2] 40 %tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0 ; <float> [#uses=1] 41 %tmp7 = extractelement <4 x float> %tmp, i32 0 ; <float> [#uses=1] 42 %tmp8 = extractelement <4 x float> %tmp3, i32 1 ; <float> [#uses=1] 43 %tmp9 = extractelement <4 x float> %tmp, i32 1 ; <float> [#uses=1] 44 %tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.1, i32 0 ; <<4 x float>> [#uses=1] 45 %tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1 ; <<4 x float>> [#uses=1] 46 %tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2 ; <<4 x float>> [#uses=1] 47 %tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3 ; <<4 x float>> [#uses=1] 48 store <4 x float> %tmp13, <4 x float>* %res 49 ret void 50 ; CHECK: @test3 51 ; CHECK: unpcklps 52 } 53 54 define void @test4(<4 x float> %X, <4 x float>* %res) nounwind { 55 %tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1] 56 store <4 x float> %tmp5, <4 x float>* %res 57 ret void 58 ; CHECK: @test4 59 ; CHECK: pshufd $50, %xmm0, %xmm0 60 } 61 62 define <4 x i32> @test5(i8** %ptr) nounwind { 63 ; CHECK: test5: 64 ; CHECK: pxor 65 ; CHECK: punpcklbw 66 ; CHECK: punpcklwd 67 68 %tmp = load i8** %ptr ; <i8*> [#uses=1] 69 %tmp.upgrd.1 = bitcast i8* %tmp to float* ; <float*> [#uses=1] 70 %tmp.upgrd.2 = load float* %tmp.upgrd.1 ; <float> [#uses=1] 71 %tmp.upgrd.3 = insertelement <4 x float> undef, float %tmp.upgrd.2, i32 0 ; <<4 x float>> [#uses=1] 72 %tmp9 = insertelement <4 x float> %tmp.upgrd.3, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1] 73 %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1] 74 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1] 75 %tmp21 = bitcast <4 x float> %tmp11 to <16 x i8> ; <<16 x i8>> [#uses=1] 76 %tmp22 = shufflevector <16 x i8> %tmp21, <16 x i8> zeroinitializer, <16 x i32> < i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23 > ; <<16 x i8>> [#uses=1] 77 %tmp31 = bitcast <16 x i8> %tmp22 to <8 x i16> ; <<8 x i16>> [#uses=1] 78 %tmp.upgrd.4 = shufflevector <8 x i16> zeroinitializer, <8 x i16> %tmp31, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11 > ; <<8 x i16>> [#uses=1] 79 %tmp36 = bitcast <8 x i16> %tmp.upgrd.4 to <4 x i32> ; <<4 x i32>> [#uses=1] 80 ret <4 x i32> %tmp36 81 } 82 83 define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind { 84 %tmp1 = load <4 x float>* %A ; <<4 x float>> [#uses=1] 85 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 > ; <<4 x float>> [#uses=1] 86 store <4 x float> %tmp2, <4 x float>* %res 87 ret void 88 89 ; CHECK: test6: 90 ; CHECK: movaps (%eax), %xmm0 91 ; CHECK: movaps %xmm0, (%eax) 92 } 93 94 define void @test7() nounwind { 95 bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>>:1 [#uses=1] 96 shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer ; <<4 x float>>:2 [#uses=1] 97 store <4 x float> %2, <4 x float>* null 98 ret void 99 100 ; CHECK: test7: 101 ; CHECK: pxor %xmm0, %xmm0 102 ; CHECK: movaps %xmm0, 0 103 } 104 105 @x = external global [4 x i32] 106 107 define <2 x i64> @test8() nounwind { 108 %tmp = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 0) ; <i32> [#uses=1] 109 %tmp3 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 1) ; <i32> [#uses=1] 110 %tmp5 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 2) ; <i32> [#uses=1] 111 %tmp7 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 3) ; <i32> [#uses=1] 112 %tmp.upgrd.1 = insertelement <4 x i32> undef, i32 %tmp, i32 0 ; <<4 x i32>> [#uses=1] 113 %tmp13 = insertelement <4 x i32> %tmp.upgrd.1, i32 %tmp3, i32 1 ; <<4 x i32>> [#uses=1] 114 %tmp14 = insertelement <4 x i32> %tmp13, i32 %tmp5, i32 2 ; <<4 x i32>> [#uses=1] 115 %tmp15 = insertelement <4 x i32> %tmp14, i32 %tmp7, i32 3 ; <<4 x i32>> [#uses=1] 116 %tmp16 = bitcast <4 x i32> %tmp15 to <2 x i64> ; <<2 x i64>> [#uses=1] 117 ret <2 x i64> %tmp16 118 ; CHECK: test8: 119 ; CHECK: movups (%eax), %xmm0 120 } 121 122 define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind { 123 %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1] 124 %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1] 125 %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1] 126 %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1] 127 ret <4 x float> %tmp13 128 ; CHECK: test9: 129 ; CHECK: movups 8(%esp), %xmm0 130 } 131 132 define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind { 133 %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1] 134 %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1] 135 %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1] 136 %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1] 137 ret <4 x float> %tmp13 138 ; CHECK: test10: 139 ; CHECK: movaps 4(%esp), %xmm0 140 } 141 142 define <2 x double> @test11(double %a, double %b) nounwind { 143 %tmp = insertelement <2 x double> undef, double %a, i32 0 ; <<2 x double>> [#uses=1] 144 %tmp7 = insertelement <2 x double> %tmp, double %b, i32 1 ; <<2 x double>> [#uses=1] 145 ret <2 x double> %tmp7 146 ; CHECK: test11: 147 ; CHECK: movapd 4(%esp), %xmm0 148 } 149 150 define void @test12() nounwind { 151 %tmp1 = load <4 x float>* null ; <<4 x float>> [#uses=2] 152 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 > ; <<4 x float>> [#uses=1] 153 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1] 154 %tmp4 = fadd <4 x float> %tmp2, %tmp3 ; <<4 x float>> [#uses=1] 155 store <4 x float> %tmp4, <4 x float>* null 156 ret void 157 ; CHECK: test12: 158 ; CHECK: movhlps 159 ; CHECK: shufps 160 } 161 162 define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind { 163 %tmp3 = load <4 x float>* %B ; <<4 x float>> [#uses=1] 164 %tmp5 = load <4 x float>* %C ; <<4 x float>> [#uses=1] 165 %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1] 166 store <4 x float> %tmp11, <4 x float>* %res 167 ret void 168 ; CHECK: test13 169 ; CHECK: shufps $69, (%eax), %xmm0 170 ; CHECK: pshufd $-40, %xmm0, %xmm0 171 } 172 173 define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind { 174 %tmp = load <4 x float>* %y ; <<4 x float>> [#uses=2] 175 %tmp5 = load <4 x float>* %x ; <<4 x float>> [#uses=2] 176 %tmp9 = fadd <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1] 177 %tmp21 = fsub <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1] 178 %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 > ; <<4 x float>> [#uses=1] 179 ret <4 x float> %tmp27 180 ; CHECK: test14: 181 ; CHECK: addps [[X1:%xmm[0-9]+]], [[X0:%xmm[0-9]+]] 182 ; CHECK: subps [[X1]], [[X2:%xmm[0-9]+]] 183 ; CHECK: movlhps [[X2]], [[X0]] 184 } 185 186 define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind { 187 entry: 188 %tmp = load <4 x float>* %y ; <<4 x float>> [#uses=1] 189 %tmp3 = load <4 x float>* %x ; <<4 x float>> [#uses=1] 190 %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1] 191 ret <4 x float> %tmp4 192 ; CHECK: test15: 193 ; CHECK: movhlps %xmm1, %xmm0 194 } 195 196 ; PR8900 197 ; CHECK: test16: 198 ; CHECK: unpcklpd 199 ; CHECK: ret 200 201 define <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocapture %dst) { 202 %i5 = getelementptr inbounds <4 x double>* %srcA, i32 3 203 %i6 = load <4 x double>* %i5, align 32 204 %i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> <i32 0, i32 2> 205 ret <2 x double> %i7 206 } 207 208 ; PR9009 209 define fastcc void @test17() nounwind { 210 entry: 211 %0 = insertelement <4 x i32> undef, i32 undef, i32 1 212 %1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 213 %2 = bitcast <4 x i32> %1 to <4 x float> 214 store <4 x float> %2, <4 x float> * undef 215 ret void 216 } 217 218 ; PR9210 219 define <4 x float> @f(<4 x double>) nounwind { 220 entry: 221 %double2float.i = fptrunc <4 x double> %0 to <4 x float> 222 ret <4 x float> %double2float.i 223 } 224 225