1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; These are tests for SSE3 codegen. 3 4 ; RUN: llc < %s -mtriple=x86_64-apple-darwin9 --mattr=+sse3 | FileCheck %s --check-prefix=X64 5 6 ; Test for v8xi16 lowering where we extract the first element of the vector and 7 ; placed it in the second element of the result. 8 9 define void @t0(<8 x i16>* %dest, <8 x i16>* %old) nounwind { 10 ; X64-LABEL: t0: 11 ; X64: ## BB#0: ## %entry 12 ; X64-NEXT: movl $1, %eax 13 ; X64-NEXT: movd %eax, %xmm0 14 ; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 15 ; X64-NEXT: movdqa %xmm0, (%rdi) 16 ; X64-NEXT: retq 17 entry: 18 %tmp3 = load <8 x i16>, <8 x i16>* %old 19 %tmp6 = shufflevector <8 x i16> %tmp3, 20 <8 x i16> < i16 1, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef >, 21 <8 x i32> < i32 8, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef > 22 store <8 x i16> %tmp6, <8 x i16>* %dest 23 ret void 24 } 25 26 define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind { 27 ; X64-LABEL: t1: 28 ; X64: ## BB#0: 29 ; X64-NEXT: movaps {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,65535] 30 ; X64-NEXT: movaps %xmm0, %xmm1 31 ; X64-NEXT: andnps (%rsi), %xmm1 32 ; X64-NEXT: andps (%rdi), %xmm0 33 ; X64-NEXT: orps %xmm1, %xmm0 34 ; X64-NEXT: retq 35 %tmp1 = load <8 x i16>, <8 x i16>* %A 36 %tmp2 = load <8 x i16>, <8 x i16>* %B 37 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> < i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 > 38 ret <8 x i16> %tmp3 39 40 } 41 42 define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) nounwind { 43 ; X64-LABEL: t2: 44 ; X64: ## BB#0: 45 ; X64-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,65535,65535] 46 ; X64-NEXT: pand %xmm2, %xmm0 47 ; X64-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,2,1,4,5,6,7] 48 ; X64-NEXT: pandn %xmm1, %xmm2 49 ; X64-NEXT: por %xmm2, %xmm0 50 ; X64-NEXT: retq 51 %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 9, i32 1, i32 2, i32 9, i32 4, i32 5, i32 6, i32 7 > 52 ret <8 x i16> %tmp 53 } 54 55 define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) nounwind { 56 ; X64-LABEL: t3: 57 ; X64: ## BB#0: 58 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] 59 ; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] 60 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] 61 ; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] 62 ; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 63 ; X64-NEXT: retq 64 %tmp = shufflevector <8 x i16> %A, <8 x i16> %A, <8 x i32> < i32 8, i32 3, i32 2, i32 13, i32 7, i32 6, i32 5, i32 4 > 65 ret <8 x i16> %tmp 66 } 67 68 define <8 x i16> @t4(<8 x i16> %A, <8 x i16> %B) nounwind { 69 ; X64-LABEL: t4: 70 ; X64: ## BB#0: 71 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 72 ; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] 73 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] 74 ; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,4,7] 75 ; X64-NEXT: retq 76 %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 7, i32 2, i32 3, i32 1, i32 5, i32 6, i32 5 > 77 ret <8 x i16> %tmp 78 } 79 80 define <8 x i16> @t5(<8 x i16> %A, <8 x i16> %B) nounwind { 81 ; X64-LABEL: t5: 82 ; X64: ## BB#0: 83 ; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 84 ; X64-NEXT: movdqa %xmm1, %xmm0 85 ; X64-NEXT: retq 86 %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 0, i32 1, i32 10, i32 11, i32 2, i32 3 > 87 ret <8 x i16> %tmp 88 } 89 90 define <8 x i16> @t6(<8 x i16> %A, <8 x i16> %B) nounwind { 91 ; X64-LABEL: t6: 92 ; X64: ## BB#0: 93 ; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 94 ; X64-NEXT: retq 95 %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 > 96 ret <8 x i16> %tmp 97 } 98 99 define <8 x i16> @t7(<8 x i16> %A, <8 x i16> %B) nounwind { 100 ; X64-LABEL: t7: 101 ; X64: ## BB#0: 102 ; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] 103 ; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] 104 ; X64-NEXT: retq 105 %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 0, i32 3, i32 2, i32 4, i32 6, i32 4, i32 7 > 106 ret <8 x i16> %tmp 107 } 108 109 define void @t8(<2 x i64>* %res, <2 x i64>* %A) nounwind { 110 ; X64-LABEL: t8: 111 ; X64: ## BB#0: 112 ; X64-NEXT: pshuflw {{.*#+}} xmm0 = mem[2,1,0,3,4,5,6,7] 113 ; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] 114 ; X64-NEXT: movdqa %xmm0, (%rdi) 115 ; X64-NEXT: retq 116 %tmp = load <2 x i64>, <2 x i64>* %A 117 %tmp.upgrd.1 = bitcast <2 x i64> %tmp to <8 x i16> 118 %tmp0 = extractelement <8 x i16> %tmp.upgrd.1, i32 0 119 %tmp1 = extractelement <8 x i16> %tmp.upgrd.1, i32 1 120 %tmp2 = extractelement <8 x i16> %tmp.upgrd.1, i32 2 121 %tmp3 = extractelement <8 x i16> %tmp.upgrd.1, i32 3 122 %tmp4 = extractelement <8 x i16> %tmp.upgrd.1, i32 4 123 %tmp5 = extractelement <8 x i16> %tmp.upgrd.1, i32 5 124 %tmp6 = extractelement <8 x i16> %tmp.upgrd.1, i32 6 125 %tmp7 = extractelement <8 x i16> %tmp.upgrd.1, i32 7 126 %tmp8 = insertelement <8 x i16> undef, i16 %tmp2, i32 0 127 %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 1 128 %tmp10 = insertelement <8 x i16> %tmp9, i16 %tmp0, i32 2 129 %tmp11 = insertelement <8 x i16> %tmp10, i16 %tmp3, i32 3 130 %tmp12 = insertelement <8 x i16> %tmp11, i16 %tmp6, i32 4 131 %tmp13 = insertelement <8 x i16> %tmp12, i16 %tmp5, i32 5 132 %tmp14 = insertelement <8 x i16> %tmp13, i16 %tmp4, i32 6 133 %tmp15 = insertelement <8 x i16> %tmp14, i16 %tmp7, i32 7 134 %tmp15.upgrd.2 = bitcast <8 x i16> %tmp15 to <2 x i64> 135 store <2 x i64> %tmp15.upgrd.2, <2 x i64>* %res 136 ret void 137 } 138 139 define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind { 140 ; X64-LABEL: t9: 141 ; X64: ## BB#0: 142 ; X64-NEXT: movapd (%rdi), %xmm0 143 ; X64-NEXT: movhpd (%rsi), %xmm0 144 ; X64-NEXT: movapd %xmm0, (%rdi) 145 ; X64-NEXT: retq 146 %tmp = load <4 x float>, <4 x float>* %r 147 %tmp.upgrd.3 = bitcast <2 x i32>* %A to double* 148 %tmp.upgrd.4 = load double, double* %tmp.upgrd.3 149 %tmp.upgrd.5 = insertelement <2 x double> undef, double %tmp.upgrd.4, i32 0 150 %tmp5 = insertelement <2 x double> %tmp.upgrd.5, double undef, i32 1 151 %tmp6 = bitcast <2 x double> %tmp5 to <4 x float> 152 %tmp.upgrd.6 = extractelement <4 x float> %tmp, i32 0 153 %tmp7 = extractelement <4 x float> %tmp, i32 1 154 %tmp8 = extractelement <4 x float> %tmp6, i32 0 155 %tmp9 = extractelement <4 x float> %tmp6, i32 1 156 %tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.6, i32 0 157 %tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1 158 %tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2 159 %tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3 160 store <4 x float> %tmp13, <4 x float>* %r 161 ret void 162 } 163 164 165 166 ; FIXME: This testcase produces icky code. It can be made much better! 167 ; PR2585 168 169 @g1 = external constant <4 x i32> 170 @g2 = external constant <4 x i16> 171 172 define void @t10() nounwind { 173 ; X64-LABEL: t10: 174 ; X64: ## BB#0: 175 ; X64-NEXT: movq _g1@{{.*}}(%rip), %rax 176 ; X64-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] 177 ; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 178 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 179 ; X64-NEXT: movq _g2@{{.*}}(%rip), %rax 180 ; X64-NEXT: movq %xmm0, (%rax) 181 ; X64-NEXT: retq 182 load <4 x i32>, <4 x i32>* @g1, align 16 183 bitcast <4 x i32> %1 to <8 x i16> 184 shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> < i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef > 185 bitcast <8 x i16> %3 to <2 x i64> 186 extractelement <2 x i64> %4, i32 0 187 bitcast i64 %5 to <4 x i16> 188 store <4 x i16> %6, <4 x i16>* @g2, align 8 189 ret void 190 } 191 192 ; Pack various elements via shuffles. 193 define <8 x i16> @t11(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { 194 ; X64-LABEL: t11: 195 ; X64: ## BB#0: ## %entry 196 ; X64-NEXT: psrld $16, %xmm0 197 ; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 198 ; X64-NEXT: retq 199 entry: 200 %tmp7 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > 201 ret <8 x i16> %tmp7 202 203 } 204 205 define <8 x i16> @t12(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { 206 ; X64-LABEL: t12: 207 ; X64: ## BB#0: ## %entry 208 ; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 209 ; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 210 ; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,6,7] 211 ; X64-NEXT: retq 212 entry: 213 %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef > 214 ret <8 x i16> %tmp9 215 216 } 217 218 define <8 x i16> @t13(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { 219 ; X64-LABEL: t13: 220 ; X64: ## BB#0: ## %entry 221 ; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 222 ; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] 223 ; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,6,7] 224 ; X64-NEXT: retq 225 entry: 226 %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 11, i32 3, i32 undef , i32 undef > 227 ret <8 x i16> %tmp9 228 } 229 230 define <8 x i16> @t14(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { 231 ; X64-LABEL: t14: 232 ; X64: ## BB#0: ## %entry 233 ; X64-NEXT: psrlq $16, %xmm0 234 ; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 235 ; X64-NEXT: movdqa %xmm1, %xmm0 236 ; X64-NEXT: retq 237 entry: 238 %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef , i32 undef > 239 ret <8 x i16> %tmp9 240 } 241 242 ; FIXME: t15 is worse off from disabling of scheduler 2-address hack. 243 define <8 x i16> @t15(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { 244 ; X64-LABEL: t15: 245 ; X64: ## BB#0: ## %entry 246 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 247 ; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] 248 ; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 249 ; X64-NEXT: retq 250 entry: 251 %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef > 252 ret <8 x i16> %tmp8 253 } 254 255 ; Test yonah where we convert a shuffle to pextrw and pinrsw 256 define <16 x i8> @t16(<16 x i8> %T0) nounwind readnone { 257 ; X64-LABEL: t16: 258 ; X64: ## BB#0: ## %entry 259 ; X64-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0] 260 ; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 261 ; X64-NEXT: movdqa %xmm1, %xmm0 262 ; X64-NEXT: retq 263 entry: 264 %tmp8 = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > 265 %tmp9 = shufflevector <16 x i8> %tmp8, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 2, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > 266 ret <16 x i8> %tmp9 267 } 268 269 ; rdar://8520311 270 define <4 x i32> @t17() nounwind { 271 ; X64-LABEL: t17: 272 ; X64: ## BB#0: ## %entry 273 ; X64-NEXT: movaps (%rax), %xmm0 274 ; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1] 275 ; X64-NEXT: pxor %xmm1, %xmm1 276 ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 277 ; X64-NEXT: retq 278 entry: 279 %tmp1 = load <4 x float>, <4 x float>* undef, align 16 280 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 281 %tmp3 = load <4 x float>, <4 x float>* undef, align 16 282 %tmp4 = shufflevector <4 x float> %tmp2, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1> 283 %tmp5 = bitcast <4 x float> %tmp3 to <4 x i32> 284 %tmp6 = shufflevector <4 x i32> %tmp5, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1> 285 %tmp7 = and <4 x i32> %tmp6, <i32 undef, i32 undef, i32 -1, i32 0> 286 ret <4 x i32> %tmp7 287 } 288