1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 4 5 define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind { 6 ; X32-LABEL: test_broadcast_2f64_4f64: 7 ; X32: # %bb.0: 8 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 9 ; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 10 ; X32-NEXT: vaddpd {{\.LCPI.*}}, %ymm0, %ymm0 11 ; X32-NEXT: retl 12 ; 13 ; X64-LABEL: test_broadcast_2f64_4f64: 14 ; X64: # %bb.0: 15 ; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 16 ; X64-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0 17 ; X64-NEXT: retq 18 %1 = load <2 x double>, <2 x double> *%p 19 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 20 %3 = fadd <4 x double> %2, <double 1.0, double 2.0, double 3.0, double 4.0> 21 ret <4 x double> %3 22 } 23 24 define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind { 25 ; X32-LABEL: test_broadcast_2i64_4i64: 26 ; X32: # %bb.0: 27 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 28 ; X32-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 29 ; X32-NEXT: vpaddq {{\.LCPI.*}}, %ymm0, %ymm0 30 ; X32-NEXT: retl 31 ; 32 ; X64-LABEL: test_broadcast_2i64_4i64: 33 ; X64: # %bb.0: 34 ; X64-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 35 ; X64-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 36 ; X64-NEXT: retq 37 %1 = load <2 x i64>, <2 x i64> *%p 38 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 39 %3 = add <4 x i64> %2, <i64 1, i64 2, i64 3, i64 4> 40 ret <4 x i64> %3 41 } 42 43 define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind { 44 ; X32-LABEL: test_broadcast_4f32_8f32: 45 ; X32: # %bb.0: 46 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 47 ; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 48 ; X32-NEXT: vaddps {{\.LCPI.*}}, %ymm0, %ymm0 49 ; X32-NEXT: retl 50 ; 51 ; X64-LABEL: test_broadcast_4f32_8f32: 52 ; X64: # %bb.0: 53 ; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 54 ; X64-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0 55 ; X64-NEXT: retq 56 %1 = load <4 x float>, <4 x float> *%p 57 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 58 %3 = fadd <8 x float> %2, <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0> 59 ret <8 x float> %3 60 } 61 62 define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind { 63 ; X32-LABEL: test_broadcast_4i32_8i32: 64 ; X32: # %bb.0: 65 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 66 ; X32-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 67 ; X32-NEXT: vpaddd {{\.LCPI.*}}, %ymm0, %ymm0 68 ; X32-NEXT: retl 69 ; 70 ; X64-LABEL: test_broadcast_4i32_8i32: 71 ; X64: # %bb.0: 72 ; X64-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 73 ; X64-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 74 ; X64-NEXT: retq 75 %1 = load <4 x i32>, <4 x i32> *%p 76 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 77 %3 = add <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 78 ret <8 x i32> %3 79 } 80 81 define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind { 82 ; X32-LABEL: test_broadcast_8i16_16i16: 83 ; X32: # %bb.0: 84 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 85 ; X32-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 86 ; X32-NEXT: vpaddw {{\.LCPI.*}}, %ymm0, %ymm0 87 ; X32-NEXT: retl 88 ; 89 ; X64-LABEL: test_broadcast_8i16_16i16: 90 ; X64: # %bb.0: 91 ; X64-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 92 ; X64-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 93 ; X64-NEXT: retq 94 %1 = load <8 x i16>, <8 x i16> *%p 95 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 96 %3 = add <16 x i16> %2, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16> 97 ret <16 x i16> %3 98 } 99 100 define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind { 101 ; X32-LABEL: test_broadcast_16i8_32i8: 102 ; X32: # %bb.0: 103 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 104 ; X32-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 105 ; X32-NEXT: vpaddb {{\.LCPI.*}}, %ymm0, %ymm0 106 ; X32-NEXT: retl 107 ; 108 ; X64-LABEL: test_broadcast_16i8_32i8: 109 ; X64: # %bb.0: 110 ; X64-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 111 ; X64-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 112 ; X64-NEXT: retq 113 %1 = load <16 x i8>, <16 x i8> *%p 114 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 115 %3 = add <32 x i8> %2, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32> 116 ret <32 x i8> %3 117 } 118 119 define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x double>* %p1) { 120 ; X32-LABEL: test_broadcast_2f64_4f64_reuse: 121 ; X32: # %bb.0: 122 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 123 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 124 ; X32-NEXT: vmovapd (%ecx), %xmm1 125 ; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 126 ; X32-NEXT: vaddpd {{\.LCPI.*}}, %ymm0, %ymm0 127 ; X32-NEXT: vmovapd %xmm1, (%eax) 128 ; X32-NEXT: retl 129 ; 130 ; X64-LABEL: test_broadcast_2f64_4f64_reuse: 131 ; X64: # %bb.0: 132 ; X64-NEXT: vmovapd (%rdi), %xmm1 133 ; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 134 ; X64-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0 135 ; X64-NEXT: vmovapd %xmm1, (%rsi) 136 ; X64-NEXT: retq 137 %1 = load <2 x double>, <2 x double>* %p0 138 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 139 %3 = fadd <4 x double> %2, <double 1.0, double 2.0, double 3.0, double 4.0> 140 store <2 x double> %1, <2 x double>* %p1 141 ret <4 x double> %3 142 } 143 144 define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) { 145 ; X32-LABEL: test_broadcast_2i64_4i64_reuse: 146 ; X32: # %bb.0: 147 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 148 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 149 ; X32-NEXT: vmovdqa (%ecx), %xmm1 150 ; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 151 ; X32-NEXT: vpaddq {{\.LCPI.*}}, %ymm0, %ymm0 152 ; X32-NEXT: vmovdqa %xmm1, (%eax) 153 ; X32-NEXT: retl 154 ; 155 ; X64-LABEL: test_broadcast_2i64_4i64_reuse: 156 ; X64: # %bb.0: 157 ; X64-NEXT: vmovdqa (%rdi), %xmm1 158 ; X64-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 159 ; X64-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 160 ; X64-NEXT: vmovdqa %xmm1, (%rsi) 161 ; X64-NEXT: retq 162 %1 = load <2 x i64>, <2 x i64>* %p0 163 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 164 %3 = add <4 x i64> %2, <i64 1, i64 2, i64 3, i64 4> 165 store <2 x i64> %1, <2 x i64>* %p1 166 ret <4 x i64> %3 167 } 168 169 define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) { 170 ; X32-LABEL: test_broadcast_4f32_8f32_reuse: 171 ; X32: # %bb.0: 172 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 173 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 174 ; X32-NEXT: vmovaps (%ecx), %xmm1 175 ; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 176 ; X32-NEXT: vaddps {{\.LCPI.*}}, %ymm0, %ymm0 177 ; X32-NEXT: vmovaps %xmm1, (%eax) 178 ; X32-NEXT: retl 179 ; 180 ; X64-LABEL: test_broadcast_4f32_8f32_reuse: 181 ; X64: # %bb.0: 182 ; X64-NEXT: vmovaps (%rdi), %xmm1 183 ; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 184 ; X64-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0 185 ; X64-NEXT: vmovaps %xmm1, (%rsi) 186 ; X64-NEXT: retq 187 %1 = load <4 x float>, <4 x float>* %p0 188 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 189 %3 = fadd <8 x float> %2, <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0> 190 store <4 x float> %1, <4 x float>* %p1 191 ret <8 x float> %3 192 } 193 194 define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) { 195 ; X32-LABEL: test_broadcast_4i32_8i32_reuse: 196 ; X32: # %bb.0: 197 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 198 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 199 ; X32-NEXT: vmovdqa (%ecx), %xmm1 200 ; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 201 ; X32-NEXT: vpaddd {{\.LCPI.*}}, %ymm0, %ymm0 202 ; X32-NEXT: vmovdqa %xmm1, (%eax) 203 ; X32-NEXT: retl 204 ; 205 ; X64-LABEL: test_broadcast_4i32_8i32_reuse: 206 ; X64: # %bb.0: 207 ; X64-NEXT: vmovdqa (%rdi), %xmm1 208 ; X64-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 209 ; X64-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 210 ; X64-NEXT: vmovdqa %xmm1, (%rsi) 211 ; X64-NEXT: retq 212 %1 = load <4 x i32>, <4 x i32>* %p0 213 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 214 %3 = add <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 215 store <4 x i32> %1, <4 x i32>* %p1 216 ret <8 x i32> %3 217 } 218 219 define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p1) nounwind { 220 ; X32-LABEL: test_broadcast_8i16_16i16_reuse: 221 ; X32: # %bb.0: 222 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 223 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 224 ; X32-NEXT: vmovdqa (%ecx), %xmm1 225 ; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 226 ; X32-NEXT: vpaddw {{\.LCPI.*}}, %ymm0, %ymm0 227 ; X32-NEXT: vmovdqa %xmm1, (%eax) 228 ; X32-NEXT: retl 229 ; 230 ; X64-LABEL: test_broadcast_8i16_16i16_reuse: 231 ; X64: # %bb.0: 232 ; X64-NEXT: vmovdqa (%rdi), %xmm1 233 ; X64-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 234 ; X64-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 235 ; X64-NEXT: vmovdqa %xmm1, (%rsi) 236 ; X64-NEXT: retq 237 %1 = load <8 x i16>, <8 x i16> *%p0 238 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 239 %3 = add <16 x i16> %2, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16> 240 store <8 x i16> %1, <8 x i16>* %p1 241 ret <16 x i16> %3 242 } 243 244 define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) nounwind { 245 ; X32-LABEL: test_broadcast_16i8_32i8_reuse: 246 ; X32: # %bb.0: 247 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 248 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 249 ; X32-NEXT: vmovdqa (%ecx), %xmm1 250 ; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 251 ; X32-NEXT: vpaddb {{\.LCPI.*}}, %ymm0, %ymm0 252 ; X32-NEXT: vmovdqa %xmm1, (%eax) 253 ; X32-NEXT: retl 254 ; 255 ; X64-LABEL: test_broadcast_16i8_32i8_reuse: 256 ; X64: # %bb.0: 257 ; X64-NEXT: vmovdqa (%rdi), %xmm1 258 ; X64-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 259 ; X64-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 260 ; X64-NEXT: vmovdqa %xmm1, (%rsi) 261 ; X64-NEXT: retq 262 %1 = load <16 x i8>, <16 x i8> *%p0 263 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 264 %3 = add <32 x i8> %2, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32> 265 store <16 x i8> %1, <16 x i8>* %p1 266 ret <32 x i8> %3 267 } 268 269 define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) { 270 ; X32-LABEL: PR29088: 271 ; X32: # %bb.0: 272 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 273 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 274 ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 275 ; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 276 ; X32-NEXT: vmovaps %ymm1, (%eax) 277 ; X32-NEXT: retl 278 ; 279 ; X64-LABEL: PR29088: 280 ; X64: # %bb.0: 281 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 282 ; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 283 ; X64-NEXT: vmovaps %ymm1, (%rsi) 284 ; X64-NEXT: retq 285 %ld = load <4 x i32>, <4 x i32>* %p0 286 store <8 x float> zeroinitializer, <8 x float>* %p1 287 %shuf = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 288 ret <8 x i32> %shuf 289 } 290