1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl | FileCheck %s --check-prefix=X64-AVX512 --check-prefix=X64-AVX512VL 3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=X64-AVX512 --check-prefix=X64-AVX512BWVL 4 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64-AVX512 --check-prefix=X64-AVX512DQVL 5 6 ; 7 ; 128-bit Subvector Broadcast to 256-bit 8 ; 9 10 define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind { 11 ; X64-AVX512-LABEL: test_broadcast_2f64_4f64: 12 ; X64-AVX512: ## %bb.0: 13 ; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 14 ; X64-AVX512-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0 15 ; X64-AVX512-NEXT: retq 16 %1 = load <2 x double>, <2 x double> *%p 17 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 18 %3 = fadd <4 x double> %2, <double 1.0, double 2.0, double 3.0, double 4.0> 19 ret <4 x double> %3 20 } 21 22 define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind { 23 ; X64-AVX512-LABEL: test_broadcast_2i64_4i64: 24 ; X64-AVX512: ## %bb.0: 25 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 26 ; X64-AVX512-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 27 ; X64-AVX512-NEXT: retq 28 %1 = load <2 x i64>, <2 x i64> *%p 29 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 30 %3 = add <4 x i64> %2, <i64 1, i64 2, i64 3, i64 4> 31 ret <4 x i64> %3 32 } 33 34 define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind { 35 ; X64-AVX512-LABEL: test_broadcast_4f32_8f32: 36 ; X64-AVX512: ## %bb.0: 37 ; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 38 ; X64-AVX512-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0 39 ; X64-AVX512-NEXT: retq 40 %1 = load <4 x float>, <4 x float> *%p 41 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 42 %3 = fadd <8 x float> %2, <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0> 43 ret <8 x float> %3 44 } 45 46 define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind { 47 ; X64-AVX512-LABEL: test_broadcast_4i32_8i32: 48 ; X64-AVX512: ## %bb.0: 49 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 50 ; X64-AVX512-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 51 ; X64-AVX512-NEXT: retq 52 %1 = load <4 x i32>, <4 x i32> *%p 53 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 54 %3 = add <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 55 ret <8 x i32> %3 56 } 57 58 define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind { 59 ; X64-AVX512-LABEL: test_broadcast_8i16_16i16: 60 ; X64-AVX512: ## %bb.0: 61 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 62 ; X64-AVX512-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 63 ; X64-AVX512-NEXT: retq 64 %1 = load <8 x i16>, <8 x i16> *%p 65 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 66 %3 = add <16 x i16> %2, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16> 67 ret <16 x i16> %3 68 } 69 70 define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind { 71 ; X64-AVX512-LABEL: test_broadcast_16i8_32i8: 72 ; X64-AVX512: ## %bb.0: 73 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 74 ; X64-AVX512-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 75 ; X64-AVX512-NEXT: retq 76 %1 = load <16 x i8>, <16 x i8> *%p 77 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 78 %3 = add <32 x i8> %2, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32> 79 ret <32 x i8> %3 80 } 81 82 ; 83 ; 128-bit Subvector Broadcast to 512-bit 84 ; 85 86 define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind { 87 ; X64-AVX512-LABEL: test_broadcast_2f64_8f64: 88 ; X64-AVX512: ## %bb.0: 89 ; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 90 ; X64-AVX512-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0 91 ; X64-AVX512-NEXT: retq 92 %1 = load <2 x double>, <2 x double> *%p 93 %2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 94 %3 = fadd <8 x double> %2, <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0> 95 ret <8 x double> %3 96 } 97 98 define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind { 99 ; X64-AVX512-LABEL: test_broadcast_2i64_8i64: 100 ; X64-AVX512: ## %bb.0: 101 ; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 102 ; X64-AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 103 ; X64-AVX512-NEXT: retq 104 %1 = load <2 x i64>, <2 x i64> *%p 105 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 106 %3 = add <8 x i64> %2, <i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8> 107 ret <8 x i64> %3 108 } 109 110 define <16 x float> @test_broadcast_4f32_16f32(<4 x float> *%p) nounwind { 111 ; X64-AVX512-LABEL: test_broadcast_4f32_16f32: 112 ; X64-AVX512: ## %bb.0: 113 ; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 114 ; X64-AVX512-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 115 ; X64-AVX512-NEXT: retq 116 %1 = load <4 x float>, <4 x float> *%p 117 %2 = shufflevector <4 x float> %1, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 118 %3 = fadd <16 x float> %2, <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0> 119 ret <16 x float> %3 120 } 121 122 define <16 x i32> @test_broadcast_4i32_16i32(<4 x i32> *%p) nounwind { 123 ; X64-AVX512-LABEL: test_broadcast_4i32_16i32: 124 ; X64-AVX512: ## %bb.0: 125 ; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 126 ; X64-AVX512-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 127 ; X64-AVX512-NEXT: retq 128 %1 = load <4 x i32>, <4 x i32> *%p 129 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 130 %3 = add <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16> 131 ret <16 x i32> %3 132 } 133 134 define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind { 135 ; X64-AVX512VL-LABEL: test_broadcast_8i16_32i16: 136 ; X64-AVX512VL: ## %bb.0: 137 ; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] 138 ; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm0 139 ; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1 140 ; X64-AVX512VL-NEXT: retq 141 ; 142 ; X64-AVX512BWVL-LABEL: test_broadcast_8i16_32i16: 143 ; X64-AVX512BWVL: ## %bb.0: 144 ; X64-AVX512BWVL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 145 ; X64-AVX512BWVL-NEXT: vpaddw {{.*}}(%rip), %zmm0, %zmm0 146 ; X64-AVX512BWVL-NEXT: retq 147 ; 148 ; X64-AVX512DQVL-LABEL: test_broadcast_8i16_32i16: 149 ; X64-AVX512DQVL: ## %bb.0: 150 ; X64-AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] 151 ; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm0 152 ; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1 153 ; X64-AVX512DQVL-NEXT: retq 154 %1 = load <8 x i16>, <8 x i16> *%p 155 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 156 %3 = add <32 x i16> %2, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23, i16 24, i16 25, i16 26, i16 27, i16 28, i16 29, i16 30, i16 31, i16 32> 157 ret <32 x i16> %3 158 } 159 160 define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind { 161 ; X64-AVX512VL-LABEL: test_broadcast_16i8_64i8: 162 ; X64-AVX512VL: ## %bb.0: 163 ; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] 164 ; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm0 165 ; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm1 166 ; X64-AVX512VL-NEXT: retq 167 ; 168 ; X64-AVX512BWVL-LABEL: test_broadcast_16i8_64i8: 169 ; X64-AVX512BWVL: ## %bb.0: 170 ; X64-AVX512BWVL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 171 ; X64-AVX512BWVL-NEXT: vpaddb {{.*}}(%rip), %zmm0, %zmm0 172 ; X64-AVX512BWVL-NEXT: retq 173 ; 174 ; X64-AVX512DQVL-LABEL: test_broadcast_16i8_64i8: 175 ; X64-AVX512DQVL: ## %bb.0: 176 ; X64-AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] 177 ; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm0 178 ; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm1 179 ; X64-AVX512DQVL-NEXT: retq 180 %1 = load <16 x i8>, <16 x i8> *%p 181 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 182 %3 = add <64 x i8> %2, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47, i8 48, i8 49, i8 50, i8 51, i8 52, i8 53, i8 54, i8 55, i8 56, i8 57, i8 58, i8 59, i8 60, i8 61, i8 62, i8 63, i8 64> 183 ret <64 x i8> %3 184 } 185 186 define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) { 187 ; X64-AVX512-LABEL: PR29088: 188 ; X64-AVX512: ## %bb.0: 189 ; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 190 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 191 ; X64-AVX512-NEXT: vmovaps %ymm1, (%rsi) 192 ; X64-AVX512-NEXT: retq 193 %ld = load <4 x i32>, <4 x i32>* %p0 194 store <8 x float> zeroinitializer, <8 x float>* %p1 195 %shuf = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 196 ret <8 x i32> %shuf 197 } 198