1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1 3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 4 5 define <8 x float> @shuffle_v8f32_45670123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 6 ; ALL-LABEL: shuffle_v8f32_45670123: 7 ; ALL: ## BB#0: ## %entry 8 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 9 ; ALL-NEXT: retq 10 entry: 11 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 12 ret <8 x float> %shuffle 13 } 14 15 define <8 x float> @shuffle_v8f32_45670123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp { 16 ; ALL-LABEL: shuffle_v8f32_45670123_mem: 17 ; ALL: ## BB#0: ## %entry 18 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1] 19 ; ALL-NEXT: retq 20 entry: 21 %a = load <8 x float>, <8 x float>* %pa 22 %b = load <8 x float>, <8 x float>* %pb 23 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 24 ret <8 x float> %shuffle 25 } 26 27 define <8 x float> @shuffle_v8f32_0123cdef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 28 ; ALL-LABEL: shuffle_v8f32_0123cdef: 29 ; ALL: ## BB#0: ## %entry 30 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] 31 ; ALL-NEXT: retq 32 entry: 33 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15> 34 ret <8 x float> %shuffle 35 } 36 37 define <8 x float> @shuffle_v8f32_01230123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 38 ; AVX1-LABEL: shuffle_v8f32_01230123: 39 ; AVX1: ## BB#0: ## %entry 40 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 41 ; AVX1-NEXT: retq 42 ; 43 ; AVX2-LABEL: shuffle_v8f32_01230123: 44 ; AVX2: ## BB#0: ## %entry 45 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] 46 ; AVX2-NEXT: retq 47 entry: 48 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 49 ret <8 x float> %shuffle 50 } 51 52 define <8 x float> @shuffle_v8f32_01230123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp { 53 ; AVX1-LABEL: shuffle_v8f32_01230123_mem: 54 ; AVX1: ## BB#0: ## %entry 55 ; AVX1-NEXT: vmovaps (%rdi), %ymm0 56 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 57 ; AVX1-NEXT: retq 58 ; 59 ; AVX2-LABEL: shuffle_v8f32_01230123_mem: 60 ; AVX2: ## BB#0: ## %entry 61 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,1] 62 ; AVX2-NEXT: retq 63 entry: 64 %a = load <8 x float>, <8 x float>* %pa 65 %b = load <8 x float>, <8 x float>* %pb 66 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 67 ret <8 x float> %shuffle 68 } 69 70 define <8 x float> @shuffle_v8f32_45674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 71 ; ALL-LABEL: shuffle_v8f32_45674567: 72 ; ALL: ## BB#0: ## %entry 73 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 74 ; ALL-NEXT: retq 75 entry: 76 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 77 ret <8 x float> %shuffle 78 } 79 80 define <8 x float> @shuffle_v8f32_45674567_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp { 81 ; ALL-LABEL: shuffle_v8f32_45674567_mem: 82 ; ALL: ## BB#0: ## %entry 83 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3] 84 ; ALL-NEXT: retq 85 entry: 86 %a = load <8 x float>, <8 x float>* %pa 87 %b = load <8 x float>, <8 x float>* %pb 88 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 89 ret <8 x float> %shuffle 90 } 91 92 define <32 x i8> @shuffle_v32i8_2323(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { 93 ; ALL-LABEL: shuffle_v32i8_2323: 94 ; ALL: ## BB#0: ## %entry 95 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 96 ; ALL-NEXT: retq 97 entry: 98 %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 99 ret <32 x i8> %shuffle 100 } 101 102 define <32 x i8> @shuffle_v32i8_2323_domain(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { 103 ; AVX1-LABEL: shuffle_v32i8_2323_domain: 104 ; AVX1: ## BB#0: ## %entry 105 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 106 ; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 107 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 108 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 109 ; AVX1-NEXT: retq 110 ; 111 ; AVX2-LABEL: shuffle_v32i8_2323_domain: 112 ; AVX2: ## BB#0: ## %entry 113 ; AVX2-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 114 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 115 ; AVX2-NEXT: retq 116 entry: 117 ; add forces execution domain 118 %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 119 %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 120 ret <32 x i8> %shuffle 121 } 122 123 define <4 x i64> @shuffle_v4i64_6701(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { 124 ; ALL-LABEL: shuffle_v4i64_6701: 125 ; ALL: ## BB#0: ## %entry 126 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] 127 ; ALL-NEXT: retq 128 entry: 129 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 130 ret <4 x i64> %shuffle 131 } 132 133 define <4 x i64> @shuffle_v4i64_6701_domain(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { 134 ; AVX1-LABEL: shuffle_v4i64_6701_domain: 135 ; AVX1: ## BB#0: ## %entry 136 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 137 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] 138 ; AVX1-NEXT: retq 139 ; 140 ; AVX2-LABEL: shuffle_v4i64_6701_domain: 141 ; AVX2: ## BB#0: ## %entry 142 ; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 143 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 144 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] 145 ; AVX2-NEXT: retq 146 entry: 147 ; add forces execution domain 148 %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1> 149 %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 150 ret <4 x i64> %shuffle 151 } 152 153 define <8 x i32> @shuffle_v8i32_u5u7cdef(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp { 154 ; AVX1-LABEL: shuffle_v8i32_u5u7cdef: 155 ; AVX1: ## BB#0: ## %entry 156 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 157 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 158 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 159 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 160 ; AVX1-NEXT: retq 161 ; 162 ; AVX2-LABEL: shuffle_v8i32_u5u7cdef: 163 ; AVX2: ## BB#0: ## %entry 164 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 165 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 166 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 167 ; AVX2-NEXT: retq 168 entry: 169 ; add forces execution domain 170 %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 171 %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15> 172 ret <8 x i32> %shuffle 173 } 174 175 define <16 x i16> @shuffle_v16i16_4501(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp { 176 ; AVX1-LABEL: shuffle_v16i16_4501: 177 ; AVX1: ## BB#0: ## %entry 178 ; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 179 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 180 ; AVX1-NEXT: retq 181 ; 182 ; AVX2-LABEL: shuffle_v16i16_4501: 183 ; AVX2: ## BB#0: ## %entry 184 ; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 185 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 186 ; AVX2-NEXT: retq 187 entry: 188 ; add forces execution domain 189 %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 190 %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 191 ret <16 x i16> %shuffle 192 } 193 194 define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp { 195 ; AVX1-LABEL: shuffle_v16i16_4501_mem: 196 ; AVX1: ## BB#0: ## %entry 197 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 198 ; AVX1-NEXT: vmovaps (%rsi), %ymm1 199 ; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 200 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 201 ; AVX1-NEXT: retq 202 ; 203 ; AVX2-LABEL: shuffle_v16i16_4501_mem: 204 ; AVX2: ## BB#0: ## %entry 205 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 206 ; AVX2-NEXT: vmovdqa (%rsi), %ymm1 207 ; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 208 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 209 ; AVX2-NEXT: retq 210 entry: 211 %c = load <16 x i16>, <16 x i16>* %a 212 %d = load <16 x i16>, <16 x i16>* %b 213 %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 214 %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 215 ret <16 x i16> %shuffle 216 } 217 218 ;;;; Cases with undef indicies mixed in the mask 219 220 define <8 x float> @shuffle_v8f32_uu67u9ub(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 221 ; ALL-LABEL: shuffle_v8f32_uu67u9ub: 222 ; ALL: ## BB#0: ## %entry 223 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] 224 ; ALL-NEXT: retq 225 entry: 226 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11> 227 ret <8 x float> %shuffle 228 } 229 230 define <8 x float> @shuffle_v8f32_uu67uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 231 ; ALL-LABEL: shuffle_v8f32_uu67uu67: 232 ; ALL: ## BB#0: ## %entry 233 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 234 ; ALL-NEXT: retq 235 entry: 236 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7> 237 ret <8 x float> %shuffle 238 } 239 240 define <8 x float> @shuffle_v8f32_uu67uuab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 241 ; ALL-LABEL: shuffle_v8f32_uu67uuab: 242 ; ALL: ## BB#0: ## %entry 243 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] 244 ; ALL-NEXT: retq 245 entry: 246 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 10, i32 11> 247 ret <8 x float> %shuffle 248 } 249 250 define <8 x float> @shuffle_v8f32_uu67uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 251 ; ALL-LABEL: shuffle_v8f32_uu67uuef: 252 ; ALL: ## BB#0: ## %entry 253 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 254 ; ALL-NEXT: retq 255 entry: 256 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15> 257 ret <8 x float> %shuffle 258 } 259 260 define <8 x float> @shuffle_v8f32_uu674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 261 ; ALL-LABEL: shuffle_v8f32_uu674567: 262 ; ALL: ## BB#0: ## %entry 263 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 264 ; ALL-NEXT: retq 265 entry: 266 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 267 ret <8 x float> %shuffle 268 } 269 270 define <8 x float> @shuffle_v8f32_uu6789ab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 271 ; ALL-LABEL: shuffle_v8f32_uu6789ab: 272 ; ALL: ## BB#0: ## %entry 273 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] 274 ; ALL-NEXT: retq 275 entry: 276 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 277 ret <8 x float> %shuffle 278 } 279 280 define <8 x float> @shuffle_v8f32_4567uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 281 ; ALL-LABEL: shuffle_v8f32_4567uu67: 282 ; ALL: ## BB#0: ## %entry 283 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 284 ; ALL-NEXT: retq 285 entry: 286 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7> 287 ret <8 x float> %shuffle 288 } 289 290 define <8 x float> @shuffle_v8f32_4567uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 291 ; ALL-LABEL: shuffle_v8f32_4567uuef: 292 ; ALL: ## BB#0: ## %entry 293 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 294 ; ALL-NEXT: retq 295 entry: 296 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15> 297 ret <8 x float> %shuffle 298 } 299 300 ;;;; Cases we must not select vperm2f128 301 302 define <8 x float> @shuffle_v8f32_uu67ucuf(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 303 ; ALL-LABEL: shuffle_v8f32_uu67ucuf: 304 ; ALL: ## BB#0: ## %entry 305 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 306 ; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] 307 ; ALL-NEXT: retq 308 entry: 309 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15> 310 ret <8 x float> %shuffle 311 } 312 313 ;; Test zero mask generation. 314 ;; PR22984: https://llvm.org/bugs/show_bug.cgi?id=22984 315 ;; Prefer xor+vblendpd over vperm2f128 because that has better performance. 316 ;; TODO: When building for optsize we should use vperm2f128. 317 318 define <4 x double> @shuffle_v4f64_zz01(<4 x double> %a) { 319 ; ALL-LABEL: shuffle_v4f64_zz01: 320 ; ALL: ## BB#0: 321 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] 322 ; ALL-NEXT: retq 323 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 324 ret <4 x double> %s 325 } 326 define <4 x double> @shuffle_v4f64_zz01_optsize(<4 x double> %a) optsize { 327 ; ALL-LABEL: shuffle_v4f64_zz01_optsize: 328 ; ALL: ## BB#0: 329 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] 330 ; ALL-NEXT: retq 331 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 332 ret <4 x double> %s 333 } 334 335 define <4 x double> @shuffle_v4f64_zz23(<4 x double> %a) { 336 ; ALL-LABEL: shuffle_v4f64_zz23: 337 ; ALL: ## BB#0: 338 ; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 339 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] 340 ; ALL-NEXT: retq 341 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 342 ret <4 x double> %s 343 } 344 define <4 x double> @shuffle_v4f64_zz23_optsize(<4 x double> %a) optsize { 345 ; ALL-LABEL: shuffle_v4f64_zz23_optsize: 346 ; ALL: ## BB#0: 347 ; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 348 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] 349 ; ALL-NEXT: retq 350 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 351 ret <4 x double> %s 352 } 353 354 define <4 x double> @shuffle_v4f64_zz45(<4 x double> %a) { 355 ; ALL-LABEL: shuffle_v4f64_zz45: 356 ; ALL: ## BB#0: 357 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] 358 ; ALL-NEXT: retq 359 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 360 ret <4 x double> %s 361 } 362 define <4 x double> @shuffle_v4f64_zz45_optsize(<4 x double> %a) optsize { 363 ; ALL-LABEL: shuffle_v4f64_zz45_optsize: 364 ; ALL: ## BB#0: 365 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] 366 ; ALL-NEXT: retq 367 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 368 ret <4 x double> %s 369 } 370 371 define <4 x double> @shuffle_v4f64_zz67(<4 x double> %a) { 372 ; ALL-LABEL: shuffle_v4f64_zz67: 373 ; ALL: ## BB#0: 374 ; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 375 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] 376 ; ALL-NEXT: retq 377 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 378 ret <4 x double> %s 379 } 380 define <4 x double> @shuffle_v4f64_zz67_optsize(<4 x double> %a) optsize { 381 ; ALL-LABEL: shuffle_v4f64_zz67_optsize: 382 ; ALL: ## BB#0: 383 ; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 384 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] 385 ; ALL-NEXT: retq 386 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 387 ret <4 x double> %s 388 } 389 390 define <4 x double> @shuffle_v4f64_01zz(<4 x double> %a) { 391 ; ALL-LABEL: shuffle_v4f64_01zz: 392 ; ALL: ## BB#0: 393 ; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 394 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] 395 ; ALL-NEXT: retq 396 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 397 ret <4 x double> %s 398 } 399 define <4 x double> @shuffle_v4f64_01zz_optsize(<4 x double> %a) optsize { 400 ; ALL-LABEL: shuffle_v4f64_01zz_optsize: 401 ; ALL: ## BB#0: 402 ; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 403 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] 404 ; ALL-NEXT: retq 405 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 406 ret <4 x double> %s 407 } 408 409 define <4 x double> @shuffle_v4f64_23zz(<4 x double> %a) { 410 ; ALL-LABEL: shuffle_v4f64_23zz: 411 ; ALL: ## BB#0: 412 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero 413 ; ALL-NEXT: retq 414 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 415 ret <4 x double> %s 416 } 417 define <4 x double> @shuffle_v4f64_23zz_optsize(<4 x double> %a) optsize { 418 ; ALL-LABEL: shuffle_v4f64_23zz_optsize: 419 ; ALL: ## BB#0: 420 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero 421 ; ALL-NEXT: retq 422 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 423 ret <4 x double> %s 424 } 425 426 define <4 x double> @shuffle_v4f64_45zz(<4 x double> %a) { 427 ; ALL-LABEL: shuffle_v4f64_45zz: 428 ; ALL: ## BB#0: 429 ; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 430 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] 431 ; ALL-NEXT: retq 432 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 433 ret <4 x double> %s 434 } 435 define <4 x double> @shuffle_v4f64_45zz_optsize(<4 x double> %a) optsize { 436 ; ALL-LABEL: shuffle_v4f64_45zz_optsize: 437 ; ALL: ## BB#0: 438 ; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 439 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] 440 ; ALL-NEXT: retq 441 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 442 ret <4 x double> %s 443 } 444 445 define <4 x double> @shuffle_v4f64_67zz(<4 x double> %a) { 446 ; ALL-LABEL: shuffle_v4f64_67zz: 447 ; ALL: ## BB#0: 448 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero 449 ; ALL-NEXT: retq 450 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 451 ret <4 x double> %s 452 } 453 define <4 x double> @shuffle_v4f64_67zz_optsize(<4 x double> %a) optsize { 454 ; ALL-LABEL: shuffle_v4f64_67zz_optsize: 455 ; ALL: ## BB#0: 456 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero 457 ; ALL-NEXT: retq 458 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 459 ret <4 x double> %s 460 } 461 462 ;; With AVX2 select the integer version of the instruction. Use an add to force the domain selection. 463 464 define <4 x i64> @shuffle_v4i64_67zz(<4 x i64> %a, <4 x i64> %b) { 465 ; AVX1-LABEL: shuffle_v4i64_67zz: 466 ; AVX1: ## BB#0: 467 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero 468 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 469 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 470 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 471 ; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 472 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 473 ; AVX1-NEXT: retq 474 ; 475 ; AVX2-LABEL: shuffle_v4i64_67zz: 476 ; AVX2: ## BB#0: 477 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero 478 ; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 479 ; AVX2-NEXT: retq 480 %s = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 481 %c = add <4 x i64> %b, %s 482 ret <4 x i64> %c 483 } 484 485 ;;; Memory folding cases 486 487 define <4 x double> @ld0_hi0_lo1_4f64(<4 x double> * %pa, <4 x double> %b) nounwind uwtable readnone ssp { 488 ; AVX1-LABEL: ld0_hi0_lo1_4f64: 489 ; AVX1: ## BB#0: ## %entry 490 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 491 ; AVX1-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0 492 ; AVX1-NEXT: retq 493 ; 494 ; AVX2-LABEL: ld0_hi0_lo1_4f64: 495 ; AVX2: ## BB#0: ## %entry 496 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 497 ; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm1 498 ; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 499 ; AVX2-NEXT: retq 500 entry: 501 %a = load <4 x double>, <4 x double> * %pa 502 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 503 %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0> 504 ret <4 x double> %res 505 } 506 507 define <4 x double> @ld1_hi0_hi1_4f64(<4 x double> %a, <4 x double> * %pb) nounwind uwtable readnone ssp { 508 ; AVX1-LABEL: ld1_hi0_hi1_4f64: 509 ; AVX1: ## BB#0: ## %entry 510 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 511 ; AVX1-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0 512 ; AVX1-NEXT: retq 513 ; 514 ; AVX2-LABEL: ld1_hi0_hi1_4f64: 515 ; AVX2: ## BB#0: ## %entry 516 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 517 ; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm1 518 ; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 519 ; AVX2-NEXT: retq 520 entry: 521 %b = load <4 x double>, <4 x double> * %pb 522 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 523 %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0> 524 ret <4 x double> %res 525 } 526 527 define <8 x float> @ld0_hi0_lo1_8f32(<8 x float> * %pa, <8 x float> %b) nounwind uwtable readnone ssp { 528 ; AVX1-LABEL: ld0_hi0_lo1_8f32: 529 ; AVX1: ## BB#0: ## %entry 530 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 531 ; AVX1-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0 532 ; AVX1-NEXT: retq 533 ; 534 ; AVX2-LABEL: ld0_hi0_lo1_8f32: 535 ; AVX2: ## BB#0: ## %entry 536 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 537 ; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 538 ; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 539 ; AVX2-NEXT: retq 540 entry: 541 %a = load <8 x float>, <8 x float> * %pa 542 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 543 %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0> 544 ret <8 x float> %res 545 } 546 547 define <8 x float> @ld1_hi0_hi1_8f32(<8 x float> %a, <8 x float> * %pb) nounwind uwtable readnone ssp { 548 ; AVX1-LABEL: ld1_hi0_hi1_8f32: 549 ; AVX1: ## BB#0: ## %entry 550 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 551 ; AVX1-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0 552 ; AVX1-NEXT: retq 553 ; 554 ; AVX2-LABEL: ld1_hi0_hi1_8f32: 555 ; AVX2: ## BB#0: ## %entry 556 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 557 ; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 558 ; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 559 ; AVX2-NEXT: retq 560 entry: 561 %b = load <8 x float>, <8 x float> * %pb 562 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 563 %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0> 564 ret <8 x float> %res 565 } 566 567 define <4 x i64> @ld0_hi0_lo1_4i64(<4 x i64> * %pa, <4 x i64> %b) nounwind uwtable readnone ssp { 568 ; AVX1-LABEL: ld0_hi0_lo1_4i64: 569 ; AVX1: ## BB#0: ## %entry 570 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 571 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1 572 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 573 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 574 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 575 ; AVX1-NEXT: retq 576 ; 577 ; AVX2-LABEL: ld0_hi0_lo1_4i64: 578 ; AVX2: ## BB#0: ## %entry 579 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 580 ; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 581 ; AVX2-NEXT: retq 582 entry: 583 %a = load <4 x i64>, <4 x i64> * %pa 584 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 585 %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4> 586 ret <4 x i64> %res 587 } 588 589 define <4 x i64> @ld1_hi0_hi1_4i64(<4 x i64> %a, <4 x i64> * %pb) nounwind uwtable readnone ssp { 590 ; AVX1-LABEL: ld1_hi0_hi1_4i64: 591 ; AVX1: ## BB#0: ## %entry 592 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 593 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1 594 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 595 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 596 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 597 ; AVX1-NEXT: retq 598 ; 599 ; AVX2-LABEL: ld1_hi0_hi1_4i64: 600 ; AVX2: ## BB#0: ## %entry 601 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 602 ; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 603 ; AVX2-NEXT: retq 604 entry: 605 %b = load <4 x i64>, <4 x i64> * %pb 606 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 607 %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4> 608 ret <4 x i64> %res 609 } 610 611 define <8 x i32> @ld0_hi0_lo1_8i32(<8 x i32> * %pa, <8 x i32> %b) nounwind uwtable readnone ssp { 612 ; AVX1-LABEL: ld0_hi0_lo1_8i32: 613 ; AVX1: ## BB#0: ## %entry 614 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 615 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 616 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,3,4] 617 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 618 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 619 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 620 ; AVX1-NEXT: retq 621 ; 622 ; AVX2-LABEL: ld0_hi0_lo1_8i32: 623 ; AVX2: ## BB#0: ## %entry 624 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 625 ; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 626 ; AVX2-NEXT: retq 627 entry: 628 %a = load <8 x i32>, <8 x i32> * %pa 629 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 630 %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4> 631 ret <8 x i32> %res 632 } 633 634 define <8 x i32> @ld1_hi0_hi1_8i32(<8 x i32> %a, <8 x i32> * %pb) nounwind uwtable readnone ssp { 635 ; AVX1-LABEL: ld1_hi0_hi1_8i32: 636 ; AVX1: ## BB#0: ## %entry 637 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 638 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 639 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,3,4] 640 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 641 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 642 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 643 ; AVX1-NEXT: retq 644 ; 645 ; AVX2-LABEL: ld1_hi0_hi1_8i32: 646 ; AVX2: ## BB#0: ## %entry 647 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 648 ; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 649 ; AVX2-NEXT: retq 650 entry: 651 %b = load <8 x i32>, <8 x i32> * %pb 652 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 653 %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4> 654 ret <8 x i32> %res 655 } 656