1 //===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file describes the X86 SSE instruction set, defining the instructions, 11 // and properties of the instructions which are needed for code generation, 12 // machine code emission, and analysis. 13 // 14 //===----------------------------------------------------------------------===// 15 16 //===----------------------------------------------------------------------===// 17 // SSE 1 & 2 Instructions Classes 18 //===----------------------------------------------------------------------===// 19 20 /// sse12_fp_scalar - SSE 1 & 2 scalar instructions class 21 multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, 22 RegisterClass RC, X86MemOperand x86memop, 23 Domain d, X86FoldableSchedWrite sched, 24 bit Is2Addr = 1> { 25 let isCommutable = 1 in { 26 def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 27 !if(Is2Addr, 28 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 29 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 30 [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>, 31 Sched<[sched]>; 32 } 33 def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 34 !if(Is2Addr, 35 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 36 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 37 [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>, 38 Sched<[sched.Folded, ReadAfterLd]>; 39 } 40 41 /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class 42 multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, 43 SDPatternOperator OpNode, RegisterClass RC, 44 ValueType VT, string asm, Operand memopr, 45 ComplexPattern mem_cpat, Domain d, 46 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 47 let isCodeGenOnly = 1, hasSideEffects = 0 in { 48 def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 49 !if(Is2Addr, 50 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 51 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 52 [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>, 53 Sched<[sched]>; 54 let mayLoad = 1 in 55 def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2), 56 !if(Is2Addr, 57 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 58 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 59 [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], d>, 60 Sched<[sched.Folded, ReadAfterLd]>; 61 } 62 } 63 64 /// sse12_fp_packed - SSE 1 & 2 packed instructions class 65 multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, 66 RegisterClass RC, ValueType vt, 67 X86MemOperand x86memop, PatFrag mem_frag, 68 Domain d, X86FoldableSchedWrite sched, 69 bit Is2Addr = 1> { 70 let isCommutable = 1 in 71 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 72 !if(Is2Addr, 73 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 74 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 75 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>, 76 Sched<[sched]>; 77 let mayLoad = 1 in 78 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 79 !if(Is2Addr, 80 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 81 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 82 [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], 83 d>, 84 Sched<[sched.Folded, ReadAfterLd]>; 85 } 86 87 /// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class 88 multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d, 89 string OpcodeStr, X86MemOperand x86memop, 90 X86FoldableSchedWrite sched, 91 list<dag> pat_rr, list<dag> pat_rm, 92 bit Is2Addr = 1> { 93 let isCommutable = 1, hasSideEffects = 0 in 94 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 95 !if(Is2Addr, 96 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 97 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 98 pat_rr, d>, 99 Sched<[sched]>; 100 let hasSideEffects = 0, mayLoad = 1 in 101 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 102 !if(Is2Addr, 103 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 104 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 105 pat_rm, d>, 106 Sched<[sched.Folded, ReadAfterLd]>; 107 } 108 109 110 // Alias instructions that map fld0 to xorps for sse or vxorps for avx. 111 // This is expanded by ExpandPostRAPseudos. 112 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 113 isPseudo = 1, SchedRW = [WriteZero] in { 114 def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", 115 [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>; 116 def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", 117 [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoAVX512]>; 118 } 119 120 //===----------------------------------------------------------------------===// 121 // AVX & SSE - Zero/One Vectors 122 //===----------------------------------------------------------------------===// 123 124 // Alias instruction that maps zero vector to pxor / xorp* for sse. 125 // This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then 126 // swizzled by ExecutionDomainFix to pxor. 127 // We set canFoldAsLoad because this can be converted to a constant-pool 128 // load of an all-zeros value if folding it would be beneficial. 129 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 130 isPseudo = 1, SchedRW = [WriteZero] in { 131 def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", 132 [(set VR128:$dst, (v4f32 immAllZerosV))]>; 133 } 134 135 let Predicates = [NoAVX512] in 136 def : Pat<(v4i32 immAllZerosV), (V_SET0)>; 137 138 139 // The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI, 140 // and doesn't need it because on sandy bridge the register is set to zero 141 // at the rename stage without using any execution unit, so SET0PSY 142 // and SET0PDY can be used for vector int instructions without penalty 143 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 144 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in { 145 def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "", 146 [(set VR256:$dst, (v8i32 immAllZerosV))]>; 147 } 148 149 // We set canFoldAsLoad because this can be converted to a constant-pool 150 // load of an all-ones value if folding it would be beneficial. 151 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 152 isPseudo = 1, SchedRW = [WriteZero] in { 153 def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "", 154 [(set VR128:$dst, (v4i32 immAllOnesV))]>; 155 let Predicates = [HasAVX1Only, OptForMinSize] in { 156 def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "", 157 [(set VR256:$dst, (v8i32 immAllOnesV))]>; 158 } 159 let Predicates = [HasAVX2] in 160 def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "", 161 [(set VR256:$dst, (v8i32 immAllOnesV))]>; 162 } 163 164 //===----------------------------------------------------------------------===// 165 // SSE 1 & 2 - Move FP Scalar Instructions 166 // 167 // Move Instructions. Register-to-register movss/movsd is not used for FR32/64 168 // register copies because it's a partial register update; Register-to-register 169 // movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires 170 // that the insert be implementable in terms of a copy, and just mentioned, we 171 // don't use movss/movsd for copies. 172 //===----------------------------------------------------------------------===// 173 174 multiclass sse12_move_rr<SDNode OpNode, ValueType vt, 175 X86MemOperand x86memop, string base_opc, 176 string asm_opr, Domain d, string Name> { 177 let isCommutable = 1 in 178 def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst), 179 (ins VR128:$src1, VR128:$src2), 180 !strconcat(base_opc, asm_opr), 181 [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>, 182 Sched<[SchedWriteFShuffle.XMM]>; 183 184 // For the disassembler 185 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in 186 def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), 187 (ins VR128:$src1, VR128:$src2), 188 !strconcat(base_opc, asm_opr), []>, 189 Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>; 190 } 191 192 multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, 193 X86MemOperand x86memop, string OpcodeStr, 194 Domain d, string Name, Predicate pred> { 195 // AVX 196 let Predicates = [UseAVX, OptForSize] in 197 defm V#NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr, 198 "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d, 199 "V"#Name>, 200 VEX_4V, VEX_LIG, VEX_WIG; 201 202 def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), 203 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 204 [(store RC:$src, addr:$dst)], d>, 205 VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG; 206 // SSE1 & 2 207 let Constraints = "$src1 = $dst" in { 208 let Predicates = [pred, NoSSE41_Or_OptForSize] in 209 defm NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr, 210 "\t{$src2, $dst|$dst, $src2}", d, Name>; 211 } 212 213 def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), 214 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 215 [(store RC:$src, addr:$dst)], d>, 216 Sched<[WriteFStore]>; 217 218 def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", 219 (!cast<Instruction>("V"#NAME#"rr_REV") 220 VR128:$dst, VR128:$src1, VR128:$src2), 0>; 221 def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}", 222 (!cast<Instruction>(NAME#"rr_REV") 223 VR128:$dst, VR128:$src2), 0>; 224 } 225 226 // Loading from memory automatically zeroing upper bits. 227 multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop, 228 PatFrag mem_pat, string OpcodeStr, Domain d> { 229 def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 230 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 231 [(set RC:$dst, (mem_pat addr:$src))], d>, 232 VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG; 233 def NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 234 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 235 [(set RC:$dst, (mem_pat addr:$src))], d>, 236 Sched<[WriteFLoad]>; 237 } 238 239 defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss", 240 SSEPackedSingle, "MOVSS", UseSSE1>, XS; 241 defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd", 242 SSEPackedDouble, "MOVSD", UseSSE2>, XD; 243 244 let canFoldAsLoad = 1, isReMaterializable = 1 in { 245 defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss", 246 SSEPackedSingle>, XS; 247 defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd", 248 SSEPackedDouble>, XD; 249 } 250 251 // Patterns 252 let Predicates = [UseAVX] in { 253 // MOVSSrm zeros the high parts of the register; represent this 254 // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 255 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), 256 (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; 257 def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), 258 (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; 259 def : Pat<(v4f32 (X86vzload addr:$src)), 260 (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; 261 262 // MOVSDrm zeros the high parts of the register; represent this 263 // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 264 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), 265 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; 266 def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), 267 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; 268 def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), 269 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; 270 def : Pat<(v2f64 (X86vzload addr:$src)), 271 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; 272 273 // Represent the same patterns above but in the form they appear for 274 // 256-bit types 275 def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, 276 (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), 277 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; 278 def : Pat<(v8f32 (X86vzload addr:$src)), 279 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; 280 def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, 281 (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), 282 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; 283 def : Pat<(v4f64 (X86vzload addr:$src)), 284 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; 285 286 // Extract and store. 287 def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), 288 addr:$dst), 289 (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>; 290 } 291 292 let Predicates = [UseAVX, OptForSize] in { 293 // Move scalar to XMM zero-extended, zeroing a VR128 then do a 294 // MOVSS to the lower bits. 295 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 296 (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>; 297 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 298 (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>; 299 300 // Move low f32 and clear high bits. 301 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), 302 (SUBREG_TO_REG (i32 0), 303 (v4f32 (VMOVSSrr (v4f32 (V_SET0)), 304 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>; 305 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), 306 (SUBREG_TO_REG (i32 0), 307 (v4i32 (VMOVSSrr (v4i32 (V_SET0)), 308 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>; 309 310 def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), 311 (SUBREG_TO_REG (i32 0), 312 (v2f64 (VMOVSDrr (v2f64 (V_SET0)), 313 (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))), 314 sub_xmm)>; 315 def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), 316 (SUBREG_TO_REG (i32 0), 317 (v2i64 (VMOVSDrr (v2i64 (V_SET0)), 318 (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))), 319 sub_xmm)>; 320 } 321 322 let Predicates = [UseSSE1] in { 323 let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in { 324 // Move scalar to XMM zero-extended, zeroing a VR128 then do a 325 // MOVSS to the lower bits. 326 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 327 (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>; 328 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 329 (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>; 330 } 331 332 // MOVSSrm already zeros the high parts of the register. 333 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), 334 (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; 335 def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), 336 (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; 337 def : Pat<(v4f32 (X86vzload addr:$src)), 338 (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; 339 340 // Extract and store. 341 def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), 342 addr:$dst), 343 (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>; 344 } 345 346 let Predicates = [UseSSE2] in { 347 // MOVSDrm already zeros the high parts of the register. 348 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), 349 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; 350 def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), 351 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; 352 def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), 353 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; 354 def : Pat<(v2f64 (X86vzload addr:$src)), 355 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; 356 } 357 358 // Aliases to help the assembler pick two byte VEX encodings by swapping the 359 // operands relative to the normal instructions to use VEX.R instead of VEX.B. 360 def : InstAlias<"vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 361 (VMOVSSrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>; 362 def : InstAlias<"vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 363 (VMOVSDrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>; 364 365 //===----------------------------------------------------------------------===// 366 // SSE 1 & 2 - Move Aligned/Unaligned FP Instructions 367 //===----------------------------------------------------------------------===// 368 369 multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC, 370 X86MemOperand x86memop, PatFrag ld_frag, 371 string asm, Domain d, 372 X86SchedWriteMoveLS sched> { 373 let hasSideEffects = 0, isMoveReg = 1 in 374 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), 375 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>, 376 Sched<[sched.RR]>; 377 let canFoldAsLoad = 1, isReMaterializable = 1 in 378 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 379 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 380 [(set RC:$dst, (ld_frag addr:$src))], d>, 381 Sched<[sched.RM]>; 382 } 383 384 let Predicates = [HasAVX, NoVLX] in { 385 defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", 386 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 387 PS, VEX, VEX_WIG; 388 defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", 389 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 390 PD, VEX, VEX_WIG; 391 defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", 392 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 393 PS, VEX, VEX_WIG; 394 defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", 395 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 396 PD, VEX, VEX_WIG; 397 398 defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps", 399 SSEPackedSingle, SchedWriteFMoveLS.YMM>, 400 PS, VEX, VEX_L, VEX_WIG; 401 defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd", 402 SSEPackedDouble, SchedWriteFMoveLS.YMM>, 403 PD, VEX, VEX_L, VEX_WIG; 404 defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups", 405 SSEPackedSingle, SchedWriteFMoveLS.YMM>, 406 PS, VEX, VEX_L, VEX_WIG; 407 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", 408 SSEPackedDouble, SchedWriteFMoveLS.YMM>, 409 PD, VEX, VEX_L, VEX_WIG; 410 } 411 412 let Predicates = [UseSSE1] in { 413 defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", 414 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 415 PS; 416 defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", 417 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 418 PS; 419 } 420 let Predicates = [UseSSE2] in { 421 defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", 422 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 423 PD; 424 defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", 425 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 426 PD; 427 } 428 429 let Predicates = [HasAVX, NoVLX] in { 430 let SchedRW = [SchedWriteFMoveLS.XMM.MR] in { 431 def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 432 "movaps\t{$src, $dst|$dst, $src}", 433 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>, 434 VEX, VEX_WIG; 435 def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 436 "movapd\t{$src, $dst|$dst, $src}", 437 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>, 438 VEX, VEX_WIG; 439 def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 440 "movups\t{$src, $dst|$dst, $src}", 441 [(store (v4f32 VR128:$src), addr:$dst)]>, 442 VEX, VEX_WIG; 443 def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 444 "movupd\t{$src, $dst|$dst, $src}", 445 [(store (v2f64 VR128:$src), addr:$dst)]>, 446 VEX, VEX_WIG; 447 } // SchedRW 448 449 let SchedRW = [SchedWriteFMoveLS.YMM.MR] in { 450 def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 451 "movaps\t{$src, $dst|$dst, $src}", 452 [(alignedstore (v8f32 VR256:$src), addr:$dst)]>, 453 VEX, VEX_L, VEX_WIG; 454 def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 455 "movapd\t{$src, $dst|$dst, $src}", 456 [(alignedstore (v4f64 VR256:$src), addr:$dst)]>, 457 VEX, VEX_L, VEX_WIG; 458 def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 459 "movups\t{$src, $dst|$dst, $src}", 460 [(store (v8f32 VR256:$src), addr:$dst)]>, 461 VEX, VEX_L, VEX_WIG; 462 def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 463 "movupd\t{$src, $dst|$dst, $src}", 464 [(store (v4f64 VR256:$src), addr:$dst)]>, 465 VEX, VEX_L, VEX_WIG; 466 } // SchedRW 467 } // Predicate 468 469 // For disassembler 470 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 471 isMoveReg = 1 in { 472 let SchedRW = [SchedWriteFMoveLS.XMM.RR] in { 473 def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst), 474 (ins VR128:$src), 475 "movaps\t{$src, $dst|$dst, $src}", []>, 476 VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">; 477 def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst), 478 (ins VR128:$src), 479 "movapd\t{$src, $dst|$dst, $src}", []>, 480 VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">; 481 def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst), 482 (ins VR128:$src), 483 "movups\t{$src, $dst|$dst, $src}", []>, 484 VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">; 485 def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst), 486 (ins VR128:$src), 487 "movupd\t{$src, $dst|$dst, $src}", []>, 488 VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">; 489 } // SchedRW 490 491 let SchedRW = [SchedWriteFMoveLS.YMM.RR] in { 492 def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst), 493 (ins VR256:$src), 494 "movaps\t{$src, $dst|$dst, $src}", []>, 495 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">; 496 def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst), 497 (ins VR256:$src), 498 "movapd\t{$src, $dst|$dst, $src}", []>, 499 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">; 500 def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst), 501 (ins VR256:$src), 502 "movups\t{$src, $dst|$dst, $src}", []>, 503 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">; 504 def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst), 505 (ins VR256:$src), 506 "movupd\t{$src, $dst|$dst, $src}", []>, 507 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">; 508 } // SchedRW 509 } // Predicate 510 511 // Aliases to help the assembler pick two byte VEX encodings by swapping the 512 // operands relative to the normal instructions to use VEX.R instead of VEX.B. 513 def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}", 514 (VMOVAPSrr_REV VR128L:$dst, VR128H:$src), 0>; 515 def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}", 516 (VMOVAPDrr_REV VR128L:$dst, VR128H:$src), 0>; 517 def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}", 518 (VMOVUPSrr_REV VR128L:$dst, VR128H:$src), 0>; 519 def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}", 520 (VMOVUPDrr_REV VR128L:$dst, VR128H:$src), 0>; 521 def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}", 522 (VMOVAPSYrr_REV VR256L:$dst, VR256H:$src), 0>; 523 def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}", 524 (VMOVAPDYrr_REV VR256L:$dst, VR256H:$src), 0>; 525 def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}", 526 (VMOVUPSYrr_REV VR256L:$dst, VR256H:$src), 0>; 527 def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}", 528 (VMOVUPDYrr_REV VR256L:$dst, VR256H:$src), 0>; 529 530 // Reversed version with ".s" suffix for GAS compatibility. 531 def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}", 532 (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>; 533 def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}", 534 (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>; 535 def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}", 536 (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>; 537 def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}", 538 (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>; 539 def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}", 540 (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>; 541 def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}", 542 (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>; 543 def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}", 544 (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>; 545 def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}", 546 (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>; 547 548 let SchedRW = [SchedWriteFMoveLS.XMM.MR] in { 549 def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 550 "movaps\t{$src, $dst|$dst, $src}", 551 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>; 552 def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 553 "movapd\t{$src, $dst|$dst, $src}", 554 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>; 555 def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 556 "movups\t{$src, $dst|$dst, $src}", 557 [(store (v4f32 VR128:$src), addr:$dst)]>; 558 def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 559 "movupd\t{$src, $dst|$dst, $src}", 560 [(store (v2f64 VR128:$src), addr:$dst)]>; 561 } // SchedRW 562 563 // For disassembler 564 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 565 isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in { 566 def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 567 "movaps\t{$src, $dst|$dst, $src}", []>, 568 FoldGenData<"MOVAPSrr">; 569 def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 570 "movapd\t{$src, $dst|$dst, $src}", []>, 571 FoldGenData<"MOVAPDrr">; 572 def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 573 "movups\t{$src, $dst|$dst, $src}", []>, 574 FoldGenData<"MOVUPSrr">; 575 def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 576 "movupd\t{$src, $dst|$dst, $src}", []>, 577 FoldGenData<"MOVUPDrr">; 578 } 579 580 // Reversed version with ".s" suffix for GAS compatibility. 581 def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}", 582 (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>; 583 def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}", 584 (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>; 585 def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}", 586 (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>; 587 def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}", 588 (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>; 589 590 let Predicates = [HasAVX, NoVLX] in { 591 // 256-bit load/store need to use floating point load/store in case we don't 592 // have AVX2. Execution domain fixing will convert to integer if AVX2 is 593 // available and changing the domain is beneficial. 594 def : Pat<(alignedloadv4i64 addr:$src), 595 (VMOVAPSYrm addr:$src)>; 596 def : Pat<(loadv4i64 addr:$src), 597 (VMOVUPSYrm addr:$src)>; 598 def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst), 599 (VMOVAPSYmr addr:$dst, VR256:$src)>; 600 def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst), 601 (VMOVAPSYmr addr:$dst, VR256:$src)>; 602 def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst), 603 (VMOVAPSYmr addr:$dst, VR256:$src)>; 604 def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst), 605 (VMOVAPSYmr addr:$dst, VR256:$src)>; 606 def : Pat<(store (v4i64 VR256:$src), addr:$dst), 607 (VMOVUPSYmr addr:$dst, VR256:$src)>; 608 def : Pat<(store (v8i32 VR256:$src), addr:$dst), 609 (VMOVUPSYmr addr:$dst, VR256:$src)>; 610 def : Pat<(store (v16i16 VR256:$src), addr:$dst), 611 (VMOVUPSYmr addr:$dst, VR256:$src)>; 612 def : Pat<(store (v32i8 VR256:$src), addr:$dst), 613 (VMOVUPSYmr addr:$dst, VR256:$src)>; 614 } 615 616 // Use movaps / movups for SSE integer load / store (one byte shorter). 617 // The instructions selected below are then converted to MOVDQA/MOVDQU 618 // during the SSE domain pass. 619 let Predicates = [UseSSE1] in { 620 def : Pat<(alignedloadv2i64 addr:$src), 621 (MOVAPSrm addr:$src)>; 622 def : Pat<(loadv2i64 addr:$src), 623 (MOVUPSrm addr:$src)>; 624 625 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), 626 (MOVAPSmr addr:$dst, VR128:$src)>; 627 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), 628 (MOVAPSmr addr:$dst, VR128:$src)>; 629 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), 630 (MOVAPSmr addr:$dst, VR128:$src)>; 631 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), 632 (MOVAPSmr addr:$dst, VR128:$src)>; 633 def : Pat<(store (v2i64 VR128:$src), addr:$dst), 634 (MOVUPSmr addr:$dst, VR128:$src)>; 635 def : Pat<(store (v4i32 VR128:$src), addr:$dst), 636 (MOVUPSmr addr:$dst, VR128:$src)>; 637 def : Pat<(store (v8i16 VR128:$src), addr:$dst), 638 (MOVUPSmr addr:$dst, VR128:$src)>; 639 def : Pat<(store (v16i8 VR128:$src), addr:$dst), 640 (MOVUPSmr addr:$dst, VR128:$src)>; 641 } 642 643 //===----------------------------------------------------------------------===// 644 // SSE 1 & 2 - Move Low packed FP Instructions 645 //===----------------------------------------------------------------------===// 646 647 multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode pdnode, 648 string base_opc, string asm_opr> { 649 // No pattern as they need be special cased between high and low. 650 let hasSideEffects = 0, mayLoad = 1 in 651 def PSrm : PI<opc, MRMSrcMem, 652 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 653 !strconcat(base_opc, "s", asm_opr), 654 [], SSEPackedSingle>, PS, 655 Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>; 656 657 def PDrm : PI<opc, MRMSrcMem, 658 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 659 !strconcat(base_opc, "d", asm_opr), 660 [(set VR128:$dst, (v2f64 (pdnode VR128:$src1, 661 (scalar_to_vector (loadf64 addr:$src2)))))], 662 SSEPackedDouble>, PD, 663 Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>; 664 } 665 666 multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode, 667 string base_opc> { 668 let Predicates = [UseAVX] in 669 defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc, 670 "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, 671 VEX_4V, VEX_WIG; 672 673 let Constraints = "$src1 = $dst" in 674 defm NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc, 675 "\t{$src2, $dst|$dst, $src2}">; 676 } 677 678 defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">; 679 680 let SchedRW = [WriteFStore] in { 681 let Predicates = [UseAVX] in { 682 def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 683 "movlps\t{$src, $dst|$dst, $src}", 684 [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)), 685 (iPTR 0))), addr:$dst)]>, 686 VEX, VEX_WIG; 687 def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 688 "movlpd\t{$src, $dst|$dst, $src}", 689 [(store (f64 (extractelt (v2f64 VR128:$src), 690 (iPTR 0))), addr:$dst)]>, 691 VEX, VEX_WIG; 692 }// UseAVX 693 def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 694 "movlps\t{$src, $dst|$dst, $src}", 695 [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)), 696 (iPTR 0))), addr:$dst)]>; 697 def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 698 "movlpd\t{$src, $dst|$dst, $src}", 699 [(store (f64 (extractelt (v2f64 VR128:$src), 700 (iPTR 0))), addr:$dst)]>; 701 } // SchedRW 702 703 let Predicates = [UseSSE1] in { 704 // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS 705 def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)), 706 (iPTR 0))), addr:$src1), 707 (MOVLPSmr addr:$src1, VR128:$src2)>; 708 709 // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll 710 // end up with a movsd or blend instead of shufp. 711 // No need for aligned load, we're only loading 64-bits. 712 def : Pat<(X86Shufp (loadv4f32 addr:$src2), VR128:$src1, (i8 -28)), 713 (MOVLPSrm VR128:$src1, addr:$src2)>; 714 } 715 716 //===----------------------------------------------------------------------===// 717 // SSE 1 & 2 - Move Hi packed FP Instructions 718 //===----------------------------------------------------------------------===// 719 720 defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">; 721 722 let SchedRW = [WriteFStore] in { 723 // v2f64 extract element 1 is always custom lowered to unpack high to low 724 // and extract element 0 so the non-store version isn't too horrible. 725 let Predicates = [UseAVX] in { 726 def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 727 "movhps\t{$src, $dst|$dst, $src}", 728 [(store (f64 (extractelt 729 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), 730 (bc_v2f64 (v4f32 VR128:$src))), 731 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG; 732 def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 733 "movhpd\t{$src, $dst|$dst, $src}", 734 [(store (f64 (extractelt 735 (v2f64 (X86Unpckh VR128:$src, VR128:$src)), 736 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG; 737 } // UseAVX 738 def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 739 "movhps\t{$src, $dst|$dst, $src}", 740 [(store (f64 (extractelt 741 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), 742 (bc_v2f64 (v4f32 VR128:$src))), 743 (iPTR 0))), addr:$dst)]>; 744 def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 745 "movhpd\t{$src, $dst|$dst, $src}", 746 [(store (f64 (extractelt 747 (v2f64 (X86Unpckh VR128:$src, VR128:$src)), 748 (iPTR 0))), addr:$dst)]>; 749 } // SchedRW 750 751 let Predicates = [UseAVX] in { 752 // Also handle an i64 load because that may get selected as a faster way to 753 // load the data. 754 def : Pat<(v2f64 (X86Unpckl VR128:$src1, 755 (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), 756 (VMOVHPDrm VR128:$src1, addr:$src2)>; 757 758 def : Pat<(store (f64 (extractelt 759 (v2f64 (X86VPermilpi VR128:$src, (i8 1))), 760 (iPTR 0))), addr:$dst), 761 (VMOVHPDmr addr:$dst, VR128:$src)>; 762 } 763 764 let Predicates = [UseSSE1] in { 765 // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll 766 // end up with a movsd or blend instead of shufp. 767 // No need for aligned load, we're only loading 64-bits. 768 def : Pat<(X86Movlhps VR128:$src1, (loadv4f32 addr:$src2)), 769 (MOVHPSrm VR128:$src1, addr:$src2)>; 770 } 771 772 let Predicates = [UseSSE2] in { 773 // MOVHPD patterns 774 775 // Also handle an i64 load because that may get selected as a faster way to 776 // load the data. 777 def : Pat<(v2f64 (X86Unpckl VR128:$src1, 778 (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), 779 (MOVHPDrm VR128:$src1, addr:$src2)>; 780 781 def : Pat<(store (f64 (extractelt 782 (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))), 783 (iPTR 0))), addr:$dst), 784 (MOVHPDmr addr:$dst, VR128:$src)>; 785 } 786 787 //===----------------------------------------------------------------------===// 788 // SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions 789 //===----------------------------------------------------------------------===// 790 791 let Predicates = [UseAVX] in { 792 def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst), 793 (ins VR128:$src1, VR128:$src2), 794 "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 795 [(set VR128:$dst, 796 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>, 797 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG; 798 let isCommutable = 1 in 799 def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst), 800 (ins VR128:$src1, VR128:$src2), 801 "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 802 [(set VR128:$dst, 803 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>, 804 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG, 805 NotMemoryFoldable; 806 } 807 let Constraints = "$src1 = $dst" in { 808 def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst), 809 (ins VR128:$src1, VR128:$src2), 810 "movlhps\t{$src2, $dst|$dst, $src2}", 811 [(set VR128:$dst, 812 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>, 813 Sched<[SchedWriteFShuffle.XMM]>; 814 let isCommutable = 1 in 815 def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst), 816 (ins VR128:$src1, VR128:$src2), 817 "movhlps\t{$src2, $dst|$dst, $src2}", 818 [(set VR128:$dst, 819 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>, 820 Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable; 821 } 822 823 // TODO: This is largely to trick fastisel into ignoring the pattern. 824 def UnpckhUnary : PatFrag<(ops node:$src1, node:$src2), 825 (X86Unpckh node:$src1, node:$src2), [{ 826 return N->getOperand(0) == N->getOperand(1); 827 }]>; 828 829 let Predicates = [UseSSE2] in { 830 // TODO: This is a hack pattern to allow lowering to emit unpckh instead of 831 // movhlps for sse2 without changing a bunch of tests. 832 def : Pat<(v2f64 (UnpckhUnary VR128:$src, VR128:$src)), 833 (MOVHLPSrr VR128:$src, VR128:$src)>; 834 } 835 836 //===----------------------------------------------------------------------===// 837 // SSE 1 & 2 - Conversion Instructions 838 //===----------------------------------------------------------------------===// 839 840 multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 841 SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, 842 string asm, X86FoldableSchedWrite sched> { 843 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, 844 [(set DstRC:$dst, (OpNode SrcRC:$src))]>, 845 Sched<[sched]>; 846 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, 847 [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, 848 Sched<[sched.Folded]>; 849 } 850 851 multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop, 852 ValueType DstTy, ValueType SrcTy, PatFrag ld_frag, 853 string asm, Domain d, X86FoldableSchedWrite sched> { 854 let hasSideEffects = 0 in { 855 def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm, 856 [(set RC:$dst, (DstTy (sint_to_fp (SrcTy RC:$src))))], d>, 857 Sched<[sched]>; 858 let mayLoad = 1 in 859 def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm, 860 [(set RC:$dst, (DstTy (sint_to_fp 861 (SrcTy (bitconvert (ld_frag addr:$src))))))], d>, 862 Sched<[sched.Folded]>; 863 } 864 } 865 866 multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 867 X86MemOperand x86memop, string asm, 868 X86FoldableSchedWrite sched> { 869 let hasSideEffects = 0, Predicates = [UseAVX] in { 870 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), 871 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, 872 Sched<[sched]>; 873 let mayLoad = 1 in 874 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), 875 (ins DstRC:$src1, x86memop:$src), 876 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, 877 Sched<[sched.Folded, ReadAfterLd]>; 878 } // hasSideEffects = 0 879 } 880 881 let Predicates = [UseAVX] in { 882 defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, 883 "cvttss2si\t{$src, $dst|$dst, $src}", 884 WriteCvtSS2I>, 885 XS, VEX, VEX_LIG; 886 defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, 887 "cvttss2si\t{$src, $dst|$dst, $src}", 888 WriteCvtSS2I>, 889 XS, VEX, VEX_W, VEX_LIG; 890 defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, 891 "cvttsd2si\t{$src, $dst|$dst, $src}", 892 WriteCvtSD2I>, 893 XD, VEX, VEX_LIG; 894 defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, 895 "cvttsd2si\t{$src, $dst|$dst, $src}", 896 WriteCvtSD2I>, 897 XD, VEX, VEX_W, VEX_LIG; 898 899 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", 900 (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">; 901 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", 902 (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">; 903 def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", 904 (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">; 905 def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", 906 (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">; 907 def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", 908 (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">; 909 def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", 910 (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">; 911 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", 912 (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">; 913 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", 914 (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">; 915 } 916 // The assembler can recognize rr 64-bit instructions by seeing a rxx 917 // register, but the same isn't true when only using memory operands, 918 // provide other assembly "l" and "q" forms to address this explicitly 919 // where appropriate to do so. 920 defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}", 921 WriteCvtI2SS>, XS, VEX_4V, VEX_LIG; 922 defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}", 923 WriteCvtI2SS>, XS, VEX_4V, VEX_W, VEX_LIG; 924 defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}", 925 WriteCvtI2SD>, XD, VEX_4V, VEX_LIG; 926 defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}", 927 WriteCvtI2SD>, XD, VEX_4V, VEX_W, VEX_LIG; 928 929 let Predicates = [UseAVX] in { 930 def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", 931 (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">; 932 def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", 933 (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">; 934 935 def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), 936 (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; 937 def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))), 938 (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; 939 def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))), 940 (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; 941 def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))), 942 (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; 943 944 def : Pat<(f32 (sint_to_fp GR32:$src)), 945 (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>; 946 def : Pat<(f32 (sint_to_fp GR64:$src)), 947 (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>; 948 def : Pat<(f64 (sint_to_fp GR32:$src)), 949 (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>; 950 def : Pat<(f64 (sint_to_fp GR64:$src)), 951 (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>; 952 } 953 954 defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, 955 "cvttss2si\t{$src, $dst|$dst, $src}", 956 WriteCvtSS2I>, XS; 957 defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, 958 "cvttss2si\t{$src, $dst|$dst, $src}", 959 WriteCvtSS2I>, XS, REX_W; 960 defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, 961 "cvttsd2si\t{$src, $dst|$dst, $src}", 962 WriteCvtSD2I>, XD; 963 defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, 964 "cvttsd2si\t{$src, $dst|$dst, $src}", 965 WriteCvtSD2I>, XD, REX_W; 966 defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32, 967 "cvtsi2ss{l}\t{$src, $dst|$dst, $src}", 968 WriteCvtI2SS>, XS; 969 defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64, 970 "cvtsi2ss{q}\t{$src, $dst|$dst, $src}", 971 WriteCvtI2SS>, XS, REX_W; 972 defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32, 973 "cvtsi2sd{l}\t{$src, $dst|$dst, $src}", 974 WriteCvtI2SD>, XD; 975 defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64, 976 "cvtsi2sd{q}\t{$src, $dst|$dst, $src}", 977 WriteCvtI2SD>, XD, REX_W; 978 979 def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", 980 (CVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">; 981 def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", 982 (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">; 983 def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", 984 (CVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">; 985 def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", 986 (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">; 987 def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", 988 (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">; 989 def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", 990 (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">; 991 def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", 992 (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">; 993 def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", 994 (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">; 995 996 def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}", 997 (CVTSI2SSrm FR64:$dst, i32mem:$src), 0, "att">; 998 def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", 999 (CVTSI2SDrm FR64:$dst, i32mem:$src), 0, "att">; 1000 1001 // Conversion Instructions Intrinsics - Match intrinsics which expect MM 1002 // and/or XMM operand(s). 1003 1004 // FIXME: We probably want to match the rm form only when optimizing for 1005 // size, to avoid false depenendecies (see sse_fp_unop_s for details) 1006 multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 1007 Intrinsic Int, Operand memop, ComplexPattern mem_cpat, 1008 string asm, X86FoldableSchedWrite sched> { 1009 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), 1010 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 1011 [(set DstRC:$dst, (Int SrcRC:$src))]>, 1012 Sched<[sched]>; 1013 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), 1014 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 1015 [(set DstRC:$dst, (Int mem_cpat:$src))]>, 1016 Sched<[sched.Folded]>; 1017 } 1018 1019 multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC, 1020 RegisterClass DstRC, X86MemOperand x86memop, 1021 string asm, X86FoldableSchedWrite sched, 1022 bit Is2Addr = 1> { 1023 let hasSideEffects = 0 in { 1024 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2), 1025 !if(Is2Addr, 1026 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 1027 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 1028 []>, Sched<[sched]>; 1029 let mayLoad = 1 in 1030 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), 1031 (ins DstRC:$src1, x86memop:$src2), 1032 !if(Is2Addr, 1033 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 1034 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 1035 []>, Sched<[sched.Folded, ReadAfterLd]>; 1036 } 1037 } 1038 1039 let Predicates = [UseAVX] in { 1040 defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, 1041 int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si", 1042 WriteCvtSD2I>, XD, VEX, VEX_LIG; 1043 defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, 1044 int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si", 1045 WriteCvtSD2I>, XD, VEX, VEX_W, VEX_LIG; 1046 } 1047 defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si, 1048 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD; 1049 defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64, 1050 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD, REX_W; 1051 1052 1053 let isCodeGenOnly = 1 in { 1054 let Predicates = [UseAVX] in { 1055 defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1056 i32mem, "cvtsi2ss{l}", WriteCvtI2SS, 0>, XS, VEX_4V; 1057 defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1058 i64mem, "cvtsi2ss{q}", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_W; 1059 defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1060 i32mem, "cvtsi2sd{l}", WriteCvtI2SD, 0>, XD, VEX_4V; 1061 defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1062 i64mem, "cvtsi2sd{q}", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_W; 1063 } 1064 let Constraints = "$src1 = $dst" in { 1065 defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1066 i32mem, "cvtsi2ss{l}", WriteCvtI2SS>, XS; 1067 defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1068 i64mem, "cvtsi2ss{q}", WriteCvtI2SS>, XS, REX_W; 1069 defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1070 i32mem, "cvtsi2sd{l}", WriteCvtI2SD>, XD; 1071 defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1072 i64mem, "cvtsi2sd{q}", WriteCvtI2SD>, XD, REX_W; 1073 } 1074 } // isCodeGenOnly = 1 1075 1076 /// SSE 1 Only 1077 1078 // Aliases for intrinsics 1079 let isCodeGenOnly = 1 in { 1080 let Predicates = [UseAVX] in { 1081 defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, 1082 ssmem, sse_load_f32, "cvttss2si", 1083 WriteCvtSS2I>, XS, VEX; 1084 defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, 1085 int_x86_sse_cvttss2si64, ssmem, sse_load_f32, 1086 "cvttss2si", WriteCvtSS2I>, 1087 XS, VEX, VEX_W; 1088 defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, 1089 sdmem, sse_load_f64, "cvttsd2si", 1090 WriteCvtSS2I>, XD, VEX; 1091 defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, 1092 int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, 1093 "cvttsd2si", WriteCvtSS2I>, 1094 XD, VEX, VEX_W; 1095 } 1096 defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, 1097 ssmem, sse_load_f32, "cvttss2si", 1098 WriteCvtSS2I>, XS; 1099 defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, 1100 int_x86_sse_cvttss2si64, ssmem, sse_load_f32, 1101 "cvttss2si", WriteCvtSS2I>, XS, REX_W; 1102 defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, 1103 sdmem, sse_load_f64, "cvttsd2si", 1104 WriteCvtSD2I>, XD; 1105 defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, 1106 int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, 1107 "cvttsd2si", WriteCvtSD2I>, XD, REX_W; 1108 } // isCodeGenOnly = 1 1109 1110 let Predicates = [UseAVX] in { 1111 defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, 1112 ssmem, sse_load_f32, "cvtss2si", 1113 WriteCvtSS2I>, XS, VEX, VEX_LIG; 1114 defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, 1115 ssmem, sse_load_f32, "cvtss2si", 1116 WriteCvtSS2I>, XS, VEX, VEX_W, VEX_LIG; 1117 } 1118 defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, 1119 ssmem, sse_load_f32, "cvtss2si", 1120 WriteCvtSS2I>, XS; 1121 defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, 1122 ssmem, sse_load_f32, "cvtss2si", 1123 WriteCvtSS2I>, XS, REX_W; 1124 1125 defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64, 1126 "vcvtdq2ps\t{$src, $dst|$dst, $src}", 1127 SSEPackedSingle, WriteCvtI2PS>, 1128 PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG; 1129 defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64, 1130 "vcvtdq2ps\t{$src, $dst|$dst, $src}", 1131 SSEPackedSingle, WriteCvtI2PSY>, 1132 PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG; 1133 1134 defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64, 1135 "cvtdq2ps\t{$src, $dst|$dst, $src}", 1136 SSEPackedSingle, WriteCvtI2PS>, 1137 PS, Requires<[UseSSE2]>; 1138 1139 let Predicates = [UseAVX] in { 1140 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", 1141 (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1142 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", 1143 (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">; 1144 def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", 1145 (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1146 def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", 1147 (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">; 1148 def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", 1149 (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1150 def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", 1151 (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">; 1152 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", 1153 (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1154 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", 1155 (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">; 1156 } 1157 1158 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", 1159 (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1160 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", 1161 (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">; 1162 def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", 1163 (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1164 def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", 1165 (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">; 1166 def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", 1167 (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1168 def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", 1169 (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">; 1170 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", 1171 (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1172 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", 1173 (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">; 1174 1175 /// SSE 2 Only 1176 1177 // Convert scalar double to scalar single 1178 let hasSideEffects = 0, Predicates = [UseAVX] in { 1179 def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), 1180 (ins FR32:$src1, FR64:$src2), 1181 "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1182 VEX_4V, VEX_LIG, VEX_WIG, 1183 Sched<[WriteCvtSD2SS]>; 1184 let mayLoad = 1 in 1185 def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), 1186 (ins FR32:$src1, f64mem:$src2), 1187 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1188 XD, VEX_4V, VEX_LIG, VEX_WIG, 1189 Sched<[WriteCvtSD2SS.Folded, ReadAfterLd]>; 1190 } 1191 1192 def : Pat<(f32 (fpround FR64:$src)), 1193 (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>, 1194 Requires<[UseAVX]>; 1195 1196 def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), 1197 "cvtsd2ss\t{$src, $dst|$dst, $src}", 1198 [(set FR32:$dst, (fpround FR64:$src))]>, 1199 Sched<[WriteCvtSD2SS]>; 1200 def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), 1201 "cvtsd2ss\t{$src, $dst|$dst, $src}", 1202 [(set FR32:$dst, (fpround (loadf64 addr:$src)))]>, 1203 XD, Requires<[UseSSE2, OptForSize]>, 1204 Sched<[WriteCvtSD2SS.Folded]>; 1205 1206 let isCodeGenOnly = 1 in { 1207 def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg, 1208 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1209 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1210 [(set VR128:$dst, 1211 (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>, 1212 XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>, 1213 Sched<[WriteCvtSD2SS]>; 1214 def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem, 1215 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), 1216 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1217 [(set VR128:$dst, (int_x86_sse2_cvtsd2ss 1218 VR128:$src1, sse_load_f64:$src2))]>, 1219 XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>, 1220 Sched<[WriteCvtSD2SS.Folded, ReadAfterLd]>; 1221 let Constraints = "$src1 = $dst" in { 1222 def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg, 1223 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1224 "cvtsd2ss\t{$src2, $dst|$dst, $src2}", 1225 [(set VR128:$dst, 1226 (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>, 1227 XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>; 1228 def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem, 1229 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), 1230 "cvtsd2ss\t{$src2, $dst|$dst, $src2}", 1231 [(set VR128:$dst, (int_x86_sse2_cvtsd2ss 1232 VR128:$src1, sse_load_f64:$src2))]>, 1233 XD, Requires<[UseSSE2]>, 1234 Sched<[WriteCvtSD2SS.Folded, ReadAfterLd]>; 1235 } 1236 } // isCodeGenOnly = 1 1237 1238 // Convert scalar single to scalar double 1239 // SSE2 instructions with XS prefix 1240 let hasSideEffects = 0 in { 1241 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), 1242 (ins FR64:$src1, FR32:$src2), 1243 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1244 XS, VEX_4V, VEX_LIG, VEX_WIG, 1245 Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>; 1246 let mayLoad = 1 in 1247 def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), 1248 (ins FR64:$src1, f32mem:$src2), 1249 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1250 XS, VEX_4V, VEX_LIG, VEX_WIG, 1251 Sched<[WriteCvtSS2SD.Folded, ReadAfterLd]>, 1252 Requires<[UseAVX, OptForSize]>; 1253 } 1254 1255 def : Pat<(f64 (fpextend FR32:$src)), 1256 (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>; 1257 def : Pat<(fpextend (loadf32 addr:$src)), 1258 (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>; 1259 1260 def : Pat<(extloadf32 addr:$src), 1261 (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, 1262 Requires<[UseAVX, OptForSize]>; 1263 def : Pat<(extloadf32 addr:$src), 1264 (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>, 1265 Requires<[UseAVX, OptForSpeed]>; 1266 1267 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), 1268 "cvtss2sd\t{$src, $dst|$dst, $src}", 1269 [(set FR64:$dst, (fpextend FR32:$src))]>, 1270 XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>; 1271 def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), 1272 "cvtss2sd\t{$src, $dst|$dst, $src}", 1273 [(set FR64:$dst, (extloadf32 addr:$src))]>, 1274 XS, Requires<[UseSSE2, OptForSize]>, 1275 Sched<[WriteCvtSS2SD.Folded]>; 1276 1277 // extload f32 -> f64. This matches load+fpextend because we have a hack in 1278 // the isel (PreprocessForFPConvert) that can introduce loads after dag 1279 // combine. 1280 // Since these loads aren't folded into the fpextend, we have to match it 1281 // explicitly here. 1282 def : Pat<(fpextend (loadf32 addr:$src)), 1283 (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2, OptForSize]>; 1284 def : Pat<(extloadf32 addr:$src), 1285 (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>; 1286 1287 let isCodeGenOnly = 1, hasSideEffects = 0 in { 1288 def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg, 1289 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1290 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1291 []>, XS, VEX_4V, VEX_WIG, 1292 Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>; 1293 let mayLoad = 1 in 1294 def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem, 1295 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), 1296 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1297 []>, XS, VEX_4V, VEX_WIG, Requires<[HasAVX]>, 1298 Sched<[WriteCvtSS2SD.Folded, ReadAfterLd]>; 1299 let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix 1300 def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg, 1301 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1302 "cvtss2sd\t{$src2, $dst|$dst, $src2}", 1303 []>, XS, Requires<[UseSSE2]>, 1304 Sched<[WriteCvtSS2SD]>; 1305 let mayLoad = 1 in 1306 def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem, 1307 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), 1308 "cvtss2sd\t{$src2, $dst|$dst, $src2}", 1309 []>, XS, Requires<[UseSSE2]>, 1310 Sched<[WriteCvtSS2SD.Folded, ReadAfterLd]>; 1311 } 1312 } // isCodeGenOnly = 1 1313 1314 // Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and 1315 // (v)cvtss2sd intrinsic sequences from clang which produce unnecessary 1316 // vmovs{s,d} instructions 1317 let Predicates = [UseAVX] in { 1318 def : Pat<(v4f32 (X86Movss 1319 (v4f32 VR128:$dst), 1320 (v4f32 (scalar_to_vector 1321 (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), 1322 (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>; 1323 1324 def : Pat<(v2f64 (X86Movsd 1325 (v2f64 VR128:$dst), 1326 (v2f64 (scalar_to_vector 1327 (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), 1328 (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>; 1329 1330 def : Pat<(v4f32 (X86Movss 1331 (v4f32 VR128:$dst), 1332 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), 1333 (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>; 1334 1335 def : Pat<(v4f32 (X86Movss 1336 (v4f32 VR128:$dst), 1337 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))), 1338 (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>; 1339 1340 def : Pat<(v4f32 (X86Movss 1341 (v4f32 VR128:$dst), 1342 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), 1343 (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>; 1344 1345 def : Pat<(v4f32 (X86Movss 1346 (v4f32 VR128:$dst), 1347 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))), 1348 (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>; 1349 1350 def : Pat<(v2f64 (X86Movsd 1351 (v2f64 VR128:$dst), 1352 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), 1353 (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>; 1354 1355 def : Pat<(v2f64 (X86Movsd 1356 (v2f64 VR128:$dst), 1357 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))), 1358 (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>; 1359 1360 def : Pat<(v2f64 (X86Movsd 1361 (v2f64 VR128:$dst), 1362 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), 1363 (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>; 1364 1365 def : Pat<(v2f64 (X86Movsd 1366 (v2f64 VR128:$dst), 1367 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))), 1368 (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>; 1369 } // Predicates = [UseAVX] 1370 1371 let Predicates = [UseSSE2] in { 1372 def : Pat<(v4f32 (X86Movss 1373 (v4f32 VR128:$dst), 1374 (v4f32 (scalar_to_vector 1375 (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), 1376 (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>; 1377 1378 def : Pat<(v2f64 (X86Movsd 1379 (v2f64 VR128:$dst), 1380 (v2f64 (scalar_to_vector 1381 (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), 1382 (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>; 1383 1384 def : Pat<(v2f64 (X86Movsd 1385 (v2f64 VR128:$dst), 1386 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), 1387 (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>; 1388 1389 def : Pat<(v2f64 (X86Movsd 1390 (v2f64 VR128:$dst), 1391 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))), 1392 (CVTSI642SDrm_Int VR128:$dst, addr:$src)>; 1393 1394 def : Pat<(v2f64 (X86Movsd 1395 (v2f64 VR128:$dst), 1396 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), 1397 (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>; 1398 1399 def : Pat<(v2f64 (X86Movsd 1400 (v2f64 VR128:$dst), 1401 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))), 1402 (CVTSI2SDrm_Int VR128:$dst, addr:$src)>; 1403 } // Predicates = [UseSSE2] 1404 1405 let Predicates = [UseSSE1] in { 1406 def : Pat<(v4f32 (X86Movss 1407 (v4f32 VR128:$dst), 1408 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), 1409 (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>; 1410 1411 def : Pat<(v4f32 (X86Movss 1412 (v4f32 VR128:$dst), 1413 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))), 1414 (CVTSI642SSrm_Int VR128:$dst, addr:$src)>; 1415 1416 def : Pat<(v4f32 (X86Movss 1417 (v4f32 VR128:$dst), 1418 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), 1419 (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>; 1420 1421 def : Pat<(v4f32 (X86Movss 1422 (v4f32 VR128:$dst), 1423 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))), 1424 (CVTSI2SSrm_Int VR128:$dst, addr:$src)>; 1425 } // Predicates = [UseSSE1] 1426 1427 let Predicates = [HasAVX, NoVLX] in { 1428 // Convert packed single/double fp to doubleword 1429 def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1430 "cvtps2dq\t{$src, $dst|$dst, $src}", 1431 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>, 1432 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG; 1433 def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1434 "cvtps2dq\t{$src, $dst|$dst, $src}", 1435 [(set VR128:$dst, 1436 (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>, 1437 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG; 1438 def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 1439 "cvtps2dq\t{$src, $dst|$dst, $src}", 1440 [(set VR256:$dst, 1441 (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>, 1442 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG; 1443 def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 1444 "cvtps2dq\t{$src, $dst|$dst, $src}", 1445 [(set VR256:$dst, 1446 (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>, 1447 VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG; 1448 } 1449 def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1450 "cvtps2dq\t{$src, $dst|$dst, $src}", 1451 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>, 1452 Sched<[WriteCvtPS2I]>; 1453 def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1454 "cvtps2dq\t{$src, $dst|$dst, $src}", 1455 [(set VR128:$dst, 1456 (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>, 1457 Sched<[WriteCvtPS2ILd]>; 1458 1459 1460 // Convert Packed Double FP to Packed DW Integers 1461 let Predicates = [HasAVX, NoVLX] in { 1462 // The assembler can recognize rr 256-bit instructions by seeing a ymm 1463 // register, but the same isn't true when using memory operands instead. 1464 // Provide other assembly rr and rm forms to address this explicitly. 1465 def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1466 "vcvtpd2dq\t{$src, $dst|$dst, $src}", 1467 [(set VR128:$dst, 1468 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>, 1469 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG; 1470 1471 // XMM only 1472 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", 1473 (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>; 1474 def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1475 "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}", 1476 [(set VR128:$dst, 1477 (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX, 1478 Sched<[WriteCvtPD2ILd]>, VEX_WIG; 1479 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", 1480 (VCVTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">; 1481 1482 // YMM only 1483 def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1484 "vcvtpd2dq\t{$src, $dst|$dst, $src}", 1485 [(set VR128:$dst, 1486 (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>, 1487 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG; 1488 def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1489 "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", 1490 [(set VR128:$dst, 1491 (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>, 1492 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG; 1493 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}", 1494 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>; 1495 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}", 1496 (VCVTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">; 1497 } 1498 1499 def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1500 "cvtpd2dq\t{$src, $dst|$dst, $src}", 1501 [(set VR128:$dst, 1502 (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>, 1503 Sched<[WriteCvtPD2ILd]>; 1504 def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1505 "cvtpd2dq\t{$src, $dst|$dst, $src}", 1506 [(set VR128:$dst, 1507 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>, 1508 Sched<[WriteCvtPD2I]>; 1509 1510 // Convert with truncation packed single/double fp to doubleword 1511 // SSE2 packed instructions with XS prefix 1512 let Predicates = [HasAVX, NoVLX] in { 1513 def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1514 "cvttps2dq\t{$src, $dst|$dst, $src}", 1515 [(set VR128:$dst, 1516 (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>, 1517 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG; 1518 def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1519 "cvttps2dq\t{$src, $dst|$dst, $src}", 1520 [(set VR128:$dst, 1521 (v4i32 (X86cvttp2si (loadv4f32 addr:$src))))]>, 1522 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG; 1523 def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 1524 "cvttps2dq\t{$src, $dst|$dst, $src}", 1525 [(set VR256:$dst, 1526 (v8i32 (X86cvttp2si (v8f32 VR256:$src))))]>, 1527 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG; 1528 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 1529 "cvttps2dq\t{$src, $dst|$dst, $src}", 1530 [(set VR256:$dst, 1531 (v8i32 (X86cvttp2si (loadv8f32 addr:$src))))]>, 1532 VEX, VEX_L, 1533 Sched<[WriteCvtPS2IYLd]>, VEX_WIG; 1534 } 1535 1536 let Predicates = [HasAVX, NoVLX] in { 1537 def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), 1538 (VCVTTPS2DQrr VR128:$src)>; 1539 def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))), 1540 (VCVTTPS2DQrm addr:$src)>; 1541 def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))), 1542 (VCVTTPS2DQYrr VR256:$src)>; 1543 def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))), 1544 (VCVTTPS2DQYrm addr:$src)>; 1545 } 1546 1547 def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1548 "cvttps2dq\t{$src, $dst|$dst, $src}", 1549 [(set VR128:$dst, 1550 (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>, 1551 Sched<[WriteCvtPS2I]>; 1552 def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1553 "cvttps2dq\t{$src, $dst|$dst, $src}", 1554 [(set VR128:$dst, 1555 (v4i32 (X86cvttp2si (memopv4f32 addr:$src))))]>, 1556 Sched<[WriteCvtPS2ILd]>; 1557 1558 let Predicates = [UseSSE2] in { 1559 def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), 1560 (CVTTPS2DQrr VR128:$src)>; 1561 def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))), 1562 (CVTTPS2DQrm addr:$src)>; 1563 } 1564 1565 let Predicates = [HasAVX, NoVLX] in 1566 def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1567 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1568 [(set VR128:$dst, 1569 (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>, 1570 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG; 1571 1572 // The assembler can recognize rr 256-bit instructions by seeing a ymm 1573 // register, but the same isn't true when using memory operands instead. 1574 // Provide other assembly rr and rm forms to address this explicitly. 1575 1576 // XMM only 1577 def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", 1578 (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>; 1579 1580 let Predicates = [HasAVX, NoVLX] in 1581 def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1582 "cvttpd2dq{x}\t{$src, $dst|$dst, $src}", 1583 [(set VR128:$dst, 1584 (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))]>, 1585 VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG; 1586 def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", 1587 (VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">; 1588 1589 // YMM only 1590 let Predicates = [HasAVX, NoVLX] in { 1591 def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1592 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1593 [(set VR128:$dst, 1594 (v4i32 (X86cvttp2si (v4f64 VR256:$src))))]>, 1595 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG; 1596 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1597 "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", 1598 [(set VR128:$dst, 1599 (v4i32 (X86cvttp2si (loadv4f64 addr:$src))))]>, 1600 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG; 1601 } 1602 def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}", 1603 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>; 1604 def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}", 1605 (VCVTTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">; 1606 1607 let Predicates = [HasAVX, NoVLX] in { 1608 def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), 1609 (VCVTTPD2DQYrr VR256:$src)>; 1610 def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))), 1611 (VCVTTPD2DQYrm addr:$src)>; 1612 } 1613 1614 let Predicates = [HasAVX, NoVLX] in { 1615 def : Pat<(X86vzmovl (v2i64 (bitconvert 1616 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), 1617 (VCVTPD2DQrr VR128:$src)>; 1618 def : Pat<(X86vzmovl (v2i64 (bitconvert 1619 (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))), 1620 (VCVTPD2DQrm addr:$src)>; 1621 def : Pat<(X86vzmovl (v2i64 (bitconvert 1622 (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))), 1623 (VCVTTPD2DQrr VR128:$src)>; 1624 def : Pat<(X86vzmovl (v2i64 (bitconvert 1625 (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))), 1626 (VCVTTPD2DQrm addr:$src)>; 1627 } // Predicates = [HasAVX, NoVLX] 1628 1629 def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1630 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1631 [(set VR128:$dst, 1632 (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>, 1633 Sched<[WriteCvtPD2I]>; 1634 def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), 1635 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1636 [(set VR128:$dst, 1637 (v4i32 (X86cvttp2si (memopv2f64 addr:$src))))]>, 1638 Sched<[WriteCvtPD2ILd]>; 1639 1640 let Predicates = [UseSSE2] in { 1641 def : Pat<(X86vzmovl (v2i64 (bitconvert 1642 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), 1643 (CVTPD2DQrr VR128:$src)>; 1644 def : Pat<(X86vzmovl (v2i64 (bitconvert 1645 (v4i32 (X86cvtp2Int (memopv2f64 addr:$src)))))), 1646 (CVTPD2DQrm addr:$src)>; 1647 def : Pat<(X86vzmovl (v2i64 (bitconvert 1648 (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))), 1649 (CVTTPD2DQrr VR128:$src)>; 1650 def : Pat<(X86vzmovl (v2i64 (bitconvert 1651 (v4i32 (X86cvttp2si (memopv2f64 addr:$src)))))), 1652 (CVTTPD2DQrm addr:$src)>; 1653 } // Predicates = [UseSSE2] 1654 1655 // Convert packed single to packed double 1656 let Predicates = [HasAVX, NoVLX] in { 1657 // SSE2 instructions without OpSize prefix 1658 def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1659 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1660 [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>, 1661 PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG; 1662 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 1663 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1664 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>, 1665 PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG; 1666 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 1667 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1668 [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))]>, 1669 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG; 1670 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), 1671 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1672 [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>, 1673 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG; 1674 } 1675 1676 let Predicates = [UseSSE2] in { 1677 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1678 "cvtps2pd\t{$src, $dst|$dst, $src}", 1679 [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>, 1680 PS, Sched<[WriteCvtPS2PD]>; 1681 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 1682 "cvtps2pd\t{$src, $dst|$dst, $src}", 1683 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>, 1684 PS, Sched<[WriteCvtPS2PD.Folded]>; 1685 } 1686 1687 // Convert Packed DW Integers to Packed Double FP 1688 let Predicates = [HasAVX, NoVLX] in { 1689 let hasSideEffects = 0, mayLoad = 1 in 1690 def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 1691 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1692 [(set VR128:$dst, 1693 (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>, 1694 VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG; 1695 def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1696 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1697 [(set VR128:$dst, 1698 (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>, 1699 VEX, Sched<[WriteCvtI2PD]>, VEX_WIG; 1700 def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), 1701 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1702 [(set VR256:$dst, 1703 (v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))))]>, 1704 VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>, 1705 VEX_WIG; 1706 def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 1707 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1708 [(set VR256:$dst, 1709 (v4f64 (sint_to_fp (v4i32 VR128:$src))))]>, 1710 VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG; 1711 } 1712 1713 let hasSideEffects = 0, mayLoad = 1 in 1714 def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 1715 "cvtdq2pd\t{$src, $dst|$dst, $src}", 1716 [(set VR128:$dst, 1717 (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>, 1718 Sched<[WriteCvtI2PDLd]>; 1719 def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1720 "cvtdq2pd\t{$src, $dst|$dst, $src}", 1721 [(set VR128:$dst, 1722 (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>, 1723 Sched<[WriteCvtI2PD]>; 1724 1725 // AVX register conversion intrinsics 1726 let Predicates = [HasAVX, NoVLX] in { 1727 def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 1728 (VCVTDQ2PDrm addr:$src)>; 1729 def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), 1730 (VCVTDQ2PDrm addr:$src)>; 1731 } // Predicates = [HasAVX, NoVLX] 1732 1733 // SSE2 register conversion intrinsics 1734 let Predicates = [UseSSE2] in { 1735 def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 1736 (CVTDQ2PDrm addr:$src)>; 1737 def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), 1738 (CVTDQ2PDrm addr:$src)>; 1739 } // Predicates = [UseSSE2] 1740 1741 // Convert packed double to packed single 1742 // The assembler can recognize rr 256-bit instructions by seeing a ymm 1743 // register, but the same isn't true when using memory operands instead. 1744 // Provide other assembly rr and rm forms to address this explicitly. 1745 let Predicates = [HasAVX, NoVLX] in 1746 def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1747 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1748 [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>, 1749 VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG; 1750 1751 // XMM only 1752 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", 1753 (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>; 1754 let Predicates = [HasAVX, NoVLX] in 1755 def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1756 "cvtpd2ps{x}\t{$src, $dst|$dst, $src}", 1757 [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))]>, 1758 VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG; 1759 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", 1760 (VCVTPD2PSrm VR128:$dst, f128mem:$src), 0, "intel">; 1761 1762 // YMM only 1763 let Predicates = [HasAVX, NoVLX] in { 1764 def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1765 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1766 [(set VR128:$dst, (fpround VR256:$src))]>, 1767 VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG; 1768 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1769 "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", 1770 [(set VR128:$dst, (fpround (loadv4f64 addr:$src)))]>, 1771 VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG; 1772 } 1773 def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}", 1774 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>; 1775 def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}", 1776 (VCVTPD2PSYrm VR128:$dst, f256mem:$src), 0, "intel">; 1777 1778 def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1779 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1780 [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>, 1781 Sched<[WriteCvtPD2PS]>; 1782 def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1783 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1784 [(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))]>, 1785 Sched<[WriteCvtPD2PS.Folded]>; 1786 1787 // AVX 256-bit register conversion intrinsics 1788 // FIXME: Migrate SSE conversion intrinsics matching to use patterns as below 1789 // whenever possible to avoid declaring two versions of each one. 1790 1791 let Predicates = [HasAVX, NoVLX] in { 1792 // Match fpround and fpextend for 128/256-bit conversions 1793 def : Pat<(X86vzmovl (v2f64 (bitconvert 1794 (v4f32 (X86vfpround (v2f64 VR128:$src)))))), 1795 (VCVTPD2PSrr VR128:$src)>; 1796 def : Pat<(X86vzmovl (v2f64 (bitconvert 1797 (v4f32 (X86vfpround (loadv2f64 addr:$src)))))), 1798 (VCVTPD2PSrm addr:$src)>; 1799 } 1800 1801 let Predicates = [UseSSE2] in { 1802 // Match fpround and fpextend for 128 conversions 1803 def : Pat<(X86vzmovl (v2f64 (bitconvert 1804 (v4f32 (X86vfpround (v2f64 VR128:$src)))))), 1805 (CVTPD2PSrr VR128:$src)>; 1806 def : Pat<(X86vzmovl (v2f64 (bitconvert 1807 (v4f32 (X86vfpround (memopv2f64 addr:$src)))))), 1808 (CVTPD2PSrm addr:$src)>; 1809 } 1810 1811 //===----------------------------------------------------------------------===// 1812 // SSE 1 & 2 - Compare Instructions 1813 //===----------------------------------------------------------------------===// 1814 1815 // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions 1816 multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, 1817 Operand CC, SDNode OpNode, ValueType VT, 1818 PatFrag ld_frag, string asm, string asm_alt, 1819 X86FoldableSchedWrite sched> { 1820 let isCommutable = 1 in 1821 def rr : SIi8<0xC2, MRMSrcReg, 1822 (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, 1823 [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>, 1824 Sched<[sched]>; 1825 def rm : SIi8<0xC2, MRMSrcMem, 1826 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, 1827 [(set RC:$dst, (OpNode (VT RC:$src1), 1828 (ld_frag addr:$src2), imm:$cc))]>, 1829 Sched<[sched.Folded, ReadAfterLd]>; 1830 1831 // Accept explicit immediate argument form instead of comparison code. 1832 let isAsmParserOnly = 1, hasSideEffects = 0 in { 1833 def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst), 1834 (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, []>, 1835 Sched<[sched]>, NotMemoryFoldable; 1836 let mayLoad = 1 in 1837 def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst), 1838 (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, []>, 1839 Sched<[sched.Folded, ReadAfterLd]>, NotMemoryFoldable; 1840 } 1841 } 1842 1843 let ExeDomain = SSEPackedSingle in 1844 defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32, 1845 "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1846 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1847 SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG; 1848 let ExeDomain = SSEPackedDouble in 1849 defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64, 1850 "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1851 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1852 SchedWriteFCmpSizes.PD.Scl>, 1853 XD, VEX_4V, VEX_LIG, VEX_WIG; 1854 1855 let Constraints = "$src1 = $dst" in { 1856 let ExeDomain = SSEPackedSingle in 1857 defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32, 1858 "cmp${cc}ss\t{$src2, $dst|$dst, $src2}", 1859 "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1860 SchedWriteFCmpSizes.PS.Scl>, XS; 1861 let ExeDomain = SSEPackedDouble in 1862 defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64, 1863 "cmp${cc}sd\t{$src2, $dst|$dst, $src2}", 1864 "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1865 SchedWriteFCmpSizes.PD.Scl>, XD; 1866 } 1867 1868 multiclass sse12_cmp_scalar_int<Operand memop, Operand CC, 1869 Intrinsic Int, string asm, X86FoldableSchedWrite sched, 1870 ComplexPattern mem_cpat> { 1871 def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), 1872 (ins VR128:$src1, VR128:$src, CC:$cc), asm, 1873 [(set VR128:$dst, (Int VR128:$src1, 1874 VR128:$src, imm:$cc))]>, 1875 Sched<[sched]>; 1876 let mayLoad = 1 in 1877 def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), 1878 (ins VR128:$src1, memop:$src, CC:$cc), asm, 1879 [(set VR128:$dst, (Int VR128:$src1, 1880 mem_cpat:$src, imm:$cc))]>, 1881 Sched<[sched.Folded, ReadAfterLd]>; 1882 } 1883 1884 let isCodeGenOnly = 1 in { 1885 // Aliases to match intrinsics which expect XMM operand(s). 1886 let ExeDomain = SSEPackedSingle in 1887 defm VCMPSS : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss, 1888 "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}", 1889 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS, VEX_4V; 1890 let ExeDomain = SSEPackedDouble in 1891 defm VCMPSD : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd, 1892 "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}", 1893 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, 1894 XD, VEX_4V; 1895 let Constraints = "$src1 = $dst" in { 1896 let ExeDomain = SSEPackedSingle in 1897 defm CMPSS : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss, 1898 "cmp${cc}ss\t{$src, $dst|$dst, $src}", 1899 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS; 1900 let ExeDomain = SSEPackedDouble in 1901 defm CMPSD : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd, 1902 "cmp${cc}sd\t{$src, $dst|$dst, $src}", 1903 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD; 1904 } 1905 } 1906 1907 1908 // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS 1909 multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode, 1910 ValueType vt, X86MemOperand x86memop, 1911 PatFrag ld_frag, string OpcodeStr, 1912 X86FoldableSchedWrite sched> { 1913 let hasSideEffects = 0 in { 1914 def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 1915 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1916 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, 1917 Sched<[sched]>; 1918 let mayLoad = 1 in 1919 def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), 1920 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1921 [(set EFLAGS, (OpNode (vt RC:$src1), 1922 (ld_frag addr:$src2)))]>, 1923 Sched<[sched.Folded, ReadAfterLd]>; 1924 } 1925 } 1926 1927 // sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp 1928 multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode, 1929 ValueType vt, Operand memop, 1930 ComplexPattern mem_cpat, string OpcodeStr, 1931 X86FoldableSchedWrite sched> { 1932 def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 1933 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1934 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, 1935 Sched<[sched]>; 1936 let mayLoad = 1 in 1937 def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2), 1938 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1939 [(set EFLAGS, (OpNode (vt RC:$src1), 1940 mem_cpat:$src2))]>, 1941 Sched<[sched.Folded, ReadAfterLd]>; 1942 } 1943 1944 let Defs = [EFLAGS] in { 1945 defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, 1946 "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG; 1947 defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, 1948 "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG; 1949 let Pattern = []<dag> in { 1950 defm VCOMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32, 1951 "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG; 1952 defm VCOMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64, 1953 "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG; 1954 } 1955 1956 let isCodeGenOnly = 1 in { 1957 defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, 1958 sse_load_f32, "ucomiss", WriteFCom>, PS, VEX, VEX_WIG; 1959 defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, 1960 sse_load_f64, "ucomisd", WriteFCom>, PD, VEX, VEX_WIG; 1961 1962 defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, 1963 sse_load_f32, "comiss", WriteFCom>, PS, VEX, VEX_WIG; 1964 defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, 1965 sse_load_f64, "comisd", WriteFCom>, PD, VEX, VEX_WIG; 1966 } 1967 defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, 1968 "ucomiss", WriteFCom>, PS; 1969 defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, 1970 "ucomisd", WriteFCom>, PD; 1971 1972 let Pattern = []<dag> in { 1973 defm COMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32, 1974 "comiss", WriteFCom>, PS; 1975 defm COMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64, 1976 "comisd", WriteFCom>, PD; 1977 } 1978 1979 let isCodeGenOnly = 1 in { 1980 defm UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, 1981 sse_load_f32, "ucomiss", WriteFCom>, PS; 1982 defm UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, 1983 sse_load_f64, "ucomisd", WriteFCom>, PD; 1984 1985 defm COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, 1986 sse_load_f32, "comiss", WriteFCom>, PS; 1987 defm COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, 1988 sse_load_f64, "comisd", WriteFCom>, PD; 1989 } 1990 } // Defs = [EFLAGS] 1991 1992 // sse12_cmp_packed - sse 1 & 2 compare packed instructions 1993 multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, 1994 Operand CC, ValueType VT, string asm, 1995 string asm_alt, X86FoldableSchedWrite sched, 1996 Domain d, PatFrag ld_frag> { 1997 let isCommutable = 1 in 1998 def rri : PIi8<0xC2, MRMSrcReg, 1999 (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, 2000 [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, imm:$cc)))], d>, 2001 Sched<[sched]>; 2002 def rmi : PIi8<0xC2, MRMSrcMem, 2003 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, 2004 [(set RC:$dst, 2005 (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))], d>, 2006 Sched<[sched.Folded, ReadAfterLd]>; 2007 2008 // Accept explicit immediate argument form instead of comparison code. 2009 let isAsmParserOnly = 1, hasSideEffects = 0 in { 2010 def rri_alt : PIi8<0xC2, MRMSrcReg, 2011 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), 2012 asm_alt, [], d>, Sched<[sched]>, NotMemoryFoldable; 2013 let mayLoad = 1 in 2014 def rmi_alt : PIi8<0xC2, MRMSrcMem, 2015 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), 2016 asm_alt, [], d>, Sched<[sched.Folded, ReadAfterLd]>, 2017 NotMemoryFoldable; 2018 } 2019 } 2020 2021 defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, v4f32, 2022 "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2023 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 2024 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG; 2025 defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, v2f64, 2026 "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2027 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 2028 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG; 2029 defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, v8f32, 2030 "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2031 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 2032 SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG; 2033 defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, v4f64, 2034 "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2035 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 2036 SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG; 2037 let Constraints = "$src1 = $dst" in { 2038 defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, v4f32, 2039 "cmp${cc}ps\t{$src2, $dst|$dst, $src2}", 2040 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}", 2041 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS; 2042 defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, v2f64, 2043 "cmp${cc}pd\t{$src2, $dst|$dst, $src2}", 2044 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}", 2045 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD; 2046 } 2047 2048 def CommutableCMPCC : PatLeaf<(imm), [{ 2049 uint64_t Imm = N->getZExtValue() & 0x7; 2050 return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07); 2051 }]>; 2052 2053 // Patterns to select compares with loads in first operand. 2054 let Predicates = [HasAVX] in { 2055 def : Pat<(v4f64 (X86cmpp (loadv4f64 addr:$src2), VR256:$src1, 2056 CommutableCMPCC:$cc)), 2057 (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>; 2058 2059 def : Pat<(v8f32 (X86cmpp (loadv8f32 addr:$src2), VR256:$src1, 2060 CommutableCMPCC:$cc)), 2061 (VCMPPSYrmi VR256:$src1, addr:$src2, imm:$cc)>; 2062 2063 def : Pat<(v2f64 (X86cmpp (loadv2f64 addr:$src2), VR128:$src1, 2064 CommutableCMPCC:$cc)), 2065 (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; 2066 2067 def : Pat<(v4f32 (X86cmpp (loadv4f32 addr:$src2), VR128:$src1, 2068 CommutableCMPCC:$cc)), 2069 (VCMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>; 2070 2071 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, 2072 CommutableCMPCC:$cc)), 2073 (VCMPSDrm FR64:$src1, addr:$src2, imm:$cc)>; 2074 2075 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, 2076 CommutableCMPCC:$cc)), 2077 (VCMPSSrm FR32:$src1, addr:$src2, imm:$cc)>; 2078 } 2079 2080 let Predicates = [UseSSE2] in { 2081 def : Pat<(v2f64 (X86cmpp (memopv2f64 addr:$src2), VR128:$src1, 2082 CommutableCMPCC:$cc)), 2083 (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; 2084 2085 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, 2086 CommutableCMPCC:$cc)), 2087 (CMPSDrm FR64:$src1, addr:$src2, imm:$cc)>; 2088 } 2089 2090 let Predicates = [UseSSE1] in { 2091 def : Pat<(v4f32 (X86cmpp (memopv4f32 addr:$src2), VR128:$src1, 2092 CommutableCMPCC:$cc)), 2093 (CMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>; 2094 2095 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, 2096 CommutableCMPCC:$cc)), 2097 (CMPSSrm FR32:$src1, addr:$src2, imm:$cc)>; 2098 } 2099 2100 //===----------------------------------------------------------------------===// 2101 // SSE 1 & 2 - Shuffle Instructions 2102 //===----------------------------------------------------------------------===// 2103 2104 /// sse12_shuffle - sse 1 & 2 fp shuffle instructions 2105 multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, 2106 ValueType vt, string asm, PatFrag mem_frag, 2107 X86FoldableSchedWrite sched, Domain d> { 2108 def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), 2109 (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm, 2110 [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), 2111 (i8 imm:$src3))))], d>, 2112 Sched<[sched.Folded, ReadAfterLd]>; 2113 def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), 2114 (ins RC:$src1, RC:$src2, u8imm:$src3), asm, 2115 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, 2116 (i8 imm:$src3))))], d>, 2117 Sched<[sched]>; 2118 } 2119 2120 let Predicates = [HasAVX, NoVLX] in { 2121 defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, 2122 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2123 loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, 2124 PS, VEX_4V, VEX_WIG; 2125 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, 2126 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2127 loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>, 2128 PS, VEX_4V, VEX_L, VEX_WIG; 2129 defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, 2130 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2131 loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>, 2132 PD, VEX_4V, VEX_WIG; 2133 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, 2134 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2135 loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>, 2136 PD, VEX_4V, VEX_L, VEX_WIG; 2137 } 2138 let Constraints = "$src1 = $dst" in { 2139 defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, 2140 "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}", 2141 memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; 2142 defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64, 2143 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}", 2144 memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>, PD; 2145 } 2146 2147 //===----------------------------------------------------------------------===// 2148 // SSE 1 & 2 - Unpack FP Instructions 2149 //===----------------------------------------------------------------------===// 2150 2151 /// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave 2152 multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, 2153 PatFrag mem_frag, RegisterClass RC, 2154 X86MemOperand x86memop, string asm, 2155 X86FoldableSchedWrite sched, Domain d, 2156 bit IsCommutable = 0> { 2157 let isCommutable = IsCommutable in 2158 def rr : PI<opc, MRMSrcReg, 2159 (outs RC:$dst), (ins RC:$src1, RC:$src2), 2160 asm, [(set RC:$dst, 2161 (vt (OpNode RC:$src1, RC:$src2)))], d>, 2162 Sched<[sched]>; 2163 def rm : PI<opc, MRMSrcMem, 2164 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 2165 asm, [(set RC:$dst, 2166 (vt (OpNode RC:$src1, 2167 (mem_frag addr:$src2))))], d>, 2168 Sched<[sched.Folded, ReadAfterLd]>; 2169 } 2170 2171 let Predicates = [HasAVX, NoVLX] in { 2172 defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32, 2173 VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2174 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; 2175 defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64, 2176 VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2177 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG; 2178 defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32, 2179 VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2180 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; 2181 defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64, 2182 VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2183 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG; 2184 2185 defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32, 2186 VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2187 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; 2188 defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64, 2189 VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2190 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; 2191 defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32, 2192 VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2193 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; 2194 defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64, 2195 VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2196 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; 2197 }// Predicates = [HasAVX, NoVLX] 2198 2199 let Constraints = "$src1 = $dst" in { 2200 defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32, 2201 VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}", 2202 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; 2203 defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64, 2204 VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}", 2205 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD; 2206 defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32, 2207 VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}", 2208 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; 2209 defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64, 2210 VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}", 2211 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD; 2212 } // Constraints = "$src1 = $dst" 2213 2214 let Predicates = [HasAVX1Only] in { 2215 def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), 2216 (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; 2217 def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)), 2218 (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; 2219 def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), 2220 (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; 2221 def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)), 2222 (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; 2223 2224 def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))), 2225 (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; 2226 def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)), 2227 (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; 2228 def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))), 2229 (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; 2230 def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)), 2231 (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; 2232 } 2233 2234 //===----------------------------------------------------------------------===// 2235 // SSE 1 & 2 - Extract Floating-Point Sign mask 2236 //===----------------------------------------------------------------------===// 2237 2238 /// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave 2239 multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt, 2240 string asm, Domain d> { 2241 def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src), 2242 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 2243 [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>, 2244 Sched<[WriteFMOVMSK]>; 2245 } 2246 2247 let Predicates = [HasAVX] in { 2248 defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", 2249 SSEPackedSingle>, PS, VEX, VEX_WIG; 2250 defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd", 2251 SSEPackedDouble>, PD, VEX, VEX_WIG; 2252 defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps", 2253 SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG; 2254 defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd", 2255 SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG; 2256 } 2257 2258 defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", 2259 SSEPackedSingle>, PS; 2260 defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd", 2261 SSEPackedDouble>, PD; 2262 2263 //===---------------------------------------------------------------------===// 2264 // SSE2 - Packed Integer Logical Instructions 2265 //===---------------------------------------------------------------------===// 2266 2267 let ExeDomain = SSEPackedInt in { // SSE integer instructions 2268 2269 /// PDI_binop_rm - Simple SSE2 binary operator. 2270 multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 2271 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 2272 X86MemOperand x86memop, X86FoldableSchedWrite sched, 2273 bit IsCommutable, bit Is2Addr> { 2274 let isCommutable = IsCommutable in 2275 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 2276 (ins RC:$src1, RC:$src2), 2277 !if(Is2Addr, 2278 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 2279 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 2280 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 2281 Sched<[sched]>; 2282 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 2283 (ins RC:$src1, x86memop:$src2), 2284 !if(Is2Addr, 2285 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 2286 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 2287 [(set RC:$dst, (OpVT (OpNode RC:$src1, 2288 (bitconvert (memop_frag addr:$src2)))))]>, 2289 Sched<[sched.Folded, ReadAfterLd]>; 2290 } 2291 } // ExeDomain = SSEPackedInt 2292 2293 multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode, 2294 ValueType OpVT128, ValueType OpVT256, 2295 X86SchedWriteWidths sched, bit IsCommutable, 2296 Predicate prd> { 2297 let Predicates = [HasAVX, prd] in 2298 defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128, 2299 VR128, loadv2i64, i128mem, sched.XMM, 2300 IsCommutable, 0>, VEX_4V, VEX_WIG; 2301 2302 let Constraints = "$src1 = $dst" in 2303 defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128, 2304 memopv2i64, i128mem, sched.XMM, IsCommutable, 1>; 2305 2306 let Predicates = [HasAVX2, prd] in 2307 defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, 2308 OpVT256, VR256, loadv4i64, i256mem, sched.YMM, 2309 IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG; 2310 } 2311 2312 // These are ordered here for pattern ordering requirements with the fp versions 2313 2314 defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, 2315 SchedWriteVecLogic, 1, NoVLX>; 2316 defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, 2317 SchedWriteVecLogic, 1, NoVLX>; 2318 defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, 2319 SchedWriteVecLogic, 1, NoVLX>; 2320 defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, 2321 SchedWriteVecLogic, 0, NoVLX>; 2322 2323 //===----------------------------------------------------------------------===// 2324 // SSE 1 & 2 - Logical Instructions 2325 //===----------------------------------------------------------------------===// 2326 2327 /// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops 2328 /// 2329 /// There are no patterns here because isel prefers integer versions for SSE2 2330 /// and later. There are SSE1 v4f32 patterns later. 2331 multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, 2332 SDNode OpNode, X86SchedWriteWidths sched> { 2333 let Predicates = [HasAVX, NoVLX] in { 2334 defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle, 2335 !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM, 2336 [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG; 2337 2338 defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble, 2339 !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM, 2340 [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG; 2341 2342 defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, 2343 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM, 2344 [], [], 0>, PS, VEX_4V, VEX_WIG; 2345 2346 defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, 2347 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM, 2348 [], [], 0>, PD, VEX_4V, VEX_WIG; 2349 } 2350 2351 let Constraints = "$src1 = $dst" in { 2352 defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, 2353 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM, 2354 [], []>, PS; 2355 2356 defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, 2357 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM, 2358 [], []>, PD; 2359 } 2360 } 2361 2362 defm AND : sse12_fp_packed_logical<0x54, "and", and, SchedWriteFLogic>; 2363 defm OR : sse12_fp_packed_logical<0x56, "or", or, SchedWriteFLogic>; 2364 defm XOR : sse12_fp_packed_logical<0x57, "xor", xor, SchedWriteFLogic>; 2365 let isCommutable = 0 in 2366 defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>; 2367 2368 // If only AVX1 is supported, we need to handle integer operations with 2369 // floating point instructions since the integer versions aren't available. 2370 let Predicates = [HasAVX1Only] in { 2371 def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)), 2372 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2373 def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)), 2374 (VORPSYrr VR256:$src1, VR256:$src2)>; 2375 def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)), 2376 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2377 def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)), 2378 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2379 2380 def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)), 2381 (VANDPSYrm VR256:$src1, addr:$src2)>; 2382 def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)), 2383 (VORPSYrm VR256:$src1, addr:$src2)>; 2384 def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)), 2385 (VXORPSYrm VR256:$src1, addr:$src2)>; 2386 def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)), 2387 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2388 } 2389 2390 let Predicates = [HasAVX, NoVLX_Or_NoDQI] in { 2391 // Use packed logical operations for scalar ops. 2392 def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)), 2393 (COPY_TO_REGCLASS 2394 (v2f64 (VANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), 2395 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), 2396 FR64)>; 2397 def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)), 2398 (COPY_TO_REGCLASS 2399 (v2f64 (VORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), 2400 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), 2401 FR64)>; 2402 def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)), 2403 (COPY_TO_REGCLASS 2404 (v2f64 (VXORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), 2405 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), 2406 FR64)>; 2407 def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)), 2408 (COPY_TO_REGCLASS 2409 (v2f64 (VANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), 2410 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), 2411 FR64)>; 2412 2413 def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)), 2414 (COPY_TO_REGCLASS 2415 (v4f32 (VANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), 2416 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), 2417 FR32)>; 2418 def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)), 2419 (COPY_TO_REGCLASS 2420 (v4f32 (VORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), 2421 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), 2422 FR32)>; 2423 def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)), 2424 (COPY_TO_REGCLASS 2425 (v4f32 (VXORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), 2426 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), 2427 FR32)>; 2428 def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)), 2429 (COPY_TO_REGCLASS 2430 (v4f32 (VANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), 2431 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), 2432 FR32)>; 2433 } 2434 2435 let Predicates = [UseSSE1] in { 2436 // Use packed logical operations for scalar ops. 2437 def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)), 2438 (COPY_TO_REGCLASS 2439 (v4f32 (ANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), 2440 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), 2441 FR32)>; 2442 def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)), 2443 (COPY_TO_REGCLASS 2444 (v4f32 (ORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), 2445 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), 2446 FR32)>; 2447 def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)), 2448 (COPY_TO_REGCLASS 2449 (v4f32 (XORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), 2450 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), 2451 FR32)>; 2452 def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)), 2453 (COPY_TO_REGCLASS 2454 (v4f32 (ANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), 2455 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), 2456 FR32)>; 2457 } 2458 2459 let Predicates = [UseSSE2] in { 2460 // Use packed logical operations for scalar ops. 2461 def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)), 2462 (COPY_TO_REGCLASS 2463 (v2f64 (ANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), 2464 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), 2465 FR64)>; 2466 def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)), 2467 (COPY_TO_REGCLASS 2468 (v2f64 (ORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), 2469 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), 2470 FR64)>; 2471 def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)), 2472 (COPY_TO_REGCLASS 2473 (v2f64 (XORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), 2474 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), 2475 FR64)>; 2476 def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)), 2477 (COPY_TO_REGCLASS 2478 (v2f64 (ANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), 2479 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), 2480 FR64)>; 2481 } 2482 2483 // Patterns for packed operations when we don't have integer type available. 2484 def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)), 2485 (ANDPSrr VR128:$src1, VR128:$src2)>; 2486 def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)), 2487 (ORPSrr VR128:$src1, VR128:$src2)>; 2488 def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)), 2489 (XORPSrr VR128:$src1, VR128:$src2)>; 2490 def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)), 2491 (ANDNPSrr VR128:$src1, VR128:$src2)>; 2492 2493 def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)), 2494 (ANDPSrm VR128:$src1, addr:$src2)>; 2495 def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)), 2496 (ORPSrm VR128:$src1, addr:$src2)>; 2497 def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)), 2498 (XORPSrm VR128:$src1, addr:$src2)>; 2499 def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)), 2500 (ANDNPSrm VR128:$src1, addr:$src2)>; 2501 2502 //===----------------------------------------------------------------------===// 2503 // SSE 1 & 2 - Arithmetic Instructions 2504 //===----------------------------------------------------------------------===// 2505 2506 /// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and 2507 /// vector forms. 2508 /// 2509 /// In addition, we also have a special variant of the scalar form here to 2510 /// represent the associated intrinsic operation. This form is unlike the 2511 /// plain scalar form, in that it takes an entire vector (instead of a scalar) 2512 /// and leaves the top elements unmodified (therefore these cannot be commuted). 2513 /// 2514 /// These three forms can each be reg+reg or reg+mem. 2515 /// 2516 2517 /// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those 2518 /// classes below 2519 multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, 2520 SDNode OpNode, X86SchedWriteSizes sched> { 2521 let Predicates = [HasAVX, NoVLX] in { 2522 defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, 2523 VR128, v4f32, f128mem, loadv4f32, 2524 SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG; 2525 defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, 2526 VR128, v2f64, f128mem, loadv2f64, 2527 SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG; 2528 2529 defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), 2530 OpNode, VR256, v8f32, f256mem, loadv8f32, 2531 SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG; 2532 defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), 2533 OpNode, VR256, v4f64, f256mem, loadv4f64, 2534 SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG; 2535 } 2536 2537 let Constraints = "$src1 = $dst" in { 2538 defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, 2539 v4f32, f128mem, memopv4f32, SSEPackedSingle, 2540 sched.PS.XMM>, PS; 2541 defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, 2542 v2f64, f128mem, memopv2f64, SSEPackedDouble, 2543 sched.PD.XMM>, PD; 2544 } 2545 } 2546 2547 multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, 2548 X86SchedWriteSizes sched> { 2549 defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), 2550 OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>, 2551 XS, VEX_4V, VEX_LIG, VEX_WIG; 2552 defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), 2553 OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>, 2554 XD, VEX_4V, VEX_LIG, VEX_WIG; 2555 2556 let Constraints = "$src1 = $dst" in { 2557 defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), 2558 OpNode, FR32, f32mem, SSEPackedSingle, 2559 sched.PS.Scl>, XS; 2560 defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), 2561 OpNode, FR64, f64mem, SSEPackedDouble, 2562 sched.PD.Scl>, XD; 2563 } 2564 } 2565 2566 multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, 2567 SDPatternOperator OpNode, 2568 X86SchedWriteSizes sched> { 2569 defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32, 2570 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, 2571 SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG; 2572 defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64, 2573 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, 2574 SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG; 2575 2576 let Constraints = "$src1 = $dst" in { 2577 defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32, 2578 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, 2579 SSEPackedSingle, sched.PS.Scl>, XS; 2580 defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64, 2581 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, 2582 SSEPackedDouble, sched.PD.Scl>, XD; 2583 } 2584 } 2585 2586 // Binary Arithmetic instructions 2587 defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SchedWriteFAddSizes>, 2588 basic_sse12_fp_binop_s<0x58, "add", fadd, SchedWriteFAddSizes>, 2589 basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>; 2590 defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SchedWriteFMulSizes>, 2591 basic_sse12_fp_binop_s<0x59, "mul", fmul, SchedWriteFMulSizes>, 2592 basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>; 2593 let isCommutable = 0 in { 2594 defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SchedWriteFAddSizes>, 2595 basic_sse12_fp_binop_s<0x5C, "sub", fsub, SchedWriteFAddSizes>, 2596 basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>; 2597 defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SchedWriteFDivSizes>, 2598 basic_sse12_fp_binop_s<0x5E, "div", fdiv, SchedWriteFDivSizes>, 2599 basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>; 2600 defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, 2601 basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, 2602 basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>; 2603 defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>, 2604 basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>, 2605 basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>; 2606 } 2607 2608 let isCodeGenOnly = 1 in { 2609 defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>, 2610 basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>; 2611 defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>, 2612 basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>; 2613 } 2614 2615 // Patterns used to select SSE scalar fp arithmetic instructions from 2616 // either: 2617 // 2618 // (1) a scalar fp operation followed by a blend 2619 // 2620 // The effect is that the backend no longer emits unnecessary vector 2621 // insert instructions immediately after SSE scalar fp instructions 2622 // like addss or mulss. 2623 // 2624 // For example, given the following code: 2625 // __m128 foo(__m128 A, __m128 B) { 2626 // A[0] += B[0]; 2627 // return A; 2628 // } 2629 // 2630 // Previously we generated: 2631 // addss %xmm0, %xmm1 2632 // movss %xmm1, %xmm0 2633 // 2634 // We now generate: 2635 // addss %xmm1, %xmm0 2636 // 2637 // (2) a vector packed single/double fp operation followed by a vector insert 2638 // 2639 // The effect is that the backend converts the packed fp instruction 2640 // followed by a vector insert into a single SSE scalar fp instruction. 2641 // 2642 // For example, given the following code: 2643 // __m128 foo(__m128 A, __m128 B) { 2644 // __m128 C = A + B; 2645 // return (__m128) {c[0], a[1], a[2], a[3]}; 2646 // } 2647 // 2648 // Previously we generated: 2649 // addps %xmm0, %xmm1 2650 // movss %xmm1, %xmm0 2651 // 2652 // We now generate: 2653 // addss %xmm1, %xmm0 2654 2655 // TODO: Some canonicalization in lowering would simplify the number of 2656 // patterns we have to try to match. 2657 multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move, 2658 ValueType VT, ValueType EltTy, 2659 RegisterClass RC, Predicate BasePredicate> { 2660 let Predicates = [BasePredicate] in { 2661 // extracted scalar math op with insert via movss/movsd 2662 def : Pat<(VT (Move (VT VR128:$dst), 2663 (VT (scalar_to_vector 2664 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2665 RC:$src))))), 2666 (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst, 2667 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; 2668 } 2669 2670 // Repeat for AVX versions of the instructions. 2671 let Predicates = [UseAVX] in { 2672 // extracted scalar math op with insert via movss/movsd 2673 def : Pat<(VT (Move (VT VR128:$dst), 2674 (VT (scalar_to_vector 2675 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2676 RC:$src))))), 2677 (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst, 2678 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; 2679 } 2680 } 2681 2682 defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, UseSSE1>; 2683 defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, UseSSE1>; 2684 defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, UseSSE1>; 2685 defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, UseSSE1>; 2686 2687 defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, UseSSE2>; 2688 defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, UseSSE2>; 2689 defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, UseSSE2>; 2690 defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, UseSSE2>; 2691 2692 /// Unop Arithmetic 2693 /// In addition, we also have a special variant of the scalar form here to 2694 /// represent the associated intrinsic operation. This form is unlike the 2695 /// plain scalar form, in that it takes an entire vector (instead of a 2696 /// scalar) and leaves the top elements undefined. 2697 /// 2698 /// And, we have a special variant form for a full-vector intrinsic form. 2699 2700 /// sse_fp_unop_s - SSE1 unops in scalar form 2701 /// For the non-AVX defs, we need $src1 to be tied to $dst because 2702 /// the HW instructions are 2 operand / destructive. 2703 multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, 2704 ValueType ScalarVT, X86MemOperand x86memop, 2705 Operand intmemop, SDNode OpNode, Domain d, 2706 X86FoldableSchedWrite sched, Predicate target> { 2707 let hasSideEffects = 0 in { 2708 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1), 2709 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), 2710 [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>, 2711 Requires<[target]>; 2712 let mayLoad = 1 in 2713 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1), 2714 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), 2715 [(set RC:$dst, (OpNode (load addr:$src1)))], d>, 2716 Sched<[sched.Folded, ReadAfterLd]>, 2717 Requires<[target, OptForSize]>; 2718 2719 let isCodeGenOnly = 1, Constraints = "$src1 = $dst", ExeDomain = d in { 2720 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 2721 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, 2722 Sched<[sched]>; 2723 let mayLoad = 1 in 2724 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2), 2725 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, 2726 Sched<[sched.Folded, ReadAfterLd]>; 2727 } 2728 } 2729 2730 } 2731 2732 multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt, 2733 ComplexPattern int_cpat, Intrinsic Intr, 2734 Predicate target, string Suffix> { 2735 let Predicates = [target] in { 2736 // These are unary operations, but they are modeled as having 2 source operands 2737 // because the high elements of the destination are unchanged in SSE. 2738 def : Pat<(Intr VR128:$src), 2739 (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>; 2740 } 2741 // We don't want to fold scalar loads into these instructions unless 2742 // optimizing for size. This is because the folded instruction will have a 2743 // partial register update, while the unfolded sequence will not, e.g. 2744 // movss mem, %xmm0 2745 // rcpss %xmm0, %xmm0 2746 // which has a clobber before the rcp, vs. 2747 // rcpss mem, %xmm0 2748 let Predicates = [target, OptForSize] in { 2749 def : Pat<(Intr int_cpat:$src2), 2750 (!cast<Instruction>(NAME#m_Int) 2751 (vt (IMPLICIT_DEF)), addr:$src2)>; 2752 } 2753 } 2754 2755 multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, ComplexPattern int_cpat, 2756 Intrinsic Intr, Predicate target> { 2757 let Predicates = [target] in { 2758 def : Pat<(Intr VR128:$src), 2759 (!cast<Instruction>(NAME#r_Int) VR128:$src, 2760 VR128:$src)>; 2761 } 2762 let Predicates = [target, OptForSize] in { 2763 def : Pat<(Intr int_cpat:$src2), 2764 (!cast<Instruction>(NAME#m_Int) 2765 (vt (IMPLICIT_DEF)), addr:$src2)>; 2766 } 2767 } 2768 2769 multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, 2770 ValueType ScalarVT, X86MemOperand x86memop, 2771 Operand intmemop, SDNode OpNode, Domain d, 2772 X86FoldableSchedWrite sched, Predicate target> { 2773 let hasSideEffects = 0 in { 2774 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 2775 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2776 [], d>, Sched<[sched]>; 2777 let mayLoad = 1 in 2778 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 2779 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2780 [], d>, Sched<[sched.Folded, ReadAfterLd]>; 2781 let isCodeGenOnly = 1, ExeDomain = d in { 2782 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), 2783 (ins VR128:$src1, VR128:$src2), 2784 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2785 []>, Sched<[sched]>; 2786 let mayLoad = 1 in 2787 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), 2788 (ins VR128:$src1, intmemop:$src2), 2789 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2790 []>, Sched<[sched.Folded, ReadAfterLd]>; 2791 } 2792 } 2793 2794 // We don't want to fold scalar loads into these instructions unless 2795 // optimizing for size. This is because the folded instruction will have a 2796 // partial register update, while the unfolded sequence will not, e.g. 2797 // vmovss mem, %xmm0 2798 // vrcpss %xmm0, %xmm0, %xmm0 2799 // which has a clobber before the rcp, vs. 2800 // vrcpss mem, %xmm0, %xmm0 2801 // TODO: In theory, we could fold the load, and avoid the stall caused by 2802 // the partial register store, either in BreakFalseDeps or with smarter RA. 2803 let Predicates = [target] in { 2804 def : Pat<(OpNode RC:$src), (!cast<Instruction>(NAME#r) 2805 (ScalarVT (IMPLICIT_DEF)), RC:$src)>; 2806 } 2807 let Predicates = [target, OptForSize] in { 2808 def : Pat<(ScalarVT (OpNode (load addr:$src))), 2809 (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)), 2810 addr:$src)>; 2811 } 2812 } 2813 2814 /// sse1_fp_unop_p - SSE1 unops in packed form. 2815 multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, 2816 X86SchedWriteWidths sched, list<Predicate> prds> { 2817 let Predicates = prds in { 2818 def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2819 !strconcat("v", OpcodeStr, 2820 "ps\t{$src, $dst|$dst, $src}"), 2821 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>, 2822 VEX, Sched<[sched.XMM]>, VEX_WIG; 2823 def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2824 !strconcat("v", OpcodeStr, 2825 "ps\t{$src, $dst|$dst, $src}"), 2826 [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>, 2827 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG; 2828 def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 2829 !strconcat("v", OpcodeStr, 2830 "ps\t{$src, $dst|$dst, $src}"), 2831 [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>, 2832 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; 2833 def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 2834 !strconcat("v", OpcodeStr, 2835 "ps\t{$src, $dst|$dst, $src}"), 2836 [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>, 2837 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG; 2838 } 2839 2840 def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2841 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 2842 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>, 2843 Sched<[sched.XMM]>; 2844 def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2845 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 2846 [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>, 2847 Sched<[sched.XMM.Folded]>; 2848 } 2849 2850 /// sse2_fp_unop_p - SSE2 unops in vector forms. 2851 multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr, 2852 SDNode OpNode, X86SchedWriteWidths sched> { 2853 let Predicates = [HasAVX, NoVLX] in { 2854 def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2855 !strconcat("v", OpcodeStr, 2856 "pd\t{$src, $dst|$dst, $src}"), 2857 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>, 2858 VEX, Sched<[sched.XMM]>, VEX_WIG; 2859 def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2860 !strconcat("v", OpcodeStr, 2861 "pd\t{$src, $dst|$dst, $src}"), 2862 [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>, 2863 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG; 2864 def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 2865 !strconcat("v", OpcodeStr, 2866 "pd\t{$src, $dst|$dst, $src}"), 2867 [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>, 2868 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; 2869 def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 2870 !strconcat("v", OpcodeStr, 2871 "pd\t{$src, $dst|$dst, $src}"), 2872 [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>, 2873 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG; 2874 } 2875 2876 def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2877 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), 2878 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>, 2879 Sched<[sched.XMM]>; 2880 def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2881 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), 2882 [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>, 2883 Sched<[sched.XMM.Folded]>; 2884 } 2885 2886 multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode, 2887 X86SchedWriteWidths sched, Predicate AVXTarget> { 2888 defm SS : sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32, 2889 !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), 2890 UseSSE1, "SS">, XS; 2891 defm V#NAME#SS : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32, 2892 !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), 2893 AVXTarget>, 2894 XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable; 2895 } 2896 2897 multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, 2898 X86SchedWriteWidths sched, Predicate AVXTarget> { 2899 defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, f32, f32mem, 2900 ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS; 2901 defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, f32, 2902 f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>, 2903 XS, VEX_4V, VEX_LIG, VEX_WIG; 2904 } 2905 2906 multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, 2907 X86SchedWriteWidths sched, Predicate AVXTarget> { 2908 defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, f64, f64mem, 2909 sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD; 2910 defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, f64, 2911 f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>, 2912 XD, VEX_4V, VEX_LIG, VEX_WIG; 2913 } 2914 2915 // Square root. 2916 defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt, UseAVX>, 2917 sse1_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>, 2918 sse2_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt64, UseAVX>, 2919 sse2_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt64>; 2920 2921 // Reciprocal approximations. Note that these typically require refinement 2922 // in order to obtain suitable precision. 2923 defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>, 2924 sse1_fp_unop_s_intr<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>, 2925 sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>; 2926 defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>, 2927 sse1_fp_unop_s_intr<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>, 2928 sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>; 2929 2930 // There is no f64 version of the reciprocal approximation instructions. 2931 2932 multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Move, 2933 ValueType VT, Predicate BasePredicate> { 2934 let Predicates = [BasePredicate] in { 2935 def : Pat<(VT (Move VT:$dst, (scalar_to_vector 2936 (OpNode (extractelt VT:$src, 0))))), 2937 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; 2938 } 2939 2940 // Repeat for AVX versions of the instructions. 2941 let Predicates = [UseAVX] in { 2942 def : Pat<(VT (Move VT:$dst, (scalar_to_vector 2943 (OpNode (extractelt VT:$src, 0))))), 2944 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; 2945 } 2946 } 2947 2948 multiclass scalar_unary_math_imm_patterns<SDNode OpNode, string OpcPrefix, SDNode Move, 2949 ValueType VT, bits<8> ImmV, 2950 Predicate BasePredicate> { 2951 let Predicates = [BasePredicate] in { 2952 def : Pat<(VT (Move VT:$dst, (scalar_to_vector 2953 (OpNode (extractelt VT:$src, 0))))), 2954 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>; 2955 } 2956 2957 // Repeat for AVX versions of the instructions. 2958 let Predicates = [UseAVX] in { 2959 def : Pat<(VT (Move VT:$dst, (scalar_to_vector 2960 (OpNode (extractelt VT:$src, 0))))), 2961 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>; 2962 } 2963 } 2964 2965 defm : scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>; 2966 defm : scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>; 2967 2968 multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix, 2969 SDNode Move, ValueType VT, 2970 Predicate BasePredicate> { 2971 let Predicates = [BasePredicate] in { 2972 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), 2973 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; 2974 } 2975 2976 // Repeat for AVX versions of the instructions. 2977 let Predicates = [HasAVX] in { 2978 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), 2979 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; 2980 } 2981 } 2982 2983 defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss, 2984 v4f32, UseSSE1>; 2985 defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss, 2986 v4f32, UseSSE1>; 2987 2988 2989 //===----------------------------------------------------------------------===// 2990 // SSE 1 & 2 - Non-temporal stores 2991 //===----------------------------------------------------------------------===// 2992 2993 let AddedComplexity = 400 in { // Prefer non-temporal versions 2994 let Predicates = [HasAVX, NoVLX] in { 2995 let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in { 2996 def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), 2997 (ins f128mem:$dst, VR128:$src), 2998 "movntps\t{$src, $dst|$dst, $src}", 2999 [(alignednontemporalstore (v4f32 VR128:$src), 3000 addr:$dst)]>, VEX, VEX_WIG; 3001 def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs), 3002 (ins f128mem:$dst, VR128:$src), 3003 "movntpd\t{$src, $dst|$dst, $src}", 3004 [(alignednontemporalstore (v2f64 VR128:$src), 3005 addr:$dst)]>, VEX, VEX_WIG; 3006 } // SchedRW 3007 3008 let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in { 3009 def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), 3010 (ins f256mem:$dst, VR256:$src), 3011 "movntps\t{$src, $dst|$dst, $src}", 3012 [(alignednontemporalstore (v8f32 VR256:$src), 3013 addr:$dst)]>, VEX, VEX_L, VEX_WIG; 3014 def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), 3015 (ins f256mem:$dst, VR256:$src), 3016 "movntpd\t{$src, $dst|$dst, $src}", 3017 [(alignednontemporalstore (v4f64 VR256:$src), 3018 addr:$dst)]>, VEX, VEX_L, VEX_WIG; 3019 } // SchedRW 3020 3021 let ExeDomain = SSEPackedInt in { 3022 def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), 3023 (ins i128mem:$dst, VR128:$src), 3024 "movntdq\t{$src, $dst|$dst, $src}", 3025 [(alignednontemporalstore (v2i64 VR128:$src), 3026 addr:$dst)]>, VEX, VEX_WIG, 3027 Sched<[SchedWriteVecMoveLSNT.XMM.MR]>; 3028 def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), 3029 (ins i256mem:$dst, VR256:$src), 3030 "movntdq\t{$src, $dst|$dst, $src}", 3031 [(alignednontemporalstore (v4i64 VR256:$src), 3032 addr:$dst)]>, VEX, VEX_L, VEX_WIG, 3033 Sched<[SchedWriteVecMoveLSNT.YMM.MR]>; 3034 } // ExeDomain 3035 } // Predicates 3036 3037 let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in { 3038 def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3039 "movntps\t{$src, $dst|$dst, $src}", 3040 [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>; 3041 def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3042 "movntpd\t{$src, $dst|$dst, $src}", 3043 [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>; 3044 } // SchedRW 3045 3046 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in 3047 def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3048 "movntdq\t{$src, $dst|$dst, $src}", 3049 [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>; 3050 3051 let SchedRW = [WriteStoreNT] in { 3052 // There is no AVX form for instructions below this point 3053 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), 3054 "movnti{l}\t{$src, $dst|$dst, $src}", 3055 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>, 3056 PS, Requires<[HasSSE2]>; 3057 def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), 3058 "movnti{q}\t{$src, $dst|$dst, $src}", 3059 [(nontemporalstore (i64 GR64:$src), addr:$dst)]>, 3060 PS, Requires<[HasSSE2]>; 3061 } // SchedRW = [WriteStoreNT] 3062 3063 let Predicates = [HasAVX, NoVLX] in { 3064 def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst), 3065 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3066 def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst), 3067 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3068 def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst), 3069 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3070 3071 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), 3072 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3073 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), 3074 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3075 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), 3076 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3077 } 3078 3079 let Predicates = [UseSSE2] in { 3080 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), 3081 (MOVNTDQmr addr:$dst, VR128:$src)>; 3082 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), 3083 (MOVNTDQmr addr:$dst, VR128:$src)>; 3084 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), 3085 (MOVNTDQmr addr:$dst, VR128:$src)>; 3086 } 3087 3088 } // AddedComplexity 3089 3090 //===----------------------------------------------------------------------===// 3091 // SSE 1 & 2 - Prefetch and memory fence 3092 //===----------------------------------------------------------------------===// 3093 3094 // Prefetch intrinsic. 3095 let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in { 3096 def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src), 3097 "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB; 3098 def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src), 3099 "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB; 3100 def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src), 3101 "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB; 3102 def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src), 3103 "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB; 3104 } 3105 3106 // FIXME: How should flush instruction be modeled? 3107 let SchedRW = [WriteLoad] in { 3108 // Flush cache 3109 def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src), 3110 "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>, 3111 PS, Requires<[HasSSE2]>; 3112 } 3113 3114 let SchedRW = [WriteNop] in { 3115 // Pause. This "instruction" is encoded as "rep; nop", so even though it 3116 // was introduced with SSE2, it's backward compatible. 3117 def PAUSE : I<0x90, RawFrm, (outs), (ins), 3118 "pause", [(int_x86_sse2_pause)]>, OBXS; 3119 } 3120 3121 let SchedRW = [WriteFence] in { 3122 // Load, store, and memory fence 3123 // TODO: As with mfence, we may want to ease the availablity of sfence/lfence 3124 // to include any 64-bit target. 3125 def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>, 3126 PS, Requires<[HasSSE1]>; 3127 def LFENCE : I<0xAE, MRM_E8, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>, 3128 PS, Requires<[HasSSE2]>; 3129 def MFENCE : I<0xAE, MRM_F0, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>, 3130 PS, Requires<[HasMFence]>; 3131 } // SchedRW 3132 3133 def : Pat<(X86MFence), (MFENCE)>; 3134 3135 //===----------------------------------------------------------------------===// 3136 // SSE 1 & 2 - Load/Store XCSR register 3137 //===----------------------------------------------------------------------===// 3138 3139 def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), 3140 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, 3141 VEX, Sched<[WriteLDMXCSR]>, VEX_WIG; 3142 def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), 3143 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, 3144 VEX, Sched<[WriteSTMXCSR]>, VEX_WIG; 3145 3146 def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src), 3147 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, 3148 TB, Sched<[WriteLDMXCSR]>; 3149 def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst), 3150 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, 3151 TB, Sched<[WriteSTMXCSR]>; 3152 3153 //===---------------------------------------------------------------------===// 3154 // SSE2 - Move Aligned/Unaligned Packed Integer Instructions 3155 //===---------------------------------------------------------------------===// 3156 3157 let ExeDomain = SSEPackedInt in { // SSE integer instructions 3158 3159 let hasSideEffects = 0 in { 3160 def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3161 "movdqa\t{$src, $dst|$dst, $src}", []>, 3162 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG; 3163 def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3164 "movdqu\t{$src, $dst|$dst, $src}", []>, 3165 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG; 3166 def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3167 "movdqa\t{$src, $dst|$dst, $src}", []>, 3168 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG; 3169 def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3170 "movdqu\t{$src, $dst|$dst, $src}", []>, 3171 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG; 3172 } 3173 3174 // For Disassembler 3175 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { 3176 def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3177 "movdqa\t{$src, $dst|$dst, $src}", []>, 3178 Sched<[SchedWriteVecMoveLS.XMM.RR]>, 3179 VEX, VEX_WIG, FoldGenData<"VMOVDQArr">; 3180 def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), 3181 "movdqa\t{$src, $dst|$dst, $src}", []>, 3182 Sched<[SchedWriteVecMoveLS.YMM.RR]>, 3183 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">; 3184 def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3185 "movdqu\t{$src, $dst|$dst, $src}", []>, 3186 Sched<[SchedWriteVecMoveLS.XMM.RR]>, 3187 VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">; 3188 def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), 3189 "movdqu\t{$src, $dst|$dst, $src}", []>, 3190 Sched<[SchedWriteVecMoveLS.YMM.RR]>, 3191 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">; 3192 } 3193 3194 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, 3195 hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in { 3196 def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3197 "movdqa\t{$src, $dst|$dst, $src}", 3198 [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>, 3199 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG; 3200 def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 3201 "movdqa\t{$src, $dst|$dst, $src}", []>, 3202 Sched<[SchedWriteVecMoveLS.YMM.RM]>, 3203 VEX, VEX_L, VEX_WIG; 3204 def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3205 "vmovdqu\t{$src, $dst|$dst, $src}", 3206 [(set VR128:$dst, (loadv2i64 addr:$src))]>, 3207 Sched<[SchedWriteVecMoveLS.XMM.RM]>, 3208 XS, VEX, VEX_WIG; 3209 def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 3210 "vmovdqu\t{$src, $dst|$dst, $src}", []>, 3211 Sched<[SchedWriteVecMoveLS.YMM.RM]>, 3212 XS, VEX, VEX_L, VEX_WIG; 3213 } 3214 3215 let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in { 3216 def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), 3217 (ins i128mem:$dst, VR128:$src), 3218 "movdqa\t{$src, $dst|$dst, $src}", 3219 [(alignedstore (v2i64 VR128:$src), addr:$dst)]>, 3220 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG; 3221 def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), 3222 (ins i256mem:$dst, VR256:$src), 3223 "movdqa\t{$src, $dst|$dst, $src}", []>, 3224 Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG; 3225 def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3226 "vmovdqu\t{$src, $dst|$dst, $src}", 3227 [(store (v2i64 VR128:$src), addr:$dst)]>, 3228 Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG; 3229 def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), 3230 "vmovdqu\t{$src, $dst|$dst, $src}",[]>, 3231 Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG; 3232 } 3233 3234 let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in { 3235 let hasSideEffects = 0 in { 3236 def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3237 "movdqa\t{$src, $dst|$dst, $src}", []>; 3238 3239 def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3240 "movdqu\t{$src, $dst|$dst, $src}", []>, 3241 XS, Requires<[UseSSE2]>; 3242 } 3243 3244 // For Disassembler 3245 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { 3246 def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3247 "movdqa\t{$src, $dst|$dst, $src}", []>, 3248 FoldGenData<"MOVDQArr">; 3249 3250 def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3251 "movdqu\t{$src, $dst|$dst, $src}", []>, 3252 XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">; 3253 } 3254 } // SchedRW 3255 3256 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, 3257 hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in { 3258 def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3259 "movdqa\t{$src, $dst|$dst, $src}", 3260 [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>; 3261 def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3262 "movdqu\t{$src, $dst|$dst, $src}", 3263 [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>, 3264 XS, Requires<[UseSSE2]>; 3265 } 3266 3267 let mayStore = 1, hasSideEffects = 0, 3268 SchedRW = [SchedWriteVecMoveLS.XMM.MR] in { 3269 def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3270 "movdqa\t{$src, $dst|$dst, $src}", 3271 [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>; 3272 def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3273 "movdqu\t{$src, $dst|$dst, $src}", 3274 [/*(store (v2i64 VR128:$src), addr:$dst)*/]>, 3275 XS, Requires<[UseSSE2]>; 3276 } 3277 3278 } // ExeDomain = SSEPackedInt 3279 3280 // Aliases to help the assembler pick two byte VEX encodings by swapping the 3281 // operands relative to the normal instructions to use VEX.R instead of VEX.B. 3282 def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}", 3283 (VMOVDQArr_REV VR128L:$dst, VR128H:$src), 0>; 3284 def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}", 3285 (VMOVDQAYrr_REV VR256L:$dst, VR256H:$src), 0>; 3286 def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}", 3287 (VMOVDQUrr_REV VR128L:$dst, VR128H:$src), 0>; 3288 def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}", 3289 (VMOVDQUYrr_REV VR256L:$dst, VR256H:$src), 0>; 3290 3291 // Reversed version with ".s" suffix for GAS compatibility. 3292 def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}", 3293 (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>; 3294 def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}", 3295 (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>; 3296 def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}", 3297 (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>; 3298 def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}", 3299 (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>; 3300 3301 // Reversed version with ".s" suffix for GAS compatibility. 3302 def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}", 3303 (MOVDQArr_REV VR128:$dst, VR128:$src), 0>; 3304 def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}", 3305 (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>; 3306 3307 let Predicates = [HasAVX, NoVLX] in { 3308 // Additional patterns for other integer sizes. 3309 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), 3310 (VMOVDQAmr addr:$dst, VR128:$src)>; 3311 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), 3312 (VMOVDQAmr addr:$dst, VR128:$src)>; 3313 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), 3314 (VMOVDQAmr addr:$dst, VR128:$src)>; 3315 def : Pat<(store (v4i32 VR128:$src), addr:$dst), 3316 (VMOVDQUmr addr:$dst, VR128:$src)>; 3317 def : Pat<(store (v8i16 VR128:$src), addr:$dst), 3318 (VMOVDQUmr addr:$dst, VR128:$src)>; 3319 def : Pat<(store (v16i8 VR128:$src), addr:$dst), 3320 (VMOVDQUmr addr:$dst, VR128:$src)>; 3321 } 3322 3323 //===---------------------------------------------------------------------===// 3324 // SSE2 - Packed Integer Arithmetic Instructions 3325 //===---------------------------------------------------------------------===// 3326 3327 let ExeDomain = SSEPackedInt in { // SSE integer instructions 3328 3329 /// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types 3330 multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, 3331 ValueType DstVT, ValueType SrcVT, RegisterClass RC, 3332 PatFrag memop_frag, X86MemOperand x86memop, 3333 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 3334 let isCommutable = 1 in 3335 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 3336 (ins RC:$src1, RC:$src2), 3337 !if(Is2Addr, 3338 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3339 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3340 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>, 3341 Sched<[sched]>; 3342 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 3343 (ins RC:$src1, x86memop:$src2), 3344 !if(Is2Addr, 3345 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3346 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3347 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), 3348 (bitconvert (memop_frag addr:$src2)))))]>, 3349 Sched<[sched.Folded, ReadAfterLd]>; 3350 } 3351 } // ExeDomain = SSEPackedInt 3352 3353 defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8, 3354 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3355 defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16, 3356 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3357 defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32, 3358 SchedWriteVecALU, 1, NoVLX>; 3359 defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64, 3360 SchedWriteVecALU, 1, NoVLX>; 3361 defm PADDSB : PDI_binop_all<0xEC, "paddsb", X86adds, v16i8, v32i8, 3362 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3363 defm PADDSW : PDI_binop_all<0xED, "paddsw", X86adds, v8i16, v16i16, 3364 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3365 defm PADDUSB : PDI_binop_all<0xDC, "paddusb", X86addus, v16i8, v32i8, 3366 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3367 defm PADDUSW : PDI_binop_all<0xDD, "paddusw", X86addus, v8i16, v16i16, 3368 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3369 defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16, 3370 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3371 defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16, 3372 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3373 defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16, 3374 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3375 defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8, 3376 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3377 defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16, 3378 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3379 defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32, 3380 SchedWriteVecALU, 0, NoVLX>; 3381 defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64, 3382 SchedWriteVecALU, 0, NoVLX>; 3383 defm PSUBSB : PDI_binop_all<0xE8, "psubsb", X86subs, v16i8, v32i8, 3384 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3385 defm PSUBSW : PDI_binop_all<0xE9, "psubsw", X86subs, v8i16, v16i16, 3386 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3387 defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8, 3388 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3389 defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16, 3390 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3391 defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8, 3392 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3393 defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16, 3394 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3395 defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8, 3396 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3397 defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, 3398 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3399 defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8, 3400 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3401 defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16, 3402 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3403 defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64, 3404 SchedWriteVecIMul, 1, NoVLX>; 3405 3406 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3407 defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, 3408 loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>, 3409 VEX_4V, VEX_WIG; 3410 3411 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3412 defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16, 3413 VR256, loadv4i64, i256mem, SchedWriteVecIMul.YMM, 3414 0>, VEX_4V, VEX_L, VEX_WIG; 3415 let Constraints = "$src1 = $dst" in 3416 defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, 3417 memopv2i64, i128mem, SchedWriteVecIMul.XMM>; 3418 3419 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3420 defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128, 3421 loadv2i64, i128mem, SchedWritePSADBW.XMM, 0>, 3422 VEX_4V, VEX_WIG; 3423 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3424 defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256, 3425 loadv4i64, i256mem, SchedWritePSADBW.YMM, 0>, 3426 VEX_4V, VEX_L, VEX_WIG; 3427 let Constraints = "$src1 = $dst" in 3428 defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128, 3429 memopv2i64, i128mem, SchedWritePSADBW.XMM>; 3430 3431 //===---------------------------------------------------------------------===// 3432 // SSE2 - Packed Integer Logical Instructions 3433 //===---------------------------------------------------------------------===// 3434 3435 multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, 3436 string OpcodeStr, SDNode OpNode, 3437 SDNode OpNode2, RegisterClass RC, 3438 X86FoldableSchedWrite sched, 3439 X86FoldableSchedWrite schedImm, 3440 ValueType DstVT, ValueType SrcVT, 3441 PatFrag ld_frag, bit Is2Addr = 1> { 3442 // src2 is always 128-bit 3443 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 3444 (ins RC:$src1, VR128:$src2), 3445 !if(Is2Addr, 3446 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3447 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3448 [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>, 3449 Sched<[sched]>; 3450 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 3451 (ins RC:$src1, i128mem:$src2), 3452 !if(Is2Addr, 3453 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3454 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3455 [(set RC:$dst, (DstVT (OpNode RC:$src1, 3456 (SrcVT (bitconvert (ld_frag addr:$src2))))))]>, 3457 Sched<[sched.Folded, ReadAfterLd]>; 3458 def ri : PDIi8<opc2, ImmForm, (outs RC:$dst), 3459 (ins RC:$src1, u8imm:$src2), 3460 !if(Is2Addr, 3461 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3462 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3463 [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))]>, 3464 Sched<[schedImm]>; 3465 } 3466 3467 multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm, 3468 string OpcodeStr, SDNode OpNode, 3469 SDNode OpNode2, ValueType DstVT128, 3470 ValueType DstVT256, ValueType SrcVT, 3471 X86SchedWriteWidths sched, 3472 X86SchedWriteWidths schedImm, Predicate prd> { 3473 let Predicates = [HasAVX, prd] in 3474 defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), 3475 OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM, 3476 DstVT128, SrcVT, loadv2i64, 0>, VEX_4V, VEX_WIG; 3477 let Predicates = [HasAVX2, prd] in 3478 defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), 3479 OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM, 3480 DstVT256, SrcVT, loadv2i64, 0>, VEX_4V, VEX_L, 3481 VEX_WIG; 3482 let Constraints = "$src1 = $dst" in 3483 defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2, 3484 VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT, 3485 memopv2i64>; 3486 } 3487 3488 multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr, 3489 SDNode OpNode, RegisterClass RC, ValueType VT, 3490 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 3491 def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2), 3492 !if(Is2Addr, 3493 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3494 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3495 [(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))]>, 3496 Sched<[sched]>; 3497 } 3498 3499 multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr, 3500 SDNode OpNode, X86SchedWriteWidths sched> { 3501 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3502 defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, 3503 VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG; 3504 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3505 defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, 3506 VR256, v32i8, sched.YMM, 0>, 3507 VEX_4V, VEX_L, VEX_WIG; 3508 let Constraints = "$src1 = $dst" in 3509 defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8, 3510 sched.XMM>; 3511 } 3512 3513 let ExeDomain = SSEPackedInt in { 3514 defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, 3515 v8i16, v16i16, v8i16, SchedWriteVecShift, 3516 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3517 defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli, 3518 v4i32, v8i32, v4i32, SchedWriteVecShift, 3519 SchedWriteVecShiftImm, NoVLX>; 3520 defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli, 3521 v2i64, v4i64, v2i64, SchedWriteVecShift, 3522 SchedWriteVecShiftImm, NoVLX>; 3523 3524 defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli, 3525 v8i16, v16i16, v8i16, SchedWriteVecShift, 3526 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3527 defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli, 3528 v4i32, v8i32, v4i32, SchedWriteVecShift, 3529 SchedWriteVecShiftImm, NoVLX>; 3530 defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli, 3531 v2i64, v4i64, v2i64, SchedWriteVecShift, 3532 SchedWriteVecShiftImm, NoVLX>; 3533 3534 defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai, 3535 v8i16, v16i16, v8i16, SchedWriteVecShift, 3536 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3537 defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai, 3538 v4i32, v8i32, v4i32, SchedWriteVecShift, 3539 SchedWriteVecShiftImm, NoVLX>; 3540 3541 defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq, 3542 SchedWriteShuffle>; 3543 defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq, 3544 SchedWriteShuffle>; 3545 } // ExeDomain = SSEPackedInt 3546 3547 //===---------------------------------------------------------------------===// 3548 // SSE2 - Packed Integer Comparison Instructions 3549 //===---------------------------------------------------------------------===// 3550 3551 defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8, 3552 SchedWriteVecALU, 1, TruePredicate>; 3553 defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16, 3554 SchedWriteVecALU, 1, TruePredicate>; 3555 defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32, 3556 SchedWriteVecALU, 1, TruePredicate>; 3557 defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8, 3558 SchedWriteVecALU, 0, TruePredicate>; 3559 defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16, 3560 SchedWriteVecALU, 0, TruePredicate>; 3561 defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, 3562 SchedWriteVecALU, 0, TruePredicate>; 3563 3564 //===---------------------------------------------------------------------===// 3565 // SSE2 - Packed Integer Shuffle Instructions 3566 //===---------------------------------------------------------------------===// 3567 3568 let ExeDomain = SSEPackedInt in { 3569 multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256, 3570 SDNode OpNode, X86SchedWriteWidths sched, 3571 Predicate prd> { 3572 let Predicates = [HasAVX, prd] in { 3573 def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst), 3574 (ins VR128:$src1, u8imm:$src2), 3575 !strconcat("v", OpcodeStr, 3576 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3577 [(set VR128:$dst, 3578 (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>, 3579 VEX, Sched<[sched.XMM]>, VEX_WIG; 3580 def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), 3581 (ins i128mem:$src1, u8imm:$src2), 3582 !strconcat("v", OpcodeStr, 3583 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3584 [(set VR128:$dst, 3585 (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)), 3586 (i8 imm:$src2))))]>, VEX, 3587 Sched<[sched.XMM.Folded]>, VEX_WIG; 3588 } 3589 3590 let Predicates = [HasAVX2, prd] in { 3591 def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst), 3592 (ins VR256:$src1, u8imm:$src2), 3593 !strconcat("v", OpcodeStr, 3594 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3595 [(set VR256:$dst, 3596 (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))]>, 3597 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; 3598 def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst), 3599 (ins i256mem:$src1, u8imm:$src2), 3600 !strconcat("v", OpcodeStr, 3601 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3602 [(set VR256:$dst, 3603 (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)), 3604 (i8 imm:$src2))))]>, VEX, VEX_L, 3605 Sched<[sched.YMM.Folded]>, VEX_WIG; 3606 } 3607 3608 let Predicates = [UseSSE2] in { 3609 def ri : Ii8<0x70, MRMSrcReg, 3610 (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), 3611 !strconcat(OpcodeStr, 3612 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3613 [(set VR128:$dst, 3614 (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>, 3615 Sched<[sched.XMM]>; 3616 def mi : Ii8<0x70, MRMSrcMem, 3617 (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), 3618 !strconcat(OpcodeStr, 3619 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3620 [(set VR128:$dst, 3621 (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)), 3622 (i8 imm:$src2))))]>, 3623 Sched<[sched.XMM.Folded]>; 3624 } 3625 } 3626 } // ExeDomain = SSEPackedInt 3627 3628 defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd, 3629 SchedWriteShuffle, NoVLX>, PD; 3630 defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw, 3631 SchedWriteShuffle, NoVLX_Or_NoBWI>, XS; 3632 defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw, 3633 SchedWriteShuffle, NoVLX_Or_NoBWI>, XD; 3634 3635 //===---------------------------------------------------------------------===// 3636 // Packed Integer Pack Instructions (SSE & AVX) 3637 //===---------------------------------------------------------------------===// 3638 3639 let ExeDomain = SSEPackedInt in { 3640 multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, 3641 ValueType ArgVT, SDNode OpNode, RegisterClass RC, 3642 X86MemOperand x86memop, X86FoldableSchedWrite sched, 3643 PatFrag ld_frag, bit Is2Addr = 1> { 3644 def rr : PDI<opc, MRMSrcReg, 3645 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3646 !if(Is2Addr, 3647 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3648 !strconcat(OpcodeStr, 3649 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3650 [(set RC:$dst, 3651 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>, 3652 Sched<[sched]>; 3653 def rm : PDI<opc, MRMSrcMem, 3654 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3655 !if(Is2Addr, 3656 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3657 !strconcat(OpcodeStr, 3658 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3659 [(set RC:$dst, 3660 (OutVT (OpNode (ArgVT RC:$src1), 3661 (bitconvert (ld_frag addr:$src2)))))]>, 3662 Sched<[sched.Folded, ReadAfterLd]>; 3663 } 3664 3665 multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, 3666 ValueType ArgVT, SDNode OpNode, RegisterClass RC, 3667 X86MemOperand x86memop, X86FoldableSchedWrite sched, 3668 PatFrag ld_frag, bit Is2Addr = 1> { 3669 def rr : SS48I<opc, MRMSrcReg, 3670 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3671 !if(Is2Addr, 3672 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3673 !strconcat(OpcodeStr, 3674 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3675 [(set RC:$dst, 3676 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>, 3677 Sched<[sched]>; 3678 def rm : SS48I<opc, MRMSrcMem, 3679 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3680 !if(Is2Addr, 3681 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3682 !strconcat(OpcodeStr, 3683 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3684 [(set RC:$dst, 3685 (OutVT (OpNode (ArgVT RC:$src1), 3686 (bitconvert (ld_frag addr:$src2)))))]>, 3687 Sched<[sched.Folded, ReadAfterLd]>; 3688 } 3689 3690 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 3691 defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128, 3692 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, 3693 VEX_4V, VEX_WIG; 3694 defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128, 3695 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, 3696 VEX_4V, VEX_WIG; 3697 3698 defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128, 3699 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, 3700 VEX_4V, VEX_WIG; 3701 defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128, 3702 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, 3703 VEX_4V; 3704 } 3705 3706 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 3707 defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256, 3708 i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, 3709 VEX_4V, VEX_L, VEX_WIG; 3710 defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256, 3711 i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, 3712 VEX_4V, VEX_L, VEX_WIG; 3713 3714 defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256, 3715 i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, 3716 VEX_4V, VEX_L, VEX_WIG; 3717 defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256, 3718 i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, 3719 VEX_4V, VEX_L; 3720 } 3721 3722 let Constraints = "$src1 = $dst" in { 3723 defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128, 3724 i128mem, SchedWriteShuffle.XMM, memopv2i64>; 3725 defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128, 3726 i128mem, SchedWriteShuffle.XMM, memopv2i64>; 3727 3728 defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128, 3729 i128mem, SchedWriteShuffle.XMM, memopv2i64>; 3730 3731 defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128, 3732 i128mem, SchedWriteShuffle.XMM, memopv2i64>; 3733 } 3734 } // ExeDomain = SSEPackedInt 3735 3736 //===---------------------------------------------------------------------===// 3737 // SSE2 - Packed Integer Unpack Instructions 3738 //===---------------------------------------------------------------------===// 3739 3740 let ExeDomain = SSEPackedInt in { 3741 multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, 3742 SDNode OpNode, RegisterClass RC, X86MemOperand x86memop, 3743 X86FoldableSchedWrite sched, PatFrag ld_frag, 3744 bit Is2Addr = 1> { 3745 def rr : PDI<opc, MRMSrcReg, 3746 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3747 !if(Is2Addr, 3748 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), 3749 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3750 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 3751 Sched<[sched]>; 3752 def rm : PDI<opc, MRMSrcMem, 3753 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3754 !if(Is2Addr, 3755 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), 3756 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3757 [(set RC:$dst, (vt (OpNode RC:$src1, 3758 (bitconvert (ld_frag addr:$src2)))))]>, 3759 Sched<[sched.Folded, ReadAfterLd]>; 3760 } 3761 3762 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 3763 defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128, 3764 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, 3765 VEX_4V, VEX_WIG; 3766 defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128, 3767 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, 3768 VEX_4V, VEX_WIG; 3769 defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128, 3770 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, 3771 VEX_4V, VEX_WIG; 3772 defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128, 3773 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, 3774 VEX_4V, VEX_WIG; 3775 } 3776 3777 let Predicates = [HasAVX, NoVLX] in { 3778 defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128, 3779 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, 3780 VEX_4V, VEX_WIG; 3781 defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128, 3782 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, 3783 VEX_4V, VEX_WIG; 3784 defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128, 3785 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, 3786 VEX_4V, VEX_WIG; 3787 defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128, 3788 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, 3789 VEX_4V, VEX_WIG; 3790 } 3791 3792 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 3793 defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256, 3794 i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, 3795 VEX_4V, VEX_L, VEX_WIG; 3796 defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256, 3797 i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, 3798 VEX_4V, VEX_L, VEX_WIG; 3799 defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256, 3800 i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, 3801 VEX_4V, VEX_L, VEX_WIG; 3802 defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256, 3803 i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, 3804 VEX_4V, VEX_L, VEX_WIG; 3805 } 3806 3807 let Predicates = [HasAVX2, NoVLX] in { 3808 defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256, 3809 i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, 3810 VEX_4V, VEX_L, VEX_WIG; 3811 defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256, 3812 i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, 3813 VEX_4V, VEX_L, VEX_WIG; 3814 defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256, 3815 i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, 3816 VEX_4V, VEX_L, VEX_WIG; 3817 defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256, 3818 i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, 3819 VEX_4V, VEX_L, VEX_WIG; 3820 } 3821 3822 let Constraints = "$src1 = $dst" in { 3823 defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128, 3824 i128mem, SchedWriteShuffle.XMM, memopv2i64>; 3825 defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128, 3826 i128mem, SchedWriteShuffle.XMM, memopv2i64>; 3827 defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128, 3828 i128mem, SchedWriteShuffle.XMM, memopv2i64>; 3829 defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128, 3830 i128mem, SchedWriteShuffle.XMM, memopv2i64>; 3831 3832 defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128, 3833 i128mem, SchedWriteShuffle.XMM, memopv2i64>; 3834 defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128, 3835 i128mem, SchedWriteShuffle.XMM, memopv2i64>; 3836 defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128, 3837 i128mem, SchedWriteShuffle.XMM, memopv2i64>; 3838 defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128, 3839 i128mem, SchedWriteShuffle.XMM, memopv2i64>; 3840 } 3841 } // ExeDomain = SSEPackedInt 3842 3843 //===---------------------------------------------------------------------===// 3844 // SSE2 - Packed Integer Extract and Insert 3845 //===---------------------------------------------------------------------===// 3846 3847 let ExeDomain = SSEPackedInt in { 3848 multiclass sse2_pinsrw<bit Is2Addr = 1> { 3849 def rr : Ii8<0xC4, MRMSrcReg, 3850 (outs VR128:$dst), (ins VR128:$src1, 3851 GR32orGR64:$src2, u8imm:$src3), 3852 !if(Is2Addr, 3853 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", 3854 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 3855 [(set VR128:$dst, 3856 (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, 3857 Sched<[WriteVecInsert]>; 3858 def rm : Ii8<0xC4, MRMSrcMem, 3859 (outs VR128:$dst), (ins VR128:$src1, 3860 i16mem:$src2, u8imm:$src3), 3861 !if(Is2Addr, 3862 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", 3863 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 3864 [(set VR128:$dst, 3865 (X86pinsrw VR128:$src1, (extloadi16 addr:$src2), 3866 imm:$src3))]>, 3867 Sched<[WriteVecInsertLd, ReadAfterLd]>; 3868 } 3869 3870 // Extract 3871 let Predicates = [HasAVX, NoBWI] in 3872 def VPEXTRWrr : Ii8<0xC5, MRMSrcReg, 3873 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), 3874 "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", 3875 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), 3876 imm:$src2))]>, 3877 PD, VEX, Sched<[WriteVecExtract]>; 3878 def PEXTRWrr : PDIi8<0xC5, MRMSrcReg, 3879 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), 3880 "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", 3881 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), 3882 imm:$src2))]>, 3883 Sched<[WriteVecExtract]>; 3884 3885 // Insert 3886 let Predicates = [HasAVX, NoBWI] in 3887 defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V; 3888 3889 let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in 3890 defm PINSRW : sse2_pinsrw, PD; 3891 3892 } // ExeDomain = SSEPackedInt 3893 3894 //===---------------------------------------------------------------------===// 3895 // SSE2 - Packed Mask Creation 3896 //===---------------------------------------------------------------------===// 3897 3898 let ExeDomain = SSEPackedInt in { 3899 3900 def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), 3901 (ins VR128:$src), 3902 "pmovmskb\t{$src, $dst|$dst, $src}", 3903 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>, 3904 Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG; 3905 3906 let Predicates = [HasAVX2] in { 3907 def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), 3908 (ins VR256:$src), 3909 "pmovmskb\t{$src, $dst|$dst, $src}", 3910 [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>, 3911 Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG; 3912 } 3913 3914 def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src), 3915 "pmovmskb\t{$src, $dst|$dst, $src}", 3916 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>, 3917 Sched<[WriteVecMOVMSK]>; 3918 3919 } // ExeDomain = SSEPackedInt 3920 3921 //===---------------------------------------------------------------------===// 3922 // SSE2 - Conditional Store 3923 //===---------------------------------------------------------------------===// 3924 3925 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in { 3926 let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in 3927 def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), 3928 (ins VR128:$src, VR128:$mask), 3929 "maskmovdqu\t{$mask, $src|$src, $mask}", 3930 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>, 3931 VEX, VEX_WIG; 3932 let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in 3933 def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs), 3934 (ins VR128:$src, VR128:$mask), 3935 "maskmovdqu\t{$mask, $src|$src, $mask}", 3936 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>, 3937 VEX, VEX_WIG; 3938 3939 let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in 3940 def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), 3941 "maskmovdqu\t{$mask, $src|$src, $mask}", 3942 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>; 3943 let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in 3944 def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), 3945 "maskmovdqu\t{$mask, $src|$src, $mask}", 3946 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>; 3947 3948 } // ExeDomain = SSEPackedInt 3949 3950 //===---------------------------------------------------------------------===// 3951 // SSE2 - Move Doubleword/Quadword 3952 //===---------------------------------------------------------------------===// 3953 3954 //===---------------------------------------------------------------------===// 3955 // Move Int Doubleword to Packed Double Int 3956 // 3957 let ExeDomain = SSEPackedInt in { 3958 def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), 3959 "movd\t{$src, $dst|$dst, $src}", 3960 [(set VR128:$dst, 3961 (v4i32 (scalar_to_vector GR32:$src)))]>, 3962 VEX, Sched<[WriteVecMoveFromGpr]>; 3963 def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), 3964 "movd\t{$src, $dst|$dst, $src}", 3965 [(set VR128:$dst, 3966 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, 3967 VEX, Sched<[WriteVecLoad]>; 3968 def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 3969 "movq\t{$src, $dst|$dst, $src}", 3970 [(set VR128:$dst, 3971 (v2i64 (scalar_to_vector GR64:$src)))]>, 3972 VEX, Sched<[WriteVecMoveFromGpr]>; 3973 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in 3974 def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 3975 "movq\t{$src, $dst|$dst, $src}", []>, 3976 VEX, Sched<[WriteVecLoad]>; 3977 let isCodeGenOnly = 1 in 3978 def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), 3979 "movq\t{$src, $dst|$dst, $src}", 3980 [(set FR64:$dst, (bitconvert GR64:$src))]>, 3981 VEX, Sched<[WriteVecMoveFromGpr]>; 3982 3983 def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), 3984 "movd\t{$src, $dst|$dst, $src}", 3985 [(set VR128:$dst, 3986 (v4i32 (scalar_to_vector GR32:$src)))]>, 3987 Sched<[WriteVecMoveFromGpr]>; 3988 def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), 3989 "movd\t{$src, $dst|$dst, $src}", 3990 [(set VR128:$dst, 3991 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, 3992 Sched<[WriteVecLoad]>; 3993 def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 3994 "movq\t{$src, $dst|$dst, $src}", 3995 [(set VR128:$dst, 3996 (v2i64 (scalar_to_vector GR64:$src)))]>, 3997 Sched<[WriteVecMoveFromGpr]>; 3998 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in 3999 def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4000 "movq\t{$src, $dst|$dst, $src}", []>, 4001 Sched<[WriteVecLoad]>; 4002 let isCodeGenOnly = 1 in 4003 def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), 4004 "movq\t{$src, $dst|$dst, $src}", 4005 [(set FR64:$dst, (bitconvert GR64:$src))]>, 4006 Sched<[WriteVecMoveFromGpr]>; 4007 } // ExeDomain = SSEPackedInt 4008 4009 //===---------------------------------------------------------------------===// 4010 // Move Int Doubleword to Single Scalar 4011 // 4012 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4013 def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), 4014 "movd\t{$src, $dst|$dst, $src}", 4015 [(set FR32:$dst, (bitconvert GR32:$src))]>, 4016 VEX, Sched<[WriteVecMoveFromGpr]>; 4017 4018 def VMOVDI2SSrm : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), 4019 "movd\t{$src, $dst|$dst, $src}", 4020 [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>, 4021 VEX, Sched<[WriteVecLoad]>; 4022 def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), 4023 "movd\t{$src, $dst|$dst, $src}", 4024 [(set FR32:$dst, (bitconvert GR32:$src))]>, 4025 Sched<[WriteVecMoveFromGpr]>; 4026 4027 def MOVDI2SSrm : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), 4028 "movd\t{$src, $dst|$dst, $src}", 4029 [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>, 4030 Sched<[WriteVecLoad]>; 4031 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4032 4033 //===---------------------------------------------------------------------===// 4034 // Move Packed Doubleword Int to Packed Double Int 4035 // 4036 let ExeDomain = SSEPackedInt in { 4037 def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), 4038 "movd\t{$src, $dst|$dst, $src}", 4039 [(set GR32:$dst, (extractelt (v4i32 VR128:$src), 4040 (iPTR 0)))]>, VEX, 4041 Sched<[WriteVecMoveToGpr]>; 4042 def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs), 4043 (ins i32mem:$dst, VR128:$src), 4044 "movd\t{$src, $dst|$dst, $src}", 4045 [(store (i32 (extractelt (v4i32 VR128:$src), 4046 (iPTR 0))), addr:$dst)]>, 4047 VEX, Sched<[WriteVecStore]>; 4048 def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), 4049 "movd\t{$src, $dst|$dst, $src}", 4050 [(set GR32:$dst, (extractelt (v4i32 VR128:$src), 4051 (iPTR 0)))]>, 4052 Sched<[WriteVecMoveToGpr]>; 4053 def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), 4054 "movd\t{$src, $dst|$dst, $src}", 4055 [(store (i32 (extractelt (v4i32 VR128:$src), 4056 (iPTR 0))), addr:$dst)]>, 4057 Sched<[WriteVecStore]>; 4058 } // ExeDomain = SSEPackedInt 4059 4060 //===---------------------------------------------------------------------===// 4061 // Move Packed Doubleword Int first element to Doubleword Int 4062 // 4063 let ExeDomain = SSEPackedInt in { 4064 let SchedRW = [WriteVecMoveToGpr] in { 4065 def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), 4066 "movq\t{$src, $dst|$dst, $src}", 4067 [(set GR64:$dst, (extractelt (v2i64 VR128:$src), 4068 (iPTR 0)))]>, 4069 VEX; 4070 4071 def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), 4072 "movq\t{$src, $dst|$dst, $src}", 4073 [(set GR64:$dst, (extractelt (v2i64 VR128:$src), 4074 (iPTR 0)))]>; 4075 } //SchedRW 4076 4077 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in 4078 def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs), 4079 (ins i64mem:$dst, VR128:$src), 4080 "movq\t{$src, $dst|$dst, $src}", []>, 4081 VEX, Sched<[WriteVecStore]>; 4082 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in 4083 def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4084 "movq\t{$src, $dst|$dst, $src}", []>, 4085 Sched<[WriteVecStore]>; 4086 } // ExeDomain = SSEPackedInt 4087 4088 //===---------------------------------------------------------------------===// 4089 // Bitcast FR64 <-> GR64 4090 // 4091 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4092 let Predicates = [UseAVX] in 4093 def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), 4094 "movq\t{$src, $dst|$dst, $src}", 4095 [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>, 4096 VEX, Sched<[WriteVecLoad]>; 4097 def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), 4098 "movq\t{$src, $dst|$dst, $src}", 4099 [(set GR64:$dst, (bitconvert FR64:$src))]>, 4100 VEX, Sched<[WriteVecMoveToGpr]>; 4101 def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), 4102 "movq\t{$src, $dst|$dst, $src}", 4103 [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>, 4104 VEX, Sched<[WriteVecStore]>; 4105 4106 def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), 4107 "movq\t{$src, $dst|$dst, $src}", 4108 [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>, 4109 Sched<[WriteVecLoad]>; 4110 def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), 4111 "movq\t{$src, $dst|$dst, $src}", 4112 [(set GR64:$dst, (bitconvert FR64:$src))]>, 4113 Sched<[WriteVecMoveToGpr]>; 4114 def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), 4115 "movq\t{$src, $dst|$dst, $src}", 4116 [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>, 4117 Sched<[WriteVecStore]>; 4118 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4119 4120 //===---------------------------------------------------------------------===// 4121 // Move Scalar Single to Double Int 4122 // 4123 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4124 def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), 4125 "movd\t{$src, $dst|$dst, $src}", 4126 [(set GR32:$dst, (bitconvert FR32:$src))]>, 4127 VEX, Sched<[WriteVecMoveToGpr]>; 4128 def VMOVSS2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), 4129 "movd\t{$src, $dst|$dst, $src}", 4130 [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>, 4131 VEX, Sched<[WriteVecStore]>; 4132 def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), 4133 "movd\t{$src, $dst|$dst, $src}", 4134 [(set GR32:$dst, (bitconvert FR32:$src))]>, 4135 Sched<[WriteVecMoveToGpr]>; 4136 def MOVSS2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), 4137 "movd\t{$src, $dst|$dst, $src}", 4138 [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>, 4139 Sched<[WriteVecStore]>; 4140 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4141 4142 let Predicates = [UseAVX] in { 4143 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), 4144 (VMOVDI2PDIrr GR32:$src)>; 4145 4146 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), 4147 (VMOV64toPQIrr GR64:$src)>; 4148 4149 def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, 4150 (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), 4151 (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIrr GR64:$src)), sub_xmm)>; 4152 // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part. 4153 // These instructions also write zeros in the high part of a 256-bit register. 4154 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), 4155 (VMOVDI2PDIrm addr:$src)>; 4156 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), 4157 (VMOVDI2PDIrm addr:$src)>; 4158 def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), 4159 (VMOVDI2PDIrm addr:$src)>; 4160 def : Pat<(v4i32 (X86vzload addr:$src)), 4161 (VMOVDI2PDIrm addr:$src)>; 4162 def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, 4163 (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), 4164 (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>; 4165 def : Pat<(v8i32 (X86vzload addr:$src)), 4166 (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>; 4167 // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. 4168 def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, 4169 (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), 4170 (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrr GR32:$src)), sub_xmm)>; 4171 } 4172 4173 let Predicates = [UseSSE2] in { 4174 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), 4175 (MOVDI2PDIrr GR32:$src)>; 4176 4177 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), 4178 (MOV64toPQIrr GR64:$src)>; 4179 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), 4180 (MOVDI2PDIrm addr:$src)>; 4181 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), 4182 (MOVDI2PDIrm addr:$src)>; 4183 def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), 4184 (MOVDI2PDIrm addr:$src)>; 4185 def : Pat<(v4i32 (X86vzload addr:$src)), 4186 (MOVDI2PDIrm addr:$src)>; 4187 } 4188 4189 // Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of 4190 // "movq" due to MacOS parsing limitation. In order to parse old assembly, we add 4191 // these aliases. 4192 def : InstAlias<"movd\t{$src, $dst|$dst, $src}", 4193 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>; 4194 def : InstAlias<"movd\t{$src, $dst|$dst, $src}", 4195 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>; 4196 // Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX. 4197 def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", 4198 (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>; 4199 def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", 4200 (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>; 4201 4202 //===---------------------------------------------------------------------===// 4203 // SSE2 - Move Quadword 4204 //===---------------------------------------------------------------------===// 4205 4206 //===---------------------------------------------------------------------===// 4207 // Move Quadword Int to Packed Quadword Int 4208 // 4209 4210 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in { 4211 def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4212 "vmovq\t{$src, $dst|$dst, $src}", 4213 [(set VR128:$dst, 4214 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS, 4215 VEX, Requires<[UseAVX]>, VEX_WIG; 4216 def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4217 "movq\t{$src, $dst|$dst, $src}", 4218 [(set VR128:$dst, 4219 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, 4220 XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix 4221 } // ExeDomain, SchedRW 4222 4223 //===---------------------------------------------------------------------===// 4224 // Move Packed Quadword Int to Quadword Int 4225 // 4226 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in { 4227 def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4228 "movq\t{$src, $dst|$dst, $src}", 4229 [(store (i64 (extractelt (v2i64 VR128:$src), 4230 (iPTR 0))), addr:$dst)]>, 4231 VEX, VEX_WIG; 4232 def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4233 "movq\t{$src, $dst|$dst, $src}", 4234 [(store (i64 (extractelt (v2i64 VR128:$src), 4235 (iPTR 0))), addr:$dst)]>; 4236 } // ExeDomain, SchedRW 4237 4238 // For disassembler only 4239 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 4240 SchedRW = [SchedWriteVecLogic.XMM] in { 4241 def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4242 "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG; 4243 def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4244 "movq\t{$src, $dst|$dst, $src}", []>; 4245 } 4246 4247 // Aliases to help the assembler pick two byte VEX encodings by swapping the 4248 // operands relative to the normal instructions to use VEX.R instead of VEX.B. 4249 def : InstAlias<"vmovq\t{$src, $dst|$dst, $src}", 4250 (VMOVPQI2QIrr VR128L:$dst, VR128H:$src), 0>; 4251 4252 def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}", 4253 (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>; 4254 def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}", 4255 (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>; 4256 4257 let Predicates = [UseAVX] in { 4258 def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), 4259 (VMOVQI2PQIrm addr:$src)>; 4260 def : Pat<(v2i64 (X86vzload addr:$src)), 4261 (VMOVQI2PQIrm addr:$src)>; 4262 def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, 4263 (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), 4264 (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; 4265 def : Pat<(v4i64 (X86vzload addr:$src)), 4266 (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; 4267 } 4268 4269 let Predicates = [UseSSE2] in { 4270 def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), 4271 (MOVQI2PQIrm addr:$src)>; 4272 def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>; 4273 } 4274 4275 //===---------------------------------------------------------------------===// 4276 // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in 4277 // IA32 document. movq xmm1, xmm2 does clear the high bits. 4278 // 4279 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in { 4280 def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4281 "vmovq\t{$src, $dst|$dst, $src}", 4282 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, 4283 XS, VEX, Requires<[UseAVX]>, VEX_WIG; 4284 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4285 "movq\t{$src, $dst|$dst, $src}", 4286 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, 4287 XS, Requires<[UseSSE2]>; 4288 } // ExeDomain, SchedRW 4289 4290 let Predicates = [UseAVX] in { 4291 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 4292 (VMOVZPQILo2PQIrr VR128:$src)>; 4293 } 4294 let Predicates = [UseSSE2] in { 4295 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 4296 (MOVZPQILo2PQIrr VR128:$src)>; 4297 } 4298 4299 //===---------------------------------------------------------------------===// 4300 // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP 4301 //===---------------------------------------------------------------------===// 4302 4303 multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, 4304 ValueType vt, RegisterClass RC, PatFrag mem_frag, 4305 X86MemOperand x86memop, X86FoldableSchedWrite sched> { 4306 def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src), 4307 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4308 [(set RC:$dst, (vt (OpNode RC:$src)))]>, 4309 Sched<[sched]>; 4310 def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 4311 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4312 [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>, 4313 Sched<[sched.Folded]>; 4314 } 4315 4316 let Predicates = [HasAVX, NoVLX] in { 4317 defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", 4318 v4f32, VR128, loadv4f32, f128mem, 4319 SchedWriteFShuffle.XMM>, VEX, VEX_WIG; 4320 defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", 4321 v4f32, VR128, loadv4f32, f128mem, 4322 SchedWriteFShuffle.XMM>, VEX, VEX_WIG; 4323 defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", 4324 v8f32, VR256, loadv8f32, f256mem, 4325 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG; 4326 defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", 4327 v8f32, VR256, loadv8f32, f256mem, 4328 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG; 4329 } 4330 defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128, 4331 memopv4f32, f128mem, SchedWriteFShuffle.XMM>; 4332 defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128, 4333 memopv4f32, f128mem, SchedWriteFShuffle.XMM>; 4334 4335 let Predicates = [HasAVX, NoVLX] in { 4336 def : Pat<(v4i32 (X86Movshdup VR128:$src)), 4337 (VMOVSHDUPrr VR128:$src)>; 4338 def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))), 4339 (VMOVSHDUPrm addr:$src)>; 4340 def : Pat<(v4i32 (X86Movsldup VR128:$src)), 4341 (VMOVSLDUPrr VR128:$src)>; 4342 def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))), 4343 (VMOVSLDUPrm addr:$src)>; 4344 def : Pat<(v8i32 (X86Movshdup VR256:$src)), 4345 (VMOVSHDUPYrr VR256:$src)>; 4346 def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))), 4347 (VMOVSHDUPYrm addr:$src)>; 4348 def : Pat<(v8i32 (X86Movsldup VR256:$src)), 4349 (VMOVSLDUPYrr VR256:$src)>; 4350 def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))), 4351 (VMOVSLDUPYrm addr:$src)>; 4352 } 4353 4354 let Predicates = [UseSSE3] in { 4355 def : Pat<(v4i32 (X86Movshdup VR128:$src)), 4356 (MOVSHDUPrr VR128:$src)>; 4357 def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))), 4358 (MOVSHDUPrm addr:$src)>; 4359 def : Pat<(v4i32 (X86Movsldup VR128:$src)), 4360 (MOVSLDUPrr VR128:$src)>; 4361 def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))), 4362 (MOVSLDUPrm addr:$src)>; 4363 } 4364 4365 //===---------------------------------------------------------------------===// 4366 // SSE3 - Replicate Double FP - MOVDDUP 4367 //===---------------------------------------------------------------------===// 4368 4369 multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> { 4370 def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4371 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4372 [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>, 4373 Sched<[sched.XMM]>; 4374 def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 4375 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4376 [(set VR128:$dst, 4377 (v2f64 (X86Movddup 4378 (scalar_to_vector (loadf64 addr:$src)))))]>, 4379 Sched<[sched.XMM.Folded]>; 4380 } 4381 4382 // FIXME: Merge with above classes when there are patterns for the ymm version 4383 multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> { 4384 def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 4385 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4386 [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>, 4387 Sched<[sched.YMM]>; 4388 def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 4389 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4390 [(set VR256:$dst, 4391 (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>, 4392 Sched<[sched.YMM.Folded]>; 4393 } 4394 4395 let Predicates = [HasAVX, NoVLX] in { 4396 defm VMOVDDUP : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>, 4397 VEX, VEX_WIG; 4398 defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>, 4399 VEX, VEX_L, VEX_WIG; 4400 } 4401 4402 defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>; 4403 4404 4405 let Predicates = [HasAVX, NoVLX] in { 4406 def : Pat<(X86Movddup (loadv2f64 addr:$src)), 4407 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; 4408 } 4409 4410 let Predicates = [UseSSE3] in { 4411 // No need for aligned memory as this only loads 64-bits. 4412 def : Pat<(X86Movddup (loadv2f64 addr:$src)), 4413 (MOVDDUPrm addr:$src)>; 4414 } 4415 4416 //===---------------------------------------------------------------------===// 4417 // SSE3 - Move Unaligned Integer 4418 //===---------------------------------------------------------------------===// 4419 4420 let Predicates = [HasAVX] in { 4421 def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 4422 "vlddqu\t{$src, $dst|$dst, $src}", 4423 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, 4424 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG; 4425 def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 4426 "vlddqu\t{$src, $dst|$dst, $src}", 4427 [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>, 4428 Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG; 4429 } // Predicates 4430 4431 def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 4432 "lddqu\t{$src, $dst|$dst, $src}", 4433 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, 4434 Sched<[SchedWriteVecMoveLS.XMM.RM]>; 4435 4436 //===---------------------------------------------------------------------===// 4437 // SSE3 - Arithmetic 4438 //===---------------------------------------------------------------------===// 4439 4440 multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC, 4441 X86MemOperand x86memop, X86FoldableSchedWrite sched, 4442 PatFrag ld_frag, bit Is2Addr = 1> { 4443 def rr : I<0xD0, MRMSrcReg, 4444 (outs RC:$dst), (ins RC:$src1, RC:$src2), 4445 !if(Is2Addr, 4446 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4447 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4448 [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>, 4449 Sched<[sched]>; 4450 def rm : I<0xD0, MRMSrcMem, 4451 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4452 !if(Is2Addr, 4453 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4454 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4455 [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>, 4456 Sched<[sched.Folded, ReadAfterLd]>; 4457 } 4458 4459 let Predicates = [HasAVX] in { 4460 let ExeDomain = SSEPackedSingle in { 4461 defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem, 4462 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>, 4463 XD, VEX_4V, VEX_WIG; 4464 defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem, 4465 SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>, 4466 XD, VEX_4V, VEX_L, VEX_WIG; 4467 } 4468 let ExeDomain = SSEPackedDouble in { 4469 defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem, 4470 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>, 4471 PD, VEX_4V, VEX_WIG; 4472 defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem, 4473 SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>, 4474 PD, VEX_4V, VEX_L, VEX_WIG; 4475 } 4476 } 4477 let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { 4478 let ExeDomain = SSEPackedSingle in 4479 defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem, 4480 SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD; 4481 let ExeDomain = SSEPackedDouble in 4482 defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem, 4483 SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD; 4484 } 4485 4486 //===---------------------------------------------------------------------===// 4487 // SSE3 Instructions 4488 //===---------------------------------------------------------------------===// 4489 4490 // Horizontal ops 4491 multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, 4492 X86MemOperand x86memop, SDNode OpNode, 4493 X86FoldableSchedWrite sched, PatFrag ld_frag, 4494 bit Is2Addr = 1> { 4495 def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 4496 !if(Is2Addr, 4497 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4498 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4499 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 4500 Sched<[sched]>; 4501 4502 def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4503 !if(Is2Addr, 4504 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4505 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4506 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 4507 Sched<[sched.Folded, ReadAfterLd]>; 4508 } 4509 multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, 4510 X86MemOperand x86memop, SDNode OpNode, 4511 X86FoldableSchedWrite sched, PatFrag ld_frag, 4512 bit Is2Addr = 1> { 4513 def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 4514 !if(Is2Addr, 4515 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4516 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4517 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 4518 Sched<[sched]>; 4519 4520 def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4521 !if(Is2Addr, 4522 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4523 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4524 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 4525 Sched<[sched.Folded, ReadAfterLd]>; 4526 } 4527 4528 let Predicates = [HasAVX] in { 4529 let ExeDomain = SSEPackedSingle in { 4530 defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, 4531 X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG; 4532 defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, 4533 X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG; 4534 defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, 4535 X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; 4536 defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, 4537 X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; 4538 } 4539 let ExeDomain = SSEPackedDouble in { 4540 defm VHADDPD : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem, 4541 X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG; 4542 defm VHSUBPD : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem, 4543 X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG; 4544 defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem, 4545 X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; 4546 defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem, 4547 X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; 4548 } 4549 } 4550 4551 let Constraints = "$src1 = $dst" in { 4552 let ExeDomain = SSEPackedSingle in { 4553 defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd, 4554 WriteFHAdd, memopv4f32>; 4555 defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub, 4556 WriteFHAdd, memopv4f32>; 4557 } 4558 let ExeDomain = SSEPackedDouble in { 4559 defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd, 4560 WriteFHAdd, memopv2f64>; 4561 defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub, 4562 WriteFHAdd, memopv2f64>; 4563 } 4564 } 4565 4566 //===---------------------------------------------------------------------===// 4567 // SSSE3 - Packed Absolute Instructions 4568 //===---------------------------------------------------------------------===// 4569 4570 /// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. 4571 multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt, 4572 SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> { 4573 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst), 4574 (ins VR128:$src), 4575 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4576 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>, 4577 Sched<[sched.XMM]>; 4578 4579 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst), 4580 (ins i128mem:$src), 4581 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4582 [(set VR128:$dst, 4583 (vt (OpNode (bitconvert (ld_frag addr:$src)))))]>, 4584 Sched<[sched.XMM.Folded]>; 4585 } 4586 4587 /// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. 4588 multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt, 4589 SDNode OpNode, X86SchedWriteWidths sched> { 4590 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst), 4591 (ins VR256:$src), 4592 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4593 [(set VR256:$dst, (vt (OpNode VR256:$src)))]>, 4594 Sched<[sched.YMM]>; 4595 4596 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst), 4597 (ins i256mem:$src), 4598 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4599 [(set VR256:$dst, 4600 (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>, 4601 Sched<[sched.YMM.Folded]>; 4602 } 4603 4604 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4605 defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU, 4606 loadv2i64>, VEX, VEX_WIG; 4607 defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU, 4608 loadv2i64>, VEX, VEX_WIG; 4609 } 4610 let Predicates = [HasAVX, NoVLX] in { 4611 defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU, 4612 loadv2i64>, VEX, VEX_WIG; 4613 } 4614 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4615 defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>, 4616 VEX, VEX_L, VEX_WIG; 4617 defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>, 4618 VEX, VEX_L, VEX_WIG; 4619 } 4620 let Predicates = [HasAVX2, NoVLX] in { 4621 defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>, 4622 VEX, VEX_L, VEX_WIG; 4623 } 4624 4625 defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU, 4626 memopv2i64>; 4627 defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU, 4628 memopv2i64>; 4629 defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU, 4630 memopv2i64>; 4631 4632 //===---------------------------------------------------------------------===// 4633 // SSSE3 - Packed Binary Operator Instructions 4634 //===---------------------------------------------------------------------===// 4635 4636 /// SS3I_binop_rm - Simple SSSE3 bin op 4637 multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 4638 ValueType DstVT, ValueType OpVT, RegisterClass RC, 4639 PatFrag memop_frag, X86MemOperand x86memop, 4640 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 4641 let isCommutable = 1 in 4642 def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst), 4643 (ins RC:$src1, RC:$src2), 4644 !if(Is2Addr, 4645 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4646 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4647 [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>, 4648 Sched<[sched]>; 4649 def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst), 4650 (ins RC:$src1, x86memop:$src2), 4651 !if(Is2Addr, 4652 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4653 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4654 [(set RC:$dst, 4655 (DstVT (OpNode (OpVT RC:$src1), 4656 (bitconvert (memop_frag addr:$src2)))))]>, 4657 Sched<[sched.Folded, ReadAfterLd]>; 4658 } 4659 4660 /// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}. 4661 multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, 4662 Intrinsic IntId128, X86FoldableSchedWrite sched, 4663 PatFrag ld_frag, bit Is2Addr = 1> { 4664 let isCommutable = 1 in 4665 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst), 4666 (ins VR128:$src1, VR128:$src2), 4667 !if(Is2Addr, 4668 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4669 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4670 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, 4671 Sched<[sched]>; 4672 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst), 4673 (ins VR128:$src1, i128mem:$src2), 4674 !if(Is2Addr, 4675 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4676 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4677 [(set VR128:$dst, 4678 (IntId128 VR128:$src1, 4679 (bitconvert (ld_frag addr:$src2))))]>, 4680 Sched<[sched.Folded, ReadAfterLd]>; 4681 } 4682 4683 multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr, 4684 Intrinsic IntId256, 4685 X86FoldableSchedWrite sched> { 4686 let isCommutable = 1 in 4687 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst), 4688 (ins VR256:$src1, VR256:$src2), 4689 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4690 [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, 4691 Sched<[sched]>; 4692 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst), 4693 (ins VR256:$src1, i256mem:$src2), 4694 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4695 [(set VR256:$dst, 4696 (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>, 4697 Sched<[sched.Folded, ReadAfterLd]>; 4698 } 4699 4700 let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4701 let isCommutable = 0 in { 4702 defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8, 4703 VR128, loadv2i64, i128mem, 4704 SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG; 4705 defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16, 4706 v16i8, VR128, loadv2i64, i128mem, 4707 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; 4708 } 4709 defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16, 4710 VR128, loadv2i64, i128mem, 4711 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; 4712 } 4713 4714 let ImmT = NoImm, Predicates = [HasAVX] in { 4715 let isCommutable = 0 in { 4716 defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128, 4717 loadv2i64, i128mem, 4718 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4719 defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128, 4720 loadv2i64, i128mem, 4721 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4722 defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128, 4723 loadv2i64, i128mem, 4724 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4725 defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128, 4726 loadv2i64, i128mem, 4727 SchedWritePHAdd.XMM, 0>, VEX_4V; 4728 defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb", 4729 int_x86_ssse3_psign_b_128, 4730 SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; 4731 defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw", 4732 int_x86_ssse3_psign_w_128, 4733 SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; 4734 defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd", 4735 int_x86_ssse3_psign_d_128, 4736 SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; 4737 defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", 4738 int_x86_ssse3_phadd_sw_128, 4739 SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; 4740 defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", 4741 int_x86_ssse3_phsub_sw_128, 4742 SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; 4743 } 4744 } 4745 4746 let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4747 let isCommutable = 0 in { 4748 defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8, 4749 VR256, loadv4i64, i256mem, 4750 SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4751 defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16, 4752 v32i8, VR256, loadv4i64, i256mem, 4753 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4754 } 4755 defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16, 4756 VR256, loadv4i64, i256mem, 4757 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4758 } 4759 4760 let ImmT = NoImm, Predicates = [HasAVX2] in { 4761 let isCommutable = 0 in { 4762 defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16, 4763 VR256, loadv4i64, i256mem, 4764 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4765 defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256, 4766 loadv4i64, i256mem, 4767 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4768 defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16, 4769 VR256, loadv4i64, i256mem, 4770 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4771 defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256, 4772 loadv4i64, i256mem, 4773 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L; 4774 defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b, 4775 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; 4776 defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w, 4777 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; 4778 defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d, 4779 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; 4780 defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", 4781 int_x86_avx2_phadd_sw, 4782 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG; 4783 defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", 4784 int_x86_avx2_phsub_sw, 4785 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG; 4786 } 4787 } 4788 4789 // None of these have i8 immediate fields. 4790 let ImmT = NoImm, Constraints = "$src1 = $dst" in { 4791 let isCommutable = 0 in { 4792 defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128, 4793 memopv2i64, i128mem, SchedWritePHAdd.XMM>; 4794 defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128, 4795 memopv2i64, i128mem, SchedWritePHAdd.XMM>; 4796 defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128, 4797 memopv2i64, i128mem, SchedWritePHAdd.XMM>; 4798 defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128, 4799 memopv2i64, i128mem, SchedWritePHAdd.XMM>; 4800 defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128, 4801 SchedWriteVecALU.XMM, memopv2i64>; 4802 defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128, 4803 SchedWriteVecALU.XMM, memopv2i64>; 4804 defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128, 4805 SchedWriteVecALU.XMM, memopv2i64>; 4806 defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128, 4807 memopv2i64, i128mem, SchedWriteVarShuffle.XMM>; 4808 defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", 4809 int_x86_ssse3_phadd_sw_128, 4810 SchedWritePHAdd.XMM, memopv2i64>; 4811 defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", 4812 int_x86_ssse3_phsub_sw_128, 4813 SchedWritePHAdd.XMM, memopv2i64>; 4814 defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16, 4815 v16i8, VR128, memopv2i64, i128mem, 4816 SchedWriteVecIMul.XMM>; 4817 } 4818 defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16, 4819 VR128, memopv2i64, i128mem, SchedWriteVecIMul.XMM>; 4820 } 4821 4822 //===---------------------------------------------------------------------===// 4823 // SSSE3 - Packed Align Instruction Patterns 4824 //===---------------------------------------------------------------------===// 4825 4826 multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC, 4827 PatFrag memop_frag, X86MemOperand x86memop, 4828 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 4829 let hasSideEffects = 0 in { 4830 def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst), 4831 (ins RC:$src1, RC:$src2, u8imm:$src3), 4832 !if(Is2Addr, 4833 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 4834 !strconcat(asm, 4835 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 4836 [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 imm:$src3))))]>, 4837 Sched<[sched]>; 4838 let mayLoad = 1 in 4839 def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst), 4840 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 4841 !if(Is2Addr, 4842 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 4843 !strconcat(asm, 4844 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 4845 [(set RC:$dst, (VT (X86PAlignr RC:$src1, 4846 (bitconvert (memop_frag addr:$src2)), 4847 (i8 imm:$src3))))]>, 4848 Sched<[sched.Folded, ReadAfterLd]>; 4849 } 4850 } 4851 4852 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 4853 defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, loadv2i64, i128mem, 4854 SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG; 4855 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 4856 defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, loadv4i64, i256mem, 4857 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4858 let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in 4859 defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memopv2i64, i128mem, 4860 SchedWriteShuffle.XMM>; 4861 4862 //===---------------------------------------------------------------------===// 4863 // SSSE3 - Thread synchronization 4864 //===---------------------------------------------------------------------===// 4865 4866 let SchedRW = [WriteSystem] in { 4867 let usesCustomInserter = 1 in { 4868 def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3), 4869 [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>, 4870 Requires<[HasSSE3]>; 4871 } 4872 4873 let Uses = [EAX, ECX, EDX] in 4874 def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, 4875 TB, Requires<[HasSSE3]>; 4876 4877 let Uses = [ECX, EAX] in 4878 def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", 4879 [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>; 4880 } // SchedRW 4881 4882 def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>; 4883 def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>; 4884 4885 def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>, 4886 Requires<[Not64BitMode]>; 4887 def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>, 4888 Requires<[In64BitMode]>; 4889 4890 //===----------------------------------------------------------------------===// 4891 // SSE4.1 - Packed Move with Sign/Zero Extend 4892 //===----------------------------------------------------------------------===// 4893 4894 multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, 4895 RegisterClass OutRC, RegisterClass InRC, 4896 X86FoldableSchedWrite sched> { 4897 def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src), 4898 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, 4899 Sched<[sched]>; 4900 4901 def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src), 4902 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, 4903 Sched<[sched.Folded]>; 4904 } 4905 4906 multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr, 4907 X86MemOperand MemOp, X86MemOperand MemYOp, 4908 Predicate prd> { 4909 defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, 4910 SchedWriteShuffle.XMM>; 4911 let Predicates = [HasAVX, prd] in 4912 defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp, 4913 VR128, VR128, SchedWriteShuffle.XMM>, 4914 VEX, VEX_WIG; 4915 let Predicates = [HasAVX2, prd] in 4916 defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp, 4917 VR256, VR128, WriteShuffle256>, 4918 VEX, VEX_L, VEX_WIG; 4919 } 4920 4921 multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, 4922 X86MemOperand MemYOp, Predicate prd> { 4923 defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr), 4924 MemOp, MemYOp, prd>; 4925 defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10), 4926 !strconcat("pmovzx", OpcodeStr), 4927 MemOp, MemYOp, prd>; 4928 } 4929 4930 defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>; 4931 defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>; 4932 defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>; 4933 4934 defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>; 4935 defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>; 4936 4937 defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>; 4938 4939 // AVX2 Patterns 4940 multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtOp> { 4941 // Register-Register patterns 4942 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4943 def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), 4944 (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>; 4945 } 4946 let Predicates = [HasAVX, NoVLX] in { 4947 def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))), 4948 (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>; 4949 def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))), 4950 (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>; 4951 4952 def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), 4953 (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>; 4954 def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))), 4955 (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>; 4956 4957 def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), 4958 (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>; 4959 } 4960 4961 // Simple Register-Memory patterns 4962 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4963 def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 4964 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 4965 } 4966 let Predicates = [HasAVX, NoVLX] in { 4967 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 4968 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 4969 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 4970 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 4971 4972 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 4973 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 4974 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 4975 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 4976 4977 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), 4978 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 4979 } 4980 4981 // AVX2 Register-Memory patterns 4982 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4983 def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 4984 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 4985 def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), 4986 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 4987 def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), 4988 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 4989 } 4990 let Predicates = [HasAVX, NoVLX] in { 4991 def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 4992 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 4993 def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), 4994 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 4995 def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), 4996 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 4997 def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 4998 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 4999 5000 def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5001 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5002 def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), 5003 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5004 def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), 5005 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5006 def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 5007 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5008 5009 def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), 5010 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 5011 def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), 5012 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 5013 def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), 5014 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 5015 5016 def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5017 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5018 def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), 5019 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5020 def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), 5021 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5022 def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), 5023 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5024 5025 def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), 5026 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 5027 def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), 5028 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 5029 def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), 5030 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 5031 } 5032 } 5033 5034 defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>; 5035 defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>; 5036 5037 // SSE4.1/AVX patterns. 5038 multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, 5039 SDNode ExtOp> { 5040 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5041 def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))), 5042 (!cast<I>(OpcPrefix#BWrr) VR128:$src)>; 5043 } 5044 let Predicates = [HasAVX, NoVLX] in { 5045 def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))), 5046 (!cast<I>(OpcPrefix#BDrr) VR128:$src)>; 5047 def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))), 5048 (!cast<I>(OpcPrefix#BQrr) VR128:$src)>; 5049 5050 def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))), 5051 (!cast<I>(OpcPrefix#WDrr) VR128:$src)>; 5052 def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))), 5053 (!cast<I>(OpcPrefix#WQrr) VR128:$src)>; 5054 5055 def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))), 5056 (!cast<I>(OpcPrefix#DQrr) VR128:$src)>; 5057 } 5058 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5059 def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5060 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5061 } 5062 let Predicates = [HasAVX, NoVLX] in { 5063 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5064 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5065 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5066 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5067 5068 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5069 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5070 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5071 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5072 5073 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), 5074 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5075 } 5076 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5077 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5078 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5079 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5080 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5081 def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), 5082 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5083 def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), 5084 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5085 def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 5086 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5087 } 5088 let Predicates = [HasAVX, NoVLX] in { 5089 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5090 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5091 def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), 5092 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5093 def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), 5094 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5095 def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 5096 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5097 5098 def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))), 5099 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5100 def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), 5101 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5102 def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), 5103 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5104 def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), 5105 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5106 5107 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5108 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5109 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5110 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5111 def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), 5112 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5113 def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), 5114 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5115 def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), 5116 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5117 5118 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5119 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5120 def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))), 5121 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5122 def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), 5123 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5124 def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), 5125 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5126 5127 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5128 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5129 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5130 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5131 def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), 5132 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5133 def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), 5134 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5135 def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), 5136 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5137 } 5138 } 5139 5140 defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>; 5141 defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>; 5142 5143 let Predicates = [UseSSE41] in { 5144 defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>; 5145 defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>; 5146 } 5147 5148 //===----------------------------------------------------------------------===// 5149 // SSE4.1 - Extract Instructions 5150 //===----------------------------------------------------------------------===// 5151 5152 /// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem 5153 multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { 5154 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5155 (ins VR128:$src1, u8imm:$src2), 5156 !strconcat(OpcodeStr, 5157 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5158 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1), 5159 imm:$src2))]>, 5160 Sched<[WriteVecExtract]>; 5161 let hasSideEffects = 0, mayStore = 1 in 5162 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5163 (ins i8mem:$dst, VR128:$src1, u8imm:$src2), 5164 !strconcat(OpcodeStr, 5165 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5166 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), 5167 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5168 } 5169 5170 let Predicates = [HasAVX, NoBWI] in 5171 defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX; 5172 5173 defm PEXTRB : SS41I_extract8<0x14, "pextrb">; 5174 5175 5176 /// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination 5177 multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { 5178 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in 5179 def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5180 (ins VR128:$src1, u8imm:$src2), 5181 !strconcat(OpcodeStr, 5182 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 5183 Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>; 5184 5185 let hasSideEffects = 0, mayStore = 1 in 5186 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5187 (ins i16mem:$dst, VR128:$src1, u8imm:$src2), 5188 !strconcat(OpcodeStr, 5189 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5190 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), imm:$src2))), 5191 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5192 } 5193 5194 let Predicates = [HasAVX, NoBWI] in 5195 defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX; 5196 5197 defm PEXTRW : SS41I_extract16<0x15, "pextrw">; 5198 5199 5200 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination 5201 multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { 5202 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst), 5203 (ins VR128:$src1, u8imm:$src2), 5204 !strconcat(OpcodeStr, 5205 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5206 [(set GR32:$dst, 5207 (extractelt (v4i32 VR128:$src1), imm:$src2))]>, 5208 Sched<[WriteVecExtract]>; 5209 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5210 (ins i32mem:$dst, VR128:$src1, u8imm:$src2), 5211 !strconcat(OpcodeStr, 5212 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5213 [(store (extractelt (v4i32 VR128:$src1), imm:$src2), 5214 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5215 } 5216 5217 let Predicates = [HasAVX, NoDQI] in 5218 defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX; 5219 5220 defm PEXTRD : SS41I_extract32<0x16, "pextrd">; 5221 5222 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination 5223 multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { 5224 def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst), 5225 (ins VR128:$src1, u8imm:$src2), 5226 !strconcat(OpcodeStr, 5227 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5228 [(set GR64:$dst, 5229 (extractelt (v2i64 VR128:$src1), imm:$src2))]>, 5230 Sched<[WriteVecExtract]>; 5231 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5232 (ins i64mem:$dst, VR128:$src1, u8imm:$src2), 5233 !strconcat(OpcodeStr, 5234 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5235 [(store (extractelt (v2i64 VR128:$src1), imm:$src2), 5236 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5237 } 5238 5239 let Predicates = [HasAVX, NoDQI] in 5240 defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W; 5241 5242 defm PEXTRQ : SS41I_extract64<0x16, "pextrq">, REX_W; 5243 5244 /// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory 5245 /// destination 5246 multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> { 5247 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5248 (ins VR128:$src1, u8imm:$src2), 5249 !strconcat(OpcodeStr, 5250 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5251 [(set GR32orGR64:$dst, 5252 (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>, 5253 Sched<[WriteVecExtract]>; 5254 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5255 (ins f32mem:$dst, VR128:$src1, u8imm:$src2), 5256 !strconcat(OpcodeStr, 5257 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5258 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2), 5259 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5260 } 5261 5262 let ExeDomain = SSEPackedSingle in { 5263 let Predicates = [UseAVX] in 5264 defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG; 5265 defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">; 5266 } 5267 5268 // Also match an EXTRACTPS store when the store is done as f32 instead of i32. 5269 def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), 5270 imm:$src2))), 5271 addr:$dst), 5272 (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, 5273 Requires<[HasAVX]>; 5274 def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), 5275 imm:$src2))), 5276 addr:$dst), 5277 (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, 5278 Requires<[UseSSE41]>; 5279 5280 //===----------------------------------------------------------------------===// 5281 // SSE4.1 - Insert Instructions 5282 //===----------------------------------------------------------------------===// 5283 5284 multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { 5285 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5286 (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3), 5287 !if(Is2Addr, 5288 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5289 !strconcat(asm, 5290 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5291 [(set VR128:$dst, 5292 (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, 5293 Sched<[WriteVecInsert]>; 5294 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5295 (ins VR128:$src1, i8mem:$src2, u8imm:$src3), 5296 !if(Is2Addr, 5297 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5298 !strconcat(asm, 5299 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5300 [(set VR128:$dst, 5301 (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), 5302 imm:$src3))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>; 5303 } 5304 5305 let Predicates = [HasAVX, NoBWI] in 5306 defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V; 5307 let Constraints = "$src1 = $dst" in 5308 defm PINSRB : SS41I_insert8<0x20, "pinsrb">; 5309 5310 multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { 5311 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5312 (ins VR128:$src1, GR32:$src2, u8imm:$src3), 5313 !if(Is2Addr, 5314 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5315 !strconcat(asm, 5316 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5317 [(set VR128:$dst, 5318 (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, 5319 Sched<[WriteVecInsert]>; 5320 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5321 (ins VR128:$src1, i32mem:$src2, u8imm:$src3), 5322 !if(Is2Addr, 5323 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5324 !strconcat(asm, 5325 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5326 [(set VR128:$dst, 5327 (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), 5328 imm:$src3)))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>; 5329 } 5330 5331 let Predicates = [HasAVX, NoDQI] in 5332 defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V; 5333 let Constraints = "$src1 = $dst" in 5334 defm PINSRD : SS41I_insert32<0x22, "pinsrd">; 5335 5336 multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { 5337 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5338 (ins VR128:$src1, GR64:$src2, u8imm:$src3), 5339 !if(Is2Addr, 5340 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5341 !strconcat(asm, 5342 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5343 [(set VR128:$dst, 5344 (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>, 5345 Sched<[WriteVecInsert]>; 5346 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5347 (ins VR128:$src1, i64mem:$src2, u8imm:$src3), 5348 !if(Is2Addr, 5349 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5350 !strconcat(asm, 5351 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5352 [(set VR128:$dst, 5353 (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), 5354 imm:$src3)))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>; 5355 } 5356 5357 let Predicates = [HasAVX, NoDQI] in 5358 defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W; 5359 let Constraints = "$src1 = $dst" in 5360 defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W; 5361 5362 // insertps has a few different modes, there's the first two here below which 5363 // are optimized inserts that won't zero arbitrary elements in the destination 5364 // vector. The next one matches the intrinsic and could zero arbitrary elements 5365 // in the target vector. 5366 multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { 5367 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5368 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 5369 !if(Is2Addr, 5370 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5371 !strconcat(asm, 5372 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5373 [(set VR128:$dst, 5374 (X86insertps VR128:$src1, VR128:$src2, imm:$src3))]>, 5375 Sched<[SchedWriteFShuffle.XMM]>; 5376 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5377 (ins VR128:$src1, f32mem:$src2, u8imm:$src3), 5378 !if(Is2Addr, 5379 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5380 !strconcat(asm, 5381 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5382 [(set VR128:$dst, 5383 (X86insertps VR128:$src1, 5384 (v4f32 (scalar_to_vector (loadf32 addr:$src2))), 5385 imm:$src3))]>, 5386 Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>; 5387 } 5388 5389 let ExeDomain = SSEPackedSingle in { 5390 let Predicates = [UseAVX] in 5391 defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, 5392 VEX_4V, VEX_WIG; 5393 let Constraints = "$src1 = $dst" in 5394 defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>; 5395 } 5396 5397 let Predicates = [UseAVX] in { 5398 // If we're inserting an element from a vbroadcast of a load, fold the 5399 // load into the X86insertps instruction. 5400 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), 5401 (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)), 5402 (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; 5403 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), 5404 (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)), 5405 (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; 5406 } 5407 5408 //===----------------------------------------------------------------------===// 5409 // SSE4.1 - Round Instructions 5410 //===----------------------------------------------------------------------===// 5411 5412 multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr, 5413 X86MemOperand x86memop, RegisterClass RC, 5414 ValueType VT, PatFrag mem_frag, SDNode OpNode, 5415 X86FoldableSchedWrite sched> { 5416 // Intrinsic operation, reg. 5417 // Vector intrinsic operation, reg 5418 def r : SS4AIi8<opc, MRMSrcReg, 5419 (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), 5420 !strconcat(OpcodeStr, 5421 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5422 [(set RC:$dst, (VT (OpNode RC:$src1, imm:$src2)))]>, 5423 Sched<[sched]>; 5424 5425 // Vector intrinsic operation, mem 5426 def m : SS4AIi8<opc, MRMSrcMem, 5427 (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), 5428 !strconcat(OpcodeStr, 5429 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5430 [(set RC:$dst, 5431 (VT (OpNode (mem_frag addr:$src1),imm:$src2)))]>, 5432 Sched<[sched.Folded]>; 5433 } 5434 5435 multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd, 5436 string OpcodeStr, X86FoldableSchedWrite sched> { 5437 let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in { 5438 def SSr : SS4AIi8<opcss, MRMSrcReg, 5439 (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3), 5440 !strconcat(OpcodeStr, 5441 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5442 []>, Sched<[sched]>; 5443 5444 let mayLoad = 1 in 5445 def SSm : SS4AIi8<opcss, MRMSrcMem, 5446 (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3), 5447 !strconcat(OpcodeStr, 5448 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5449 []>, Sched<[sched.Folded, ReadAfterLd]>; 5450 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0 5451 5452 let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in { 5453 def SDr : SS4AIi8<opcsd, MRMSrcReg, 5454 (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3), 5455 !strconcat(OpcodeStr, 5456 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5457 []>, Sched<[sched]>; 5458 5459 let mayLoad = 1 in 5460 def SDm : SS4AIi8<opcsd, MRMSrcMem, 5461 (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3), 5462 !strconcat(OpcodeStr, 5463 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5464 []>, Sched<[sched.Folded, ReadAfterLd]>; 5465 } // ExeDomain = SSEPackedDouble, hasSideEffects = 0 5466 } 5467 5468 multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd, 5469 string OpcodeStr, X86FoldableSchedWrite sched> { 5470 let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in { 5471 def SSr : SS4AIi8<opcss, MRMSrcReg, 5472 (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2), 5473 !strconcat(OpcodeStr, 5474 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5475 []>, Sched<[sched]>; 5476 5477 let mayLoad = 1 in 5478 def SSm : SS4AIi8<opcss, MRMSrcMem, 5479 (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2), 5480 !strconcat(OpcodeStr, 5481 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5482 []>, Sched<[sched.Folded, ReadAfterLd]>; 5483 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0 5484 5485 let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in { 5486 def SDr : SS4AIi8<opcsd, MRMSrcReg, 5487 (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2), 5488 !strconcat(OpcodeStr, 5489 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5490 []>, Sched<[sched]>; 5491 5492 let mayLoad = 1 in 5493 def SDm : SS4AIi8<opcsd, MRMSrcMem, 5494 (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2), 5495 !strconcat(OpcodeStr, 5496 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5497 []>, Sched<[sched.Folded, ReadAfterLd]>; 5498 } // ExeDomain = SSEPackedDouble, hasSideEffects = 0 5499 } 5500 5501 multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd, 5502 string OpcodeStr, X86FoldableSchedWrite sched, 5503 ValueType VT32, ValueType VT64, 5504 SDNode OpNode, bit Is2Addr = 1> { 5505 let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in { 5506 def SSr_Int : SS4AIi8<opcss, MRMSrcReg, 5507 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), 5508 !if(Is2Addr, 5509 !strconcat(OpcodeStr, 5510 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5511 !strconcat(OpcodeStr, 5512 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5513 [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>, 5514 Sched<[sched]>; 5515 5516 def SSm_Int : SS4AIi8<opcss, MRMSrcMem, 5517 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3), 5518 !if(Is2Addr, 5519 !strconcat(OpcodeStr, 5520 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5521 !strconcat(OpcodeStr, 5522 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5523 [(set VR128:$dst, 5524 (OpNode VR128:$src1, sse_load_f32:$src2, imm:$src3))]>, 5525 Sched<[sched.Folded, ReadAfterLd]>; 5526 } // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 5527 5528 let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in { 5529 def SDr_Int : SS4AIi8<opcsd, MRMSrcReg, 5530 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), 5531 !if(Is2Addr, 5532 !strconcat(OpcodeStr, 5533 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5534 !strconcat(OpcodeStr, 5535 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5536 [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>, 5537 Sched<[sched]>; 5538 5539 def SDm_Int : SS4AIi8<opcsd, MRMSrcMem, 5540 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3), 5541 !if(Is2Addr, 5542 !strconcat(OpcodeStr, 5543 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5544 !strconcat(OpcodeStr, 5545 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5546 [(set VR128:$dst, 5547 (OpNode VR128:$src1, sse_load_f64:$src2, imm:$src3))]>, 5548 Sched<[sched.Folded, ReadAfterLd]>; 5549 } // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 5550 } 5551 5552 // FP round - roundss, roundps, roundsd, roundpd 5553 let Predicates = [HasAVX, NoVLX] in { 5554 let ExeDomain = SSEPackedSingle in { 5555 // Intrinsic form 5556 defm VROUNDPS : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32, 5557 loadv4f32, X86VRndScale, SchedWriteFRnd.XMM>, 5558 VEX, VEX_WIG; 5559 defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32, 5560 loadv8f32, X86VRndScale, SchedWriteFRnd.YMM>, 5561 VEX, VEX_L, VEX_WIG; 5562 } 5563 5564 let ExeDomain = SSEPackedDouble in { 5565 defm VROUNDPD : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64, 5566 loadv2f64, X86VRndScale, SchedWriteFRnd.XMM>, 5567 VEX, VEX_WIG; 5568 defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64, 5569 loadv4f64, X86VRndScale, SchedWriteFRnd.YMM>, 5570 VEX, VEX_L, VEX_WIG; 5571 } 5572 } 5573 let Predicates = [HasAVX, NoAVX512] in { 5574 defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl, 5575 v4f32, v2f64, X86RndScales, 0>, 5576 VEX_4V, VEX_LIG, VEX_WIG; 5577 defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>, 5578 VEX_4V, VEX_LIG, VEX_WIG; 5579 } 5580 5581 let Predicates = [UseAVX] in { 5582 def : Pat<(ffloor FR32:$src), 5583 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>; 5584 def : Pat<(f32 (fnearbyint FR32:$src)), 5585 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; 5586 def : Pat<(f32 (fceil FR32:$src)), 5587 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>; 5588 def : Pat<(f32 (frint FR32:$src)), 5589 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; 5590 def : Pat<(f32 (ftrunc FR32:$src)), 5591 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>; 5592 5593 def : Pat<(f64 (ffloor FR64:$src)), 5594 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>; 5595 def : Pat<(f64 (fnearbyint FR64:$src)), 5596 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; 5597 def : Pat<(f64 (fceil FR64:$src)), 5598 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>; 5599 def : Pat<(f64 (frint FR64:$src)), 5600 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; 5601 def : Pat<(f64 (ftrunc FR64:$src)), 5602 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>; 5603 } 5604 5605 let Predicates = [UseAVX, OptForSize] in { 5606 def : Pat<(ffloor (loadf32 addr:$src)), 5607 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>; 5608 def : Pat<(f32 (fnearbyint (loadf32 addr:$src))), 5609 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>; 5610 def : Pat<(f32 (fceil (loadf32 addr:$src))), 5611 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>; 5612 def : Pat<(f32 (frint (loadf32 addr:$src))), 5613 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>; 5614 def : Pat<(f32 (ftrunc (loadf32 addr:$src))), 5615 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>; 5616 5617 def : Pat<(f64 (ffloor (loadf64 addr:$src))), 5618 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>; 5619 def : Pat<(f64 (fnearbyint (loadf64 addr:$src))), 5620 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>; 5621 def : Pat<(f64 (fceil (loadf64 addr:$src))), 5622 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>; 5623 def : Pat<(f64 (frint (loadf64 addr:$src))), 5624 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>; 5625 def : Pat<(f64 (ftrunc (loadf64 addr:$src))), 5626 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>; 5627 } 5628 5629 let Predicates = [HasAVX, NoVLX] in { 5630 def : Pat<(v4f32 (ffloor VR128:$src)), 5631 (VROUNDPSr VR128:$src, (i32 0x9))>; 5632 def : Pat<(v4f32 (fnearbyint VR128:$src)), 5633 (VROUNDPSr VR128:$src, (i32 0xC))>; 5634 def : Pat<(v4f32 (fceil VR128:$src)), 5635 (VROUNDPSr VR128:$src, (i32 0xA))>; 5636 def : Pat<(v4f32 (frint VR128:$src)), 5637 (VROUNDPSr VR128:$src, (i32 0x4))>; 5638 def : Pat<(v4f32 (ftrunc VR128:$src)), 5639 (VROUNDPSr VR128:$src, (i32 0xB))>; 5640 5641 def : Pat<(v4f32 (ffloor (loadv4f32 addr:$src))), 5642 (VROUNDPSm addr:$src, (i32 0x9))>; 5643 def : Pat<(v4f32 (fnearbyint (loadv4f32 addr:$src))), 5644 (VROUNDPSm addr:$src, (i32 0xC))>; 5645 def : Pat<(v4f32 (fceil (loadv4f32 addr:$src))), 5646 (VROUNDPSm addr:$src, (i32 0xA))>; 5647 def : Pat<(v4f32 (frint (loadv4f32 addr:$src))), 5648 (VROUNDPSm addr:$src, (i32 0x4))>; 5649 def : Pat<(v4f32 (ftrunc (loadv4f32 addr:$src))), 5650 (VROUNDPSm addr:$src, (i32 0xB))>; 5651 5652 def : Pat<(v2f64 (ffloor VR128:$src)), 5653 (VROUNDPDr VR128:$src, (i32 0x9))>; 5654 def : Pat<(v2f64 (fnearbyint VR128:$src)), 5655 (VROUNDPDr VR128:$src, (i32 0xC))>; 5656 def : Pat<(v2f64 (fceil VR128:$src)), 5657 (VROUNDPDr VR128:$src, (i32 0xA))>; 5658 def : Pat<(v2f64 (frint VR128:$src)), 5659 (VROUNDPDr VR128:$src, (i32 0x4))>; 5660 def : Pat<(v2f64 (ftrunc VR128:$src)), 5661 (VROUNDPDr VR128:$src, (i32 0xB))>; 5662 5663 def : Pat<(v2f64 (ffloor (loadv2f64 addr:$src))), 5664 (VROUNDPDm addr:$src, (i32 0x9))>; 5665 def : Pat<(v2f64 (fnearbyint (loadv2f64 addr:$src))), 5666 (VROUNDPDm addr:$src, (i32 0xC))>; 5667 def : Pat<(v2f64 (fceil (loadv2f64 addr:$src))), 5668 (VROUNDPDm addr:$src, (i32 0xA))>; 5669 def : Pat<(v2f64 (frint (loadv2f64 addr:$src))), 5670 (VROUNDPDm addr:$src, (i32 0x4))>; 5671 def : Pat<(v2f64 (ftrunc (loadv2f64 addr:$src))), 5672 (VROUNDPDm addr:$src, (i32 0xB))>; 5673 5674 def : Pat<(v8f32 (ffloor VR256:$src)), 5675 (VROUNDPSYr VR256:$src, (i32 0x9))>; 5676 def : Pat<(v8f32 (fnearbyint VR256:$src)), 5677 (VROUNDPSYr VR256:$src, (i32 0xC))>; 5678 def : Pat<(v8f32 (fceil VR256:$src)), 5679 (VROUNDPSYr VR256:$src, (i32 0xA))>; 5680 def : Pat<(v8f32 (frint VR256:$src)), 5681 (VROUNDPSYr VR256:$src, (i32 0x4))>; 5682 def : Pat<(v8f32 (ftrunc VR256:$src)), 5683 (VROUNDPSYr VR256:$src, (i32 0xB))>; 5684 5685 def : Pat<(v8f32 (ffloor (loadv8f32 addr:$src))), 5686 (VROUNDPSYm addr:$src, (i32 0x9))>; 5687 def : Pat<(v8f32 (fnearbyint (loadv8f32 addr:$src))), 5688 (VROUNDPSYm addr:$src, (i32 0xC))>; 5689 def : Pat<(v8f32 (fceil (loadv8f32 addr:$src))), 5690 (VROUNDPSYm addr:$src, (i32 0xA))>; 5691 def : Pat<(v8f32 (frint (loadv8f32 addr:$src))), 5692 (VROUNDPSYm addr:$src, (i32 0x4))>; 5693 def : Pat<(v8f32 (ftrunc (loadv8f32 addr:$src))), 5694 (VROUNDPSYm addr:$src, (i32 0xB))>; 5695 5696 def : Pat<(v4f64 (ffloor VR256:$src)), 5697 (VROUNDPDYr VR256:$src, (i32 0x9))>; 5698 def : Pat<(v4f64 (fnearbyint VR256:$src)), 5699 (VROUNDPDYr VR256:$src, (i32 0xC))>; 5700 def : Pat<(v4f64 (fceil VR256:$src)), 5701 (VROUNDPDYr VR256:$src, (i32 0xA))>; 5702 def : Pat<(v4f64 (frint VR256:$src)), 5703 (VROUNDPDYr VR256:$src, (i32 0x4))>; 5704 def : Pat<(v4f64 (ftrunc VR256:$src)), 5705 (VROUNDPDYr VR256:$src, (i32 0xB))>; 5706 5707 def : Pat<(v4f64 (ffloor (loadv4f64 addr:$src))), 5708 (VROUNDPDYm addr:$src, (i32 0x9))>; 5709 def : Pat<(v4f64 (fnearbyint (loadv4f64 addr:$src))), 5710 (VROUNDPDYm addr:$src, (i32 0xC))>; 5711 def : Pat<(v4f64 (fceil (loadv4f64 addr:$src))), 5712 (VROUNDPDYm addr:$src, (i32 0xA))>; 5713 def : Pat<(v4f64 (frint (loadv4f64 addr:$src))), 5714 (VROUNDPDYm addr:$src, (i32 0x4))>; 5715 def : Pat<(v4f64 (ftrunc (loadv4f64 addr:$src))), 5716 (VROUNDPDYm addr:$src, (i32 0xB))>; 5717 } 5718 5719 let ExeDomain = SSEPackedSingle in 5720 defm ROUNDPS : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32, 5721 memopv4f32, X86VRndScale, SchedWriteFRnd.XMM>; 5722 let ExeDomain = SSEPackedDouble in 5723 defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64, 5724 memopv2f64, X86VRndScale, SchedWriteFRnd.XMM>; 5725 5726 defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>; 5727 5728 let Constraints = "$src1 = $dst" in 5729 defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl, 5730 v4f32, v2f64, X86RndScales>; 5731 5732 let Predicates = [UseSSE41] in { 5733 def : Pat<(ffloor FR32:$src), 5734 (ROUNDSSr FR32:$src, (i32 0x9))>; 5735 def : Pat<(f32 (fnearbyint FR32:$src)), 5736 (ROUNDSSr FR32:$src, (i32 0xC))>; 5737 def : Pat<(f32 (fceil FR32:$src)), 5738 (ROUNDSSr FR32:$src, (i32 0xA))>; 5739 def : Pat<(f32 (frint FR32:$src)), 5740 (ROUNDSSr FR32:$src, (i32 0x4))>; 5741 def : Pat<(f32 (ftrunc FR32:$src)), 5742 (ROUNDSSr FR32:$src, (i32 0xB))>; 5743 5744 def : Pat<(f64 (ffloor FR64:$src)), 5745 (ROUNDSDr FR64:$src, (i32 0x9))>; 5746 def : Pat<(f64 (fnearbyint FR64:$src)), 5747 (ROUNDSDr FR64:$src, (i32 0xC))>; 5748 def : Pat<(f64 (fceil FR64:$src)), 5749 (ROUNDSDr FR64:$src, (i32 0xA))>; 5750 def : Pat<(f64 (frint FR64:$src)), 5751 (ROUNDSDr FR64:$src, (i32 0x4))>; 5752 def : Pat<(f64 (ftrunc FR64:$src)), 5753 (ROUNDSDr FR64:$src, (i32 0xB))>; 5754 } 5755 5756 let Predicates = [UseSSE41, OptForSize] in { 5757 def : Pat<(ffloor (loadf32 addr:$src)), 5758 (ROUNDSSm addr:$src, (i32 0x9))>; 5759 def : Pat<(f32 (fnearbyint (loadf32 addr:$src))), 5760 (ROUNDSSm addr:$src, (i32 0xC))>; 5761 def : Pat<(f32 (fceil (loadf32 addr:$src))), 5762 (ROUNDSSm addr:$src, (i32 0xA))>; 5763 def : Pat<(f32 (frint (loadf32 addr:$src))), 5764 (ROUNDSSm addr:$src, (i32 0x4))>; 5765 def : Pat<(f32 (ftrunc (loadf32 addr:$src))), 5766 (ROUNDSSm addr:$src, (i32 0xB))>; 5767 5768 def : Pat<(f64 (ffloor (loadf64 addr:$src))), 5769 (ROUNDSDm addr:$src, (i32 0x9))>; 5770 def : Pat<(f64 (fnearbyint (loadf64 addr:$src))), 5771 (ROUNDSDm addr:$src, (i32 0xC))>; 5772 def : Pat<(f64 (fceil (loadf64 addr:$src))), 5773 (ROUNDSDm addr:$src, (i32 0xA))>; 5774 def : Pat<(f64 (frint (loadf64 addr:$src))), 5775 (ROUNDSDm addr:$src, (i32 0x4))>; 5776 def : Pat<(f64 (ftrunc (loadf64 addr:$src))), 5777 (ROUNDSDm addr:$src, (i32 0xB))>; 5778 } 5779 5780 let Predicates = [UseSSE41] in { 5781 def : Pat<(v4f32 (ffloor VR128:$src)), 5782 (ROUNDPSr VR128:$src, (i32 0x9))>; 5783 def : Pat<(v4f32 (fnearbyint VR128:$src)), 5784 (ROUNDPSr VR128:$src, (i32 0xC))>; 5785 def : Pat<(v4f32 (fceil VR128:$src)), 5786 (ROUNDPSr VR128:$src, (i32 0xA))>; 5787 def : Pat<(v4f32 (frint VR128:$src)), 5788 (ROUNDPSr VR128:$src, (i32 0x4))>; 5789 def : Pat<(v4f32 (ftrunc VR128:$src)), 5790 (ROUNDPSr VR128:$src, (i32 0xB))>; 5791 5792 def : Pat<(v4f32 (ffloor (memopv4f32 addr:$src))), 5793 (ROUNDPSm addr:$src, (i32 0x9))>; 5794 def : Pat<(v4f32 (fnearbyint (memopv4f32 addr:$src))), 5795 (ROUNDPSm addr:$src, (i32 0xC))>; 5796 def : Pat<(v4f32 (fceil (memopv4f32 addr:$src))), 5797 (ROUNDPSm addr:$src, (i32 0xA))>; 5798 def : Pat<(v4f32 (frint (memopv4f32 addr:$src))), 5799 (ROUNDPSm addr:$src, (i32 0x4))>; 5800 def : Pat<(v4f32 (ftrunc (memopv4f32 addr:$src))), 5801 (ROUNDPSm addr:$src, (i32 0xB))>; 5802 5803 def : Pat<(v2f64 (ffloor VR128:$src)), 5804 (ROUNDPDr VR128:$src, (i32 0x9))>; 5805 def : Pat<(v2f64 (fnearbyint VR128:$src)), 5806 (ROUNDPDr VR128:$src, (i32 0xC))>; 5807 def : Pat<(v2f64 (fceil VR128:$src)), 5808 (ROUNDPDr VR128:$src, (i32 0xA))>; 5809 def : Pat<(v2f64 (frint VR128:$src)), 5810 (ROUNDPDr VR128:$src, (i32 0x4))>; 5811 def : Pat<(v2f64 (ftrunc VR128:$src)), 5812 (ROUNDPDr VR128:$src, (i32 0xB))>; 5813 5814 def : Pat<(v2f64 (ffloor (memopv2f64 addr:$src))), 5815 (ROUNDPDm addr:$src, (i32 0x9))>; 5816 def : Pat<(v2f64 (fnearbyint (memopv2f64 addr:$src))), 5817 (ROUNDPDm addr:$src, (i32 0xC))>; 5818 def : Pat<(v2f64 (fceil (memopv2f64 addr:$src))), 5819 (ROUNDPDm addr:$src, (i32 0xA))>; 5820 def : Pat<(v2f64 (frint (memopv2f64 addr:$src))), 5821 (ROUNDPDm addr:$src, (i32 0x4))>; 5822 def : Pat<(v2f64 (ftrunc (memopv2f64 addr:$src))), 5823 (ROUNDPDm addr:$src, (i32 0xB))>; 5824 } 5825 5826 defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSS", X86Movss, 5827 v4f32, 0x01, UseSSE41>; 5828 defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSS", X86Movss, 5829 v4f32, 0x02, UseSSE41>; 5830 defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSD", X86Movsd, 5831 v2f64, 0x01, UseSSE41>; 5832 defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSD", X86Movsd, 5833 v2f64, 0x02, UseSSE41>; 5834 5835 //===----------------------------------------------------------------------===// 5836 // SSE4.1 - Packed Bit Test 5837 //===----------------------------------------------------------------------===// 5838 5839 // ptest instruction we'll lower to this in X86ISelLowering primarily from 5840 // the intel intrinsic that corresponds to this. 5841 let Defs = [EFLAGS], Predicates = [HasAVX] in { 5842 def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), 5843 "vptest\t{$src2, $src1|$src1, $src2}", 5844 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, 5845 Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG; 5846 def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), 5847 "vptest\t{$src2, $src1|$src1, $src2}", 5848 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>, 5849 Sched<[SchedWriteVecTest.XMM.Folded, ReadAfterLd]>, 5850 VEX, VEX_WIG; 5851 5852 def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2), 5853 "vptest\t{$src2, $src1|$src1, $src2}", 5854 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>, 5855 Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG; 5856 def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2), 5857 "vptest\t{$src2, $src1|$src1, $src2}", 5858 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>, 5859 Sched<[SchedWriteVecTest.YMM.Folded, ReadAfterLd]>, 5860 VEX, VEX_L, VEX_WIG; 5861 } 5862 5863 let Defs = [EFLAGS] in { 5864 def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), 5865 "ptest\t{$src2, $src1|$src1, $src2}", 5866 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, 5867 Sched<[SchedWriteVecTest.XMM]>; 5868 def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), 5869 "ptest\t{$src2, $src1|$src1, $src2}", 5870 [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>, 5871 Sched<[SchedWriteVecTest.XMM.Folded, ReadAfterLd]>; 5872 } 5873 5874 // The bit test instructions below are AVX only 5875 multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC, 5876 X86MemOperand x86memop, PatFrag mem_frag, ValueType vt, 5877 X86FoldableSchedWrite sched> { 5878 def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 5879 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 5880 [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>, 5881 Sched<[sched]>, VEX; 5882 def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), 5883 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 5884 [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>, 5885 Sched<[sched.Folded, ReadAfterLd]>, VEX; 5886 } 5887 5888 let Defs = [EFLAGS], Predicates = [HasAVX] in { 5889 let ExeDomain = SSEPackedSingle in { 5890 defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32, 5891 SchedWriteFTest.XMM>; 5892 defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32, 5893 SchedWriteFTest.YMM>, VEX_L; 5894 } 5895 let ExeDomain = SSEPackedDouble in { 5896 defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64, 5897 SchedWriteFTest.XMM>; 5898 defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64, 5899 SchedWriteFTest.YMM>, VEX_L; 5900 } 5901 } 5902 5903 //===----------------------------------------------------------------------===// 5904 // SSE4.1 - Misc Instructions 5905 //===----------------------------------------------------------------------===// 5906 5907 let Defs = [EFLAGS], Predicates = [HasPOPCNT] in { 5908 def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), 5909 "popcnt{w}\t{$src, $dst|$dst, $src}", 5910 [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>, 5911 Sched<[WritePOPCNT]>, OpSize16, XS; 5912 def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), 5913 "popcnt{w}\t{$src, $dst|$dst, $src}", 5914 [(set GR16:$dst, (ctpop (loadi16 addr:$src))), 5915 (implicit EFLAGS)]>, 5916 Sched<[WritePOPCNT.Folded]>, OpSize16, XS; 5917 5918 def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), 5919 "popcnt{l}\t{$src, $dst|$dst, $src}", 5920 [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>, 5921 Sched<[WritePOPCNT]>, OpSize32, XS; 5922 5923 def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), 5924 "popcnt{l}\t{$src, $dst|$dst, $src}", 5925 [(set GR32:$dst, (ctpop (loadi32 addr:$src))), 5926 (implicit EFLAGS)]>, 5927 Sched<[WritePOPCNT.Folded]>, OpSize32, XS; 5928 5929 def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), 5930 "popcnt{q}\t{$src, $dst|$dst, $src}", 5931 [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>, 5932 Sched<[WritePOPCNT]>, XS; 5933 def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), 5934 "popcnt{q}\t{$src, $dst|$dst, $src}", 5935 [(set GR64:$dst, (ctpop (loadi64 addr:$src))), 5936 (implicit EFLAGS)]>, 5937 Sched<[WritePOPCNT.Folded]>, XS; 5938 } 5939 5940 // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16. 5941 multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, 5942 SDNode OpNode, PatFrag ld_frag, 5943 X86FoldableSchedWrite Sched> { 5944 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 5945 (ins VR128:$src), 5946 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5947 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>, 5948 Sched<[Sched]>; 5949 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 5950 (ins i128mem:$src), 5951 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5952 [(set VR128:$dst, 5953 (v8i16 (OpNode (v8i16 (bitconvert (ld_frag addr:$src))))))]>, 5954 Sched<[Sched.Folded]>; 5955 } 5956 5957 // PHMIN has the same profile as PSAD, thus we use the same scheduling 5958 // model, although the naming is misleading. 5959 let Predicates = [HasAVX] in 5960 defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw", 5961 X86phminpos, loadv2i64, 5962 WritePHMINPOS>, VEX, VEX_WIG; 5963 defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw", 5964 X86phminpos, memopv2i64, 5965 WritePHMINPOS>; 5966 5967 /// SS48I_binop_rm - Simple SSE41 binary operator. 5968 multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 5969 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 5970 X86MemOperand x86memop, X86FoldableSchedWrite sched, 5971 bit Is2Addr = 1> { 5972 let isCommutable = 1 in 5973 def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst), 5974 (ins RC:$src1, RC:$src2), 5975 !if(Is2Addr, 5976 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5977 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5978 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 5979 Sched<[sched]>; 5980 def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst), 5981 (ins RC:$src1, x86memop:$src2), 5982 !if(Is2Addr, 5983 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5984 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5985 [(set RC:$dst, 5986 (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>, 5987 Sched<[sched.Folded, ReadAfterLd]>; 5988 } 5989 5990 let Predicates = [HasAVX, NoVLX] in { 5991 defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128, 5992 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, 5993 VEX_4V, VEX_WIG; 5994 defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128, 5995 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, 5996 VEX_4V, VEX_WIG; 5997 defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128, 5998 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, 5999 VEX_4V, VEX_WIG; 6000 defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128, 6001 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, 6002 VEX_4V, VEX_WIG; 6003 defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128, 6004 loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>, 6005 VEX_4V, VEX_WIG; 6006 } 6007 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 6008 defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128, 6009 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, 6010 VEX_4V, VEX_WIG; 6011 defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128, 6012 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, 6013 VEX_4V, VEX_WIG; 6014 defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128, 6015 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, 6016 VEX_4V, VEX_WIG; 6017 defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128, 6018 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, 6019 VEX_4V, VEX_WIG; 6020 } 6021 6022 let Predicates = [HasAVX2, NoVLX] in { 6023 defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256, 6024 loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, 6025 VEX_4V, VEX_L, VEX_WIG; 6026 defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256, 6027 loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, 6028 VEX_4V, VEX_L, VEX_WIG; 6029 defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256, 6030 loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, 6031 VEX_4V, VEX_L, VEX_WIG; 6032 defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256, 6033 loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, 6034 VEX_4V, VEX_L, VEX_WIG; 6035 defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256, 6036 loadv4i64, i256mem, SchedWriteVecIMul.YMM, 0>, 6037 VEX_4V, VEX_L, VEX_WIG; 6038 } 6039 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 6040 defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256, 6041 loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, 6042 VEX_4V, VEX_L, VEX_WIG; 6043 defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256, 6044 loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, 6045 VEX_4V, VEX_L, VEX_WIG; 6046 defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256, 6047 loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, 6048 VEX_4V, VEX_L, VEX_WIG; 6049 defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256, 6050 loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, 6051 VEX_4V, VEX_L, VEX_WIG; 6052 } 6053 6054 let Constraints = "$src1 = $dst" in { 6055 defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128, 6056 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; 6057 defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128, 6058 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; 6059 defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128, 6060 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; 6061 defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128, 6062 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; 6063 defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128, 6064 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; 6065 defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128, 6066 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; 6067 defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128, 6068 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; 6069 defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128, 6070 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; 6071 defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128, 6072 memopv2i64, i128mem, SchedWriteVecIMul.XMM, 1>; 6073 } 6074 6075 let Predicates = [HasAVX, NoVLX] in 6076 defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, 6077 loadv2i64, i128mem, SchedWritePMULLD.XMM, 0>, 6078 VEX_4V, VEX_WIG; 6079 let Predicates = [HasAVX] in 6080 defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, 6081 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, 6082 VEX_4V, VEX_WIG; 6083 6084 let Predicates = [HasAVX2, NoVLX] in 6085 defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, 6086 loadv4i64, i256mem, SchedWritePMULLD.YMM, 0>, 6087 VEX_4V, VEX_L, VEX_WIG; 6088 let Predicates = [HasAVX2] in 6089 defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, 6090 loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, 6091 VEX_4V, VEX_L, VEX_WIG; 6092 6093 let Constraints = "$src1 = $dst" in { 6094 defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128, 6095 memopv2i64, i128mem, SchedWritePMULLD.XMM, 1>; 6096 defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128, 6097 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; 6098 } 6099 6100 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate 6101 multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, 6102 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, 6103 X86MemOperand x86memop, bit Is2Addr, 6104 X86FoldableSchedWrite sched> { 6105 let isCommutable = 1 in 6106 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 6107 (ins RC:$src1, RC:$src2, u8imm:$src3), 6108 !if(Is2Addr, 6109 !strconcat(OpcodeStr, 6110 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6111 !strconcat(OpcodeStr, 6112 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6113 [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>, 6114 Sched<[sched]>; 6115 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 6116 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 6117 !if(Is2Addr, 6118 !strconcat(OpcodeStr, 6119 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6120 !strconcat(OpcodeStr, 6121 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6122 [(set RC:$dst, 6123 (IntId RC:$src1, 6124 (bitconvert (memop_frag addr:$src2)), imm:$src3))]>, 6125 Sched<[sched.Folded, ReadAfterLd]>; 6126 } 6127 6128 /// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate 6129 multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 6130 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 6131 X86MemOperand x86memop, bit Is2Addr, 6132 X86FoldableSchedWrite sched> { 6133 let isCommutable = 1 in 6134 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 6135 (ins RC:$src1, RC:$src2, u8imm:$src3), 6136 !if(Is2Addr, 6137 !strconcat(OpcodeStr, 6138 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6139 !strconcat(OpcodeStr, 6140 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6141 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>, 6142 Sched<[sched]>; 6143 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 6144 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 6145 !if(Is2Addr, 6146 !strconcat(OpcodeStr, 6147 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6148 !strconcat(OpcodeStr, 6149 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6150 [(set RC:$dst, 6151 (OpVT (OpNode RC:$src1, 6152 (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>, 6153 Sched<[sched.Folded, ReadAfterLd]>; 6154 } 6155 6156 def BlendCommuteImm2 : SDNodeXForm<imm, [{ 6157 uint8_t Imm = N->getZExtValue() & 0x03; 6158 return getI8Imm(Imm ^ 0x03, SDLoc(N)); 6159 }]>; 6160 6161 def BlendCommuteImm4 : SDNodeXForm<imm, [{ 6162 uint8_t Imm = N->getZExtValue() & 0x0f; 6163 return getI8Imm(Imm ^ 0x0f, SDLoc(N)); 6164 }]>; 6165 6166 def BlendCommuteImm8 : SDNodeXForm<imm, [{ 6167 uint8_t Imm = N->getZExtValue() & 0xff; 6168 return getI8Imm(Imm ^ 0xff, SDLoc(N)); 6169 }]>; 6170 6171 let Predicates = [HasAVX] in { 6172 let isCommutable = 0 in { 6173 defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, 6174 VR128, loadv2i64, i128mem, 0, 6175 SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG; 6176 } 6177 6178 let ExeDomain = SSEPackedSingle in 6179 defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, 6180 VR128, loadv4f32, f128mem, 0, 6181 SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG; 6182 let ExeDomain = SSEPackedDouble in 6183 defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, 6184 VR128, loadv2f64, f128mem, 0, 6185 SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG; 6186 let ExeDomain = SSEPackedSingle in 6187 defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, 6188 VR256, loadv8f32, i256mem, 0, 6189 SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG; 6190 } 6191 6192 let Predicates = [HasAVX2] in { 6193 let isCommutable = 0 in { 6194 defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, 6195 VR256, loadv4i64, i256mem, 0, 6196 SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG; 6197 } 6198 } 6199 6200 let Constraints = "$src1 = $dst" in { 6201 let isCommutable = 0 in { 6202 defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, 6203 VR128, memopv2i64, i128mem, 1, 6204 SchedWriteMPSAD.XMM>; 6205 } 6206 6207 let ExeDomain = SSEPackedSingle in 6208 defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, 6209 VR128, memopv4f32, f128mem, 1, 6210 SchedWriteDPPS.XMM>; 6211 let ExeDomain = SSEPackedDouble in 6212 defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd, 6213 VR128, memopv2f64, f128mem, 1, 6214 SchedWriteDPPD.XMM>; 6215 } 6216 6217 /// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate 6218 multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 6219 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 6220 X86MemOperand x86memop, bit Is2Addr, Domain d, 6221 X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> { 6222 let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in { 6223 let isCommutable = 1 in 6224 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 6225 (ins RC:$src1, RC:$src2, u8imm:$src3), 6226 !if(Is2Addr, 6227 !strconcat(OpcodeStr, 6228 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6229 !strconcat(OpcodeStr, 6230 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6231 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>, 6232 Sched<[sched]>; 6233 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 6234 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 6235 !if(Is2Addr, 6236 !strconcat(OpcodeStr, 6237 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6238 !strconcat(OpcodeStr, 6239 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6240 [(set RC:$dst, 6241 (OpVT (OpNode RC:$src1, 6242 (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>, 6243 Sched<[sched.Folded, ReadAfterLd]>; 6244 } 6245 6246 // Pattern to commute if load is in first source. 6247 def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)), 6248 RC:$src1, imm:$src3)), 6249 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, 6250 (commuteXForm imm:$src3))>; 6251 } 6252 6253 let Predicates = [HasAVX] in { 6254 defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32, 6255 VR128, loadv4f32, f128mem, 0, SSEPackedSingle, 6256 SchedWriteFBlend.XMM, BlendCommuteImm4>, 6257 VEX_4V, VEX_WIG; 6258 defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32, 6259 VR256, loadv8f32, f256mem, 0, SSEPackedSingle, 6260 SchedWriteFBlend.YMM, BlendCommuteImm8>, 6261 VEX_4V, VEX_L, VEX_WIG; 6262 defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64, 6263 VR128, loadv2f64, f128mem, 0, SSEPackedDouble, 6264 SchedWriteFBlend.XMM, BlendCommuteImm2>, 6265 VEX_4V, VEX_WIG; 6266 defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64, 6267 VR256, loadv4f64, f256mem, 0, SSEPackedDouble, 6268 SchedWriteFBlend.YMM, BlendCommuteImm4>, 6269 VEX_4V, VEX_L, VEX_WIG; 6270 defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16, 6271 VR128, loadv2i64, i128mem, 0, SSEPackedInt, 6272 SchedWriteBlend.XMM, BlendCommuteImm8>, 6273 VEX_4V, VEX_WIG; 6274 } 6275 6276 let Predicates = [HasAVX2] in { 6277 defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16, 6278 VR256, loadv4i64, i256mem, 0, SSEPackedInt, 6279 SchedWriteBlend.YMM, BlendCommuteImm8>, 6280 VEX_4V, VEX_L, VEX_WIG; 6281 } 6282 6283 defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32, 6284 VR128, memopv4f32, f128mem, 1, SSEPackedSingle, 6285 SchedWriteFBlend.XMM, BlendCommuteImm4>; 6286 defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64, 6287 VR128, memopv2f64, f128mem, 1, SSEPackedDouble, 6288 SchedWriteFBlend.XMM, BlendCommuteImm2>; 6289 defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16, 6290 VR128, memopv2i64, i128mem, 1, SSEPackedInt, 6291 SchedWriteBlend.XMM, BlendCommuteImm8>; 6292 6293 // For insertion into the zero index (low half) of a 256-bit vector, it is 6294 // more efficient to generate a blend with immediate instead of an insert*128. 6295 let Predicates = [HasAVX] in { 6296 def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)), 6297 (VBLENDPDYrri VR256:$src1, 6298 (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 6299 VR128:$src2, sub_xmm), 0x3)>; 6300 def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)), 6301 (VBLENDPSYrri VR256:$src1, 6302 (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 6303 VR128:$src2, sub_xmm), 0xf)>; 6304 } 6305 6306 /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators 6307 multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, 6308 RegisterClass RC, X86MemOperand x86memop, 6309 PatFrag mem_frag, Intrinsic IntId, 6310 X86FoldableSchedWrite sched> { 6311 def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst), 6312 (ins RC:$src1, RC:$src2, RC:$src3), 6313 !strconcat(OpcodeStr, 6314 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 6315 [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))], 6316 SSEPackedInt>, TAPD, VEX_4V, 6317 Sched<[sched]>; 6318 6319 def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst), 6320 (ins RC:$src1, x86memop:$src2, RC:$src3), 6321 !strconcat(OpcodeStr, 6322 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 6323 [(set RC:$dst, 6324 (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)), 6325 RC:$src3))], SSEPackedInt>, TAPD, VEX_4V, 6326 Sched<[sched.Folded, ReadAfterLd, 6327 // x86memop:$src2 6328 ReadDefault, ReadDefault, ReadDefault, ReadDefault, 6329 ReadDefault, 6330 // RC::$src3 6331 ReadAfterLd]>; 6332 } 6333 6334 let Predicates = [HasAVX] in { 6335 let ExeDomain = SSEPackedDouble in { 6336 defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem, 6337 loadv2f64, int_x86_sse41_blendvpd, 6338 SchedWriteFVarBlend.XMM>; 6339 defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem, 6340 loadv4f64, int_x86_avx_blendv_pd_256, 6341 SchedWriteFVarBlend.YMM>, VEX_L; 6342 } // ExeDomain = SSEPackedDouble 6343 let ExeDomain = SSEPackedSingle in { 6344 defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem, 6345 loadv4f32, int_x86_sse41_blendvps, 6346 SchedWriteFVarBlend.XMM>; 6347 defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem, 6348 loadv8f32, int_x86_avx_blendv_ps_256, 6349 SchedWriteFVarBlend.YMM>, VEX_L; 6350 } // ExeDomain = SSEPackedSingle 6351 defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem, 6352 loadv2i64, int_x86_sse41_pblendvb, 6353 SchedWriteVarBlend.XMM>; 6354 } 6355 6356 let Predicates = [HasAVX2] in { 6357 defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem, 6358 loadv4i64, int_x86_avx2_pblendvb, 6359 SchedWriteVarBlend.YMM>, VEX_L; 6360 } 6361 6362 let Predicates = [HasAVX] in { 6363 def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1), 6364 (v16i8 VR128:$src2))), 6365 (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6366 def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1), 6367 (v4i32 VR128:$src2))), 6368 (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6369 def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1), 6370 (v4f32 VR128:$src2))), 6371 (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6372 def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1), 6373 (v2i64 VR128:$src2))), 6374 (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6375 def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1), 6376 (v2f64 VR128:$src2))), 6377 (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6378 def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1), 6379 (v8i32 VR256:$src2))), 6380 (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6381 def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1), 6382 (v8f32 VR256:$src2))), 6383 (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6384 def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1), 6385 (v4i64 VR256:$src2))), 6386 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6387 def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1), 6388 (v4f64 VR256:$src2))), 6389 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6390 } 6391 6392 let Predicates = [HasAVX2] in { 6393 def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1), 6394 (v32i8 VR256:$src2))), 6395 (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6396 } 6397 6398 // Prefer a movss or movsd over a blendps when optimizing for size. these were 6399 // changed to use blends because blends have better throughput on sandybridge 6400 // and haswell, but movs[s/d] are 1-2 byte shorter instructions. 6401 let Predicates = [HasAVX, OptForSpeed] in { 6402 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 6403 (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; 6404 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 6405 (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; 6406 6407 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), 6408 (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; 6409 def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))), 6410 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; 6411 def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)), 6412 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; 6413 6414 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), 6415 (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; 6416 def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))), 6417 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; 6418 def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)), 6419 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; 6420 6421 // Move low f32 and clear high bits. 6422 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), 6423 (SUBREG_TO_REG (i32 0), 6424 (v4f32 (VBLENDPSrri (v4f32 (V_SET0)), 6425 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), 6426 (i8 1))), sub_xmm)>; 6427 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), 6428 (SUBREG_TO_REG (i32 0), 6429 (v4i32 (VPBLENDWrri (v4i32 (V_SET0)), 6430 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), 6431 (i8 3))), sub_xmm)>; 6432 6433 def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), 6434 (SUBREG_TO_REG (i32 0), 6435 (v2f64 (VBLENDPDrri (v2f64 (V_SET0)), 6436 (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), 6437 (i8 1))), sub_xmm)>; 6438 def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), 6439 (SUBREG_TO_REG (i32 0), 6440 (v2i64 (VPBLENDWrri (v2i64 (V_SET0)), 6441 (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), 6442 (i8 0xf))), sub_xmm)>; 6443 } 6444 6445 // Prefer a movss or movsd over a blendps when optimizing for size. these were 6446 // changed to use blends because blends have better throughput on sandybridge 6447 // and haswell, but movs[s/d] are 1-2 byte shorter instructions. 6448 let Predicates = [UseSSE41, OptForSpeed] in { 6449 // With SSE41 we can use blends for these patterns. 6450 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 6451 (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; 6452 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 6453 (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; 6454 6455 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), 6456 (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; 6457 def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))), 6458 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; 6459 def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)), 6460 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; 6461 6462 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), 6463 (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; 6464 def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))), 6465 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; 6466 def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)), 6467 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; 6468 } 6469 6470 6471 /// SS41I_ternary_int - SSE 4.1 ternary operator 6472 let Uses = [XMM0], Constraints = "$src1 = $dst" in { 6473 multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag, 6474 X86MemOperand x86memop, Intrinsic IntId, 6475 X86FoldableSchedWrite sched> { 6476 def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 6477 (ins VR128:$src1, VR128:$src2), 6478 !strconcat(OpcodeStr, 6479 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6480 [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>, 6481 Sched<[sched]>; 6482 6483 def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 6484 (ins VR128:$src1, x86memop:$src2), 6485 !strconcat(OpcodeStr, 6486 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6487 [(set VR128:$dst, 6488 (IntId VR128:$src1, 6489 (bitconvert (mem_frag addr:$src2)), XMM0))]>, 6490 Sched<[sched.Folded, ReadAfterLd]>; 6491 } 6492 } 6493 6494 let ExeDomain = SSEPackedDouble in 6495 defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem, 6496 int_x86_sse41_blendvpd, SchedWriteFVarBlend.XMM>; 6497 let ExeDomain = SSEPackedSingle in 6498 defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem, 6499 int_x86_sse41_blendvps, SchedWriteFVarBlend.XMM>; 6500 defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem, 6501 int_x86_sse41_pblendvb, SchedWriteVarBlend.XMM>; 6502 6503 // Aliases with the implicit xmm0 argument 6504 def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", 6505 (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>; 6506 def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", 6507 (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>; 6508 def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}", 6509 (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>; 6510 def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}", 6511 (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>; 6512 def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", 6513 (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>; 6514 def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", 6515 (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>; 6516 6517 let Predicates = [UseSSE41] in { 6518 def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1), 6519 (v16i8 VR128:$src2))), 6520 (PBLENDVBrr0 VR128:$src2, VR128:$src1)>; 6521 def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1), 6522 (v4i32 VR128:$src2))), 6523 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; 6524 def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1), 6525 (v4f32 VR128:$src2))), 6526 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; 6527 def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1), 6528 (v2i64 VR128:$src2))), 6529 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; 6530 def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1), 6531 (v2f64 VR128:$src2))), 6532 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; 6533 } 6534 6535 let AddedComplexity = 400 in { // Prefer non-temporal versions 6536 6537 let Predicates = [HasAVX, NoVLX] in 6538 def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 6539 "vmovntdqa\t{$src, $dst|$dst, $src}", []>, 6540 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG; 6541 let Predicates = [HasAVX2, NoVLX] in 6542 def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 6543 "vmovntdqa\t{$src, $dst|$dst, $src}", []>, 6544 Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG; 6545 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 6546 "movntdqa\t{$src, $dst|$dst, $src}", []>, 6547 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>; 6548 6549 let Predicates = [HasAVX2, NoVLX] in { 6550 def : Pat<(v8f32 (alignednontemporalload addr:$src)), 6551 (VMOVNTDQAYrm addr:$src)>; 6552 def : Pat<(v4f64 (alignednontemporalload addr:$src)), 6553 (VMOVNTDQAYrm addr:$src)>; 6554 def : Pat<(v4i64 (alignednontemporalload addr:$src)), 6555 (VMOVNTDQAYrm addr:$src)>; 6556 } 6557 6558 let Predicates = [HasAVX, NoVLX] in { 6559 def : Pat<(v4f32 (alignednontemporalload addr:$src)), 6560 (VMOVNTDQArm addr:$src)>; 6561 def : Pat<(v2f64 (alignednontemporalload addr:$src)), 6562 (VMOVNTDQArm addr:$src)>; 6563 def : Pat<(v2i64 (alignednontemporalload addr:$src)), 6564 (VMOVNTDQArm addr:$src)>; 6565 } 6566 6567 let Predicates = [UseSSE41] in { 6568 def : Pat<(v4f32 (alignednontemporalload addr:$src)), 6569 (MOVNTDQArm addr:$src)>; 6570 def : Pat<(v2f64 (alignednontemporalload addr:$src)), 6571 (MOVNTDQArm addr:$src)>; 6572 def : Pat<(v2i64 (alignednontemporalload addr:$src)), 6573 (MOVNTDQArm addr:$src)>; 6574 } 6575 6576 } // AddedComplexity 6577 6578 //===----------------------------------------------------------------------===// 6579 // SSE4.2 - Compare Instructions 6580 //===----------------------------------------------------------------------===// 6581 6582 /// SS42I_binop_rm - Simple SSE 4.2 binary operator 6583 multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 6584 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 6585 X86MemOperand x86memop, X86FoldableSchedWrite sched, 6586 bit Is2Addr = 1> { 6587 def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst), 6588 (ins RC:$src1, RC:$src2), 6589 !if(Is2Addr, 6590 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6591 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6592 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 6593 Sched<[sched]>; 6594 def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst), 6595 (ins RC:$src1, x86memop:$src2), 6596 !if(Is2Addr, 6597 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6598 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6599 [(set RC:$dst, 6600 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 6601 Sched<[sched.Folded, ReadAfterLd]>; 6602 } 6603 6604 let Predicates = [HasAVX] in 6605 defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128, 6606 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, 6607 VEX_4V, VEX_WIG; 6608 6609 let Predicates = [HasAVX2] in 6610 defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256, 6611 loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, 6612 VEX_4V, VEX_L, VEX_WIG; 6613 6614 let Constraints = "$src1 = $dst" in 6615 defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128, 6616 memopv2i64, i128mem, SchedWriteVecALU.XMM>; 6617 6618 //===----------------------------------------------------------------------===// 6619 // SSE4.2 - String/text Processing Instructions 6620 //===----------------------------------------------------------------------===// 6621 6622 multiclass pcmpistrm_SS42AI<string asm> { 6623 def rr : SS42AI<0x62, MRMSrcReg, (outs), 6624 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6625 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6626 []>, Sched<[WritePCmpIStrM]>; 6627 let mayLoad = 1 in 6628 def rm :SS42AI<0x62, MRMSrcMem, (outs), 6629 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6630 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6631 []>, Sched<[WritePCmpIStrM.Folded, ReadAfterLd]>; 6632 } 6633 6634 let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in { 6635 let Predicates = [HasAVX] in 6636 defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX; 6637 defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ; 6638 } 6639 6640 multiclass SS42AI_pcmpestrm<string asm> { 6641 def rr : SS42AI<0x60, MRMSrcReg, (outs), 6642 (ins VR128:$src1, VR128:$src3, u8imm:$src5), 6643 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6644 []>, Sched<[WritePCmpEStrM]>; 6645 let mayLoad = 1 in 6646 def rm : SS42AI<0x60, MRMSrcMem, (outs), 6647 (ins VR128:$src1, i128mem:$src3, u8imm:$src5), 6648 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6649 []>, Sched<[WritePCmpEStrM.Folded, ReadAfterLd]>; 6650 } 6651 6652 let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { 6653 let Predicates = [HasAVX] in 6654 defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX; 6655 defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">; 6656 } 6657 6658 multiclass SS42AI_pcmpistri<string asm> { 6659 def rr : SS42AI<0x63, MRMSrcReg, (outs), 6660 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6661 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6662 []>, Sched<[WritePCmpIStrI]>; 6663 let mayLoad = 1 in 6664 def rm : SS42AI<0x63, MRMSrcMem, (outs), 6665 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6666 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6667 []>, Sched<[WritePCmpIStrI.Folded, ReadAfterLd]>; 6668 } 6669 6670 let Defs = [ECX, EFLAGS], hasSideEffects = 0 in { 6671 let Predicates = [HasAVX] in 6672 defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX; 6673 defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">; 6674 } 6675 6676 multiclass SS42AI_pcmpestri<string asm> { 6677 def rr : SS42AI<0x61, MRMSrcReg, (outs), 6678 (ins VR128:$src1, VR128:$src3, u8imm:$src5), 6679 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6680 []>, Sched<[WritePCmpEStrI]>; 6681 let mayLoad = 1 in 6682 def rm : SS42AI<0x61, MRMSrcMem, (outs), 6683 (ins VR128:$src1, i128mem:$src3, u8imm:$src5), 6684 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6685 []>, Sched<[WritePCmpEStrI.Folded, ReadAfterLd]>; 6686 } 6687 6688 let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { 6689 let Predicates = [HasAVX] in 6690 defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX; 6691 defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">; 6692 } 6693 6694 //===----------------------------------------------------------------------===// 6695 // SSE4.2 - CRC Instructions 6696 //===----------------------------------------------------------------------===// 6697 6698 // No CRC instructions have AVX equivalents 6699 6700 // crc intrinsic instruction 6701 // This set of instructions are only rm, the only difference is the size 6702 // of r and m. 6703 class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut, 6704 RegisterClass RCIn, SDPatternOperator Int> : 6705 SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2), 6706 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), 6707 [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>, 6708 Sched<[WriteCRC32]>; 6709 6710 class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut, 6711 X86MemOperand x86memop, SDPatternOperator Int> : 6712 SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2), 6713 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), 6714 [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>, 6715 Sched<[WriteCRC32.Folded, ReadAfterLd]>; 6716 6717 let Constraints = "$src1 = $dst" in { 6718 def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem, 6719 int_x86_sse42_crc32_32_8>; 6720 def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8, 6721 int_x86_sse42_crc32_32_8>; 6722 def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem, 6723 int_x86_sse42_crc32_32_16>, OpSize16; 6724 def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16, 6725 int_x86_sse42_crc32_32_16>, OpSize16; 6726 def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem, 6727 int_x86_sse42_crc32_32_32>, OpSize32; 6728 def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32, 6729 int_x86_sse42_crc32_32_32>, OpSize32; 6730 def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem, 6731 int_x86_sse42_crc32_64_64>, REX_W; 6732 def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64, 6733 int_x86_sse42_crc32_64_64>, REX_W; 6734 let hasSideEffects = 0 in { 6735 let mayLoad = 1 in 6736 def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem, 6737 null_frag>, REX_W; 6738 def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8, 6739 null_frag>, REX_W; 6740 } 6741 } 6742 6743 //===----------------------------------------------------------------------===// 6744 // SHA-NI Instructions 6745 //===----------------------------------------------------------------------===// 6746 6747 // FIXME: Is there a better scheduler class for SHA than WriteVecIMul? 6748 multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId, 6749 X86FoldableSchedWrite sched, bit UsesXMM0 = 0> { 6750 def rr : I<Opc, MRMSrcReg, (outs VR128:$dst), 6751 (ins VR128:$src1, VR128:$src2), 6752 !if(UsesXMM0, 6753 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6754 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), 6755 [!if(UsesXMM0, 6756 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)), 6757 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, 6758 T8, Sched<[sched]>; 6759 6760 def rm : I<Opc, MRMSrcMem, (outs VR128:$dst), 6761 (ins VR128:$src1, i128mem:$src2), 6762 !if(UsesXMM0, 6763 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6764 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), 6765 [!if(UsesXMM0, 6766 (set VR128:$dst, (IntId VR128:$src1, 6767 (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)), 6768 (set VR128:$dst, (IntId VR128:$src1, 6769 (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8, 6770 Sched<[sched.Folded, ReadAfterLd]>; 6771 } 6772 6773 let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { 6774 def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst), 6775 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6776 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6777 [(set VR128:$dst, 6778 (int_x86_sha1rnds4 VR128:$src1, VR128:$src2, 6779 (i8 imm:$src3)))]>, TA, 6780 Sched<[SchedWriteVecIMul.XMM]>; 6781 def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst), 6782 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6783 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6784 [(set VR128:$dst, 6785 (int_x86_sha1rnds4 VR128:$src1, 6786 (bc_v4i32 (memopv2i64 addr:$src2)), 6787 (i8 imm:$src3)))]>, TA, 6788 Sched<[SchedWriteVecIMul.XMM.Folded, ReadAfterLd]>; 6789 6790 defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte, 6791 SchedWriteVecIMul.XMM>; 6792 defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1, 6793 SchedWriteVecIMul.XMM>; 6794 defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2, 6795 SchedWriteVecIMul.XMM>; 6796 6797 let Uses=[XMM0] in 6798 defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 6799 SchedWriteVecIMul.XMM, 1>; 6800 6801 defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1, 6802 SchedWriteVecIMul.XMM>; 6803 defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2, 6804 SchedWriteVecIMul.XMM>; 6805 } 6806 6807 // Aliases with explicit %xmm0 6808 def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}", 6809 (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>; 6810 def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}", 6811 (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>; 6812 6813 //===----------------------------------------------------------------------===// 6814 // AES-NI Instructions 6815 //===----------------------------------------------------------------------===// 6816 6817 multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, 6818 Intrinsic IntId, PatFrag ld_frag, 6819 bit Is2Addr = 0, RegisterClass RC = VR128, 6820 X86MemOperand MemOp = i128mem> { 6821 let AsmString = OpcodeStr## 6822 !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}", 6823 "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { 6824 def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst), 6825 (ins RC:$src1, RC:$src2), "", 6826 [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>, 6827 Sched<[WriteAESDecEnc]>; 6828 def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst), 6829 (ins RC:$src1, MemOp:$src2), "", 6830 [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>, 6831 Sched<[WriteAESDecEnc.Folded, ReadAfterLd]>; 6832 } 6833 } 6834 6835 // Perform One Round of an AES Encryption/Decryption Flow 6836 let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in { 6837 defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc", 6838 int_x86_aesni_aesenc, loadv2i64>, VEX_4V, VEX_WIG; 6839 defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast", 6840 int_x86_aesni_aesenclast, loadv2i64>, VEX_4V, VEX_WIG; 6841 defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec", 6842 int_x86_aesni_aesdec, loadv2i64>, VEX_4V, VEX_WIG; 6843 defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast", 6844 int_x86_aesni_aesdeclast, loadv2i64>, VEX_4V, VEX_WIG; 6845 } 6846 6847 let Predicates = [NoVLX, HasVAES] in { 6848 defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc", 6849 int_x86_aesni_aesenc_256, loadv4i64, 0, VR256, 6850 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6851 defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast", 6852 int_x86_aesni_aesenclast_256, loadv4i64, 0, VR256, 6853 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6854 defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec", 6855 int_x86_aesni_aesdec_256, loadv4i64, 0, VR256, 6856 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6857 defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast", 6858 int_x86_aesni_aesdeclast_256, loadv4i64, 0, VR256, 6859 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6860 } 6861 6862 let Constraints = "$src1 = $dst" in { 6863 defm AESENC : AESI_binop_rm_int<0xDC, "aesenc", 6864 int_x86_aesni_aesenc, memopv2i64, 1>; 6865 defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast", 6866 int_x86_aesni_aesenclast, memopv2i64, 1>; 6867 defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec", 6868 int_x86_aesni_aesdec, memopv2i64, 1>; 6869 defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast", 6870 int_x86_aesni_aesdeclast, memopv2i64, 1>; 6871 } 6872 6873 // Perform the AES InvMixColumn Transformation 6874 let Predicates = [HasAVX, HasAES] in { 6875 def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), 6876 (ins VR128:$src1), 6877 "vaesimc\t{$src1, $dst|$dst, $src1}", 6878 [(set VR128:$dst, 6879 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>, 6880 VEX, VEX_WIG; 6881 def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), 6882 (ins i128mem:$src1), 6883 "vaesimc\t{$src1, $dst|$dst, $src1}", 6884 [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>, 6885 Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG; 6886 } 6887 def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), 6888 (ins VR128:$src1), 6889 "aesimc\t{$src1, $dst|$dst, $src1}", 6890 [(set VR128:$dst, 6891 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>; 6892 def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), 6893 (ins i128mem:$src1), 6894 "aesimc\t{$src1, $dst|$dst, $src1}", 6895 [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>, 6896 Sched<[WriteAESIMC.Folded]>; 6897 6898 // AES Round Key Generation Assist 6899 let Predicates = [HasAVX, HasAES] in { 6900 def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), 6901 (ins VR128:$src1, u8imm:$src2), 6902 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6903 [(set VR128:$dst, 6904 (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, 6905 Sched<[WriteAESKeyGen]>, VEX, VEX_WIG; 6906 def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), 6907 (ins i128mem:$src1, u8imm:$src2), 6908 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6909 [(set VR128:$dst, 6910 (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>, 6911 Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG; 6912 } 6913 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), 6914 (ins VR128:$src1, u8imm:$src2), 6915 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6916 [(set VR128:$dst, 6917 (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, 6918 Sched<[WriteAESKeyGen]>; 6919 def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), 6920 (ins i128mem:$src1, u8imm:$src2), 6921 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6922 [(set VR128:$dst, 6923 (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>, 6924 Sched<[WriteAESKeyGen.Folded]>; 6925 6926 //===----------------------------------------------------------------------===// 6927 // PCLMUL Instructions 6928 //===----------------------------------------------------------------------===// 6929 6930 // Immediate transform to help with commuting. 6931 def PCLMULCommuteImm : SDNodeXForm<imm, [{ 6932 uint8_t Imm = N->getZExtValue(); 6933 return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N)); 6934 }]>; 6935 6936 // SSE carry-less Multiplication instructions 6937 let Predicates = [NoAVX, HasPCLMUL] in { 6938 let Constraints = "$src1 = $dst" in { 6939 let isCommutable = 1 in 6940 def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), 6941 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6942 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6943 [(set VR128:$dst, 6944 (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>, 6945 Sched<[WriteCLMul]>; 6946 6947 def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), 6948 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6949 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6950 [(set VR128:$dst, 6951 (int_x86_pclmulqdq VR128:$src1, (memopv2i64 addr:$src2), 6952 imm:$src3))]>, 6953 Sched<[WriteCLMul.Folded, ReadAfterLd]>; 6954 } // Constraints = "$src1 = $dst" 6955 6956 def : Pat<(int_x86_pclmulqdq (memopv2i64 addr:$src2), VR128:$src1, 6957 (i8 imm:$src3)), 6958 (PCLMULQDQrm VR128:$src1, addr:$src2, 6959 (PCLMULCommuteImm imm:$src3))>; 6960 } // Predicates = [NoAVX, HasPCLMUL] 6961 6962 // SSE aliases 6963 foreach HI = ["hq","lq"] in 6964 foreach LO = ["hq","lq"] in { 6965 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", 6966 (PCLMULQDQrr VR128:$dst, VR128:$src, 6967 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; 6968 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", 6969 (PCLMULQDQrm VR128:$dst, i128mem:$src, 6970 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; 6971 } 6972 6973 // AVX carry-less Multiplication instructions 6974 multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp, 6975 PatFrag LdFrag, Intrinsic IntId> { 6976 let isCommutable = 1 in 6977 def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst), 6978 (ins RC:$src1, RC:$src2, u8imm:$src3), 6979 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 6980 [(set RC:$dst, 6981 (IntId RC:$src1, RC:$src2, imm:$src3))]>, 6982 Sched<[WriteCLMul]>; 6983 6984 def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst), 6985 (ins RC:$src1, MemOp:$src2, u8imm:$src3), 6986 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 6987 [(set RC:$dst, 6988 (IntId RC:$src1, (LdFrag addr:$src2), imm:$src3))]>, 6989 Sched<[WriteCLMul.Folded, ReadAfterLd]>; 6990 6991 // We can commute a load in the first operand by swapping the sources and 6992 // rotating the immediate. 6993 def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 imm:$src3)), 6994 (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2, 6995 (PCLMULCommuteImm imm:$src3))>; 6996 } 6997 6998 let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in 6999 defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, loadv2i64, 7000 int_x86_pclmulqdq>, VEX_4V, VEX_WIG; 7001 7002 let Predicates = [NoVLX, HasVPCLMULQDQ] in 7003 defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, loadv4i64, 7004 int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG; 7005 7006 multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC, 7007 X86MemOperand MemOp, string Hi, string Lo> { 7008 def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7009 (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2, 7010 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; 7011 def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7012 (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2, 7013 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; 7014 } 7015 7016 multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC, 7017 X86MemOperand MemOp> { 7018 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">; 7019 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">; 7020 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">; 7021 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">; 7022 } 7023 7024 // AVX aliases 7025 defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>; 7026 defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>; 7027 7028 //===----------------------------------------------------------------------===// 7029 // SSE4A Instructions 7030 //===----------------------------------------------------------------------===// 7031 7032 let Predicates = [HasSSE4A] in { 7033 7034 let ExeDomain = SSEPackedInt in { 7035 let Constraints = "$src = $dst" in { 7036 def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst), 7037 (ins VR128:$src, u8imm:$len, u8imm:$idx), 7038 "extrq\t{$idx, $len, $src|$src, $len, $idx}", 7039 [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len, 7040 imm:$idx))]>, 7041 PD, Sched<[SchedWriteVecALU.XMM]>; 7042 def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst), 7043 (ins VR128:$src, VR128:$mask), 7044 "extrq\t{$mask, $src|$src, $mask}", 7045 [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src, 7046 VR128:$mask))]>, 7047 PD, Sched<[SchedWriteVecALU.XMM]>; 7048 7049 def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst), 7050 (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx), 7051 "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}", 7052 [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2, 7053 imm:$len, imm:$idx))]>, 7054 XD, Sched<[SchedWriteVecALU.XMM]>; 7055 def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst), 7056 (ins VR128:$src, VR128:$mask), 7057 "insertq\t{$mask, $src|$src, $mask}", 7058 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src, 7059 VR128:$mask))]>, 7060 XD, Sched<[SchedWriteVecALU.XMM]>; 7061 } 7062 } // ExeDomain = SSEPackedInt 7063 7064 // Non-temporal (unaligned) scalar stores. 7065 let AddedComplexity = 400 in { // Prefer non-temporal versions 7066 let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in { 7067 def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src), 7068 "movntss\t{$src, $dst|$dst, $src}", []>, XS; 7069 7070 def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 7071 "movntsd\t{$src, $dst|$dst, $src}", []>, XD; 7072 } // SchedRW 7073 7074 def : Pat<(nontemporalstore FR32:$src, addr:$dst), 7075 (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7076 7077 def : Pat<(nontemporalstore FR64:$src, addr:$dst), 7078 (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7079 7080 } // AddedComplexity 7081 } // HasSSE4A 7082 7083 //===----------------------------------------------------------------------===// 7084 // AVX Instructions 7085 //===----------------------------------------------------------------------===// 7086 7087 //===----------------------------------------------------------------------===// 7088 // VBROADCAST - Load from memory and broadcast to all elements of the 7089 // destination operand 7090 // 7091 class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC, 7092 X86MemOperand x86memop, ValueType VT, 7093 PatFrag ld_frag, SchedWrite Sched> : 7094 AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 7095 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7096 [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>, 7097 Sched<[Sched]>, VEX; 7098 7099 // AVX2 adds register forms 7100 class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC, 7101 ValueType ResVT, ValueType OpVT, SchedWrite Sched> : 7102 AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), 7103 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7104 [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>, 7105 Sched<[Sched]>, VEX; 7106 7107 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in { 7108 def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128, 7109 f32mem, v4f32, loadf32, 7110 SchedWriteFShuffle.XMM.Folded>; 7111 def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256, 7112 f32mem, v8f32, loadf32, 7113 SchedWriteFShuffle.XMM.Folded>, VEX_L; 7114 } 7115 let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in 7116 def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem, 7117 v4f64, loadf64, 7118 SchedWriteFShuffle.XMM.Folded>, VEX_L; 7119 7120 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in { 7121 def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128, 7122 v4f32, v4f32, SchedWriteFShuffle.XMM>; 7123 def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256, 7124 v8f32, v4f32, WriteFShuffle256>, VEX_L; 7125 } 7126 let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in 7127 def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256, 7128 v4f64, v2f64, WriteFShuffle256>, VEX_L; 7129 7130 let Predicates = [HasAVX, NoVLX] in { 7131 def : Pat<(v4f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))), 7132 (VBROADCASTSSrm addr:$src)>; 7133 def : Pat<(v8f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))), 7134 (VBROADCASTSSYrm addr:$src)>; 7135 def : Pat<(v4f64 (X86VBroadcast (v2f64 (scalar_to_vector (loadf64 addr:$src))))), 7136 (VBROADCASTSDYrm addr:$src)>; 7137 } 7138 7139 //===----------------------------------------------------------------------===// 7140 // VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both 7141 // halves of a 256-bit vector. 7142 // 7143 let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in 7144 def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst), 7145 (ins i128mem:$src), 7146 "vbroadcasti128\t{$src, $dst|$dst, $src}", []>, 7147 Sched<[WriteShuffleLd]>, VEX, VEX_L; 7148 7149 let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX], 7150 ExeDomain = SSEPackedSingle in 7151 def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst), 7152 (ins f128mem:$src), 7153 "vbroadcastf128\t{$src, $dst|$dst, $src}", []>, 7154 Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L; 7155 7156 let Predicates = [HasAVX2, NoVLX] in { 7157 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), 7158 (VBROADCASTI128 addr:$src)>; 7159 def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))), 7160 (VBROADCASTI128 addr:$src)>; 7161 def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), 7162 (VBROADCASTI128 addr:$src)>; 7163 def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), 7164 (VBROADCASTI128 addr:$src)>; 7165 } 7166 7167 let Predicates = [HasAVX, NoVLX] in { 7168 def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))), 7169 (VBROADCASTF128 addr:$src)>; 7170 def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))), 7171 (VBROADCASTF128 addr:$src)>; 7172 } 7173 7174 let Predicates = [HasAVX1Only] in { 7175 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), 7176 (VBROADCASTF128 addr:$src)>; 7177 def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))), 7178 (VBROADCASTF128 addr:$src)>; 7179 def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), 7180 (VBROADCASTF128 addr:$src)>; 7181 def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), 7182 (VBROADCASTF128 addr:$src)>; 7183 } 7184 7185 //===----------------------------------------------------------------------===// 7186 // VINSERTF128 - Insert packed floating-point values 7187 // 7188 let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 7189 def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst), 7190 (ins VR256:$src1, VR128:$src2, u8imm:$src3), 7191 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7192 []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L; 7193 let mayLoad = 1 in 7194 def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), 7195 (ins VR256:$src1, f128mem:$src2, u8imm:$src3), 7196 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7197 []>, Sched<[WriteFShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; 7198 } 7199 7200 // To create a 256-bit all ones value, we should produce VCMPTRUEPS 7201 // with YMM register containing zero. 7202 // FIXME: Avoid producing vxorps to clear the fake inputs. 7203 let Predicates = [HasAVX1Only] in { 7204 def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>; 7205 } 7206 7207 multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To, 7208 PatFrag memop_frag> { 7209 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2), 7210 (iPTR imm)), 7211 (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2, 7212 (INSERT_get_vinsert128_imm VR256:$ins))>; 7213 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), 7214 (From (bitconvert (memop_frag addr:$src2))), 7215 (iPTR imm)), 7216 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, 7217 (INSERT_get_vinsert128_imm VR256:$ins))>; 7218 } 7219 7220 let Predicates = [HasAVX, NoVLX] in { 7221 defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>; 7222 defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>; 7223 } 7224 7225 let Predicates = [HasAVX1Only] in { 7226 defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64, loadv2i64>; 7227 defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32, loadv2i64>; 7228 defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv2i64>; 7229 defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8, loadv2i64>; 7230 } 7231 7232 //===----------------------------------------------------------------------===// 7233 // VEXTRACTF128 - Extract packed floating-point values 7234 // 7235 let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 7236 def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst), 7237 (ins VR256:$src1, u8imm:$src2), 7238 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7239 []>, Sched<[WriteFShuffle256]>, VEX, VEX_L; 7240 let mayStore = 1 in 7241 def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), 7242 (ins f128mem:$dst, VR256:$src1, u8imm:$src2), 7243 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7244 []>, Sched<[WriteFStoreX]>, VEX, VEX_L; 7245 } 7246 7247 multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> { 7248 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 7249 (To (!cast<Instruction>(InstrStr#rr) 7250 (From VR256:$src1), 7251 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 7252 def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1), 7253 (iPTR imm))), addr:$dst), 7254 (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1, 7255 (EXTRACT_get_vextract128_imm VR128:$ext))>; 7256 } 7257 7258 // AVX1 patterns 7259 let Predicates = [HasAVX, NoVLX] in { 7260 defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>; 7261 defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>; 7262 } 7263 7264 let Predicates = [HasAVX1Only] in { 7265 defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>; 7266 defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>; 7267 defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>; 7268 defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>; 7269 } 7270 7271 //===----------------------------------------------------------------------===// 7272 // VMASKMOV - Conditional SIMD Packed Loads and Stores 7273 // 7274 multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr, 7275 Intrinsic IntLd, Intrinsic IntLd256, 7276 Intrinsic IntSt, Intrinsic IntSt256> { 7277 def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst), 7278 (ins VR128:$src1, f128mem:$src2), 7279 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7280 [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>, 7281 VEX_4V, Sched<[WriteFMaskedLoad]>; 7282 def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst), 7283 (ins VR256:$src1, f256mem:$src2), 7284 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7285 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, 7286 VEX_4V, VEX_L, Sched<[WriteFMaskedLoadY]>; 7287 def mr : AVX8I<opc_mr, MRMDestMem, (outs), 7288 (ins f128mem:$dst, VR128:$src1, VR128:$src2), 7289 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7290 [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, 7291 VEX_4V, Sched<[WriteFMaskedStore]>; 7292 def Ymr : AVX8I<opc_mr, MRMDestMem, (outs), 7293 (ins f256mem:$dst, VR256:$src1, VR256:$src2), 7294 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7295 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, 7296 VEX_4V, VEX_L, Sched<[WriteFMaskedStoreY]>; 7297 } 7298 7299 let ExeDomain = SSEPackedSingle in 7300 defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps", 7301 int_x86_avx_maskload_ps, 7302 int_x86_avx_maskload_ps_256, 7303 int_x86_avx_maskstore_ps, 7304 int_x86_avx_maskstore_ps_256>; 7305 let ExeDomain = SSEPackedDouble in 7306 defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", 7307 int_x86_avx_maskload_pd, 7308 int_x86_avx_maskload_pd_256, 7309 int_x86_avx_maskstore_pd, 7310 int_x86_avx_maskstore_pd_256>; 7311 7312 //===----------------------------------------------------------------------===// 7313 // VPERMIL - Permute Single and Double Floating-Point Values 7314 // 7315 7316 multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, 7317 RegisterClass RC, X86MemOperand x86memop_f, 7318 X86MemOperand x86memop_i, PatFrag i_frag, 7319 ValueType f_vt, ValueType i_vt, 7320 X86FoldableSchedWrite sched, 7321 X86FoldableSchedWrite varsched> { 7322 let Predicates = [HasAVX, NoVLX] in { 7323 def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst), 7324 (ins RC:$src1, RC:$src2), 7325 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7326 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V, 7327 Sched<[varsched]>; 7328 def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst), 7329 (ins RC:$src1, x86memop_i:$src2), 7330 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7331 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, 7332 (i_vt (bitconvert (i_frag addr:$src2))))))]>, VEX_4V, 7333 Sched<[varsched.Folded, ReadAfterLd]>; 7334 7335 def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), 7336 (ins RC:$src1, u8imm:$src2), 7337 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7338 [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX, 7339 Sched<[sched]>; 7340 def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), 7341 (ins x86memop_f:$src1, u8imm:$src2), 7342 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7343 [(set RC:$dst, 7344 (f_vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX, 7345 Sched<[sched.Folded]>; 7346 }// Predicates = [HasAVX, NoVLX] 7347 } 7348 7349 let ExeDomain = SSEPackedSingle in { 7350 defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem, 7351 loadv2i64, v4f32, v4i32, SchedWriteFShuffle.XMM, 7352 SchedWriteFVarShuffle.XMM>; 7353 defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem, 7354 loadv4i64, v8f32, v8i32, SchedWriteFShuffle.YMM, 7355 SchedWriteFVarShuffle.YMM>, VEX_L; 7356 } 7357 let ExeDomain = SSEPackedDouble in { 7358 defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem, 7359 loadv2i64, v2f64, v2i64, SchedWriteFShuffle.XMM, 7360 SchedWriteFVarShuffle.XMM>; 7361 defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem, 7362 loadv4i64, v4f64, v4i64, SchedWriteFShuffle.YMM, 7363 SchedWriteFVarShuffle.YMM>, VEX_L; 7364 } 7365 7366 //===----------------------------------------------------------------------===// 7367 // VPERM2F128 - Permute Floating-Point Values in 128-bit chunks 7368 // 7369 7370 let ExeDomain = SSEPackedSingle in { 7371 let isCommutable = 1 in 7372 def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), 7373 (ins VR256:$src1, VR256:$src2, u8imm:$src3), 7374 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7375 [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, 7376 (i8 imm:$src3))))]>, VEX_4V, VEX_L, 7377 Sched<[WriteFShuffle256]>; 7378 def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), 7379 (ins VR256:$src1, f256mem:$src2, u8imm:$src3), 7380 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7381 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2), 7382 (i8 imm:$src3)))]>, VEX_4V, VEX_L, 7383 Sched<[WriteFShuffle256Ld, ReadAfterLd]>; 7384 } 7385 7386 // Immediate transform to help with commuting. 7387 def Perm2XCommuteImm : SDNodeXForm<imm, [{ 7388 return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N)); 7389 }]>; 7390 7391 let Predicates = [HasAVX] in { 7392 // Pattern with load in other operand. 7393 def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2), 7394 VR256:$src1, (i8 imm:$imm))), 7395 (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>; 7396 } 7397 7398 let Predicates = [HasAVX1Only] in { 7399 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), 7400 (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; 7401 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, 7402 (loadv4i64 addr:$src2), (i8 imm:$imm))), 7403 (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; 7404 // Pattern with load in other operand. 7405 def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2), 7406 VR256:$src1, (i8 imm:$imm))), 7407 (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>; 7408 } 7409 7410 //===----------------------------------------------------------------------===// 7411 // VZERO - Zero YMM registers 7412 // Note: These instruction do not affect the YMM16-YMM31. 7413 // 7414 7415 let SchedRW = [WriteSystem] in { 7416 let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, 7417 YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in { 7418 // Zero All YMM registers 7419 def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall", 7420 [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, 7421 Requires<[HasAVX]>, VEX_WIG; 7422 7423 // Zero Upper bits of YMM registers 7424 def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", 7425 [(int_x86_avx_vzeroupper)]>, PS, VEX, 7426 Requires<[HasAVX]>, VEX_WIG; 7427 } // Defs 7428 } // SchedRW 7429 7430 //===----------------------------------------------------------------------===// 7431 // Half precision conversion instructions 7432 // 7433 7434 multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, 7435 X86FoldableSchedWrite sched> { 7436 def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), 7437 "vcvtph2ps\t{$src, $dst|$dst, $src}", 7438 [(set RC:$dst, (X86cvtph2ps VR128:$src))]>, 7439 T8PD, VEX, Sched<[sched]>; 7440 let hasSideEffects = 0, mayLoad = 1 in 7441 def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 7442 "vcvtph2ps\t{$src, $dst|$dst, $src}", 7443 [(set RC:$dst, (X86cvtph2ps (bc_v8i16 7444 (loadv2i64 addr:$src))))]>, 7445 T8PD, VEX, Sched<[sched.Folded]>; 7446 } 7447 7448 multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, 7449 SchedWrite RR, SchedWrite MR> { 7450 def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), 7451 (ins RC:$src1, i32u8imm:$src2), 7452 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7453 [(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>, 7454 TAPD, VEX, Sched<[RR]>; 7455 let hasSideEffects = 0, mayStore = 1 in 7456 def mr : Ii8<0x1D, MRMDestMem, (outs), 7457 (ins x86memop:$dst, RC:$src1, i32u8imm:$src2), 7458 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7459 TAPD, VEX, Sched<[MR]>; 7460 } 7461 7462 let Predicates = [HasF16C, NoVLX] in { 7463 defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>; 7464 defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L; 7465 defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH, 7466 WriteCvtPS2PHSt>; 7467 defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY, 7468 WriteCvtPS2PHYSt>, VEX_L; 7469 7470 // Pattern match vcvtph2ps of a scalar i64 load. 7471 def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))), 7472 (VCVTPH2PSrm addr:$src)>; 7473 def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))), 7474 (VCVTPH2PSrm addr:$src)>; 7475 def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert 7476 (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), 7477 (VCVTPH2PSrm addr:$src)>; 7478 7479 def : Pat<(store (f64 (extractelt 7480 (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))), 7481 (iPTR 0))), addr:$dst), 7482 (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; 7483 def : Pat<(store (i64 (extractelt 7484 (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))), 7485 (iPTR 0))), addr:$dst), 7486 (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; 7487 def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, i32:$src2)), addr:$dst), 7488 (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>; 7489 } 7490 7491 // Patterns for matching conversions from float to half-float and vice versa. 7492 let Predicates = [HasF16C, NoVLX] in { 7493 // Use MXCSR.RC for rounding instead of explicitly specifying the default 7494 // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the 7495 // configurations we support (the default). However, falling back to MXCSR is 7496 // more consistent with other instructions, which are always controlled by it. 7497 // It's encoded as 0b100. 7498 def : Pat<(fp_to_f16 FR32:$src), 7499 (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (v8i16 (VCVTPS2PHrr 7500 (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4))), sub_16bit))>; 7501 7502 def : Pat<(f16_to_fp GR16:$src), 7503 (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr 7504 (v4i32 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)))), FR32)) >; 7505 7506 def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))), 7507 (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr 7508 (v8i16 (VCVTPS2PHrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4)))), FR32)) >; 7509 } 7510 7511 //===----------------------------------------------------------------------===// 7512 // AVX2 Instructions 7513 //===----------------------------------------------------------------------===// 7514 7515 /// AVX2_blend_rmi - AVX2 blend with 8-bit immediate 7516 multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 7517 ValueType OpVT, X86FoldableSchedWrite sched, 7518 RegisterClass RC, PatFrag memop_frag, 7519 X86MemOperand x86memop, SDNodeXForm commuteXForm> { 7520 let isCommutable = 1 in 7521 def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst), 7522 (ins RC:$src1, RC:$src2, u8imm:$src3), 7523 !strconcat(OpcodeStr, 7524 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 7525 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>, 7526 Sched<[sched]>, VEX_4V; 7527 def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), 7528 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 7529 !strconcat(OpcodeStr, 7530 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 7531 [(set RC:$dst, 7532 (OpVT (OpNode RC:$src1, 7533 (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>, 7534 Sched<[sched.Folded, ReadAfterLd]>, VEX_4V; 7535 7536 // Pattern to commute if load is in first source. 7537 def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)), 7538 RC:$src1, imm:$src3)), 7539 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, 7540 (commuteXForm imm:$src3))>; 7541 } 7542 7543 defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32, 7544 SchedWriteBlend.XMM, VR128, loadv2i64, i128mem, 7545 BlendCommuteImm4>; 7546 defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32, 7547 SchedWriteBlend.YMM, VR256, loadv4i64, i256mem, 7548 BlendCommuteImm8>, VEX_L; 7549 7550 // For insertion into the zero index (low half) of a 256-bit vector, it is 7551 // more efficient to generate a blend with immediate instead of an insert*128. 7552 let Predicates = [HasAVX2] in { 7553 def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)), 7554 (VPBLENDDYrri VR256:$src1, 7555 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7556 VR128:$src2, sub_xmm), 0xf)>; 7557 def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)), 7558 (VPBLENDDYrri VR256:$src1, 7559 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7560 VR128:$src2, sub_xmm), 0xf)>; 7561 def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)), 7562 (VPBLENDDYrri VR256:$src1, 7563 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7564 VR128:$src2, sub_xmm), 0xf)>; 7565 def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), 7566 (VPBLENDDYrri VR256:$src1, 7567 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7568 VR128:$src2, sub_xmm), 0xf)>; 7569 } 7570 7571 let Predicates = [HasAVX1Only] in { 7572 def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)), 7573 (VBLENDPSYrri VR256:$src1, 7574 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7575 VR128:$src2, sub_xmm), 0xf)>; 7576 def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)), 7577 (VBLENDPSYrri VR256:$src1, 7578 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7579 VR128:$src2, sub_xmm), 0xf)>; 7580 def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)), 7581 (VBLENDPSYrri VR256:$src1, 7582 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7583 VR128:$src2, sub_xmm), 0xf)>; 7584 def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), 7585 (VBLENDPSYrri VR256:$src1, 7586 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7587 VR128:$src2, sub_xmm), 0xf)>; 7588 } 7589 7590 //===----------------------------------------------------------------------===// 7591 // VPBROADCAST - Load from memory and broadcast to all elements of the 7592 // destination operand 7593 // 7594 multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, 7595 X86MemOperand x86memop, PatFrag ld_frag, 7596 ValueType OpVT128, ValueType OpVT256, Predicate prd> { 7597 let Predicates = [HasAVX2, prd] in { 7598 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 7599 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7600 [(set VR128:$dst, 7601 (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>, 7602 Sched<[SchedWriteShuffle.XMM]>, VEX; 7603 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 7604 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7605 [(set VR128:$dst, 7606 (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>, 7607 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX; 7608 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 7609 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7610 [(set VR256:$dst, 7611 (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>, 7612 Sched<[WriteShuffle256]>, VEX, VEX_L; 7613 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), 7614 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7615 [(set VR256:$dst, 7616 (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>, 7617 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L; 7618 7619 // Provide aliases for broadcast from the same register class that 7620 // automatically does the extract. 7621 def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))), 7622 (!cast<Instruction>(NAME#"Yrr") 7623 (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>; 7624 } 7625 } 7626 7627 defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8, 7628 v16i8, v32i8, NoVLX_Or_NoBWI>; 7629 defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16, 7630 v8i16, v16i16, NoVLX_Or_NoBWI>; 7631 defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, 7632 v4i32, v8i32, NoVLX>; 7633 defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, 7634 v2i64, v4i64, NoVLX>; 7635 7636 let Predicates = [HasAVX2, NoVLX] in { 7637 // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. 7638 def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))), 7639 (VPBROADCASTQrm addr:$src)>; 7640 def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))), 7641 (VPBROADCASTQYrm addr:$src)>; 7642 7643 def : Pat<(v4i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))), 7644 (VPBROADCASTDrm addr:$src)>; 7645 def : Pat<(v8i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))), 7646 (VPBROADCASTDYrm addr:$src)>; 7647 def : Pat<(v2i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))), 7648 (VPBROADCASTQrm addr:$src)>; 7649 def : Pat<(v4i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))), 7650 (VPBROADCASTQYrm addr:$src)>; 7651 } 7652 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 7653 // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. 7654 // This means we'll encounter truncated i32 loads; match that here. 7655 def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), 7656 (VPBROADCASTWrm addr:$src)>; 7657 def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), 7658 (VPBROADCASTWYrm addr:$src)>; 7659 def : Pat<(v8i16 (X86VBroadcast 7660 (i16 (trunc (i32 (zextloadi16 addr:$src)))))), 7661 (VPBROADCASTWrm addr:$src)>; 7662 def : Pat<(v16i16 (X86VBroadcast 7663 (i16 (trunc (i32 (zextloadi16 addr:$src)))))), 7664 (VPBROADCASTWYrm addr:$src)>; 7665 } 7666 7667 let Predicates = [HasAVX2, NoVLX] in { 7668 // Provide aliases for broadcast from the same register class that 7669 // automatically does the extract. 7670 def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))), 7671 (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), 7672 sub_xmm)))>; 7673 def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))), 7674 (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), 7675 sub_xmm)))>; 7676 } 7677 7678 let Predicates = [HasAVX2, NoVLX] in { 7679 // Provide fallback in case the load node that is used in the patterns above 7680 // is used by additional users, which prevents the pattern selection. 7681 def : Pat<(v4f32 (X86VBroadcast FR32:$src)), 7682 (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7683 def : Pat<(v8f32 (X86VBroadcast FR32:$src)), 7684 (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7685 def : Pat<(v4f64 (X86VBroadcast FR64:$src)), 7686 (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7687 } 7688 7689 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 7690 def : Pat<(v16i8 (X86VBroadcast GR8:$src)), 7691 (VPBROADCASTBrr (v16i8 (COPY_TO_REGCLASS 7692 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7693 GR8:$src, sub_8bit)), 7694 VR128)))>; 7695 def : Pat<(v32i8 (X86VBroadcast GR8:$src)), 7696 (VPBROADCASTBYrr (v16i8 (COPY_TO_REGCLASS 7697 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7698 GR8:$src, sub_8bit)), 7699 VR128)))>; 7700 7701 def : Pat<(v8i16 (X86VBroadcast GR16:$src)), 7702 (VPBROADCASTWrr (v8i16 (COPY_TO_REGCLASS 7703 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7704 GR16:$src, sub_16bit)), 7705 VR128)))>; 7706 def : Pat<(v16i16 (X86VBroadcast GR16:$src)), 7707 (VPBROADCASTWYrr (v8i16 (COPY_TO_REGCLASS 7708 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7709 GR16:$src, sub_16bit)), 7710 VR128)))>; 7711 } 7712 let Predicates = [HasAVX2, NoVLX] in { 7713 def : Pat<(v4i32 (X86VBroadcast GR32:$src)), 7714 (VPBROADCASTDrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>; 7715 def : Pat<(v8i32 (X86VBroadcast GR32:$src)), 7716 (VPBROADCASTDYrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>; 7717 def : Pat<(v2i64 (X86VBroadcast GR64:$src)), 7718 (VPBROADCASTQrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>; 7719 def : Pat<(v4i64 (X86VBroadcast GR64:$src)), 7720 (VPBROADCASTQYrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>; 7721 } 7722 7723 // AVX1 broadcast patterns 7724 let Predicates = [HasAVX1Only] in { 7725 def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), 7726 (VBROADCASTSSYrm addr:$src)>; 7727 def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), 7728 (VBROADCASTSDYrm addr:$src)>; 7729 def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), 7730 (VBROADCASTSSrm addr:$src)>; 7731 } 7732 7733 // Provide fallback in case the load node that is used in the patterns above 7734 // is used by additional users, which prevents the pattern selection. 7735 let Predicates = [HasAVX, NoVLX] in { 7736 // 128bit broadcasts: 7737 def : Pat<(v2f64 (X86VBroadcast f64:$src)), 7738 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7739 def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), 7740 (VMOVDDUPrm addr:$src)>; 7741 7742 def : Pat<(v2f64 (X86VBroadcast v2f64:$src)), 7743 (VMOVDDUPrr VR128:$src)>; 7744 def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))), 7745 (VMOVDDUPrm addr:$src)>; 7746 } 7747 7748 let Predicates = [HasAVX1Only] in { 7749 def : Pat<(v4f32 (X86VBroadcast FR32:$src)), 7750 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>; 7751 def : Pat<(v8f32 (X86VBroadcast FR32:$src)), 7752 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 7753 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm), 7754 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>; 7755 def : Pat<(v4f64 (X86VBroadcast FR64:$src)), 7756 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 7757 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm), 7758 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>; 7759 7760 def : Pat<(v4i32 (X86VBroadcast GR32:$src)), 7761 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)>; 7762 def : Pat<(v8i32 (X86VBroadcast GR32:$src)), 7763 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7764 (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), sub_xmm), 7765 (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), 1)>; 7766 def : Pat<(v4i64 (X86VBroadcast GR64:$src)), 7767 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), 7768 (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), sub_xmm), 7769 (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), 1)>; 7770 7771 def : Pat<(v2i64 (X86VBroadcast i64:$src)), 7772 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)>; 7773 def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), 7774 (VMOVDDUPrm addr:$src)>; 7775 } 7776 7777 //===----------------------------------------------------------------------===// 7778 // VPERM - Permute instructions 7779 // 7780 7781 multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, 7782 ValueType OpVT, X86FoldableSchedWrite Sched, 7783 X86MemOperand memOp> { 7784 let Predicates = [HasAVX2, NoVLX] in { 7785 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), 7786 (ins VR256:$src1, VR256:$src2), 7787 !strconcat(OpcodeStr, 7788 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7789 [(set VR256:$dst, 7790 (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, 7791 Sched<[Sched]>, VEX_4V, VEX_L; 7792 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), 7793 (ins VR256:$src1, memOp:$src2), 7794 !strconcat(OpcodeStr, 7795 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7796 [(set VR256:$dst, 7797 (OpVT (X86VPermv VR256:$src1, 7798 (bitconvert (mem_frag addr:$src2)))))]>, 7799 Sched<[Sched.Folded, ReadAfterLd]>, VEX_4V, VEX_L; 7800 } 7801 } 7802 7803 defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteVarShuffle256, 7804 i256mem>; 7805 let ExeDomain = SSEPackedSingle in 7806 defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFVarShuffle256, 7807 f256mem>; 7808 7809 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, 7810 ValueType OpVT, X86FoldableSchedWrite Sched, 7811 X86MemOperand memOp> { 7812 let Predicates = [HasAVX2, NoVLX] in { 7813 def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst), 7814 (ins VR256:$src1, u8imm:$src2), 7815 !strconcat(OpcodeStr, 7816 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7817 [(set VR256:$dst, 7818 (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>, 7819 Sched<[Sched]>, VEX, VEX_L; 7820 def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), 7821 (ins memOp:$src1, u8imm:$src2), 7822 !strconcat(OpcodeStr, 7823 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7824 [(set VR256:$dst, 7825 (OpVT (X86VPermi (mem_frag addr:$src1), 7826 (i8 imm:$src2))))]>, 7827 Sched<[Sched.Folded, ReadAfterLd]>, VEX, VEX_L; 7828 } 7829 } 7830 7831 defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64, 7832 WriteShuffle256, i256mem>, VEX_W; 7833 let ExeDomain = SSEPackedDouble in 7834 defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64, 7835 WriteFShuffle256, f256mem>, VEX_W; 7836 7837 //===----------------------------------------------------------------------===// 7838 // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks 7839 // 7840 let isCommutable = 1 in 7841 def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), 7842 (ins VR256:$src1, VR256:$src2, u8imm:$src3), 7843 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7844 [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, 7845 (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>, 7846 VEX_4V, VEX_L; 7847 def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), 7848 (ins VR256:$src1, f256mem:$src2, u8imm:$src3), 7849 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7850 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2), 7851 (i8 imm:$src3)))]>, 7852 Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; 7853 7854 let Predicates = [HasAVX2] in 7855 def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2), 7856 VR256:$src1, (i8 imm:$imm))), 7857 (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>; 7858 7859 7860 //===----------------------------------------------------------------------===// 7861 // VINSERTI128 - Insert packed integer values 7862 // 7863 let hasSideEffects = 0 in { 7864 def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst), 7865 (ins VR256:$src1, VR128:$src2, u8imm:$src3), 7866 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7867 []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L; 7868 let mayLoad = 1 in 7869 def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), 7870 (ins VR256:$src1, i128mem:$src2, u8imm:$src3), 7871 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7872 []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; 7873 } 7874 7875 let Predicates = [HasAVX2, NoVLX] in { 7876 defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64, loadv2i64>; 7877 defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32, loadv2i64>; 7878 defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv2i64>; 7879 defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8, loadv2i64>; 7880 } 7881 7882 //===----------------------------------------------------------------------===// 7883 // VEXTRACTI128 - Extract packed integer values 7884 // 7885 def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst), 7886 (ins VR256:$src1, u8imm:$src2), 7887 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7888 Sched<[WriteShuffle256]>, VEX, VEX_L; 7889 let hasSideEffects = 0, mayStore = 1 in 7890 def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), 7891 (ins i128mem:$dst, VR256:$src1, u8imm:$src2), 7892 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7893 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L; 7894 7895 let Predicates = [HasAVX2, NoVLX] in { 7896 defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>; 7897 defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>; 7898 defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>; 7899 defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>; 7900 } 7901 7902 //===----------------------------------------------------------------------===// 7903 // VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores 7904 // 7905 multiclass avx2_pmovmask<string OpcodeStr, 7906 Intrinsic IntLd128, Intrinsic IntLd256, 7907 Intrinsic IntSt128, Intrinsic IntSt256> { 7908 def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst), 7909 (ins VR128:$src1, i128mem:$src2), 7910 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7911 [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, 7912 VEX_4V, Sched<[WriteVecMaskedLoad]>; 7913 def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst), 7914 (ins VR256:$src1, i256mem:$src2), 7915 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7916 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, 7917 VEX_4V, VEX_L, Sched<[WriteVecMaskedLoadY]>; 7918 def mr : AVX28I<0x8e, MRMDestMem, (outs), 7919 (ins i128mem:$dst, VR128:$src1, VR128:$src2), 7920 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7921 [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, 7922 VEX_4V, Sched<[WriteVecMaskedStore]>; 7923 def Ymr : AVX28I<0x8e, MRMDestMem, (outs), 7924 (ins i256mem:$dst, VR256:$src1, VR256:$src2), 7925 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7926 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, 7927 VEX_4V, VEX_L, Sched<[WriteVecMaskedStoreY]>; 7928 } 7929 7930 defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd", 7931 int_x86_avx2_maskload_d, 7932 int_x86_avx2_maskload_d_256, 7933 int_x86_avx2_maskstore_d, 7934 int_x86_avx2_maskstore_d_256>; 7935 defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", 7936 int_x86_avx2_maskload_q, 7937 int_x86_avx2_maskload_q_256, 7938 int_x86_avx2_maskstore_q, 7939 int_x86_avx2_maskstore_q_256>, VEX_W; 7940 7941 multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT, 7942 ValueType MaskVT, string BlendStr, ValueType ZeroVT> { 7943 // masked store 7944 def: Pat<(X86mstore addr:$ptr, (MaskVT RC:$mask), (VT RC:$src)), 7945 (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>; 7946 // masked load 7947 def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), undef)), 7948 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; 7949 def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), 7950 (VT (bitconvert (ZeroVT immAllZerosV))))), 7951 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; 7952 def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))), 7953 (!cast<Instruction>(BlendStr#"rr") 7954 RC:$src0, 7955 (VT (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)), 7956 RC:$mask)>; 7957 } 7958 let Predicates = [HasAVX] in { 7959 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>; 7960 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64, "VBLENDVPD", v4i32>; 7961 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32, "VBLENDVPSY", v8i32>; 7962 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64, "VBLENDVPDY", v8i32>; 7963 } 7964 let Predicates = [HasAVX1Only] in { 7965 // load/store i32/i64 not supported use ps/pd version 7966 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>; 7967 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>; 7968 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>; 7969 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>; 7970 } 7971 let Predicates = [HasAVX2] in { 7972 defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>; 7973 defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>; 7974 defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>; 7975 defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>; 7976 } 7977 7978 //===----------------------------------------------------------------------===// 7979 // SubVector Broadcasts 7980 // Provide fallback in case the load node that is used in the patterns above 7981 // is used by additional users, which prevents the pattern selection. 7982 7983 let Predicates = [HasAVX2, NoVLX] in { 7984 def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))), 7985 (VINSERTI128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7986 (v2i64 VR128:$src), 1)>; 7987 def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))), 7988 (VINSERTI128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7989 (v4i32 VR128:$src), 1)>; 7990 def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))), 7991 (VINSERTI128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7992 (v8i16 VR128:$src), 1)>; 7993 def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))), 7994 (VINSERTI128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7995 (v16i8 VR128:$src), 1)>; 7996 } 7997 7998 let Predicates = [HasAVX, NoVLX] in { 7999 def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))), 8000 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 8001 (v2f64 VR128:$src), 1)>; 8002 def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))), 8003 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 8004 (v4f32 VR128:$src), 1)>; 8005 } 8006 8007 let Predicates = [HasAVX1Only] in { 8008 def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))), 8009 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 8010 (v2i64 VR128:$src), 1)>; 8011 def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))), 8012 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 8013 (v4i32 VR128:$src), 1)>; 8014 def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))), 8015 (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 8016 (v8i16 VR128:$src), 1)>; 8017 def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))), 8018 (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 8019 (v16i8 VR128:$src), 1)>; 8020 } 8021 8022 //===----------------------------------------------------------------------===// 8023 // Variable Bit Shifts 8024 // 8025 multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, 8026 ValueType vt128, ValueType vt256> { 8027 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), 8028 (ins VR128:$src1, VR128:$src2), 8029 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8030 [(set VR128:$dst, 8031 (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>, 8032 VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>; 8033 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), 8034 (ins VR128:$src1, i128mem:$src2), 8035 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8036 [(set VR128:$dst, 8037 (vt128 (OpNode VR128:$src1, 8038 (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>, 8039 VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded, ReadAfterLd]>; 8040 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), 8041 (ins VR256:$src1, VR256:$src2), 8042 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8043 [(set VR256:$dst, 8044 (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>, 8045 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>; 8046 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), 8047 (ins VR256:$src1, i256mem:$src2), 8048 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8049 [(set VR256:$dst, 8050 (vt256 (OpNode VR256:$src1, 8051 (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>, 8052 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded, ReadAfterLd]>; 8053 } 8054 8055 let Predicates = [HasAVX2, NoVLX] in { 8056 defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>; 8057 defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W; 8058 defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>; 8059 defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W; 8060 defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>; 8061 8062 def : Pat<(v4i32 (X86vsrav VR128:$src1, VR128:$src2)), 8063 (VPSRAVDrr VR128:$src1, VR128:$src2)>; 8064 def : Pat<(v4i32 (X86vsrav VR128:$src1, 8065 (bitconvert (loadv2i64 addr:$src2)))), 8066 (VPSRAVDrm VR128:$src1, addr:$src2)>; 8067 def : Pat<(v8i32 (X86vsrav VR256:$src1, VR256:$src2)), 8068 (VPSRAVDYrr VR256:$src1, VR256:$src2)>; 8069 def : Pat<(v8i32 (X86vsrav VR256:$src1, 8070 (bitconvert (loadv4i64 addr:$src2)))), 8071 (VPSRAVDYrm VR256:$src1, addr:$src2)>; 8072 } 8073 8074 //===----------------------------------------------------------------------===// 8075 // VGATHER - GATHER Operations 8076 8077 // FIXME: Improve scheduling of gather instructions. 8078 multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx, 8079 ValueType VTy, PatFrag GatherNode128, 8080 PatFrag GatherNode256, RegisterClass RC256, 8081 X86MemOperand memop128, X86MemOperand memop256, 8082 ValueType MTx = VTx, ValueType MTy = VTy> { 8083 def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb), 8084 (ins VR128:$src1, memop128:$src2, VR128:$mask), 8085 !strconcat(OpcodeStr, 8086 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), 8087 [(set (VTx VR128:$dst), (MTx VR128:$mask_wb), 8088 (GatherNode128 VR128:$src1, VR128:$mask, 8089 vectoraddr:$src2))]>, 8090 VEX, Sched<[WriteLoad]>; 8091 def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb), 8092 (ins RC256:$src1, memop256:$src2, RC256:$mask), 8093 !strconcat(OpcodeStr, 8094 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), 8095 [(set (VTy RC256:$dst), (MTy RC256:$mask_wb), 8096 (GatherNode256 RC256:$src1, RC256:$mask, 8097 vectoraddr:$src2))]>, 8098 VEX, VEX_L, Sched<[WriteLoad]>; 8099 } 8100 8101 let Predicates = [UseAVX2] in { 8102 let mayLoad = 1, hasSideEffects = 0, Constraints 8103 = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" 8104 in { 8105 defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, mgatherv4i32, 8106 mgatherv4i32, VR256, vx128mem, vx256mem>, VEX_W; 8107 defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, mgatherv2i64, 8108 mgatherv4i64, VR256, vx128mem, vy256mem>, VEX_W; 8109 defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, mgatherv4i32, 8110 mgatherv8i32, VR256, vx128mem, vy256mem>; 8111 defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, mgatherv2i64, 8112 mgatherv4i64, VR128, vx64mem, vy128mem>; 8113 8114 let ExeDomain = SSEPackedDouble in { 8115 defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, mgatherv4i32, 8116 mgatherv4i32, VR256, vx128mem, vx256mem, 8117 v2i64, v4i64>, VEX_W; 8118 defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, mgatherv2i64, 8119 mgatherv4i64, VR256, vx128mem, vy256mem, 8120 v2i64, v4i64>, VEX_W; 8121 } 8122 8123 let ExeDomain = SSEPackedSingle in { 8124 defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, mgatherv4i32, 8125 mgatherv8i32, VR256, vx128mem, vy256mem, 8126 v4i32, v8i32>; 8127 defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, mgatherv2i64, 8128 mgatherv4i64, VR128, vx64mem, vy128mem, 8129 v4i32, v4i32>; 8130 } 8131 } 8132 } 8133 8134 //===----------------------------------------------------------------------===// 8135 // Extra selection patterns for f128, f128mem 8136 8137 // movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2. 8138 def : Pat<(alignedstore (f128 VR128:$src), addr:$dst), 8139 (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 VR128:$src), VR128))>; 8140 def : Pat<(store (f128 VR128:$src), addr:$dst), 8141 (MOVUPSmr addr:$dst, (COPY_TO_REGCLASS (f128 VR128:$src), VR128))>; 8142 8143 def : Pat<(alignedloadf128 addr:$src), 8144 (COPY_TO_REGCLASS (MOVAPSrm addr:$src), VR128)>; 8145 def : Pat<(loadf128 addr:$src), 8146 (COPY_TO_REGCLASS (MOVUPSrm addr:$src), VR128)>; 8147 8148 // andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2 8149 def : Pat<(f128 (X86fand VR128:$src1, (memopf128 addr:$src2))), 8150 (COPY_TO_REGCLASS 8151 (ANDPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2), 8152 VR128)>; 8153 8154 def : Pat<(f128 (X86fand VR128:$src1, VR128:$src2)), 8155 (COPY_TO_REGCLASS 8156 (ANDPSrr (COPY_TO_REGCLASS VR128:$src1, VR128), 8157 (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>; 8158 8159 def : Pat<(f128 (X86for VR128:$src1, (memopf128 addr:$src2))), 8160 (COPY_TO_REGCLASS 8161 (ORPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2), 8162 VR128)>; 8163 8164 def : Pat<(f128 (X86for VR128:$src1, VR128:$src2)), 8165 (COPY_TO_REGCLASS 8166 (ORPSrr (COPY_TO_REGCLASS VR128:$src1, VR128), 8167 (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>; 8168 8169 def : Pat<(f128 (X86fxor VR128:$src1, (memopf128 addr:$src2))), 8170 (COPY_TO_REGCLASS 8171 (XORPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2), 8172 VR128)>; 8173 8174 def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)), 8175 (COPY_TO_REGCLASS 8176 (XORPSrr (COPY_TO_REGCLASS VR128:$src1, VR128), 8177 (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>; 8178 8179 //===----------------------------------------------------------------------===// 8180 // GFNI instructions 8181 //===----------------------------------------------------------------------===// 8182 8183 multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT, 8184 RegisterClass RC, PatFrag MemOpFrag, 8185 X86MemOperand X86MemOp, bit Is2Addr = 0> { 8186 let ExeDomain = SSEPackedInt, 8187 AsmString = !if(Is2Addr, 8188 OpcodeStr##"\t{$src2, $dst|$dst, $src2}", 8189 OpcodeStr##"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { 8190 let isCommutable = 1 in 8191 def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "", 8192 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>, 8193 Sched<[SchedWriteVecALU.XMM]>, T8PD; 8194 8195 def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "", 8196 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, 8197 (bitconvert (MemOpFrag addr:$src2)))))]>, 8198 Sched<[SchedWriteVecALU.XMM.Folded, ReadAfterLd]>, T8PD; 8199 } 8200 } 8201 8202 multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT, 8203 SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag, 8204 X86MemOperand X86MemOp, bit Is2Addr = 0> { 8205 let AsmString = !if(Is2Addr, 8206 OpStr##"\t{$src3, $src2, $dst|$dst, $src2, $src3}", 8207 OpStr##"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in { 8208 def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst), 8209 (ins RC:$src1, RC:$src2, u8imm:$src3), "", 8210 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))], 8211 SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>; 8212 def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst), 8213 (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "", 8214 [(set RC:$dst, (OpVT (OpNode RC:$src1, 8215 (bitconvert (MemOpFrag addr:$src2)), 8216 imm:$src3)))], SSEPackedInt>, 8217 Sched<[SchedWriteVecALU.XMM.Folded, ReadAfterLd]>; 8218 } 8219 } 8220 8221 multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> { 8222 let Constraints = "$src1 = $dst", 8223 Predicates = [HasGFNI, UseSSE2] in 8224 defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode, 8225 VR128, loadv2i64, i128mem, 1>; 8226 let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in { 8227 defm V##NAME : GF2P8AFFINE_rmi<Op, "v"##OpStr, v16i8, OpNode, VR128, 8228 loadv2i64, i128mem>, VEX_4V, VEX_W; 8229 defm V##NAME##Y : GF2P8AFFINE_rmi<Op, "v"##OpStr, v32i8, OpNode, VR256, 8230 loadv4i64, i256mem>, VEX_4V, VEX_L, VEX_W; 8231 } 8232 } 8233 8234 // GF2P8MULB 8235 let Constraints = "$src1 = $dst", 8236 Predicates = [HasGFNI, UseSSE2] in 8237 defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memopv2i64, 8238 i128mem, 1>; 8239 let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in { 8240 defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, loadv2i64, 8241 i128mem>, VEX_4V; 8242 defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, loadv4i64, 8243 i256mem>, VEX_4V, VEX_L; 8244 } 8245 // GF2P8AFFINEINVQB, GF2P8AFFINEQB 8246 let isCommutable = 0 in { 8247 defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb", 8248 X86GF2P8affineinvqb>, TAPD; 8249 defm GF2P8AFFINEQB : GF2P8AFFINE_common<0xCE, "gf2p8affineqb", 8250 X86GF2P8affineqb>, TAPD; 8251 } 8252 8253