Home | History | Annotate | Download | only in X86
      1 //===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file describes the X86 SSE instruction set, defining the instructions,
     11 // and properties of the instructions which are needed for code generation,
     12 // machine code emission, and analysis.
     13 //
     14 //===----------------------------------------------------------------------===//
     15 
     16 //===----------------------------------------------------------------------===//
     17 // SSE 1 & 2 Instructions Classes
     18 //===----------------------------------------------------------------------===//
     19 
     20 /// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
     21 multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
     22                            RegisterClass RC, X86MemOperand x86memop,
     23                            Domain d, X86FoldableSchedWrite sched,
     24                            bit Is2Addr = 1> {
     25   let isCommutable = 1 in {
     26     def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
     27        !if(Is2Addr,
     28            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
     29            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
     30        [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>,
     31        Sched<[sched]>;
     32   }
     33   def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
     34        !if(Is2Addr,
     35            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
     36            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
     37        [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>,
     38        Sched<[sched.Folded, ReadAfterLd]>;
     39 }
     40 
     41 /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
     42 multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
     43                                SDPatternOperator OpNode, RegisterClass RC,
     44                                ValueType VT, string asm, Operand memopr,
     45                                ComplexPattern mem_cpat, Domain d,
     46                                X86FoldableSchedWrite sched, bit Is2Addr = 1> {
     47 let isCodeGenOnly = 1, hasSideEffects = 0 in {
     48   def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
     49        !if(Is2Addr,
     50            !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
     51            !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
     52        [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>,
     53        Sched<[sched]>;
     54   let mayLoad = 1 in
     55   def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
     56        !if(Is2Addr,
     57            !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
     58            !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
     59        [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], d>,
     60        Sched<[sched.Folded, ReadAfterLd]>;
     61 }
     62 }
     63 
     64 /// sse12_fp_packed - SSE 1 & 2 packed instructions class
     65 multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
     66                            RegisterClass RC, ValueType vt,
     67                            X86MemOperand x86memop, PatFrag mem_frag,
     68                            Domain d, X86FoldableSchedWrite sched,
     69                            bit Is2Addr = 1> {
     70   let isCommutable = 1 in
     71     def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
     72        !if(Is2Addr,
     73            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
     74            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
     75        [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>,
     76        Sched<[sched]>;
     77   let mayLoad = 1 in
     78     def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
     79        !if(Is2Addr,
     80            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
     81            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
     82        [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
     83           d>,
     84        Sched<[sched.Folded, ReadAfterLd]>;
     85 }
     86 
     87 /// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
     88 multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
     89                                       string OpcodeStr, X86MemOperand x86memop,
     90                                       X86FoldableSchedWrite sched,
     91                                       list<dag> pat_rr, list<dag> pat_rm,
     92                                       bit Is2Addr = 1> {
     93   let isCommutable = 1, hasSideEffects = 0 in
     94     def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
     95        !if(Is2Addr,
     96            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
     97            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
     98        pat_rr, d>,
     99        Sched<[sched]>;
    100   let hasSideEffects = 0, mayLoad = 1 in
    101   def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
    102        !if(Is2Addr,
    103            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
    104            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
    105        pat_rm, d>,
    106        Sched<[sched.Folded, ReadAfterLd]>;
    107 }
    108 
    109 
    110 // Alias instructions that map fld0 to xorps for sse or vxorps for avx.
    111 // This is expanded by ExpandPostRAPseudos.
    112 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
    113     isPseudo = 1, SchedRW = [WriteZero] in {
    114   def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
    115                    [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
    116   def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
    117                    [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoAVX512]>;
    118 }
    119 
    120 //===----------------------------------------------------------------------===//
    121 // AVX & SSE - Zero/One Vectors
    122 //===----------------------------------------------------------------------===//
    123 
    124 // Alias instruction that maps zero vector to pxor / xorp* for sse.
    125 // This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
    126 // swizzled by ExecutionDomainFix to pxor.
    127 // We set canFoldAsLoad because this can be converted to a constant-pool
    128 // load of an all-zeros value if folding it would be beneficial.
    129 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
    130     isPseudo = 1, SchedRW = [WriteZero] in {
    131 def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
    132                [(set VR128:$dst, (v4f32 immAllZerosV))]>;
    133 }
    134 
    135 let Predicates = [NoAVX512] in
    136 def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
    137 
    138 
    139 // The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
    140 // and doesn't need it because on sandy bridge the register is set to zero
    141 // at the rename stage without using any execution unit, so SET0PSY
    142 // and SET0PDY can be used for vector int instructions without penalty
    143 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
    144     isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
    145 def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
    146                  [(set VR256:$dst, (v8i32 immAllZerosV))]>;
    147 }
    148 
    149 // We set canFoldAsLoad because this can be converted to a constant-pool
    150 // load of an all-ones value if folding it would be beneficial.
    151 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
    152     isPseudo = 1, SchedRW = [WriteZero] in {
    153   def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
    154                        [(set VR128:$dst, (v4i32 immAllOnesV))]>;
    155   let Predicates = [HasAVX1Only, OptForMinSize] in {
    156   def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
    157                           [(set VR256:$dst, (v8i32 immAllOnesV))]>;
    158   }
    159   let Predicates = [HasAVX2] in
    160   def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
    161                           [(set VR256:$dst, (v8i32 immAllOnesV))]>;
    162 }
    163 
    164 //===----------------------------------------------------------------------===//
    165 // SSE 1 & 2 - Move FP Scalar Instructions
    166 //
    167 // Move Instructions. Register-to-register movss/movsd is not used for FR32/64
    168 // register copies because it's a partial register update; Register-to-register
    169 // movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
    170 // that the insert be implementable in terms of a copy, and just mentioned, we
    171 // don't use movss/movsd for copies.
    172 //===----------------------------------------------------------------------===//
    173 
    174 multiclass sse12_move_rr<SDNode OpNode, ValueType vt,
    175                          X86MemOperand x86memop, string base_opc,
    176                          string asm_opr, Domain d, string Name> {
    177   let isCommutable = 1 in
    178   def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
    179               (ins VR128:$src1, VR128:$src2),
    180               !strconcat(base_opc, asm_opr),
    181               [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>,
    182               Sched<[SchedWriteFShuffle.XMM]>;
    183 
    184   // For the disassembler
    185   let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
    186   def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
    187                   (ins VR128:$src1, VR128:$src2),
    188                   !strconcat(base_opc, asm_opr), []>,
    189                   Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>;
    190 }
    191 
    192 multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
    193                       X86MemOperand x86memop, string OpcodeStr,
    194                       Domain d, string Name, Predicate pred> {
    195   // AVX
    196   let Predicates = [UseAVX, OptForSize] in
    197   defm V#NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
    198                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d,
    199                               "V"#Name>,
    200                               VEX_4V, VEX_LIG, VEX_WIG;
    201 
    202   def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
    203                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
    204                      [(store RC:$src, addr:$dst)], d>,
    205                      VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG;
    206   // SSE1 & 2
    207   let Constraints = "$src1 = $dst" in {
    208     let Predicates = [pred, NoSSE41_Or_OptForSize] in
    209     defm NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
    210                               "\t{$src2, $dst|$dst, $src2}", d, Name>;
    211   }
    212 
    213   def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
    214                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
    215                      [(store RC:$src, addr:$dst)], d>,
    216                      Sched<[WriteFStore]>;
    217 
    218   def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
    219                   (!cast<Instruction>("V"#NAME#"rr_REV")
    220                    VR128:$dst, VR128:$src1, VR128:$src2), 0>;
    221   def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}",
    222                   (!cast<Instruction>(NAME#"rr_REV")
    223                    VR128:$dst, VR128:$src2), 0>;
    224 }
    225 
    226 // Loading from memory automatically zeroing upper bits.
    227 multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
    228                          PatFrag mem_pat, string OpcodeStr, Domain d> {
    229   def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
    230                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
    231                      [(set RC:$dst, (mem_pat addr:$src))], d>,
    232                      VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
    233   def NAME#rm   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
    234                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
    235                      [(set RC:$dst, (mem_pat addr:$src))], d>,
    236                      Sched<[WriteFLoad]>;
    237 }
    238 
    239 defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
    240                         SSEPackedSingle, "MOVSS", UseSSE1>, XS;
    241 defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
    242                         SSEPackedDouble, "MOVSD", UseSSE2>, XD;
    243 
    244 let canFoldAsLoad = 1, isReMaterializable = 1 in {
    245   defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss",
    246                              SSEPackedSingle>, XS;
    247   defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd",
    248                              SSEPackedDouble>, XD;
    249 }
    250 
    251 // Patterns
    252 let Predicates = [UseAVX] in {
    253   // MOVSSrm zeros the high parts of the register; represent this
    254   // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
    255   def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
    256             (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
    257   def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
    258             (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
    259   def : Pat<(v4f32 (X86vzload addr:$src)),
    260             (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
    261 
    262   // MOVSDrm zeros the high parts of the register; represent this
    263   // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
    264   def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
    265             (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
    266   def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
    267             (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
    268   def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
    269             (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
    270   def : Pat<(v2f64 (X86vzload addr:$src)),
    271             (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
    272 
    273   // Represent the same patterns above but in the form they appear for
    274   // 256-bit types
    275   def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
    276                    (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
    277             (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
    278   def : Pat<(v8f32 (X86vzload addr:$src)),
    279             (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
    280   def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
    281                    (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
    282             (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
    283   def : Pat<(v4f64 (X86vzload addr:$src)),
    284             (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
    285 
    286   // Extract and store.
    287   def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
    288                    addr:$dst),
    289             (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
    290 }
    291 
    292 let Predicates = [UseAVX, OptForSize] in {
    293   // Move scalar to XMM zero-extended, zeroing a VR128 then do a
    294   // MOVSS to the lower bits.
    295   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
    296             (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
    297   def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
    298             (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
    299 
    300   // Move low f32 and clear high bits.
    301   def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
    302             (SUBREG_TO_REG (i32 0),
    303              (v4f32 (VMOVSSrr (v4f32 (V_SET0)),
    304               (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>;
    305   def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
    306             (SUBREG_TO_REG (i32 0),
    307              (v4i32 (VMOVSSrr (v4i32 (V_SET0)),
    308               (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
    309 
    310   def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
    311             (SUBREG_TO_REG (i32 0),
    312              (v2f64 (VMOVSDrr (v2f64 (V_SET0)),
    313                        (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
    314              sub_xmm)>;
    315   def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
    316             (SUBREG_TO_REG (i32 0),
    317              (v2i64 (VMOVSDrr (v2i64 (V_SET0)),
    318                        (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
    319              sub_xmm)>;
    320 }
    321 
    322 let Predicates = [UseSSE1] in {
    323   let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
    324   // Move scalar to XMM zero-extended, zeroing a VR128 then do a
    325   // MOVSS to the lower bits.
    326   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
    327             (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
    328   def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
    329             (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
    330   }
    331 
    332   // MOVSSrm already zeros the high parts of the register.
    333   def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
    334             (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
    335   def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
    336             (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
    337   def : Pat<(v4f32 (X86vzload addr:$src)),
    338             (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
    339 
    340   // Extract and store.
    341   def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
    342                    addr:$dst),
    343             (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
    344 }
    345 
    346 let Predicates = [UseSSE2] in {
    347   // MOVSDrm already zeros the high parts of the register.
    348   def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
    349             (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
    350   def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
    351             (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
    352   def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
    353             (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
    354   def : Pat<(v2f64 (X86vzload addr:$src)),
    355             (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
    356 }
    357 
    358 // Aliases to help the assembler pick two byte VEX encodings by swapping the
    359 // operands relative to the normal instructions to use VEX.R instead of VEX.B.
    360 def : InstAlias<"vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
    361                 (VMOVSSrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>;
    362 def : InstAlias<"vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
    363                 (VMOVSDrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>;
    364 
    365 //===----------------------------------------------------------------------===//
    366 // SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
    367 //===----------------------------------------------------------------------===//
    368 
    369 multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
    370                             X86MemOperand x86memop, PatFrag ld_frag,
    371                             string asm, Domain d,
    372                             X86SchedWriteMoveLS sched> {
    373 let hasSideEffects = 0, isMoveReg = 1 in
    374   def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
    375               !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
    376            Sched<[sched.RR]>;
    377 let canFoldAsLoad = 1, isReMaterializable = 1 in
    378   def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
    379               !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
    380                    [(set RC:$dst, (ld_frag addr:$src))], d>,
    381            Sched<[sched.RM]>;
    382 }
    383 
    384 let Predicates = [HasAVX, NoVLX] in {
    385 defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
    386                                 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
    387                                 PS, VEX, VEX_WIG;
    388 defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
    389                                 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
    390                                 PD, VEX, VEX_WIG;
    391 defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
    392                                 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
    393                                 PS, VEX, VEX_WIG;
    394 defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
    395                                 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
    396                                 PD, VEX, VEX_WIG;
    397 
    398 defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps",
    399                                  SSEPackedSingle, SchedWriteFMoveLS.YMM>,
    400                                  PS, VEX, VEX_L, VEX_WIG;
    401 defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd",
    402                                  SSEPackedDouble, SchedWriteFMoveLS.YMM>,
    403                                  PD, VEX, VEX_L, VEX_WIG;
    404 defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
    405                                  SSEPackedSingle, SchedWriteFMoveLS.YMM>,
    406                                  PS, VEX, VEX_L, VEX_WIG;
    407 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", 
    408                                  SSEPackedDouble, SchedWriteFMoveLS.YMM>,
    409                                  PD, VEX, VEX_L, VEX_WIG;
    410 }
    411 
    412 let Predicates = [UseSSE1] in {
    413 defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
    414                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
    415                                PS;
    416 defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
    417                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
    418                                PS;
    419 }
    420 let Predicates = [UseSSE2] in {
    421 defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
    422                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
    423                                PD;
    424 defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
    425                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
    426                                PD;
    427 }
    428 
    429 let Predicates = [HasAVX, NoVLX]  in {
    430 let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
    431 def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
    432                    "movaps\t{$src, $dst|$dst, $src}",
    433                    [(alignedstore (v4f32 VR128:$src), addr:$dst)]>,
    434                    VEX, VEX_WIG;
    435 def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
    436                    "movapd\t{$src, $dst|$dst, $src}",
    437                    [(alignedstore (v2f64 VR128:$src), addr:$dst)]>,
    438                    VEX, VEX_WIG;
    439 def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
    440                    "movups\t{$src, $dst|$dst, $src}",
    441                    [(store (v4f32 VR128:$src), addr:$dst)]>,
    442                    VEX, VEX_WIG;
    443 def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
    444                    "movupd\t{$src, $dst|$dst, $src}",
    445                    [(store (v2f64 VR128:$src), addr:$dst)]>,
    446                    VEX, VEX_WIG;
    447 } // SchedRW
    448 
    449 let SchedRW = [SchedWriteFMoveLS.YMM.MR] in {
    450 def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
    451                    "movaps\t{$src, $dst|$dst, $src}",
    452                    [(alignedstore (v8f32 VR256:$src), addr:$dst)]>,
    453                    VEX, VEX_L, VEX_WIG;
    454 def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
    455                    "movapd\t{$src, $dst|$dst, $src}",
    456                    [(alignedstore (v4f64 VR256:$src), addr:$dst)]>,
    457                    VEX, VEX_L, VEX_WIG;
    458 def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
    459                    "movups\t{$src, $dst|$dst, $src}",
    460                    [(store (v8f32 VR256:$src), addr:$dst)]>,
    461                    VEX, VEX_L, VEX_WIG;
    462 def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
    463                    "movupd\t{$src, $dst|$dst, $src}",
    464                    [(store (v4f64 VR256:$src), addr:$dst)]>,
    465                    VEX, VEX_L, VEX_WIG;
    466 } // SchedRW
    467 } // Predicate
    468 
    469 // For disassembler
    470 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
    471     isMoveReg = 1 in {
    472 let SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
    473   def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
    474                           (ins VR128:$src),
    475                           "movaps\t{$src, $dst|$dst, $src}", []>,
    476                           VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">;
    477   def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
    478                            (ins VR128:$src),
    479                            "movapd\t{$src, $dst|$dst, $src}", []>,
    480                            VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">;
    481   def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
    482                            (ins VR128:$src),
    483                            "movups\t{$src, $dst|$dst, $src}", []>,
    484                            VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">;
    485   def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
    486                            (ins VR128:$src),
    487                            "movupd\t{$src, $dst|$dst, $src}", []>,
    488                            VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">;
    489 } // SchedRW
    490 
    491 let SchedRW = [SchedWriteFMoveLS.YMM.RR] in {
    492   def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
    493                             (ins VR256:$src),
    494                             "movaps\t{$src, $dst|$dst, $src}", []>,
    495                             VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">;
    496   def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
    497                             (ins VR256:$src),
    498                             "movapd\t{$src, $dst|$dst, $src}", []>,
    499                             VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">;
    500   def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
    501                             (ins VR256:$src),
    502                             "movups\t{$src, $dst|$dst, $src}", []>,
    503                             VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">;
    504   def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
    505                             (ins VR256:$src),
    506                             "movupd\t{$src, $dst|$dst, $src}", []>,
    507                             VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">;
    508 } // SchedRW
    509 } // Predicate
    510 
    511 // Aliases to help the assembler pick two byte VEX encodings by swapping the
    512 // operands relative to the normal instructions to use VEX.R instead of VEX.B.
    513 def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}",
    514                 (VMOVAPSrr_REV VR128L:$dst, VR128H:$src), 0>;
    515 def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}",
    516                 (VMOVAPDrr_REV VR128L:$dst, VR128H:$src), 0>;
    517 def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}",
    518                 (VMOVUPSrr_REV VR128L:$dst, VR128H:$src), 0>;
    519 def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}",
    520                 (VMOVUPDrr_REV VR128L:$dst, VR128H:$src), 0>;
    521 def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}",
    522                 (VMOVAPSYrr_REV VR256L:$dst, VR256H:$src), 0>;
    523 def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}",
    524                 (VMOVAPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
    525 def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}",
    526                 (VMOVUPSYrr_REV VR256L:$dst, VR256H:$src), 0>;
    527 def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}",
    528                 (VMOVUPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
    529 
    530 // Reversed version with ".s" suffix for GAS compatibility.
    531 def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
    532                 (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
    533 def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
    534                 (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
    535 def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
    536                 (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
    537 def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
    538                 (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
    539 def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
    540                 (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>;
    541 def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
    542                 (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>;
    543 def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
    544                 (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>;
    545 def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
    546                 (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>;
    547 
    548 let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
    549 def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
    550                    "movaps\t{$src, $dst|$dst, $src}",
    551                    [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
    552 def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
    553                    "movapd\t{$src, $dst|$dst, $src}",
    554                    [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
    555 def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
    556                    "movups\t{$src, $dst|$dst, $src}",
    557                    [(store (v4f32 VR128:$src), addr:$dst)]>;
    558 def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
    559                    "movupd\t{$src, $dst|$dst, $src}",
    560                    [(store (v2f64 VR128:$src), addr:$dst)]>;
    561 } // SchedRW
    562 
    563 // For disassembler
    564 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
    565     isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
    566   def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
    567                          "movaps\t{$src, $dst|$dst, $src}", []>,
    568                          FoldGenData<"MOVAPSrr">;
    569   def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
    570                          "movapd\t{$src, $dst|$dst, $src}", []>,
    571                          FoldGenData<"MOVAPDrr">;
    572   def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
    573                          "movups\t{$src, $dst|$dst, $src}", []>,
    574                          FoldGenData<"MOVUPSrr">;
    575   def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
    576                          "movupd\t{$src, $dst|$dst, $src}", []>,
    577                          FoldGenData<"MOVUPDrr">;
    578 }
    579 
    580 // Reversed version with ".s" suffix for GAS compatibility.
    581 def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}",
    582                 (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
    583 def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}",
    584                 (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
    585 def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}",
    586                 (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
    587 def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}",
    588                 (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
    589 
    590 let Predicates = [HasAVX, NoVLX] in {
    591   // 256-bit load/store need to use floating point load/store in case we don't
    592   // have AVX2. Execution domain fixing will convert to integer if AVX2 is
    593   // available and changing the domain is beneficial.
    594   def : Pat<(alignedloadv4i64 addr:$src),
    595             (VMOVAPSYrm addr:$src)>;
    596   def : Pat<(loadv4i64 addr:$src),
    597             (VMOVUPSYrm addr:$src)>;
    598   def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
    599             (VMOVAPSYmr addr:$dst, VR256:$src)>;
    600   def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
    601             (VMOVAPSYmr addr:$dst, VR256:$src)>;
    602   def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst),
    603             (VMOVAPSYmr addr:$dst, VR256:$src)>;
    604   def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst),
    605             (VMOVAPSYmr addr:$dst, VR256:$src)>;
    606   def : Pat<(store (v4i64 VR256:$src), addr:$dst),
    607             (VMOVUPSYmr addr:$dst, VR256:$src)>;
    608   def : Pat<(store (v8i32 VR256:$src), addr:$dst),
    609             (VMOVUPSYmr addr:$dst, VR256:$src)>;
    610   def : Pat<(store (v16i16 VR256:$src), addr:$dst),
    611             (VMOVUPSYmr addr:$dst, VR256:$src)>;
    612   def : Pat<(store (v32i8 VR256:$src), addr:$dst),
    613             (VMOVUPSYmr addr:$dst, VR256:$src)>;
    614 }
    615 
    616 // Use movaps / movups for SSE integer load / store (one byte shorter).
    617 // The instructions selected below are then converted to MOVDQA/MOVDQU
    618 // during the SSE domain pass.
    619 let Predicates = [UseSSE1] in {
    620   def : Pat<(alignedloadv2i64 addr:$src),
    621             (MOVAPSrm addr:$src)>;
    622   def : Pat<(loadv2i64 addr:$src),
    623             (MOVUPSrm addr:$src)>;
    624 
    625   def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
    626             (MOVAPSmr addr:$dst, VR128:$src)>;
    627   def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
    628             (MOVAPSmr addr:$dst, VR128:$src)>;
    629   def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
    630             (MOVAPSmr addr:$dst, VR128:$src)>;
    631   def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
    632             (MOVAPSmr addr:$dst, VR128:$src)>;
    633   def : Pat<(store (v2i64 VR128:$src), addr:$dst),
    634             (MOVUPSmr addr:$dst, VR128:$src)>;
    635   def : Pat<(store (v4i32 VR128:$src), addr:$dst),
    636             (MOVUPSmr addr:$dst, VR128:$src)>;
    637   def : Pat<(store (v8i16 VR128:$src), addr:$dst),
    638             (MOVUPSmr addr:$dst, VR128:$src)>;
    639   def : Pat<(store (v16i8 VR128:$src), addr:$dst),
    640             (MOVUPSmr addr:$dst, VR128:$src)>;
    641 }
    642 
    643 //===----------------------------------------------------------------------===//
    644 // SSE 1 & 2 - Move Low packed FP Instructions
    645 //===----------------------------------------------------------------------===//
    646 
    647 multiclass sse12_mov_hilo_packed_base<bits<8>opc,  SDNode pdnode,
    648                                       string base_opc, string asm_opr> {
    649   // No pattern as they need be special cased between high and low.
    650   let hasSideEffects = 0, mayLoad = 1 in
    651   def PSrm : PI<opc, MRMSrcMem,
    652                 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
    653                 !strconcat(base_opc, "s", asm_opr),
    654                 [], SSEPackedSingle>, PS,
    655                 Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>;
    656 
    657   def PDrm : PI<opc, MRMSrcMem,
    658          (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
    659          !strconcat(base_opc, "d", asm_opr),
    660      [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
    661                               (scalar_to_vector (loadf64 addr:$src2)))))],
    662               SSEPackedDouble>, PD,
    663      Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>;
    664 }
    665 
    666 multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
    667                                  string base_opc> {
    668   let Predicates = [UseAVX] in
    669     defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
    670                                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
    671                                     VEX_4V, VEX_WIG;
    672 
    673   let Constraints = "$src1 = $dst" in
    674     defm NAME : sse12_mov_hilo_packed_base<opc,  pdnode, base_opc,
    675                                     "\t{$src2, $dst|$dst, $src2}">;
    676 }
    677 
    678 defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">;
    679 
    680 let SchedRW = [WriteFStore] in {
    681 let Predicates = [UseAVX] in {
    682 def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
    683                      "movlps\t{$src, $dst|$dst, $src}",
    684                      [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
    685                                    (iPTR 0))), addr:$dst)]>,
    686                      VEX, VEX_WIG;
    687 def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
    688                      "movlpd\t{$src, $dst|$dst, $src}",
    689                      [(store (f64 (extractelt (v2f64 VR128:$src),
    690                                    (iPTR 0))), addr:$dst)]>,
    691                      VEX, VEX_WIG;
    692 }// UseAVX
    693 def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
    694                    "movlps\t{$src, $dst|$dst, $src}",
    695                    [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
    696                                  (iPTR 0))), addr:$dst)]>;
    697 def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
    698                    "movlpd\t{$src, $dst|$dst, $src}",
    699                    [(store (f64 (extractelt (v2f64 VR128:$src),
    700                                  (iPTR 0))), addr:$dst)]>;
    701 } // SchedRW
    702 
    703 let Predicates = [UseSSE1] in {
    704   // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
    705   def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)),
    706                                  (iPTR 0))), addr:$src1),
    707             (MOVLPSmr addr:$src1, VR128:$src2)>;
    708 
    709   // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
    710   // end up with a movsd or blend instead of shufp.
    711   // No need for aligned load, we're only loading 64-bits.
    712   def : Pat<(X86Shufp (loadv4f32 addr:$src2), VR128:$src1, (i8 -28)),
    713             (MOVLPSrm VR128:$src1, addr:$src2)>;
    714 }
    715 
    716 //===----------------------------------------------------------------------===//
    717 // SSE 1 & 2 - Move Hi packed FP Instructions
    718 //===----------------------------------------------------------------------===//
    719 
    720 defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">;
    721 
    722 let SchedRW = [WriteFStore] in {
    723 // v2f64 extract element 1 is always custom lowered to unpack high to low
    724 // and extract element 0 so the non-store version isn't too horrible.
    725 let Predicates = [UseAVX] in {
    726 def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
    727                    "movhps\t{$src, $dst|$dst, $src}",
    728                    [(store (f64 (extractelt
    729                                  (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
    730                                             (bc_v2f64 (v4f32 VR128:$src))),
    731                                  (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
    732 def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
    733                    "movhpd\t{$src, $dst|$dst, $src}",
    734                    [(store (f64 (extractelt
    735                                  (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
    736                                  (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
    737 } // UseAVX
    738 def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
    739                    "movhps\t{$src, $dst|$dst, $src}",
    740                    [(store (f64 (extractelt
    741                                  (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
    742                                             (bc_v2f64 (v4f32 VR128:$src))),
    743                                  (iPTR 0))), addr:$dst)]>;
    744 def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
    745                    "movhpd\t{$src, $dst|$dst, $src}",
    746                    [(store (f64 (extractelt
    747                                  (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
    748                                  (iPTR 0))), addr:$dst)]>;
    749 } // SchedRW
    750 
    751 let Predicates = [UseAVX] in {
    752   // Also handle an i64 load because that may get selected as a faster way to
    753   // load the data.
    754   def : Pat<(v2f64 (X86Unpckl VR128:$src1,
    755                       (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
    756             (VMOVHPDrm VR128:$src1, addr:$src2)>;
    757 
    758   def : Pat<(store (f64 (extractelt
    759                           (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
    760                           (iPTR 0))), addr:$dst),
    761             (VMOVHPDmr addr:$dst, VR128:$src)>;
    762 }
    763 
    764 let Predicates = [UseSSE1] in {
    765   // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
    766   // end up with a movsd or blend instead of shufp.
    767   // No need for aligned load, we're only loading 64-bits.
    768   def : Pat<(X86Movlhps VR128:$src1, (loadv4f32 addr:$src2)),
    769             (MOVHPSrm VR128:$src1, addr:$src2)>;
    770 }
    771 
    772 let Predicates = [UseSSE2] in {
    773   // MOVHPD patterns
    774 
    775   // Also handle an i64 load because that may get selected as a faster way to
    776   // load the data.
    777   def : Pat<(v2f64 (X86Unpckl VR128:$src1,
    778                       (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
    779             (MOVHPDrm VR128:$src1, addr:$src2)>;
    780 
    781   def : Pat<(store (f64 (extractelt
    782                           (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
    783                           (iPTR 0))), addr:$dst),
    784             (MOVHPDmr addr:$dst, VR128:$src)>;
    785 }
    786 
    787 //===----------------------------------------------------------------------===//
    788 // SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
    789 //===----------------------------------------------------------------------===//
    790 
    791 let Predicates = [UseAVX] in {
    792   def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
    793                                        (ins VR128:$src1, VR128:$src2),
    794                       "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
    795                       [(set VR128:$dst,
    796                         (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
    797                       VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG;
    798   let isCommutable = 1 in
    799   def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
    800                                        (ins VR128:$src1, VR128:$src2),
    801                       "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
    802                       [(set VR128:$dst,
    803                         (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
    804                       VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG,
    805                       NotMemoryFoldable;
    806 }
    807 let Constraints = "$src1 = $dst" in {
    808   def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
    809                                        (ins VR128:$src1, VR128:$src2),
    810                       "movlhps\t{$src2, $dst|$dst, $src2}",
    811                       [(set VR128:$dst,
    812                         (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
    813                       Sched<[SchedWriteFShuffle.XMM]>;
    814   let isCommutable = 1 in
    815   def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
    816                                        (ins VR128:$src1, VR128:$src2),
    817                       "movhlps\t{$src2, $dst|$dst, $src2}",
    818                       [(set VR128:$dst,
    819                         (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
    820                       Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable;
    821 }
    822 
    823 // TODO: This is largely to trick fastisel into ignoring the pattern.
    824 def UnpckhUnary : PatFrag<(ops node:$src1, node:$src2),
    825                           (X86Unpckh node:$src1, node:$src2), [{
    826   return N->getOperand(0) == N->getOperand(1);
    827 }]>;
    828 
    829 let Predicates = [UseSSE2] in {
    830   // TODO: This is a hack pattern to allow lowering to emit unpckh instead of
    831   // movhlps for sse2 without changing a bunch of tests.
    832   def : Pat<(v2f64 (UnpckhUnary VR128:$src, VR128:$src)),
    833             (MOVHLPSrr VR128:$src, VR128:$src)>;
    834 }
    835 
    836 //===----------------------------------------------------------------------===//
    837 // SSE 1 & 2 - Conversion Instructions
    838 //===----------------------------------------------------------------------===//
    839 
    840 multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
    841                      SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
    842                      string asm, X86FoldableSchedWrite sched> {
    843   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
    844                         [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
    845                         Sched<[sched]>;
    846   def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
    847                         [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
    848                         Sched<[sched.Folded]>;
    849 }
    850 
    851 multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
    852                        ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
    853                        string asm, Domain d, X86FoldableSchedWrite sched> {
    854 let hasSideEffects = 0 in {
    855   def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
    856              [(set RC:$dst, (DstTy (sint_to_fp (SrcTy RC:$src))))], d>,
    857              Sched<[sched]>;
    858   let mayLoad = 1 in
    859   def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
    860              [(set RC:$dst, (DstTy (sint_to_fp
    861                                     (SrcTy (bitconvert (ld_frag addr:$src))))))], d>,
    862              Sched<[sched.Folded]>;
    863 }
    864 }
    865 
    866 multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
    867                           X86MemOperand x86memop, string asm,
    868                           X86FoldableSchedWrite sched> {
    869 let hasSideEffects = 0, Predicates = [UseAVX] in {
    870   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
    871               !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
    872               Sched<[sched]>;
    873   let mayLoad = 1 in
    874   def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
    875               (ins DstRC:$src1, x86memop:$src),
    876               !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
    877            Sched<[sched.Folded, ReadAfterLd]>;
    878 } // hasSideEffects = 0
    879 }
    880 
    881 let Predicates = [UseAVX] in {
    882 defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
    883                                 "cvttss2si\t{$src, $dst|$dst, $src}",
    884                                 WriteCvtSS2I>,
    885                                 XS, VEX, VEX_LIG;
    886 defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
    887                                 "cvttss2si\t{$src, $dst|$dst, $src}",
    888                                 WriteCvtSS2I>,
    889                                 XS, VEX, VEX_W, VEX_LIG;
    890 defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
    891                                 "cvttsd2si\t{$src, $dst|$dst, $src}",
    892                                 WriteCvtSD2I>,
    893                                 XD, VEX, VEX_LIG;
    894 defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
    895                                 "cvttsd2si\t{$src, $dst|$dst, $src}",
    896                                 WriteCvtSD2I>,
    897                                 XD, VEX, VEX_W, VEX_LIG;
    898 
    899 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
    900                 (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">;
    901 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
    902                 (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">;
    903 def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
    904                 (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">;
    905 def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
    906                 (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">;
    907 def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
    908                 (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">;
    909 def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
    910                 (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">;
    911 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
    912                 (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">;
    913 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
    914                 (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">;
    915 }
    916 // The assembler can recognize rr 64-bit instructions by seeing a rxx
    917 // register, but the same isn't true when only using memory operands,
    918 // provide other assembly "l" and "q" forms to address this explicitly
    919 // where appropriate to do so.
    920 defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}",
    921                                   WriteCvtI2SS>, XS, VEX_4V, VEX_LIG;
    922 defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}",
    923                                   WriteCvtI2SS>, XS, VEX_4V, VEX_W, VEX_LIG;
    924 defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}",
    925                                   WriteCvtI2SD>, XD, VEX_4V, VEX_LIG;
    926 defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}",
    927                                   WriteCvtI2SD>, XD, VEX_4V, VEX_W, VEX_LIG;
    928 
    929 let Predicates = [UseAVX] in {
    930   def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
    931                 (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">;
    932   def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
    933                 (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">;
    934 
    935   def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
    936             (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
    937   def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
    938             (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
    939   def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
    940             (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
    941   def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
    942             (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
    943 
    944   def : Pat<(f32 (sint_to_fp GR32:$src)),
    945             (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
    946   def : Pat<(f32 (sint_to_fp GR64:$src)),
    947             (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
    948   def : Pat<(f64 (sint_to_fp GR32:$src)),
    949             (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
    950   def : Pat<(f64 (sint_to_fp GR64:$src)),
    951             (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
    952 }
    953 
    954 defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
    955                       "cvttss2si\t{$src, $dst|$dst, $src}",
    956                       WriteCvtSS2I>, XS;
    957 defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
    958                       "cvttss2si\t{$src, $dst|$dst, $src}",
    959                       WriteCvtSS2I>, XS, REX_W;
    960 defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
    961                       "cvttsd2si\t{$src, $dst|$dst, $src}",
    962                       WriteCvtSD2I>, XD;
    963 defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
    964                       "cvttsd2si\t{$src, $dst|$dst, $src}",
    965                       WriteCvtSD2I>, XD, REX_W;
    966 defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
    967                       "cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
    968                       WriteCvtI2SS>, XS;
    969 defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
    970                       "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
    971                       WriteCvtI2SS>, XS, REX_W;
    972 defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
    973                       "cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
    974                       WriteCvtI2SD>, XD;
    975 defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
    976                       "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
    977                       WriteCvtI2SD>, XD, REX_W;
    978 
    979 def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
    980                 (CVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">;
    981 def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
    982                 (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">;
    983 def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
    984                 (CVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">;
    985 def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
    986                 (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">;
    987 def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
    988                 (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">;
    989 def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
    990                 (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">;
    991 def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
    992                 (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">;
    993 def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
    994                 (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">;
    995 
    996 def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
    997                 (CVTSI2SSrm FR64:$dst, i32mem:$src), 0, "att">;
    998 def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
    999                 (CVTSI2SDrm FR64:$dst, i32mem:$src), 0, "att">;
   1000 
   1001 // Conversion Instructions Intrinsics - Match intrinsics which expect MM
   1002 // and/or XMM operand(s).
   1003 
   1004 // FIXME: We probably want to match the rm form only when optimizing for
   1005 // size, to avoid false depenendecies (see sse_fp_unop_s for details)
   1006 multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
   1007                           Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
   1008                           string asm, X86FoldableSchedWrite sched> {
   1009   def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
   1010                   !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
   1011                   [(set DstRC:$dst, (Int SrcRC:$src))]>,
   1012                Sched<[sched]>;
   1013   def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
   1014                   !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
   1015                   [(set DstRC:$dst, (Int mem_cpat:$src))]>,
   1016                Sched<[sched.Folded]>;
   1017 }
   1018 
   1019 multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
   1020                     RegisterClass DstRC, X86MemOperand x86memop,
   1021                     string asm, X86FoldableSchedWrite sched,
   1022                     bit Is2Addr = 1> {
   1023 let hasSideEffects = 0 in {
   1024   def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
   1025                   !if(Is2Addr,
   1026                       !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
   1027                       !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   1028                   []>, Sched<[sched]>;
   1029   let mayLoad = 1 in
   1030   def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst),
   1031                   (ins DstRC:$src1, x86memop:$src2),
   1032                   !if(Is2Addr,
   1033                       !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
   1034                       !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   1035                   []>, Sched<[sched.Folded, ReadAfterLd]>;
   1036 }
   1037 }
   1038 
   1039 let Predicates = [UseAVX] in {
   1040 defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
   1041                   int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si",
   1042                   WriteCvtSD2I>, XD, VEX, VEX_LIG;
   1043 defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
   1044                     int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si",
   1045                     WriteCvtSD2I>, XD, VEX, VEX_W, VEX_LIG;
   1046 }
   1047 defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
   1048                  sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD;
   1049 defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
   1050                    sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD, REX_W;
   1051 
   1052 
   1053 let isCodeGenOnly = 1 in {
   1054   let Predicates = [UseAVX] in {
   1055   defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
   1056             i32mem, "cvtsi2ss{l}", WriteCvtI2SS, 0>, XS, VEX_4V;
   1057   defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
   1058             i64mem, "cvtsi2ss{q}", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_W;
   1059   defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
   1060             i32mem, "cvtsi2sd{l}", WriteCvtI2SD, 0>, XD, VEX_4V;
   1061   defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
   1062             i64mem, "cvtsi2sd{q}", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_W;
   1063   }
   1064   let Constraints = "$src1 = $dst" in {
   1065     defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
   1066                           i32mem, "cvtsi2ss{l}", WriteCvtI2SS>, XS;
   1067     defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
   1068                           i64mem, "cvtsi2ss{q}", WriteCvtI2SS>, XS, REX_W;
   1069     defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
   1070                           i32mem, "cvtsi2sd{l}", WriteCvtI2SD>, XD;
   1071     defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
   1072                           i64mem, "cvtsi2sd{q}", WriteCvtI2SD>, XD, REX_W;
   1073   }
   1074 } // isCodeGenOnly = 1
   1075 
   1076 /// SSE 1 Only
   1077 
   1078 // Aliases for intrinsics
   1079 let isCodeGenOnly = 1 in {
   1080 let Predicates = [UseAVX] in {
   1081 defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
   1082                                 ssmem, sse_load_f32, "cvttss2si",
   1083                                 WriteCvtSS2I>, XS, VEX;
   1084 defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
   1085                                int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
   1086                                "cvttss2si", WriteCvtSS2I>,
   1087                                XS, VEX, VEX_W;
   1088 defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
   1089                                 sdmem, sse_load_f64, "cvttsd2si",
   1090                                 WriteCvtSS2I>, XD, VEX;
   1091 defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
   1092                               int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
   1093                               "cvttsd2si", WriteCvtSS2I>,
   1094                               XD, VEX, VEX_W;
   1095 }
   1096 defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
   1097                                     ssmem, sse_load_f32, "cvttss2si",
   1098                                     WriteCvtSS2I>, XS;
   1099 defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
   1100                                    int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
   1101                                    "cvttss2si", WriteCvtSS2I>, XS, REX_W;
   1102 defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
   1103                                     sdmem, sse_load_f64, "cvttsd2si",
   1104                                     WriteCvtSD2I>, XD;
   1105 defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
   1106                                   int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
   1107                                   "cvttsd2si", WriteCvtSD2I>, XD, REX_W;
   1108 } // isCodeGenOnly = 1
   1109 
   1110 let Predicates = [UseAVX] in {
   1111 defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
   1112                                   ssmem, sse_load_f32, "cvtss2si",
   1113                                   WriteCvtSS2I>, XS, VEX, VEX_LIG;
   1114 defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
   1115                                   ssmem, sse_load_f32, "cvtss2si",
   1116                                   WriteCvtSS2I>, XS, VEX, VEX_W, VEX_LIG;
   1117 }
   1118 defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
   1119                                ssmem, sse_load_f32, "cvtss2si",
   1120                                WriteCvtSS2I>, XS;
   1121 defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
   1122                                  ssmem, sse_load_f32, "cvtss2si",
   1123                                  WriteCvtSS2I>, XS, REX_W;
   1124 
   1125 defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64,
   1126                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
   1127                                SSEPackedSingle, WriteCvtI2PS>,
   1128                                PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
   1129 defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64,
   1130                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
   1131                                SSEPackedSingle, WriteCvtI2PSY>,
   1132                                PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
   1133 
   1134 defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64,
   1135                             "cvtdq2ps\t{$src, $dst|$dst, $src}",
   1136                             SSEPackedSingle, WriteCvtI2PS>,
   1137                             PS, Requires<[UseSSE2]>;
   1138 
   1139 let Predicates = [UseAVX] in {
   1140 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
   1141                 (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
   1142 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
   1143                 (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
   1144 def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
   1145                 (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
   1146 def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
   1147                 (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
   1148 def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
   1149                 (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
   1150 def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
   1151                 (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
   1152 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
   1153                 (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
   1154 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
   1155                 (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
   1156 }
   1157 
   1158 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
   1159                 (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
   1160 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
   1161                 (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
   1162 def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
   1163                 (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
   1164 def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
   1165                 (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
   1166 def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
   1167                 (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
   1168 def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
   1169                 (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
   1170 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
   1171                 (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
   1172 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
   1173                 (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
   1174 
   1175 /// SSE 2 Only
   1176 
   1177 // Convert scalar double to scalar single
   1178 let hasSideEffects = 0, Predicates = [UseAVX] in {
   1179 def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
   1180                         (ins FR32:$src1, FR64:$src2),
   1181                         "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
   1182                         VEX_4V, VEX_LIG, VEX_WIG,
   1183                         Sched<[WriteCvtSD2SS]>;
   1184 let mayLoad = 1 in
   1185 def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
   1186                      (ins FR32:$src1, f64mem:$src2),
   1187                      "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
   1188                      XD, VEX_4V, VEX_LIG, VEX_WIG,
   1189                      Sched<[WriteCvtSD2SS.Folded, ReadAfterLd]>;
   1190 }
   1191 
   1192 def : Pat<(f32 (fpround FR64:$src)),
   1193             (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
   1194           Requires<[UseAVX]>;
   1195 
   1196 def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
   1197                       "cvtsd2ss\t{$src, $dst|$dst, $src}",
   1198                       [(set FR32:$dst, (fpround FR64:$src))]>,
   1199                       Sched<[WriteCvtSD2SS]>;
   1200 def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
   1201                     "cvtsd2ss\t{$src, $dst|$dst, $src}",
   1202                     [(set FR32:$dst, (fpround (loadf64 addr:$src)))]>,
   1203                     XD, Requires<[UseSSE2, OptForSize]>,
   1204                     Sched<[WriteCvtSD2SS.Folded]>;
   1205 
   1206 let isCodeGenOnly = 1 in {
   1207 def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
   1208                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
   1209                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   1210                        [(set VR128:$dst,
   1211                          (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>,
   1212                        XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>,
   1213                        Sched<[WriteCvtSD2SS]>;
   1214 def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
   1215                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
   1216                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   1217                        [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
   1218                                           VR128:$src1, sse_load_f64:$src2))]>,
   1219                        XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>,
   1220                        Sched<[WriteCvtSD2SS.Folded, ReadAfterLd]>;
   1221 let Constraints = "$src1 = $dst" in {
   1222 def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
   1223                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
   1224                        "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
   1225                        [(set VR128:$dst,
   1226                          (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>,
   1227                        XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
   1228 def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
   1229                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
   1230                        "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
   1231                        [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
   1232                                           VR128:$src1, sse_load_f64:$src2))]>,
   1233                        XD, Requires<[UseSSE2]>,
   1234                        Sched<[WriteCvtSD2SS.Folded, ReadAfterLd]>;
   1235 }
   1236 } // isCodeGenOnly = 1
   1237 
   1238 // Convert scalar single to scalar double
   1239 // SSE2 instructions with XS prefix
   1240 let hasSideEffects = 0 in {
   1241 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
   1242                     (ins FR64:$src1, FR32:$src2),
   1243                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
   1244                     XS, VEX_4V, VEX_LIG, VEX_WIG,
   1245                     Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>;
   1246 let mayLoad = 1 in
   1247 def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
   1248                     (ins FR64:$src1, f32mem:$src2),
   1249                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
   1250                     XS, VEX_4V, VEX_LIG, VEX_WIG,
   1251                     Sched<[WriteCvtSS2SD.Folded, ReadAfterLd]>,
   1252                     Requires<[UseAVX, OptForSize]>;
   1253 }
   1254 
   1255 def : Pat<(f64 (fpextend FR32:$src)),
   1256     (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
   1257 def : Pat<(fpextend (loadf32 addr:$src)),
   1258     (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
   1259 
   1260 def : Pat<(extloadf32 addr:$src),
   1261     (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>,
   1262     Requires<[UseAVX, OptForSize]>;
   1263 def : Pat<(extloadf32 addr:$src),
   1264     (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
   1265     Requires<[UseAVX, OptForSpeed]>;
   1266 
   1267 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
   1268                    "cvtss2sd\t{$src, $dst|$dst, $src}",
   1269                    [(set FR64:$dst, (fpextend FR32:$src))]>,
   1270                    XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>;
   1271 def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
   1272                    "cvtss2sd\t{$src, $dst|$dst, $src}",
   1273                    [(set FR64:$dst, (extloadf32 addr:$src))]>,
   1274                    XS, Requires<[UseSSE2, OptForSize]>,
   1275                    Sched<[WriteCvtSS2SD.Folded]>;
   1276 
   1277 // extload f32 -> f64.  This matches load+fpextend because we have a hack in
   1278 // the isel (PreprocessForFPConvert) that can introduce loads after dag
   1279 // combine.
   1280 // Since these loads aren't folded into the fpextend, we have to match it
   1281 // explicitly here.
   1282 def : Pat<(fpextend (loadf32 addr:$src)),
   1283           (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2, OptForSize]>;
   1284 def : Pat<(extloadf32 addr:$src),
   1285           (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
   1286 
   1287 let isCodeGenOnly = 1, hasSideEffects = 0 in {
   1288 def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
   1289                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
   1290                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   1291                     []>, XS, VEX_4V, VEX_WIG,
   1292                     Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
   1293 let mayLoad = 1 in
   1294 def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
   1295                       (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
   1296                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   1297                     []>, XS, VEX_4V, VEX_WIG, Requires<[HasAVX]>,
   1298                     Sched<[WriteCvtSS2SD.Folded, ReadAfterLd]>;
   1299 let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
   1300 def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
   1301                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
   1302                     "cvtss2sd\t{$src2, $dst|$dst, $src2}",
   1303                     []>, XS, Requires<[UseSSE2]>,
   1304                     Sched<[WriteCvtSS2SD]>;
   1305 let mayLoad = 1 in
   1306 def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
   1307                       (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
   1308                     "cvtss2sd\t{$src2, $dst|$dst, $src2}",
   1309                     []>, XS, Requires<[UseSSE2]>,
   1310                     Sched<[WriteCvtSS2SD.Folded, ReadAfterLd]>;
   1311 }
   1312 } // isCodeGenOnly = 1
   1313 
   1314 // Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
   1315 // (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
   1316 // vmovs{s,d} instructions
   1317 let Predicates = [UseAVX] in {
   1318 def : Pat<(v4f32 (X86Movss
   1319                    (v4f32 VR128:$dst),
   1320                    (v4f32 (scalar_to_vector
   1321                      (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
   1322           (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
   1323 
   1324 def : Pat<(v2f64 (X86Movsd
   1325                    (v2f64 VR128:$dst),
   1326                    (v2f64 (scalar_to_vector
   1327                      (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
   1328           (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
   1329 
   1330 def : Pat<(v4f32 (X86Movss
   1331                    (v4f32 VR128:$dst),
   1332                    (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
   1333           (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
   1334 
   1335 def : Pat<(v4f32 (X86Movss
   1336                    (v4f32 VR128:$dst),
   1337                    (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))),
   1338           (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>;
   1339 
   1340 def : Pat<(v4f32 (X86Movss
   1341                    (v4f32 VR128:$dst),
   1342                    (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
   1343           (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
   1344 
   1345 def : Pat<(v4f32 (X86Movss
   1346                    (v4f32 VR128:$dst),
   1347                    (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))),
   1348           (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>;
   1349 
   1350 def : Pat<(v2f64 (X86Movsd
   1351                    (v2f64 VR128:$dst),
   1352                    (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
   1353           (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
   1354 
   1355 def : Pat<(v2f64 (X86Movsd
   1356                    (v2f64 VR128:$dst),
   1357                    (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))),
   1358           (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>;
   1359 
   1360 def : Pat<(v2f64 (X86Movsd
   1361                    (v2f64 VR128:$dst),
   1362                    (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
   1363           (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
   1364 
   1365 def : Pat<(v2f64 (X86Movsd
   1366                    (v2f64 VR128:$dst),
   1367                    (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))),
   1368           (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>;
   1369 } // Predicates = [UseAVX]
   1370 
   1371 let Predicates = [UseSSE2] in {
   1372 def : Pat<(v4f32 (X86Movss
   1373                    (v4f32 VR128:$dst),
   1374                    (v4f32 (scalar_to_vector
   1375                      (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
   1376           (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
   1377 
   1378 def : Pat<(v2f64 (X86Movsd
   1379                    (v2f64 VR128:$dst),
   1380                    (v2f64 (scalar_to_vector
   1381                      (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
   1382           (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
   1383 
   1384 def : Pat<(v2f64 (X86Movsd
   1385                    (v2f64 VR128:$dst),
   1386                    (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
   1387           (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
   1388 
   1389 def : Pat<(v2f64 (X86Movsd
   1390                    (v2f64 VR128:$dst),
   1391                    (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))),
   1392           (CVTSI642SDrm_Int VR128:$dst, addr:$src)>;
   1393 
   1394 def : Pat<(v2f64 (X86Movsd
   1395                    (v2f64 VR128:$dst),
   1396                    (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
   1397           (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
   1398 
   1399 def : Pat<(v2f64 (X86Movsd
   1400                    (v2f64 VR128:$dst),
   1401                    (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))),
   1402           (CVTSI2SDrm_Int VR128:$dst, addr:$src)>;
   1403 } // Predicates = [UseSSE2]
   1404 
   1405 let Predicates = [UseSSE1] in {
   1406 def : Pat<(v4f32 (X86Movss
   1407                    (v4f32 VR128:$dst),
   1408                    (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
   1409           (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
   1410 
   1411 def : Pat<(v4f32 (X86Movss
   1412                    (v4f32 VR128:$dst),
   1413                    (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))),
   1414           (CVTSI642SSrm_Int VR128:$dst, addr:$src)>;
   1415 
   1416 def : Pat<(v4f32 (X86Movss
   1417                    (v4f32 VR128:$dst),
   1418                    (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
   1419           (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
   1420 
   1421 def : Pat<(v4f32 (X86Movss
   1422                    (v4f32 VR128:$dst),
   1423                    (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))),
   1424           (CVTSI2SSrm_Int VR128:$dst, addr:$src)>;
   1425 } // Predicates = [UseSSE1]
   1426 
   1427 let Predicates = [HasAVX, NoVLX] in {
   1428 // Convert packed single/double fp to doubleword
   1429 def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   1430                        "cvtps2dq\t{$src, $dst|$dst, $src}",
   1431                        [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
   1432                        VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
   1433 def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   1434                        "cvtps2dq\t{$src, $dst|$dst, $src}",
   1435                        [(set VR128:$dst,
   1436                          (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>,
   1437                        VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
   1438 def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
   1439                         "cvtps2dq\t{$src, $dst|$dst, $src}",
   1440                         [(set VR256:$dst,
   1441                           (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>,
   1442                         VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
   1443 def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
   1444                         "cvtps2dq\t{$src, $dst|$dst, $src}",
   1445                         [(set VR256:$dst,
   1446                           (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>,
   1447                         VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
   1448 }
   1449 def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   1450                      "cvtps2dq\t{$src, $dst|$dst, $src}",
   1451                      [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
   1452                      Sched<[WriteCvtPS2I]>;
   1453 def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   1454                      "cvtps2dq\t{$src, $dst|$dst, $src}",
   1455                      [(set VR128:$dst,
   1456                        (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
   1457                      Sched<[WriteCvtPS2ILd]>;
   1458 
   1459 
   1460 // Convert Packed Double FP to Packed DW Integers
   1461 let Predicates = [HasAVX, NoVLX] in {
   1462 // The assembler can recognize rr 256-bit instructions by seeing a ymm
   1463 // register, but the same isn't true when using memory operands instead.
   1464 // Provide other assembly rr and rm forms to address this explicitly.
   1465 def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   1466                        "vcvtpd2dq\t{$src, $dst|$dst, $src}",
   1467                        [(set VR128:$dst,
   1468                          (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
   1469                        VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
   1470 
   1471 // XMM only
   1472 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
   1473                 (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>;
   1474 def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   1475                       "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
   1476                       [(set VR128:$dst,
   1477                         (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
   1478                       Sched<[WriteCvtPD2ILd]>, VEX_WIG;
   1479 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
   1480                 (VCVTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">;
   1481 
   1482 // YMM only
   1483 def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
   1484                        "vcvtpd2dq\t{$src, $dst|$dst, $src}",
   1485                        [(set VR128:$dst,
   1486                          (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
   1487                        VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
   1488 def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
   1489                        "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
   1490                        [(set VR128:$dst,
   1491                          (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
   1492                        VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
   1493 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
   1494                 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
   1495 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
   1496                 (VCVTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">;
   1497 }
   1498 
   1499 def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   1500                       "cvtpd2dq\t{$src, $dst|$dst, $src}",
   1501                       [(set VR128:$dst,
   1502                         (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>,
   1503                       Sched<[WriteCvtPD2ILd]>;
   1504 def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   1505                       "cvtpd2dq\t{$src, $dst|$dst, $src}",
   1506                       [(set VR128:$dst,
   1507                         (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
   1508                       Sched<[WriteCvtPD2I]>;
   1509 
   1510 // Convert with truncation packed single/double fp to doubleword
   1511 // SSE2 packed instructions with XS prefix
   1512 let Predicates = [HasAVX, NoVLX] in {
   1513 def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   1514                          "cvttps2dq\t{$src, $dst|$dst, $src}",
   1515                          [(set VR128:$dst,
   1516                            (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>,
   1517                          VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
   1518 def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   1519                          "cvttps2dq\t{$src, $dst|$dst, $src}",
   1520                          [(set VR128:$dst,
   1521                            (v4i32 (X86cvttp2si (loadv4f32 addr:$src))))]>,
   1522                          VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
   1523 def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
   1524                           "cvttps2dq\t{$src, $dst|$dst, $src}",
   1525                           [(set VR256:$dst,
   1526                             (v8i32 (X86cvttp2si (v8f32 VR256:$src))))]>,
   1527                           VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
   1528 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
   1529                           "cvttps2dq\t{$src, $dst|$dst, $src}",
   1530                           [(set VR256:$dst,
   1531                             (v8i32 (X86cvttp2si (loadv8f32 addr:$src))))]>,
   1532                           VEX, VEX_L,
   1533                           Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
   1534 }
   1535 
   1536 let Predicates = [HasAVX, NoVLX] in {
   1537   def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
   1538             (VCVTTPS2DQrr VR128:$src)>;
   1539   def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
   1540             (VCVTTPS2DQrm addr:$src)>;
   1541   def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
   1542             (VCVTTPS2DQYrr VR256:$src)>;
   1543   def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
   1544             (VCVTTPS2DQYrm addr:$src)>;
   1545 }
   1546 
   1547 def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   1548                        "cvttps2dq\t{$src, $dst|$dst, $src}",
   1549                        [(set VR128:$dst,
   1550                          (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>,
   1551                        Sched<[WriteCvtPS2I]>;
   1552 def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   1553                        "cvttps2dq\t{$src, $dst|$dst, $src}",
   1554                        [(set VR128:$dst,
   1555                          (v4i32 (X86cvttp2si (memopv4f32 addr:$src))))]>,
   1556                        Sched<[WriteCvtPS2ILd]>;
   1557 
   1558 let Predicates = [UseSSE2] in {
   1559   def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
   1560             (CVTTPS2DQrr VR128:$src)>;
   1561   def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
   1562             (CVTTPS2DQrm addr:$src)>;
   1563 }
   1564 
   1565 let Predicates = [HasAVX, NoVLX] in
   1566 def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   1567                         "cvttpd2dq\t{$src, $dst|$dst, $src}",
   1568                         [(set VR128:$dst,
   1569                           (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>,
   1570                         VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
   1571 
   1572 // The assembler can recognize rr 256-bit instructions by seeing a ymm
   1573 // register, but the same isn't true when using memory operands instead.
   1574 // Provide other assembly rr and rm forms to address this explicitly.
   1575 
   1576 // XMM only
   1577 def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
   1578                 (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
   1579 
   1580 let Predicates = [HasAVX, NoVLX] in
   1581 def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   1582                         "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
   1583                         [(set VR128:$dst,
   1584                           (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))]>,
   1585                         VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG;
   1586 def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
   1587                 (VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">;
   1588 
   1589 // YMM only
   1590 let Predicates = [HasAVX, NoVLX] in {
   1591 def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
   1592                          "cvttpd2dq\t{$src, $dst|$dst, $src}",
   1593                          [(set VR128:$dst,
   1594                            (v4i32 (X86cvttp2si (v4f64 VR256:$src))))]>,
   1595                          VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
   1596 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
   1597                          "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
   1598                          [(set VR128:$dst,
   1599                            (v4i32 (X86cvttp2si (loadv4f64 addr:$src))))]>,
   1600                          VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
   1601 }
   1602 def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
   1603                 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
   1604 def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
   1605                 (VCVTTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">;
   1606 
   1607 let Predicates = [HasAVX, NoVLX] in {
   1608   def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
   1609             (VCVTTPD2DQYrr VR256:$src)>;
   1610   def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
   1611             (VCVTTPD2DQYrm addr:$src)>;
   1612 }
   1613 
   1614 let Predicates = [HasAVX, NoVLX] in {
   1615   def : Pat<(X86vzmovl (v2i64 (bitconvert
   1616                                (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
   1617             (VCVTPD2DQrr VR128:$src)>;
   1618   def : Pat<(X86vzmovl (v2i64 (bitconvert
   1619                                (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))),
   1620             (VCVTPD2DQrm addr:$src)>;
   1621   def : Pat<(X86vzmovl (v2i64 (bitconvert
   1622                                (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
   1623             (VCVTTPD2DQrr VR128:$src)>;
   1624   def : Pat<(X86vzmovl (v2i64 (bitconvert
   1625                                (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))),
   1626             (VCVTTPD2DQrm addr:$src)>;
   1627 } // Predicates = [HasAVX, NoVLX]
   1628 
   1629 def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   1630                       "cvttpd2dq\t{$src, $dst|$dst, $src}",
   1631                       [(set VR128:$dst,
   1632                         (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>,
   1633                       Sched<[WriteCvtPD2I]>;
   1634 def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
   1635                       "cvttpd2dq\t{$src, $dst|$dst, $src}",
   1636                       [(set VR128:$dst,
   1637                         (v4i32 (X86cvttp2si (memopv2f64 addr:$src))))]>,
   1638                       Sched<[WriteCvtPD2ILd]>;
   1639 
   1640 let Predicates = [UseSSE2] in {
   1641   def : Pat<(X86vzmovl (v2i64 (bitconvert
   1642                                (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
   1643             (CVTPD2DQrr VR128:$src)>;
   1644   def : Pat<(X86vzmovl (v2i64 (bitconvert
   1645                                (v4i32 (X86cvtp2Int (memopv2f64 addr:$src)))))),
   1646             (CVTPD2DQrm addr:$src)>;
   1647   def : Pat<(X86vzmovl (v2i64 (bitconvert
   1648                                (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
   1649             (CVTTPD2DQrr VR128:$src)>;
   1650   def : Pat<(X86vzmovl (v2i64 (bitconvert
   1651                                (v4i32 (X86cvttp2si (memopv2f64 addr:$src)))))),
   1652             (CVTTPD2DQrm addr:$src)>;
   1653 } // Predicates = [UseSSE2]
   1654 
   1655 // Convert packed single to packed double
   1656 let Predicates = [HasAVX, NoVLX] in {
   1657                   // SSE2 instructions without OpSize prefix
   1658 def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   1659                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
   1660                     [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>,
   1661                     PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG;
   1662 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
   1663                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
   1664                     [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
   1665                     PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG;
   1666 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
   1667                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
   1668                      [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))]>,
   1669                      PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG;
   1670 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
   1671                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
   1672                      [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>,
   1673                      PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG;
   1674 }
   1675 
   1676 let Predicates = [UseSSE2] in {
   1677 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   1678                    "cvtps2pd\t{$src, $dst|$dst, $src}",
   1679                    [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>,
   1680                    PS, Sched<[WriteCvtPS2PD]>;
   1681 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
   1682                    "cvtps2pd\t{$src, $dst|$dst, $src}",
   1683                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
   1684                    PS, Sched<[WriteCvtPS2PD.Folded]>;
   1685 }
   1686 
   1687 // Convert Packed DW Integers to Packed Double FP
   1688 let Predicates = [HasAVX, NoVLX] in {
   1689 let hasSideEffects = 0, mayLoad = 1 in
   1690 def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
   1691                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
   1692                         [(set VR128:$dst,
   1693                           (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>,
   1694                         VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
   1695 def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   1696                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
   1697                         [(set VR128:$dst,
   1698                           (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
   1699                         VEX, Sched<[WriteCvtI2PD]>, VEX_WIG;
   1700 def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
   1701                          "vcvtdq2pd\t{$src, $dst|$dst, $src}",
   1702                          [(set VR256:$dst,
   1703                            (v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))))]>,
   1704                          VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
   1705                          VEX_WIG;
   1706 def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
   1707                          "vcvtdq2pd\t{$src, $dst|$dst, $src}",
   1708                          [(set VR256:$dst,
   1709                            (v4f64 (sint_to_fp (v4i32 VR128:$src))))]>,
   1710                          VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG;
   1711 }
   1712 
   1713 let hasSideEffects = 0, mayLoad = 1 in
   1714 def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
   1715                        "cvtdq2pd\t{$src, $dst|$dst, $src}",
   1716                        [(set VR128:$dst,
   1717                          (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>,
   1718                        Sched<[WriteCvtI2PDLd]>;
   1719 def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   1720                        "cvtdq2pd\t{$src, $dst|$dst, $src}",
   1721                        [(set VR128:$dst,
   1722                          (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
   1723                        Sched<[WriteCvtI2PD]>;
   1724 
   1725 // AVX register conversion intrinsics
   1726 let Predicates = [HasAVX, NoVLX] in {
   1727   def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
   1728             (VCVTDQ2PDrm addr:$src)>;
   1729   def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
   1730             (VCVTDQ2PDrm addr:$src)>;
   1731 } // Predicates = [HasAVX, NoVLX]
   1732 
   1733 // SSE2 register conversion intrinsics
   1734 let Predicates = [UseSSE2] in {
   1735   def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
   1736             (CVTDQ2PDrm addr:$src)>;
   1737   def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
   1738             (CVTDQ2PDrm addr:$src)>;
   1739 } // Predicates = [UseSSE2]
   1740 
   1741 // Convert packed double to packed single
   1742 // The assembler can recognize rr 256-bit instructions by seeing a ymm
   1743 // register, but the same isn't true when using memory operands instead.
   1744 // Provide other assembly rr and rm forms to address this explicitly.
   1745 let Predicates = [HasAVX, NoVLX] in
   1746 def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   1747                        "cvtpd2ps\t{$src, $dst|$dst, $src}",
   1748                        [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>,
   1749                        VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG;
   1750 
   1751 // XMM only
   1752 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
   1753                 (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>;
   1754 let Predicates = [HasAVX, NoVLX] in
   1755 def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   1756                        "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
   1757                        [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))]>,
   1758                        VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG;
   1759 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
   1760                 (VCVTPD2PSrm VR128:$dst, f128mem:$src), 0, "intel">;
   1761 
   1762 // YMM only
   1763 let Predicates = [HasAVX, NoVLX] in {
   1764 def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
   1765                         "cvtpd2ps\t{$src, $dst|$dst, $src}",
   1766                         [(set VR128:$dst, (fpround VR256:$src))]>,
   1767                         VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG;
   1768 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
   1769                         "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
   1770                         [(set VR128:$dst, (fpround (loadv4f64 addr:$src)))]>,
   1771                         VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG;
   1772 }
   1773 def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
   1774                 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
   1775 def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
   1776                 (VCVTPD2PSYrm VR128:$dst, f256mem:$src), 0, "intel">;
   1777 
   1778 def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   1779                      "cvtpd2ps\t{$src, $dst|$dst, $src}",
   1780                      [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>,
   1781                      Sched<[WriteCvtPD2PS]>;
   1782 def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   1783                      "cvtpd2ps\t{$src, $dst|$dst, $src}",
   1784                      [(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))]>,
   1785                      Sched<[WriteCvtPD2PS.Folded]>;
   1786 
   1787 // AVX 256-bit register conversion intrinsics
   1788 // FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
   1789 // whenever possible to avoid declaring two versions of each one.
   1790 
   1791 let Predicates = [HasAVX, NoVLX] in {
   1792   // Match fpround and fpextend for 128/256-bit conversions
   1793   def : Pat<(X86vzmovl (v2f64 (bitconvert
   1794                                (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
   1795             (VCVTPD2PSrr VR128:$src)>;
   1796   def : Pat<(X86vzmovl (v2f64 (bitconvert
   1797                                (v4f32 (X86vfpround (loadv2f64 addr:$src)))))),
   1798             (VCVTPD2PSrm addr:$src)>;
   1799 }
   1800 
   1801 let Predicates = [UseSSE2] in {
   1802   // Match fpround and fpextend for 128 conversions
   1803   def : Pat<(X86vzmovl (v2f64 (bitconvert
   1804                                (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
   1805             (CVTPD2PSrr VR128:$src)>;
   1806   def : Pat<(X86vzmovl (v2f64 (bitconvert
   1807                                (v4f32 (X86vfpround (memopv2f64 addr:$src)))))),
   1808             (CVTPD2PSrm addr:$src)>;
   1809 }
   1810 
   1811 //===----------------------------------------------------------------------===//
   1812 // SSE 1 & 2 - Compare Instructions
   1813 //===----------------------------------------------------------------------===//
   1814 
   1815 // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
   1816 multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
   1817                             Operand CC, SDNode OpNode, ValueType VT,
   1818                             PatFrag ld_frag, string asm, string asm_alt,
   1819                             X86FoldableSchedWrite sched> {
   1820   let isCommutable = 1 in
   1821   def rr : SIi8<0xC2, MRMSrcReg,
   1822                 (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
   1823                 [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>,
   1824                 Sched<[sched]>;
   1825   def rm : SIi8<0xC2, MRMSrcMem,
   1826                 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
   1827                 [(set RC:$dst, (OpNode (VT RC:$src1),
   1828                                          (ld_frag addr:$src2), imm:$cc))]>,
   1829                 Sched<[sched.Folded, ReadAfterLd]>;
   1830 
   1831   // Accept explicit immediate argument form instead of comparison code.
   1832   let isAsmParserOnly = 1, hasSideEffects = 0 in {
   1833     def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
   1834                       (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, []>,
   1835                       Sched<[sched]>, NotMemoryFoldable;
   1836     let mayLoad = 1 in
   1837     def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
   1838                       (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, []>,
   1839                       Sched<[sched.Folded, ReadAfterLd]>, NotMemoryFoldable;
   1840   }
   1841 }
   1842 
   1843 let ExeDomain = SSEPackedSingle in
   1844 defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
   1845                  "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   1846                  "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
   1847                  SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG;
   1848 let ExeDomain = SSEPackedDouble in
   1849 defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
   1850                  "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   1851                  "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
   1852                  SchedWriteFCmpSizes.PD.Scl>,
   1853                  XD, VEX_4V, VEX_LIG, VEX_WIG;
   1854 
   1855 let Constraints = "$src1 = $dst" in {
   1856   let ExeDomain = SSEPackedSingle in
   1857   defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
   1858                   "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
   1859                   "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
   1860                   SchedWriteFCmpSizes.PS.Scl>, XS;
   1861   let ExeDomain = SSEPackedDouble in
   1862   defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
   1863                   "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
   1864                   "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
   1865                   SchedWriteFCmpSizes.PD.Scl>, XD;
   1866 }
   1867 
   1868 multiclass sse12_cmp_scalar_int<Operand memop, Operand CC,
   1869                          Intrinsic Int, string asm, X86FoldableSchedWrite sched,
   1870                          ComplexPattern mem_cpat> {
   1871   def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
   1872                       (ins VR128:$src1, VR128:$src, CC:$cc), asm,
   1873                         [(set VR128:$dst, (Int VR128:$src1,
   1874                                                VR128:$src, imm:$cc))]>,
   1875            Sched<[sched]>;
   1876 let mayLoad = 1 in
   1877   def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
   1878                       (ins VR128:$src1, memop:$src, CC:$cc), asm,
   1879                         [(set VR128:$dst, (Int VR128:$src1,
   1880                                                mem_cpat:$src, imm:$cc))]>,
   1881            Sched<[sched.Folded, ReadAfterLd]>;
   1882 }
   1883 
   1884 let isCodeGenOnly = 1 in {
   1885   // Aliases to match intrinsics which expect XMM operand(s).
   1886   let ExeDomain = SSEPackedSingle in
   1887   defm VCMPSS  : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss,
   1888                        "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
   1889                        SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS, VEX_4V;
   1890   let ExeDomain = SSEPackedDouble in
   1891   defm VCMPSD  : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd,
   1892                        "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
   1893                        SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
   1894                        XD, VEX_4V;
   1895   let Constraints = "$src1 = $dst" in {
   1896     let ExeDomain = SSEPackedSingle in
   1897     defm CMPSS  : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss,
   1898                          "cmp${cc}ss\t{$src, $dst|$dst, $src}",
   1899                          SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
   1900     let ExeDomain = SSEPackedDouble in
   1901     defm CMPSD  : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd,
   1902                          "cmp${cc}sd\t{$src, $dst|$dst, $src}",
   1903                          SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
   1904 }
   1905 }
   1906 
   1907 
   1908 // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
   1909 multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
   1910                          ValueType vt, X86MemOperand x86memop,
   1911                          PatFrag ld_frag, string OpcodeStr,
   1912                          X86FoldableSchedWrite sched> {
   1913 let hasSideEffects = 0 in {
   1914   def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
   1915                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
   1916                      [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
   1917           Sched<[sched]>;
   1918 let mayLoad = 1 in
   1919   def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
   1920                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
   1921                      [(set EFLAGS, (OpNode (vt RC:$src1),
   1922                                            (ld_frag addr:$src2)))]>,
   1923           Sched<[sched.Folded, ReadAfterLd]>;
   1924 }
   1925 }
   1926 
   1927 // sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
   1928 multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
   1929                              ValueType vt, Operand memop,
   1930                              ComplexPattern mem_cpat, string OpcodeStr,
   1931                              X86FoldableSchedWrite sched> {
   1932   def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
   1933                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
   1934                      [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
   1935           Sched<[sched]>;
   1936 let mayLoad = 1 in
   1937   def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
   1938                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
   1939                      [(set EFLAGS, (OpNode (vt RC:$src1),
   1940                                            mem_cpat:$src2))]>,
   1941           Sched<[sched.Folded, ReadAfterLd]>;
   1942 }
   1943 
   1944 let Defs = [EFLAGS] in {
   1945   defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
   1946                                "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
   1947   defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
   1948                                "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
   1949   let Pattern = []<dag> in {
   1950     defm VCOMISS  : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
   1951                                 "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
   1952     defm VCOMISD  : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
   1953                                 "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
   1954   }
   1955 
   1956   let isCodeGenOnly = 1 in {
   1957     defm VUCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
   1958                       sse_load_f32, "ucomiss", WriteFCom>, PS, VEX, VEX_WIG;
   1959     defm VUCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
   1960                       sse_load_f64, "ucomisd", WriteFCom>, PD, VEX, VEX_WIG;
   1961 
   1962     defm VCOMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
   1963                        sse_load_f32, "comiss", WriteFCom>, PS, VEX, VEX_WIG;
   1964     defm VCOMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
   1965                        sse_load_f64, "comisd", WriteFCom>, PD, VEX, VEX_WIG;
   1966   }
   1967   defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
   1968                                   "ucomiss", WriteFCom>, PS;
   1969   defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
   1970                                   "ucomisd", WriteFCom>, PD;
   1971 
   1972   let Pattern = []<dag> in {
   1973     defm COMISS  : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
   1974                                     "comiss", WriteFCom>, PS;
   1975     defm COMISD  : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
   1976                                     "comisd", WriteFCom>, PD;
   1977   }
   1978 
   1979   let isCodeGenOnly = 1 in {
   1980     defm UCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
   1981                             sse_load_f32, "ucomiss", WriteFCom>, PS;
   1982     defm UCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
   1983                             sse_load_f64, "ucomisd", WriteFCom>, PD;
   1984 
   1985     defm COMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
   1986                                 sse_load_f32, "comiss", WriteFCom>, PS;
   1987     defm COMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
   1988                                     sse_load_f64, "comisd", WriteFCom>, PD;
   1989   }
   1990 } // Defs = [EFLAGS]
   1991 
   1992 // sse12_cmp_packed - sse 1 & 2 compare packed instructions
   1993 multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
   1994                             Operand CC,  ValueType VT, string asm,
   1995                             string asm_alt, X86FoldableSchedWrite sched,
   1996                             Domain d, PatFrag ld_frag> {
   1997   let isCommutable = 1 in
   1998   def rri : PIi8<0xC2, MRMSrcReg,
   1999              (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
   2000              [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, imm:$cc)))], d>,
   2001             Sched<[sched]>;
   2002   def rmi : PIi8<0xC2, MRMSrcMem,
   2003              (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
   2004              [(set RC:$dst,
   2005                (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))], d>,
   2006             Sched<[sched.Folded, ReadAfterLd]>;
   2007 
   2008   // Accept explicit immediate argument form instead of comparison code.
   2009   let isAsmParserOnly = 1, hasSideEffects = 0 in {
   2010     def rri_alt : PIi8<0xC2, MRMSrcReg,
   2011                (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc),
   2012                asm_alt, [], d>, Sched<[sched]>, NotMemoryFoldable;
   2013     let mayLoad = 1 in
   2014     def rmi_alt : PIi8<0xC2, MRMSrcMem,
   2015                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc),
   2016                asm_alt, [], d>, Sched<[sched.Folded, ReadAfterLd]>,
   2017                NotMemoryFoldable;
   2018   }
   2019 }
   2020 
   2021 defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, v4f32,
   2022                "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2023                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
   2024                SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG;
   2025 defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, v2f64,
   2026                "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2027                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
   2028                SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG;
   2029 defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, v8f32,
   2030                "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2031                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
   2032                SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG;
   2033 defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, v4f64,
   2034                "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2035                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
   2036                SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG;
   2037 let Constraints = "$src1 = $dst" in {
   2038   defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, v4f32,
   2039                  "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
   2040                  "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
   2041                  SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
   2042   defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, v2f64,
   2043                  "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
   2044                  "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
   2045                  SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
   2046 }
   2047 
   2048 def CommutableCMPCC : PatLeaf<(imm), [{
   2049   uint64_t Imm = N->getZExtValue() & 0x7;
   2050   return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07);
   2051 }]>;
   2052 
   2053 // Patterns to select compares with loads in first operand.
   2054 let Predicates = [HasAVX] in {
   2055   def : Pat<(v4f64 (X86cmpp (loadv4f64 addr:$src2), VR256:$src1,
   2056                             CommutableCMPCC:$cc)),
   2057             (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
   2058 
   2059   def : Pat<(v8f32 (X86cmpp (loadv8f32 addr:$src2), VR256:$src1,
   2060                             CommutableCMPCC:$cc)),
   2061             (VCMPPSYrmi VR256:$src1, addr:$src2, imm:$cc)>;
   2062 
   2063   def : Pat<(v2f64 (X86cmpp (loadv2f64 addr:$src2), VR128:$src1,
   2064                             CommutableCMPCC:$cc)),
   2065             (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
   2066 
   2067   def : Pat<(v4f32 (X86cmpp (loadv4f32 addr:$src2), VR128:$src1,
   2068                             CommutableCMPCC:$cc)),
   2069             (VCMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>;
   2070 
   2071   def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
   2072                           CommutableCMPCC:$cc)),
   2073             (VCMPSDrm FR64:$src1, addr:$src2, imm:$cc)>;
   2074 
   2075   def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
   2076                           CommutableCMPCC:$cc)),
   2077             (VCMPSSrm FR32:$src1, addr:$src2, imm:$cc)>;
   2078 }
   2079 
   2080 let Predicates = [UseSSE2] in {
   2081   def : Pat<(v2f64 (X86cmpp (memopv2f64 addr:$src2), VR128:$src1,
   2082                             CommutableCMPCC:$cc)),
   2083             (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
   2084 
   2085   def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
   2086                           CommutableCMPCC:$cc)),
   2087             (CMPSDrm FR64:$src1, addr:$src2, imm:$cc)>;
   2088 }
   2089 
   2090 let Predicates = [UseSSE1] in {
   2091   def : Pat<(v4f32 (X86cmpp (memopv4f32 addr:$src2), VR128:$src1,
   2092                             CommutableCMPCC:$cc)),
   2093             (CMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>;
   2094 
   2095   def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
   2096                           CommutableCMPCC:$cc)),
   2097             (CMPSSrm FR32:$src1, addr:$src2, imm:$cc)>;
   2098 }
   2099 
   2100 //===----------------------------------------------------------------------===//
   2101 // SSE 1 & 2 - Shuffle Instructions
   2102 //===----------------------------------------------------------------------===//
   2103 
   2104 /// sse12_shuffle - sse 1 & 2 fp shuffle instructions
   2105 multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
   2106                          ValueType vt, string asm, PatFrag mem_frag,
   2107                          X86FoldableSchedWrite sched, Domain d> {
   2108   def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
   2109                    (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
   2110                    [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
   2111                                        (i8 imm:$src3))))], d>,
   2112             Sched<[sched.Folded, ReadAfterLd]>;
   2113   def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
   2114                  (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
   2115                  [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
   2116                                      (i8 imm:$src3))))], d>,
   2117             Sched<[sched]>;
   2118 }
   2119 
   2120 let Predicates = [HasAVX, NoVLX] in {
   2121   defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
   2122            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   2123            loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
   2124            PS, VEX_4V, VEX_WIG;
   2125   defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
   2126            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   2127            loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
   2128            PS, VEX_4V, VEX_L, VEX_WIG;
   2129   defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
   2130            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   2131            loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
   2132            PD, VEX_4V, VEX_WIG;
   2133   defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
   2134            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   2135            loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
   2136            PD, VEX_4V, VEX_L, VEX_WIG;
   2137 }
   2138 let Constraints = "$src1 = $dst" in {
   2139   defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
   2140                     "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
   2141                     memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
   2142   defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
   2143                     "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
   2144                     memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
   2145 }
   2146 
   2147 //===----------------------------------------------------------------------===//
   2148 // SSE 1 & 2 - Unpack FP Instructions
   2149 //===----------------------------------------------------------------------===//
   2150 
   2151 /// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
   2152 multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
   2153                                    PatFrag mem_frag, RegisterClass RC,
   2154                                    X86MemOperand x86memop, string asm,
   2155                                    X86FoldableSchedWrite sched, Domain d,
   2156                                    bit IsCommutable = 0> {
   2157     let isCommutable = IsCommutable in
   2158     def rr : PI<opc, MRMSrcReg,
   2159                 (outs RC:$dst), (ins RC:$src1, RC:$src2),
   2160                 asm, [(set RC:$dst,
   2161                            (vt (OpNode RC:$src1, RC:$src2)))], d>,
   2162                 Sched<[sched]>;
   2163     def rm : PI<opc, MRMSrcMem,
   2164                 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
   2165                 asm, [(set RC:$dst,
   2166                            (vt (OpNode RC:$src1,
   2167                                        (mem_frag addr:$src2))))], d>,
   2168              Sched<[sched.Folded, ReadAfterLd]>;
   2169 }
   2170 
   2171 let Predicates = [HasAVX, NoVLX] in {
   2172 defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
   2173       VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2174                      SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
   2175 defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
   2176       VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2177                      SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG;
   2178 defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
   2179       VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2180                      SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
   2181 defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
   2182       VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2183                      SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
   2184 
   2185 defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
   2186       VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2187                      SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
   2188 defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
   2189       VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2190                      SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
   2191 defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
   2192       VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2193                      SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
   2194 defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
   2195       VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   2196                      SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
   2197 }// Predicates = [HasAVX, NoVLX]
   2198 
   2199 let Constraints = "$src1 = $dst" in {
   2200   defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
   2201         VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
   2202                        SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
   2203   defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
   2204         VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
   2205                        SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
   2206   defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
   2207         VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
   2208                        SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
   2209   defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
   2210         VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
   2211                        SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
   2212 } // Constraints = "$src1 = $dst"
   2213 
   2214 let Predicates = [HasAVX1Only] in {
   2215   def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
   2216             (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
   2217   def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
   2218             (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
   2219   def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
   2220             (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
   2221   def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
   2222             (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
   2223 
   2224   def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
   2225             (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
   2226   def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
   2227             (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
   2228   def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
   2229             (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
   2230   def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
   2231             (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
   2232 }
   2233 
   2234 //===----------------------------------------------------------------------===//
   2235 // SSE 1 & 2 - Extract Floating-Point Sign mask
   2236 //===----------------------------------------------------------------------===//
   2237 
   2238 /// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
   2239 multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
   2240                                 string asm, Domain d> {
   2241   def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
   2242               !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
   2243               [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>,
   2244               Sched<[WriteFMOVMSK]>;
   2245 }
   2246 
   2247 let Predicates = [HasAVX] in {
   2248   defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
   2249                                         SSEPackedSingle>, PS, VEX, VEX_WIG;
   2250   defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
   2251                                         SSEPackedDouble>, PD, VEX, VEX_WIG;
   2252   defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
   2253                                          SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG;
   2254   defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
   2255                                          SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG;
   2256 }
   2257 
   2258 defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
   2259                                      SSEPackedSingle>, PS;
   2260 defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
   2261                                      SSEPackedDouble>, PD;
   2262 
   2263 //===---------------------------------------------------------------------===//
   2264 // SSE2 - Packed Integer Logical Instructions
   2265 //===---------------------------------------------------------------------===//
   2266 
   2267 let ExeDomain = SSEPackedInt in { // SSE integer instructions
   2268 
   2269 /// PDI_binop_rm - Simple SSE2 binary operator.
   2270 multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
   2271                         ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
   2272                         X86MemOperand x86memop, X86FoldableSchedWrite sched,
   2273                         bit IsCommutable, bit Is2Addr> {
   2274   let isCommutable = IsCommutable in
   2275   def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
   2276        (ins RC:$src1, RC:$src2),
   2277        !if(Is2Addr,
   2278            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   2279            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   2280        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
   2281        Sched<[sched]>;
   2282   def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
   2283        (ins RC:$src1, x86memop:$src2),
   2284        !if(Is2Addr,
   2285            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   2286            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   2287        [(set RC:$dst, (OpVT (OpNode RC:$src1,
   2288                                      (bitconvert (memop_frag addr:$src2)))))]>,
   2289        Sched<[sched.Folded, ReadAfterLd]>;
   2290 }
   2291 } // ExeDomain = SSEPackedInt
   2292 
   2293 multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
   2294                          ValueType OpVT128, ValueType OpVT256,
   2295                          X86SchedWriteWidths sched, bit IsCommutable,
   2296                          Predicate prd> {
   2297 let Predicates = [HasAVX, prd] in
   2298   defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
   2299                              VR128, loadv2i64, i128mem, sched.XMM,
   2300                              IsCommutable, 0>, VEX_4V, VEX_WIG;
   2301 
   2302 let Constraints = "$src1 = $dst" in
   2303   defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
   2304                            memopv2i64, i128mem, sched.XMM, IsCommutable, 1>;
   2305 
   2306 let Predicates = [HasAVX2, prd] in
   2307   defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
   2308                                OpVT256, VR256, loadv4i64, i256mem, sched.YMM,
   2309                                IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
   2310 }
   2311 
   2312 // These are ordered here for pattern ordering requirements with the fp versions
   2313 
   2314 defm PAND  : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
   2315                            SchedWriteVecLogic, 1, NoVLX>;
   2316 defm POR   : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
   2317                            SchedWriteVecLogic, 1, NoVLX>;
   2318 defm PXOR  : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
   2319                            SchedWriteVecLogic, 1, NoVLX>;
   2320 defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
   2321                            SchedWriteVecLogic, 0, NoVLX>;
   2322 
   2323 //===----------------------------------------------------------------------===//
   2324 // SSE 1 & 2 - Logical Instructions
   2325 //===----------------------------------------------------------------------===//
   2326 
   2327 /// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
   2328 ///
   2329 /// There are no patterns here because isel prefers integer versions for SSE2
   2330 /// and later. There are SSE1 v4f32 patterns later.
   2331 multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
   2332                                    SDNode OpNode, X86SchedWriteWidths sched> {
   2333   let Predicates = [HasAVX, NoVLX] in {
   2334   defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
   2335         !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
   2336         [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG;
   2337 
   2338   defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
   2339         !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
   2340         [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG;
   2341 
   2342   defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
   2343        !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
   2344        [], [], 0>, PS, VEX_4V, VEX_WIG;
   2345 
   2346   defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
   2347        !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
   2348        [], [], 0>, PD, VEX_4V, VEX_WIG;
   2349   }
   2350 
   2351   let Constraints = "$src1 = $dst" in {
   2352     defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
   2353          !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
   2354          [], []>, PS;
   2355 
   2356     defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
   2357          !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
   2358          [], []>, PD;
   2359   }
   2360 }
   2361 
   2362 defm AND  : sse12_fp_packed_logical<0x54, "and", and, SchedWriteFLogic>;
   2363 defm OR   : sse12_fp_packed_logical<0x56, "or", or, SchedWriteFLogic>;
   2364 defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor, SchedWriteFLogic>;
   2365 let isCommutable = 0 in
   2366   defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>;
   2367 
   2368 // If only AVX1 is supported, we need to handle integer operations with
   2369 // floating point instructions since the integer versions aren't available.
   2370 let Predicates = [HasAVX1Only] in {
   2371   def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
   2372             (VANDPSYrr VR256:$src1, VR256:$src2)>;
   2373   def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
   2374             (VORPSYrr VR256:$src1, VR256:$src2)>;
   2375   def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
   2376             (VXORPSYrr VR256:$src1, VR256:$src2)>;
   2377   def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
   2378             (VANDNPSYrr VR256:$src1, VR256:$src2)>;
   2379 
   2380   def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
   2381             (VANDPSYrm VR256:$src1, addr:$src2)>;
   2382   def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
   2383             (VORPSYrm VR256:$src1, addr:$src2)>;
   2384   def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
   2385             (VXORPSYrm VR256:$src1, addr:$src2)>;
   2386   def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
   2387             (VANDNPSYrm VR256:$src1, addr:$src2)>;
   2388 }
   2389 
   2390 let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
   2391   // Use packed logical operations for scalar ops.
   2392   def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
   2393             (COPY_TO_REGCLASS
   2394              (v2f64 (VANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
   2395                               (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
   2396              FR64)>;
   2397   def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
   2398             (COPY_TO_REGCLASS
   2399              (v2f64 (VORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
   2400                              (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
   2401              FR64)>;
   2402   def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
   2403             (COPY_TO_REGCLASS
   2404              (v2f64 (VXORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
   2405                               (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
   2406              FR64)>;
   2407   def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
   2408             (COPY_TO_REGCLASS
   2409              (v2f64 (VANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
   2410                                (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
   2411              FR64)>;
   2412 
   2413   def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
   2414             (COPY_TO_REGCLASS
   2415              (v4f32 (VANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
   2416                               (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
   2417              FR32)>;
   2418   def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
   2419             (COPY_TO_REGCLASS
   2420              (v4f32 (VORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
   2421                              (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
   2422              FR32)>;
   2423   def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
   2424             (COPY_TO_REGCLASS
   2425              (v4f32 (VXORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
   2426                               (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
   2427              FR32)>;
   2428   def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
   2429             (COPY_TO_REGCLASS
   2430              (v4f32 (VANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
   2431                                (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
   2432              FR32)>;
   2433 }
   2434 
   2435 let Predicates = [UseSSE1] in {
   2436   // Use packed logical operations for scalar ops.
   2437   def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
   2438             (COPY_TO_REGCLASS
   2439              (v4f32 (ANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
   2440                              (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
   2441              FR32)>;
   2442   def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
   2443             (COPY_TO_REGCLASS
   2444              (v4f32 (ORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
   2445                             (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
   2446              FR32)>;
   2447   def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
   2448             (COPY_TO_REGCLASS
   2449              (v4f32 (XORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
   2450                              (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
   2451              FR32)>;
   2452   def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
   2453             (COPY_TO_REGCLASS
   2454              (v4f32 (ANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
   2455                               (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
   2456              FR32)>;
   2457 }
   2458 
   2459 let Predicates = [UseSSE2] in {
   2460   // Use packed logical operations for scalar ops.
   2461   def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
   2462             (COPY_TO_REGCLASS
   2463              (v2f64 (ANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
   2464                              (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
   2465              FR64)>;
   2466   def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
   2467             (COPY_TO_REGCLASS
   2468              (v2f64 (ORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
   2469                             (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
   2470              FR64)>;
   2471   def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
   2472             (COPY_TO_REGCLASS
   2473              (v2f64 (XORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
   2474                              (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
   2475              FR64)>;
   2476   def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
   2477             (COPY_TO_REGCLASS
   2478              (v2f64 (ANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
   2479                               (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
   2480              FR64)>;
   2481 }
   2482 
   2483 // Patterns for packed operations when we don't have integer type available.
   2484 def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
   2485           (ANDPSrr VR128:$src1, VR128:$src2)>;
   2486 def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
   2487           (ORPSrr VR128:$src1, VR128:$src2)>;
   2488 def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
   2489           (XORPSrr VR128:$src1, VR128:$src2)>;
   2490 def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
   2491           (ANDNPSrr VR128:$src1, VR128:$src2)>;
   2492 
   2493 def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
   2494           (ANDPSrm VR128:$src1, addr:$src2)>;
   2495 def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
   2496           (ORPSrm VR128:$src1, addr:$src2)>;
   2497 def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
   2498           (XORPSrm VR128:$src1, addr:$src2)>;
   2499 def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
   2500           (ANDNPSrm VR128:$src1, addr:$src2)>;
   2501 
   2502 //===----------------------------------------------------------------------===//
   2503 // SSE 1 & 2 - Arithmetic Instructions
   2504 //===----------------------------------------------------------------------===//
   2505 
   2506 /// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
   2507 /// vector forms.
   2508 ///
   2509 /// In addition, we also have a special variant of the scalar form here to
   2510 /// represent the associated intrinsic operation.  This form is unlike the
   2511 /// plain scalar form, in that it takes an entire vector (instead of a scalar)
   2512 /// and leaves the top elements unmodified (therefore these cannot be commuted).
   2513 ///
   2514 /// These three forms can each be reg+reg or reg+mem.
   2515 ///
   2516 
   2517 /// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
   2518 /// classes below
   2519 multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
   2520                                   SDNode OpNode, X86SchedWriteSizes sched> {
   2521   let Predicates = [HasAVX, NoVLX] in {
   2522   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
   2523                                VR128, v4f32, f128mem, loadv4f32,
   2524                                SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG;
   2525   defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
   2526                                VR128, v2f64, f128mem, loadv2f64,
   2527                                SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG;
   2528 
   2529   defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
   2530                         OpNode, VR256, v8f32, f256mem, loadv8f32,
   2531                         SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
   2532   defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
   2533                         OpNode, VR256, v4f64, f256mem, loadv4f64,
   2534                         SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
   2535   }
   2536 
   2537   let Constraints = "$src1 = $dst" in {
   2538     defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
   2539                               v4f32, f128mem, memopv4f32, SSEPackedSingle,
   2540                               sched.PS.XMM>, PS;
   2541     defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
   2542                               v2f64, f128mem, memopv2f64, SSEPackedDouble,
   2543                               sched.PD.XMM>, PD;
   2544   }
   2545 }
   2546 
   2547 multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
   2548                                   X86SchedWriteSizes sched> {
   2549   defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
   2550                          OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
   2551                          XS, VEX_4V, VEX_LIG, VEX_WIG;
   2552   defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
   2553                          OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
   2554                          XD, VEX_4V, VEX_LIG, VEX_WIG;
   2555 
   2556   let Constraints = "$src1 = $dst" in {
   2557     defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
   2558                               OpNode, FR32, f32mem, SSEPackedSingle,
   2559                               sched.PS.Scl>, XS;
   2560     defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
   2561                               OpNode, FR64, f64mem, SSEPackedDouble,
   2562                               sched.PD.Scl>, XD;
   2563   }
   2564 }
   2565 
   2566 multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
   2567                                       SDPatternOperator OpNode,
   2568                                       X86SchedWriteSizes sched> {
   2569   defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
   2570                    !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
   2571                    SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
   2572   defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
   2573                    !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
   2574                    SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
   2575 
   2576   let Constraints = "$src1 = $dst" in {
   2577     defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
   2578                    !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
   2579                    SSEPackedSingle, sched.PS.Scl>, XS;
   2580     defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
   2581                    !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
   2582                    SSEPackedDouble, sched.PD.Scl>, XD;
   2583   }
   2584 }
   2585 
   2586 // Binary Arithmetic instructions
   2587 defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SchedWriteFAddSizes>,
   2588            basic_sse12_fp_binop_s<0x58, "add", fadd, SchedWriteFAddSizes>,
   2589            basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>;
   2590 defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SchedWriteFMulSizes>,
   2591            basic_sse12_fp_binop_s<0x59, "mul", fmul, SchedWriteFMulSizes>,
   2592            basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>;
   2593 let isCommutable = 0 in {
   2594   defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SchedWriteFAddSizes>,
   2595              basic_sse12_fp_binop_s<0x5C, "sub", fsub, SchedWriteFAddSizes>,
   2596              basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>;
   2597   defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SchedWriteFDivSizes>,
   2598              basic_sse12_fp_binop_s<0x5E, "div", fdiv, SchedWriteFDivSizes>,
   2599              basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
   2600   defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
   2601              basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
   2602              basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>;
   2603   defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
   2604              basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
   2605              basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>;
   2606 }
   2607 
   2608 let isCodeGenOnly = 1 in {
   2609   defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>,
   2610              basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>;
   2611   defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>,
   2612              basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>;
   2613 }
   2614 
   2615 // Patterns used to select SSE scalar fp arithmetic instructions from
   2616 // either:
   2617 //
   2618 // (1) a scalar fp operation followed by a blend
   2619 //
   2620 // The effect is that the backend no longer emits unnecessary vector
   2621 // insert instructions immediately after SSE scalar fp instructions
   2622 // like addss or mulss.
   2623 //
   2624 // For example, given the following code:
   2625 //   __m128 foo(__m128 A, __m128 B) {
   2626 //     A[0] += B[0];
   2627 //     return A;
   2628 //   }
   2629 //
   2630 // Previously we generated:
   2631 //   addss %xmm0, %xmm1
   2632 //   movss %xmm1, %xmm0
   2633 //
   2634 // We now generate:
   2635 //   addss %xmm1, %xmm0
   2636 //
   2637 // (2) a vector packed single/double fp operation followed by a vector insert
   2638 //
   2639 // The effect is that the backend converts the packed fp instruction
   2640 // followed by a vector insert into a single SSE scalar fp instruction.
   2641 //
   2642 // For example, given the following code:
   2643 //   __m128 foo(__m128 A, __m128 B) {
   2644 //     __m128 C = A + B;
   2645 //     return (__m128) {c[0], a[1], a[2], a[3]};
   2646 //   }
   2647 //
   2648 // Previously we generated:
   2649 //   addps %xmm0, %xmm1
   2650 //   movss %xmm1, %xmm0
   2651 //
   2652 // We now generate:
   2653 //   addss %xmm1, %xmm0
   2654 
   2655 // TODO: Some canonicalization in lowering would simplify the number of
   2656 // patterns we have to try to match.
   2657 multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
   2658                                     ValueType VT, ValueType EltTy,
   2659                                     RegisterClass RC, Predicate BasePredicate> {
   2660   let Predicates = [BasePredicate] in {
   2661     // extracted scalar math op with insert via movss/movsd
   2662     def : Pat<(VT (Move (VT VR128:$dst),
   2663                         (VT (scalar_to_vector
   2664                              (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
   2665                                  RC:$src))))),
   2666               (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
   2667                (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
   2668   }
   2669 
   2670   // Repeat for AVX versions of the instructions.
   2671   let Predicates = [UseAVX] in {
   2672     // extracted scalar math op with insert via movss/movsd
   2673     def : Pat<(VT (Move (VT VR128:$dst),
   2674                         (VT (scalar_to_vector
   2675                              (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
   2676                                  RC:$src))))),
   2677               (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
   2678                (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
   2679   }
   2680 }
   2681 
   2682 defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
   2683 defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
   2684 defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
   2685 defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
   2686 
   2687 defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
   2688 defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
   2689 defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
   2690 defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
   2691  
   2692 /// Unop Arithmetic
   2693 /// In addition, we also have a special variant of the scalar form here to
   2694 /// represent the associated intrinsic operation.  This form is unlike the
   2695 /// plain scalar form, in that it takes an entire vector (instead of a
   2696 /// scalar) and leaves the top elements undefined.
   2697 ///
   2698 /// And, we have a special variant form for a full-vector intrinsic form.
   2699 
   2700 /// sse_fp_unop_s - SSE1 unops in scalar form
   2701 /// For the non-AVX defs, we need $src1 to be tied to $dst because
   2702 /// the HW instructions are 2 operand / destructive.
   2703 multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
   2704                           ValueType ScalarVT, X86MemOperand x86memop,
   2705                           Operand intmemop, SDNode OpNode, Domain d,
   2706                           X86FoldableSchedWrite sched, Predicate target> {
   2707   let hasSideEffects = 0 in {
   2708   def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
   2709               !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
   2710             [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>,
   2711             Requires<[target]>;
   2712   let mayLoad = 1 in
   2713   def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
   2714             !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
   2715             [(set RC:$dst, (OpNode (load addr:$src1)))], d>,
   2716             Sched<[sched.Folded, ReadAfterLd]>,
   2717             Requires<[target, OptForSize]>;
   2718 
   2719   let isCodeGenOnly = 1, Constraints = "$src1 = $dst", ExeDomain = d in {
   2720   def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
   2721                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
   2722                 Sched<[sched]>;
   2723   let mayLoad = 1 in
   2724   def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
   2725                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
   2726                 Sched<[sched.Folded, ReadAfterLd]>;
   2727   }
   2728   }
   2729 
   2730 }
   2731 
   2732 multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt,
   2733                               ComplexPattern int_cpat, Intrinsic Intr,
   2734                               Predicate target, string Suffix> {
   2735   let Predicates = [target] in {
   2736   // These are unary operations, but they are modeled as having 2 source operands
   2737   // because the high elements of the destination are unchanged in SSE.
   2738   def : Pat<(Intr VR128:$src),
   2739             (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>;
   2740   }
   2741   // We don't want to fold scalar loads into these instructions unless
   2742   // optimizing for size. This is because the folded instruction will have a
   2743   // partial register update, while the unfolded sequence will not, e.g.
   2744   // movss mem, %xmm0
   2745   // rcpss %xmm0, %xmm0
   2746   // which has a clobber before the rcp, vs.
   2747   // rcpss mem, %xmm0
   2748   let Predicates = [target, OptForSize] in {
   2749     def : Pat<(Intr int_cpat:$src2),
   2750                (!cast<Instruction>(NAME#m_Int)
   2751                       (vt (IMPLICIT_DEF)), addr:$src2)>;
   2752   }
   2753 }
   2754 
   2755 multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, ComplexPattern int_cpat,
   2756                               Intrinsic Intr, Predicate target> {
   2757   let Predicates = [target] in {
   2758    def : Pat<(Intr VR128:$src),
   2759              (!cast<Instruction>(NAME#r_Int) VR128:$src,
   2760                                  VR128:$src)>;
   2761   }
   2762   let Predicates = [target, OptForSize] in {
   2763     def : Pat<(Intr int_cpat:$src2),
   2764               (!cast<Instruction>(NAME#m_Int)
   2765                     (vt (IMPLICIT_DEF)), addr:$src2)>;
   2766   }
   2767 }
   2768 
   2769 multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
   2770                           ValueType ScalarVT, X86MemOperand x86memop,
   2771                           Operand intmemop, SDNode OpNode, Domain d,
   2772                           X86FoldableSchedWrite sched, Predicate target> {
   2773   let hasSideEffects = 0 in {
   2774   def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
   2775             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   2776             [], d>, Sched<[sched]>;
   2777   let mayLoad = 1 in
   2778   def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
   2779              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   2780             [], d>, Sched<[sched.Folded, ReadAfterLd]>;
   2781   let isCodeGenOnly = 1, ExeDomain = d in {
   2782   def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
   2783                 (ins VR128:$src1, VR128:$src2),
   2784              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   2785              []>, Sched<[sched]>;
   2786   let mayLoad = 1 in
   2787   def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
   2788                 (ins VR128:$src1, intmemop:$src2),
   2789              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   2790              []>, Sched<[sched.Folded, ReadAfterLd]>;
   2791   }
   2792   }
   2793 
   2794   // We don't want to fold scalar loads into these instructions unless
   2795   // optimizing for size. This is because the folded instruction will have a
   2796   // partial register update, while the unfolded sequence will not, e.g.
   2797   // vmovss mem, %xmm0
   2798   // vrcpss %xmm0, %xmm0, %xmm0
   2799   // which has a clobber before the rcp, vs.
   2800   // vrcpss mem, %xmm0, %xmm0
   2801   // TODO: In theory, we could fold the load, and avoid the stall caused by
   2802   // the partial register store, either in BreakFalseDeps or with smarter RA.
   2803   let Predicates = [target] in {
   2804    def : Pat<(OpNode RC:$src),  (!cast<Instruction>(NAME#r)
   2805                                 (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
   2806   }
   2807   let Predicates = [target, OptForSize] in {
   2808     def : Pat<(ScalarVT (OpNode (load addr:$src))),
   2809               (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)),
   2810             addr:$src)>;
   2811   }
   2812 }
   2813 
   2814 /// sse1_fp_unop_p - SSE1 unops in packed form.
   2815 multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
   2816                           X86SchedWriteWidths sched, list<Predicate> prds> {
   2817 let Predicates = prds in {
   2818   def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   2819                        !strconcat("v", OpcodeStr,
   2820                                   "ps\t{$src, $dst|$dst, $src}"),
   2821                        [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
   2822                        VEX, Sched<[sched.XMM]>, VEX_WIG;
   2823   def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   2824                        !strconcat("v", OpcodeStr,
   2825                                   "ps\t{$src, $dst|$dst, $src}"),
   2826                        [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>,
   2827                        VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
   2828   def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
   2829                         !strconcat("v", OpcodeStr,
   2830                                    "ps\t{$src, $dst|$dst, $src}"),
   2831                         [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>,
   2832                         VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
   2833   def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
   2834                         !strconcat("v", OpcodeStr,
   2835                                    "ps\t{$src, $dst|$dst, $src}"),
   2836                         [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>,
   2837                         VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
   2838 }
   2839 
   2840   def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   2841                 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
   2842                 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
   2843                 Sched<[sched.XMM]>;
   2844   def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   2845                 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
   2846                 [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>,
   2847                 Sched<[sched.XMM.Folded]>;
   2848 }
   2849 
   2850 /// sse2_fp_unop_p - SSE2 unops in vector forms.
   2851 multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
   2852                           SDNode OpNode, X86SchedWriteWidths sched> {
   2853 let Predicates = [HasAVX, NoVLX] in {
   2854   def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   2855                        !strconcat("v", OpcodeStr,
   2856                                   "pd\t{$src, $dst|$dst, $src}"),
   2857                        [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
   2858                        VEX, Sched<[sched.XMM]>, VEX_WIG;
   2859   def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   2860                        !strconcat("v", OpcodeStr,
   2861                                   "pd\t{$src, $dst|$dst, $src}"),
   2862                        [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>,
   2863                        VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
   2864   def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
   2865                         !strconcat("v", OpcodeStr,
   2866                                    "pd\t{$src, $dst|$dst, $src}"),
   2867                         [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>,
   2868                         VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
   2869   def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
   2870                         !strconcat("v", OpcodeStr,
   2871                                    "pd\t{$src, $dst|$dst, $src}"),
   2872                         [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>,
   2873                         VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
   2874 }
   2875 
   2876   def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   2877                 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
   2878                 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
   2879                 Sched<[sched.XMM]>;
   2880   def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
   2881                 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
   2882                 [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>,
   2883                 Sched<[sched.XMM.Folded]>;
   2884 }
   2885 
   2886 multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode,
   2887                           X86SchedWriteWidths sched, Predicate AVXTarget> {
   2888   defm SS        :  sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
   2889                       !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
   2890                       UseSSE1, "SS">, XS;
   2891   defm V#NAME#SS  : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
   2892                       !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
   2893                       AVXTarget>,
   2894                       XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
   2895 }
   2896 
   2897 multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
   2898                           X86SchedWriteWidths sched, Predicate AVXTarget> {
   2899   defm SS        :  sse_fp_unop_s<opc, OpcodeStr##ss, FR32, f32, f32mem,
   2900                       ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
   2901   defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, f32,
   2902                       f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
   2903                        XS, VEX_4V, VEX_LIG, VEX_WIG;
   2904 }
   2905 
   2906 multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
   2907                           X86SchedWriteWidths sched, Predicate AVXTarget> {
   2908   defm SD         : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, f64, f64mem,
   2909                          sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
   2910   defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, f64,
   2911                          f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
   2912                          XD, VEX_4V, VEX_LIG, VEX_WIG;
   2913 }
   2914 
   2915 // Square root.
   2916 defm SQRT  : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt, UseAVX>,
   2917              sse1_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>,
   2918              sse2_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt64, UseAVX>,
   2919              sse2_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt64>;
   2920 
   2921 // Reciprocal approximations. Note that these typically require refinement
   2922 // in order to obtain suitable precision.
   2923 defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
   2924              sse1_fp_unop_s_intr<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
   2925              sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>;
   2926 defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
   2927              sse1_fp_unop_s_intr<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
   2928              sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>;
   2929 
   2930 // There is no f64 version of the reciprocal approximation instructions.
   2931 
   2932 multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Move,
   2933                                       ValueType VT, Predicate BasePredicate> {
   2934   let Predicates = [BasePredicate] in {
   2935     def : Pat<(VT (Move VT:$dst, (scalar_to_vector
   2936                                   (OpNode (extractelt VT:$src, 0))))),
   2937               (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
   2938   }
   2939 
   2940   // Repeat for AVX versions of the instructions.
   2941   let Predicates = [UseAVX] in {
   2942     def : Pat<(VT (Move VT:$dst, (scalar_to_vector
   2943                                   (OpNode (extractelt VT:$src, 0))))),
   2944               (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
   2945   }
   2946 }
   2947 
   2948 multiclass scalar_unary_math_imm_patterns<SDNode OpNode, string OpcPrefix, SDNode Move,
   2949                                           ValueType VT, bits<8> ImmV,
   2950                                           Predicate BasePredicate> {
   2951   let Predicates = [BasePredicate] in {
   2952     def : Pat<(VT (Move VT:$dst, (scalar_to_vector
   2953                                   (OpNode (extractelt VT:$src, 0))))),
   2954               (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>;
   2955   }
   2956 
   2957   // Repeat for AVX versions of the instructions.
   2958   let Predicates = [UseAVX] in {
   2959     def : Pat<(VT (Move VT:$dst, (scalar_to_vector
   2960                                   (OpNode (extractelt VT:$src, 0))))),
   2961               (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>;
   2962   }
   2963 }
   2964 
   2965 defm : scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
   2966 defm : scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
   2967 
   2968 multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,
   2969                                            SDNode Move, ValueType VT,
   2970                                            Predicate BasePredicate> {
   2971   let Predicates = [BasePredicate] in {
   2972     def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
   2973               (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
   2974   }
   2975 
   2976   // Repeat for AVX versions of the instructions.
   2977   let Predicates = [HasAVX] in {
   2978     def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
   2979               (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
   2980   }
   2981 }
   2982 
   2983 defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
   2984                                        v4f32, UseSSE1>;
   2985 defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
   2986                                        v4f32, UseSSE1>;
   2987 
   2988 
   2989 //===----------------------------------------------------------------------===//
   2990 // SSE 1 & 2 - Non-temporal stores
   2991 //===----------------------------------------------------------------------===//
   2992 
   2993 let AddedComplexity = 400 in { // Prefer non-temporal versions
   2994 let Predicates = [HasAVX, NoVLX] in {
   2995 let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
   2996 def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
   2997                      (ins f128mem:$dst, VR128:$src),
   2998                      "movntps\t{$src, $dst|$dst, $src}",
   2999                      [(alignednontemporalstore (v4f32 VR128:$src),
   3000                                                addr:$dst)]>, VEX, VEX_WIG;
   3001 def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
   3002                      (ins f128mem:$dst, VR128:$src),
   3003                      "movntpd\t{$src, $dst|$dst, $src}",
   3004                      [(alignednontemporalstore (v2f64 VR128:$src),
   3005                                                addr:$dst)]>, VEX, VEX_WIG;
   3006 } // SchedRW
   3007 
   3008 let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in {
   3009 def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
   3010                      (ins f256mem:$dst, VR256:$src),
   3011                      "movntps\t{$src, $dst|$dst, $src}",
   3012                      [(alignednontemporalstore (v8f32 VR256:$src),
   3013                                                addr:$dst)]>, VEX, VEX_L, VEX_WIG;
   3014 def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
   3015                      (ins f256mem:$dst, VR256:$src),
   3016                      "movntpd\t{$src, $dst|$dst, $src}",
   3017                      [(alignednontemporalstore (v4f64 VR256:$src),
   3018                                                addr:$dst)]>, VEX, VEX_L, VEX_WIG;
   3019 } // SchedRW
   3020 
   3021 let ExeDomain = SSEPackedInt in {
   3022 def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
   3023                          (ins i128mem:$dst, VR128:$src),
   3024                          "movntdq\t{$src, $dst|$dst, $src}",
   3025                          [(alignednontemporalstore (v2i64 VR128:$src),
   3026                                                    addr:$dst)]>, VEX, VEX_WIG,
   3027                          Sched<[SchedWriteVecMoveLSNT.XMM.MR]>;
   3028 def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
   3029                     (ins i256mem:$dst, VR256:$src),
   3030                     "movntdq\t{$src, $dst|$dst, $src}",
   3031                     [(alignednontemporalstore (v4i64 VR256:$src),
   3032                                               addr:$dst)]>, VEX, VEX_L, VEX_WIG,
   3033                     Sched<[SchedWriteVecMoveLSNT.YMM.MR]>;
   3034 } // ExeDomain
   3035 } // Predicates
   3036 
   3037 let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
   3038 def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
   3039                     "movntps\t{$src, $dst|$dst, $src}",
   3040                     [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
   3041 def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
   3042                     "movntpd\t{$src, $dst|$dst, $src}",
   3043                     [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
   3044 } // SchedRW
   3045 
   3046 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in
   3047 def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
   3048                     "movntdq\t{$src, $dst|$dst, $src}",
   3049                     [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>;
   3050 
   3051 let SchedRW = [WriteStoreNT] in {
   3052 // There is no AVX form for instructions below this point
   3053 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
   3054                  "movnti{l}\t{$src, $dst|$dst, $src}",
   3055                  [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
   3056                PS, Requires<[HasSSE2]>;
   3057 def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
   3058                      "movnti{q}\t{$src, $dst|$dst, $src}",
   3059                      [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
   3060                   PS, Requires<[HasSSE2]>;
   3061 } // SchedRW = [WriteStoreNT]
   3062 
   3063 let Predicates = [HasAVX, NoVLX] in {
   3064   def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
   3065             (VMOVNTDQYmr addr:$dst, VR256:$src)>;
   3066   def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
   3067             (VMOVNTDQYmr addr:$dst, VR256:$src)>;
   3068   def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
   3069             (VMOVNTDQYmr addr:$dst, VR256:$src)>;
   3070 
   3071   def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
   3072             (VMOVNTDQmr addr:$dst, VR128:$src)>;
   3073   def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
   3074             (VMOVNTDQmr addr:$dst, VR128:$src)>;
   3075   def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
   3076             (VMOVNTDQmr addr:$dst, VR128:$src)>;
   3077 }
   3078 
   3079 let Predicates = [UseSSE2] in {
   3080   def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
   3081             (MOVNTDQmr addr:$dst, VR128:$src)>;
   3082   def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
   3083             (MOVNTDQmr addr:$dst, VR128:$src)>;
   3084   def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
   3085             (MOVNTDQmr addr:$dst, VR128:$src)>;
   3086 }
   3087 
   3088 } // AddedComplexity
   3089 
   3090 //===----------------------------------------------------------------------===//
   3091 // SSE 1 & 2 - Prefetch and memory fence
   3092 //===----------------------------------------------------------------------===//
   3093 
   3094 // Prefetch intrinsic.
   3095 let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in {
   3096 def PREFETCHT0   : I<0x18, MRM1m, (outs), (ins i8mem:$src),
   3097     "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB;
   3098 def PREFETCHT1   : I<0x18, MRM2m, (outs), (ins i8mem:$src),
   3099     "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB;
   3100 def PREFETCHT2   : I<0x18, MRM3m, (outs), (ins i8mem:$src),
   3101     "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB;
   3102 def PREFETCHNTA  : I<0x18, MRM0m, (outs), (ins i8mem:$src),
   3103     "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB;
   3104 }
   3105 
   3106 // FIXME: How should flush instruction be modeled?
   3107 let SchedRW = [WriteLoad] in {
   3108 // Flush cache
   3109 def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
   3110                "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
   3111                PS, Requires<[HasSSE2]>;
   3112 }
   3113 
   3114 let SchedRW = [WriteNop] in {
   3115 // Pause. This "instruction" is encoded as "rep; nop", so even though it
   3116 // was introduced with SSE2, it's backward compatible.
   3117 def PAUSE : I<0x90, RawFrm, (outs), (ins),
   3118               "pause", [(int_x86_sse2_pause)]>, OBXS;
   3119 }
   3120 
   3121 let SchedRW = [WriteFence] in {
   3122 // Load, store, and memory fence
   3123 // TODO: As with mfence, we may want to ease the availablity of sfence/lfence
   3124 // to include any 64-bit target.
   3125 def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
   3126                PS, Requires<[HasSSE1]>;
   3127 def LFENCE : I<0xAE, MRM_E8, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
   3128                PS, Requires<[HasSSE2]>;
   3129 def MFENCE : I<0xAE, MRM_F0, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
   3130                PS, Requires<[HasMFence]>;
   3131 } // SchedRW
   3132 
   3133 def : Pat<(X86MFence), (MFENCE)>;
   3134 
   3135 //===----------------------------------------------------------------------===//
   3136 // SSE 1 & 2 - Load/Store XCSR register
   3137 //===----------------------------------------------------------------------===//
   3138 
   3139 def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
   3140                "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
   3141                VEX, Sched<[WriteLDMXCSR]>, VEX_WIG;
   3142 def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
   3143                "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
   3144                VEX, Sched<[WriteSTMXCSR]>, VEX_WIG;
   3145 
   3146 def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
   3147               "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
   3148               TB, Sched<[WriteLDMXCSR]>;
   3149 def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
   3150               "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
   3151               TB, Sched<[WriteSTMXCSR]>;
   3152 
   3153 //===---------------------------------------------------------------------===//
   3154 // SSE2 - Move Aligned/Unaligned Packed Integer Instructions
   3155 //===---------------------------------------------------------------------===//
   3156 
   3157 let ExeDomain = SSEPackedInt in { // SSE integer instructions
   3158 
   3159 let hasSideEffects = 0 in {
   3160 def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   3161                       "movdqa\t{$src, $dst|$dst, $src}", []>,
   3162                       Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
   3163 def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   3164                       "movdqu\t{$src, $dst|$dst, $src}", []>,
   3165                       Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
   3166 def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
   3167                       "movdqa\t{$src, $dst|$dst, $src}", []>,
   3168                       Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
   3169 def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
   3170                       "movdqu\t{$src, $dst|$dst, $src}", []>,
   3171                       Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
   3172 }
   3173 
   3174 // For Disassembler
   3175 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
   3176 def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
   3177                           "movdqa\t{$src, $dst|$dst, $src}", []>,
   3178                           Sched<[SchedWriteVecMoveLS.XMM.RR]>,
   3179                           VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
   3180 def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
   3181                           "movdqa\t{$src, $dst|$dst, $src}", []>,
   3182                           Sched<[SchedWriteVecMoveLS.YMM.RR]>,
   3183                           VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">;
   3184 def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
   3185                           "movdqu\t{$src, $dst|$dst, $src}", []>,
   3186                           Sched<[SchedWriteVecMoveLS.XMM.RR]>,
   3187                           VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
   3188 def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
   3189                           "movdqu\t{$src, $dst|$dst, $src}", []>,
   3190                           Sched<[SchedWriteVecMoveLS.YMM.RR]>,
   3191                           VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">;
   3192 }
   3193 
   3194 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
   3195     hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
   3196 def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
   3197                       "movdqa\t{$src, $dst|$dst, $src}",
   3198                       [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>,
   3199                       Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
   3200 def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
   3201                       "movdqa\t{$src, $dst|$dst, $src}", []>,
   3202                       Sched<[SchedWriteVecMoveLS.YMM.RM]>,
   3203                       VEX, VEX_L, VEX_WIG;
   3204 def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
   3205                    "vmovdqu\t{$src, $dst|$dst, $src}",
   3206                    [(set VR128:$dst, (loadv2i64 addr:$src))]>,
   3207                    Sched<[SchedWriteVecMoveLS.XMM.RM]>,
   3208                    XS, VEX, VEX_WIG;
   3209 def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
   3210                    "vmovdqu\t{$src, $dst|$dst, $src}", []>,
   3211                    Sched<[SchedWriteVecMoveLS.YMM.RM]>,
   3212                    XS, VEX, VEX_L, VEX_WIG;
   3213 }
   3214 
   3215 let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
   3216 def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
   3217                       (ins i128mem:$dst, VR128:$src),
   3218                       "movdqa\t{$src, $dst|$dst, $src}",
   3219                       [(alignedstore (v2i64 VR128:$src), addr:$dst)]>,
   3220                       Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG;
   3221 def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
   3222                       (ins i256mem:$dst, VR256:$src),
   3223                       "movdqa\t{$src, $dst|$dst, $src}", []>,
   3224                      Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG;
   3225 def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
   3226                    "vmovdqu\t{$src, $dst|$dst, $src}",
   3227                    [(store (v2i64 VR128:$src), addr:$dst)]>,
   3228                    Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG;
   3229 def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
   3230                    "vmovdqu\t{$src, $dst|$dst, $src}",[]>,
   3231                    Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG;
   3232 }
   3233 
   3234 let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
   3235 let hasSideEffects = 0 in {
   3236 def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   3237                    "movdqa\t{$src, $dst|$dst, $src}", []>;
   3238 
   3239 def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   3240                    "movdqu\t{$src, $dst|$dst, $src}", []>,
   3241                    XS, Requires<[UseSSE2]>;
   3242 }
   3243 
   3244 // For Disassembler
   3245 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
   3246 def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
   3247                        "movdqa\t{$src, $dst|$dst, $src}", []>,
   3248                        FoldGenData<"MOVDQArr">;
   3249 
   3250 def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
   3251                        "movdqu\t{$src, $dst|$dst, $src}", []>,
   3252                        XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">;
   3253 }
   3254 } // SchedRW
   3255 
   3256 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
   3257     hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in {
   3258 def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
   3259                    "movdqa\t{$src, $dst|$dst, $src}",
   3260                    [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
   3261 def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
   3262                    "movdqu\t{$src, $dst|$dst, $src}",
   3263                    [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
   3264                  XS, Requires<[UseSSE2]>;
   3265 }
   3266 
   3267 let mayStore = 1, hasSideEffects = 0,
   3268     SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
   3269 def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
   3270                    "movdqa\t{$src, $dst|$dst, $src}",
   3271                    [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
   3272 def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
   3273                    "movdqu\t{$src, $dst|$dst, $src}",
   3274                    [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
   3275                  XS, Requires<[UseSSE2]>;
   3276 }
   3277 
   3278 } // ExeDomain = SSEPackedInt
   3279 
   3280 // Aliases to help the assembler pick two byte VEX encodings by swapping the
   3281 // operands relative to the normal instructions to use VEX.R instead of VEX.B.
   3282 def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}",
   3283                 (VMOVDQArr_REV VR128L:$dst, VR128H:$src), 0>;
   3284 def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}",
   3285                 (VMOVDQAYrr_REV VR256L:$dst, VR256H:$src), 0>;
   3286 def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
   3287                 (VMOVDQUrr_REV VR128L:$dst, VR128H:$src), 0>;
   3288 def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
   3289                 (VMOVDQUYrr_REV VR256L:$dst, VR256H:$src), 0>;
   3290 
   3291 // Reversed version with ".s" suffix for GAS compatibility.
   3292 def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
   3293                 (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>;
   3294 def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
   3295                 (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>;
   3296 def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
   3297                 (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
   3298 def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
   3299                 (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>;
   3300 
   3301 // Reversed version with ".s" suffix for GAS compatibility.
   3302 def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}",
   3303                 (MOVDQArr_REV VR128:$dst, VR128:$src), 0>;
   3304 def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
   3305                 (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
   3306 
   3307 let Predicates = [HasAVX, NoVLX] in {
   3308   // Additional patterns for other integer sizes.
   3309   def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
   3310             (VMOVDQAmr addr:$dst, VR128:$src)>;
   3311   def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
   3312             (VMOVDQAmr addr:$dst, VR128:$src)>;
   3313   def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
   3314             (VMOVDQAmr addr:$dst, VR128:$src)>;
   3315   def : Pat<(store (v4i32 VR128:$src), addr:$dst),
   3316             (VMOVDQUmr addr:$dst, VR128:$src)>;
   3317   def : Pat<(store (v8i16 VR128:$src), addr:$dst),
   3318             (VMOVDQUmr addr:$dst, VR128:$src)>;
   3319   def : Pat<(store (v16i8 VR128:$src), addr:$dst),
   3320             (VMOVDQUmr addr:$dst, VR128:$src)>;
   3321 }
   3322 
   3323 //===---------------------------------------------------------------------===//
   3324 // SSE2 - Packed Integer Arithmetic Instructions
   3325 //===---------------------------------------------------------------------===//
   3326 
   3327 let ExeDomain = SSEPackedInt in { // SSE integer instructions
   3328 
   3329 /// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
   3330 multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
   3331                          ValueType DstVT, ValueType SrcVT, RegisterClass RC,
   3332                          PatFrag memop_frag, X86MemOperand x86memop,
   3333                          X86FoldableSchedWrite sched, bit Is2Addr = 1> {
   3334   let isCommutable = 1 in
   3335   def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
   3336        (ins RC:$src1, RC:$src2),
   3337        !if(Is2Addr,
   3338            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   3339            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   3340        [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
   3341        Sched<[sched]>;
   3342   def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
   3343        (ins RC:$src1, x86memop:$src2),
   3344        !if(Is2Addr,
   3345            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   3346            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   3347        [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
   3348                                      (bitconvert (memop_frag addr:$src2)))))]>,
   3349        Sched<[sched.Folded, ReadAfterLd]>;
   3350 }
   3351 } // ExeDomain = SSEPackedInt
   3352 
   3353 defm PADDB   : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
   3354                              SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
   3355 defm PADDW   : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
   3356                              SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
   3357 defm PADDD   : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
   3358                              SchedWriteVecALU, 1, NoVLX>;
   3359 defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
   3360                              SchedWriteVecALU, 1, NoVLX>;
   3361 defm PADDSB  : PDI_binop_all<0xEC, "paddsb", X86adds, v16i8, v32i8,
   3362                              SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
   3363 defm PADDSW  : PDI_binop_all<0xED, "paddsw", X86adds, v8i16, v16i16,
   3364                              SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
   3365 defm PADDUSB : PDI_binop_all<0xDC, "paddusb", X86addus, v16i8, v32i8,
   3366                              SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
   3367 defm PADDUSW : PDI_binop_all<0xDD, "paddusw", X86addus, v8i16, v16i16,
   3368                              SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
   3369 defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
   3370                              SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
   3371 defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
   3372                              SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
   3373 defm PMULHW  : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
   3374                              SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
   3375 defm PSUBB   : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
   3376                              SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
   3377 defm PSUBW   : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
   3378                              SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
   3379 defm PSUBD   : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
   3380                              SchedWriteVecALU, 0, NoVLX>;
   3381 defm PSUBQ   : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
   3382                              SchedWriteVecALU, 0, NoVLX>;
   3383 defm PSUBSB  : PDI_binop_all<0xE8, "psubsb", X86subs, v16i8, v32i8,
   3384                              SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
   3385 defm PSUBSW  : PDI_binop_all<0xE9, "psubsw", X86subs, v8i16, v16i16,
   3386                              SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
   3387 defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
   3388                              SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
   3389 defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
   3390                              SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
   3391 defm PMINUB  : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
   3392                              SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
   3393 defm PMINSW  : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
   3394                              SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
   3395 defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
   3396                              SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
   3397 defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
   3398                              SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
   3399 defm PAVGB   : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
   3400                              SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
   3401 defm PAVGW   : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
   3402                              SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
   3403 defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
   3404                              SchedWriteVecIMul, 1, NoVLX>;
   3405 
   3406 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
   3407 defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
   3408                               loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>,
   3409                               VEX_4V, VEX_WIG;
   3410 
   3411 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
   3412 defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
   3413                                VR256, loadv4i64, i256mem, SchedWriteVecIMul.YMM,
   3414                                0>, VEX_4V, VEX_L, VEX_WIG;
   3415 let Constraints = "$src1 = $dst" in
   3416 defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
   3417                              memopv2i64, i128mem, SchedWriteVecIMul.XMM>;
   3418 
   3419 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
   3420 defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
   3421                              loadv2i64, i128mem, SchedWritePSADBW.XMM, 0>,
   3422                              VEX_4V, VEX_WIG;
   3423 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
   3424 defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
   3425                              loadv4i64, i256mem, SchedWritePSADBW.YMM, 0>,
   3426                              VEX_4V, VEX_L, VEX_WIG;
   3427 let Constraints = "$src1 = $dst" in
   3428 defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
   3429                             memopv2i64, i128mem, SchedWritePSADBW.XMM>;
   3430 
   3431 //===---------------------------------------------------------------------===//
   3432 // SSE2 - Packed Integer Logical Instructions
   3433 //===---------------------------------------------------------------------===//
   3434 
   3435 multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
   3436                          string OpcodeStr, SDNode OpNode,
   3437                          SDNode OpNode2, RegisterClass RC,
   3438                          X86FoldableSchedWrite sched,
   3439                          X86FoldableSchedWrite schedImm,
   3440                          ValueType DstVT, ValueType SrcVT,
   3441                          PatFrag ld_frag, bit Is2Addr = 1> {
   3442   // src2 is always 128-bit
   3443   def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
   3444        (ins RC:$src1, VR128:$src2),
   3445        !if(Is2Addr,
   3446            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   3447            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   3448        [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>,
   3449        Sched<[sched]>;
   3450   def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
   3451        (ins RC:$src1, i128mem:$src2),
   3452        !if(Is2Addr,
   3453            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   3454            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   3455        [(set RC:$dst, (DstVT (OpNode RC:$src1,
   3456                        (SrcVT (bitconvert (ld_frag addr:$src2))))))]>,
   3457        Sched<[sched.Folded, ReadAfterLd]>;
   3458   def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
   3459        (ins RC:$src1, u8imm:$src2),
   3460        !if(Is2Addr,
   3461            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   3462            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   3463        [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))]>,
   3464        Sched<[schedImm]>;
   3465 }
   3466 
   3467 multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
   3468                              string OpcodeStr, SDNode OpNode,
   3469                              SDNode OpNode2, ValueType DstVT128,
   3470                              ValueType DstVT256, ValueType SrcVT,
   3471                              X86SchedWriteWidths sched,
   3472                              X86SchedWriteWidths schedImm, Predicate prd> {
   3473 let Predicates = [HasAVX, prd] in
   3474   defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
   3475                               OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
   3476                               DstVT128, SrcVT, loadv2i64, 0>, VEX_4V, VEX_WIG;
   3477 let Predicates = [HasAVX2, prd] in
   3478   defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
   3479                                 OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
   3480                                 DstVT256, SrcVT, loadv2i64, 0>, VEX_4V, VEX_L,
   3481                                 VEX_WIG;
   3482 let Constraints = "$src1 = $dst" in
   3483   defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
   3484                             VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
   3485                             memopv2i64>;
   3486 }
   3487 
   3488 multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
   3489                         SDNode OpNode, RegisterClass RC, ValueType VT,
   3490                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
   3491   def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
   3492        !if(Is2Addr,
   3493            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   3494            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   3495        [(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))]>,
   3496        Sched<[sched]>;
   3497 }
   3498 
   3499 multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
   3500                             SDNode OpNode, X86SchedWriteWidths sched> {
   3501 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
   3502   defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
   3503                              VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG;
   3504 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
   3505   defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
   3506                                VR256, v32i8, sched.YMM, 0>,
   3507                                VEX_4V, VEX_L, VEX_WIG;
   3508 let Constraints = "$src1 = $dst" in
   3509   defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8,
   3510                            sched.XMM>;
   3511 }
   3512 
   3513 let ExeDomain = SSEPackedInt in {
   3514   defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
   3515                                  v8i16, v16i16, v8i16, SchedWriteVecShift,
   3516                                  SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
   3517   defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
   3518                                  v4i32, v8i32, v4i32, SchedWriteVecShift,
   3519                                  SchedWriteVecShiftImm, NoVLX>;
   3520   defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
   3521                                  v2i64, v4i64, v2i64, SchedWriteVecShift,
   3522                                  SchedWriteVecShiftImm, NoVLX>;
   3523 
   3524   defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
   3525                                  v8i16, v16i16, v8i16, SchedWriteVecShift,
   3526                                  SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
   3527   defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
   3528                                  v4i32, v8i32, v4i32, SchedWriteVecShift,
   3529                                  SchedWriteVecShiftImm, NoVLX>;
   3530   defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
   3531                                  v2i64, v4i64, v2i64, SchedWriteVecShift,
   3532                                  SchedWriteVecShiftImm, NoVLX>;
   3533 
   3534   defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
   3535                                  v8i16, v16i16, v8i16, SchedWriteVecShift,
   3536                                  SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
   3537   defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
   3538                                  v4i32, v8i32, v4i32, SchedWriteVecShift,
   3539                                  SchedWriteVecShiftImm, NoVLX>;
   3540 
   3541   defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq,
   3542                                  SchedWriteShuffle>;
   3543   defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq,
   3544                                  SchedWriteShuffle>;
   3545 } // ExeDomain = SSEPackedInt
   3546 
   3547 //===---------------------------------------------------------------------===//
   3548 // SSE2 - Packed Integer Comparison Instructions
   3549 //===---------------------------------------------------------------------===//
   3550 
   3551 defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
   3552                              SchedWriteVecALU, 1, TruePredicate>;
   3553 defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
   3554                              SchedWriteVecALU, 1, TruePredicate>;
   3555 defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
   3556                              SchedWriteVecALU, 1, TruePredicate>;
   3557 defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
   3558                              SchedWriteVecALU, 0, TruePredicate>;
   3559 defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
   3560                              SchedWriteVecALU, 0, TruePredicate>;
   3561 defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
   3562                              SchedWriteVecALU, 0, TruePredicate>;
   3563 
   3564 //===---------------------------------------------------------------------===//
   3565 // SSE2 - Packed Integer Shuffle Instructions
   3566 //===---------------------------------------------------------------------===//
   3567 
   3568 let ExeDomain = SSEPackedInt in {
   3569 multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
   3570                          SDNode OpNode, X86SchedWriteWidths sched,
   3571                          Predicate prd> {
   3572 let Predicates = [HasAVX, prd] in {
   3573   def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
   3574                       (ins VR128:$src1, u8imm:$src2),
   3575                       !strconcat("v", OpcodeStr,
   3576                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   3577                       [(set VR128:$dst,
   3578                         (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>,
   3579                       VEX, Sched<[sched.XMM]>, VEX_WIG;
   3580   def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
   3581                       (ins i128mem:$src1, u8imm:$src2),
   3582                       !strconcat("v", OpcodeStr,
   3583                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   3584                      [(set VR128:$dst,
   3585                        (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
   3586                         (i8 imm:$src2))))]>, VEX,
   3587                   Sched<[sched.XMM.Folded]>, VEX_WIG;
   3588 }
   3589 
   3590 let Predicates = [HasAVX2, prd] in {
   3591   def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
   3592                        (ins VR256:$src1, u8imm:$src2),
   3593                        !strconcat("v", OpcodeStr,
   3594                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   3595                        [(set VR256:$dst,
   3596                          (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))]>,
   3597                        VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
   3598   def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
   3599                        (ins i256mem:$src1, u8imm:$src2),
   3600                        !strconcat("v", OpcodeStr,
   3601                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   3602                       [(set VR256:$dst,
   3603                         (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
   3604                          (i8 imm:$src2))))]>, VEX, VEX_L,
   3605                    Sched<[sched.YMM.Folded]>, VEX_WIG;
   3606 }
   3607 
   3608 let Predicates = [UseSSE2] in {
   3609   def ri : Ii8<0x70, MRMSrcReg,
   3610                (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
   3611                !strconcat(OpcodeStr,
   3612                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   3613                [(set VR128:$dst,
   3614                  (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>,
   3615                Sched<[sched.XMM]>;
   3616   def mi : Ii8<0x70, MRMSrcMem,
   3617                (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
   3618                !strconcat(OpcodeStr,
   3619                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   3620                [(set VR128:$dst,
   3621                  (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
   3622                         (i8 imm:$src2))))]>,
   3623                Sched<[sched.XMM.Folded]>;
   3624 }
   3625 }
   3626 } // ExeDomain = SSEPackedInt
   3627 
   3628 defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd,
   3629                              SchedWriteShuffle, NoVLX>, PD;
   3630 defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
   3631                              SchedWriteShuffle, NoVLX_Or_NoBWI>, XS;
   3632 defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
   3633                              SchedWriteShuffle, NoVLX_Or_NoBWI>, XD;
   3634 
   3635 //===---------------------------------------------------------------------===//
   3636 // Packed Integer Pack Instructions (SSE & AVX)
   3637 //===---------------------------------------------------------------------===//
   3638 
   3639 let ExeDomain = SSEPackedInt in {
   3640 multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
   3641                      ValueType ArgVT, SDNode OpNode, RegisterClass RC,
   3642                      X86MemOperand x86memop, X86FoldableSchedWrite sched,
   3643                      PatFrag ld_frag, bit Is2Addr = 1> {
   3644   def rr : PDI<opc, MRMSrcReg,
   3645                (outs RC:$dst), (ins RC:$src1, RC:$src2),
   3646                !if(Is2Addr,
   3647                    !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   3648                    !strconcat(OpcodeStr,
   3649                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   3650                [(set RC:$dst,
   3651                      (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
   3652                Sched<[sched]>;
   3653   def rm : PDI<opc, MRMSrcMem,
   3654                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
   3655                !if(Is2Addr,
   3656                    !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   3657                    !strconcat(OpcodeStr,
   3658                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   3659                [(set RC:$dst,
   3660                      (OutVT (OpNode (ArgVT RC:$src1),
   3661                                     (bitconvert (ld_frag addr:$src2)))))]>,
   3662                Sched<[sched.Folded, ReadAfterLd]>;
   3663 }
   3664 
   3665 multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
   3666                      ValueType ArgVT, SDNode OpNode, RegisterClass RC,
   3667                      X86MemOperand x86memop, X86FoldableSchedWrite sched,
   3668                      PatFrag ld_frag, bit Is2Addr = 1> {
   3669   def rr : SS48I<opc, MRMSrcReg,
   3670                  (outs RC:$dst), (ins RC:$src1, RC:$src2),
   3671                  !if(Is2Addr,
   3672                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   3673                      !strconcat(OpcodeStr,
   3674                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   3675                  [(set RC:$dst,
   3676                        (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
   3677                  Sched<[sched]>;
   3678   def rm : SS48I<opc, MRMSrcMem,
   3679                  (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
   3680                  !if(Is2Addr,
   3681                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   3682                      !strconcat(OpcodeStr,
   3683                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   3684                  [(set RC:$dst,
   3685                        (OutVT (OpNode (ArgVT RC:$src1),
   3686                                       (bitconvert (ld_frag addr:$src2)))))]>,
   3687                  Sched<[sched.Folded, ReadAfterLd]>;
   3688 }
   3689 
   3690 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   3691   defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
   3692                              i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
   3693                              VEX_4V, VEX_WIG;
   3694   defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
   3695                              i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
   3696                              VEX_4V, VEX_WIG;
   3697 
   3698   defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
   3699                              i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
   3700                              VEX_4V, VEX_WIG;
   3701   defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
   3702                              i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
   3703                              VEX_4V;
   3704 }
   3705 
   3706 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   3707   defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
   3708                               i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
   3709                               VEX_4V, VEX_L, VEX_WIG;
   3710   defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
   3711                               i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
   3712                               VEX_4V, VEX_L, VEX_WIG;
   3713 
   3714   defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
   3715                               i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
   3716                               VEX_4V, VEX_L, VEX_WIG;
   3717   defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
   3718                               i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
   3719                               VEX_4V, VEX_L;
   3720 }
   3721 
   3722 let Constraints = "$src1 = $dst" in {
   3723   defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
   3724                             i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   3725   defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
   3726                             i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   3727 
   3728   defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
   3729                             i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   3730 
   3731   defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
   3732                             i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   3733 }
   3734 } // ExeDomain = SSEPackedInt
   3735 
   3736 //===---------------------------------------------------------------------===//
   3737 // SSE2 - Packed Integer Unpack Instructions
   3738 //===---------------------------------------------------------------------===//
   3739 
   3740 let ExeDomain = SSEPackedInt in {
   3741 multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
   3742                        SDNode OpNode, RegisterClass RC, X86MemOperand x86memop,
   3743                        X86FoldableSchedWrite sched, PatFrag ld_frag,
   3744                        bit Is2Addr = 1> {
   3745   def rr : PDI<opc, MRMSrcReg,
   3746       (outs RC:$dst), (ins RC:$src1, RC:$src2),
   3747       !if(Is2Addr,
   3748           !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
   3749           !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   3750       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
   3751       Sched<[sched]>;
   3752   def rm : PDI<opc, MRMSrcMem,
   3753       (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
   3754       !if(Is2Addr,
   3755           !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
   3756           !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   3757       [(set RC:$dst, (vt (OpNode RC:$src1,
   3758                                   (bitconvert (ld_frag addr:$src2)))))]>,
   3759       Sched<[sched.Folded, ReadAfterLd]>;
   3760 }
   3761 
   3762 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   3763   defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
   3764                                  i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
   3765                                  VEX_4V, VEX_WIG;
   3766   defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
   3767                                  i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
   3768                                  VEX_4V, VEX_WIG;
   3769   defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
   3770                                  i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
   3771                                  VEX_4V, VEX_WIG;
   3772   defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
   3773                                  i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
   3774                                  VEX_4V, VEX_WIG;
   3775 }
   3776 
   3777 let Predicates = [HasAVX, NoVLX] in {
   3778   defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
   3779                                  i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
   3780                                  VEX_4V, VEX_WIG;
   3781   defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
   3782                                  i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
   3783                                  VEX_4V, VEX_WIG;
   3784   defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
   3785                                  i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
   3786                                  VEX_4V, VEX_WIG;
   3787   defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
   3788                                  i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
   3789                                  VEX_4V, VEX_WIG;
   3790 }
   3791 
   3792 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   3793   defm VPUNPCKLBWY  : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
   3794                                   i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
   3795                                   VEX_4V, VEX_L, VEX_WIG;
   3796   defm VPUNPCKLWDY  : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
   3797                                   i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
   3798                                   VEX_4V, VEX_L, VEX_WIG;
   3799   defm VPUNPCKHBWY  : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
   3800                                   i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
   3801                                   VEX_4V, VEX_L, VEX_WIG;
   3802   defm VPUNPCKHWDY  : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
   3803                                   i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
   3804                                   VEX_4V, VEX_L, VEX_WIG;
   3805 }
   3806 
   3807 let Predicates = [HasAVX2, NoVLX] in {
   3808   defm VPUNPCKLDQY  : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
   3809                                   i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
   3810                                   VEX_4V, VEX_L, VEX_WIG;
   3811   defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
   3812                                   i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
   3813                                   VEX_4V, VEX_L, VEX_WIG;
   3814   defm VPUNPCKHDQY  : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
   3815                                   i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
   3816                                   VEX_4V, VEX_L, VEX_WIG;
   3817   defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
   3818                                   i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
   3819                                   VEX_4V, VEX_L, VEX_WIG;
   3820 }
   3821 
   3822 let Constraints = "$src1 = $dst" in {
   3823   defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
   3824                                 i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   3825   defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
   3826                                 i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   3827   defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
   3828                                 i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   3829   defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
   3830                                 i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   3831 
   3832   defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
   3833                                 i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   3834   defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
   3835                                 i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   3836   defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
   3837                                 i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   3838   defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
   3839                                 i128mem, SchedWriteShuffle.XMM, memopv2i64>;
   3840 }
   3841 } // ExeDomain = SSEPackedInt
   3842 
   3843 //===---------------------------------------------------------------------===//
   3844 // SSE2 - Packed Integer Extract and Insert
   3845 //===---------------------------------------------------------------------===//
   3846 
   3847 let ExeDomain = SSEPackedInt in {
   3848 multiclass sse2_pinsrw<bit Is2Addr = 1> {
   3849   def rr : Ii8<0xC4, MRMSrcReg,
   3850        (outs VR128:$dst), (ins VR128:$src1,
   3851         GR32orGR64:$src2, u8imm:$src3),
   3852        !if(Is2Addr,
   3853            "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
   3854            "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
   3855        [(set VR128:$dst,
   3856          (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
   3857        Sched<[WriteVecInsert]>;
   3858   def rm : Ii8<0xC4, MRMSrcMem,
   3859                       (outs VR128:$dst), (ins VR128:$src1,
   3860                        i16mem:$src2, u8imm:$src3),
   3861        !if(Is2Addr,
   3862            "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
   3863            "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
   3864        [(set VR128:$dst,
   3865          (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
   3866                     imm:$src3))]>,
   3867        Sched<[WriteVecInsertLd, ReadAfterLd]>;
   3868 }
   3869 
   3870 // Extract
   3871 let Predicates = [HasAVX, NoBWI] in
   3872 def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
   3873                     (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
   3874                     "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   3875                     [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
   3876                                             imm:$src2))]>,
   3877                 PD, VEX, Sched<[WriteVecExtract]>;
   3878 def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
   3879                     (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
   3880                     "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   3881                     [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
   3882                                             imm:$src2))]>,
   3883                Sched<[WriteVecExtract]>;
   3884 
   3885 // Insert
   3886 let Predicates = [HasAVX, NoBWI] in
   3887 defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V;
   3888 
   3889 let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
   3890 defm PINSRW : sse2_pinsrw, PD;
   3891 
   3892 } // ExeDomain = SSEPackedInt
   3893 
   3894 //===---------------------------------------------------------------------===//
   3895 // SSE2 - Packed Mask Creation
   3896 //===---------------------------------------------------------------------===//
   3897 
   3898 let ExeDomain = SSEPackedInt in {
   3899 
   3900 def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
   3901            (ins VR128:$src),
   3902            "pmovmskb\t{$src, $dst|$dst, $src}",
   3903            [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
   3904            Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG;
   3905 
   3906 let Predicates = [HasAVX2] in {
   3907 def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
   3908            (ins VR256:$src),
   3909            "pmovmskb\t{$src, $dst|$dst, $src}",
   3910            [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
   3911            Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG;
   3912 }
   3913 
   3914 def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
   3915            "pmovmskb\t{$src, $dst|$dst, $src}",
   3916            [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
   3917            Sched<[WriteVecMOVMSK]>;
   3918 
   3919 } // ExeDomain = SSEPackedInt
   3920 
   3921 //===---------------------------------------------------------------------===//
   3922 // SSE2 - Conditional Store
   3923 //===---------------------------------------------------------------------===//
   3924 
   3925 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
   3926 let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
   3927 def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
   3928            (ins VR128:$src, VR128:$mask),
   3929            "maskmovdqu\t{$mask, $src|$src, $mask}",
   3930            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
   3931            VEX, VEX_WIG;
   3932 let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
   3933 def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
   3934            (ins VR128:$src, VR128:$mask),
   3935            "maskmovdqu\t{$mask, $src|$src, $mask}",
   3936            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
   3937            VEX, VEX_WIG;
   3938 
   3939 let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
   3940 def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
   3941            "maskmovdqu\t{$mask, $src|$src, $mask}",
   3942            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
   3943 let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
   3944 def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
   3945            "maskmovdqu\t{$mask, $src|$src, $mask}",
   3946            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
   3947 
   3948 } // ExeDomain = SSEPackedInt
   3949 
   3950 //===---------------------------------------------------------------------===//
   3951 // SSE2 - Move Doubleword/Quadword
   3952 //===---------------------------------------------------------------------===//
   3953 
   3954 //===---------------------------------------------------------------------===//
   3955 // Move Int Doubleword to Packed Double Int
   3956 //
   3957 let ExeDomain = SSEPackedInt in {
   3958 def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
   3959                         "movd\t{$src, $dst|$dst, $src}",
   3960                         [(set VR128:$dst,
   3961                           (v4i32 (scalar_to_vector GR32:$src)))]>,
   3962                           VEX, Sched<[WriteVecMoveFromGpr]>;
   3963 def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
   3964                         "movd\t{$src, $dst|$dst, $src}",
   3965                         [(set VR128:$dst,
   3966                           (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
   3967                         VEX, Sched<[WriteVecLoad]>;
   3968 def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
   3969                           "movq\t{$src, $dst|$dst, $src}",
   3970                           [(set VR128:$dst,
   3971                             (v2i64 (scalar_to_vector GR64:$src)))]>,
   3972                           VEX, Sched<[WriteVecMoveFromGpr]>;
   3973 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
   3974 def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
   3975                           "movq\t{$src, $dst|$dst, $src}", []>,
   3976                           VEX, Sched<[WriteVecLoad]>;
   3977 let isCodeGenOnly = 1 in
   3978 def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
   3979                          "movq\t{$src, $dst|$dst, $src}",
   3980                          [(set FR64:$dst, (bitconvert GR64:$src))]>,
   3981                          VEX, Sched<[WriteVecMoveFromGpr]>;
   3982 
   3983 def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
   3984                       "movd\t{$src, $dst|$dst, $src}",
   3985                       [(set VR128:$dst,
   3986                         (v4i32 (scalar_to_vector GR32:$src)))]>,
   3987                       Sched<[WriteVecMoveFromGpr]>;
   3988 def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
   3989                       "movd\t{$src, $dst|$dst, $src}",
   3990                       [(set VR128:$dst,
   3991                         (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
   3992                       Sched<[WriteVecLoad]>;
   3993 def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
   3994                         "movq\t{$src, $dst|$dst, $src}",
   3995                         [(set VR128:$dst,
   3996                           (v2i64 (scalar_to_vector GR64:$src)))]>,
   3997                         Sched<[WriteVecMoveFromGpr]>;
   3998 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
   3999 def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
   4000                         "movq\t{$src, $dst|$dst, $src}", []>,
   4001                         Sched<[WriteVecLoad]>;
   4002 let isCodeGenOnly = 1 in
   4003 def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
   4004                        "movq\t{$src, $dst|$dst, $src}",
   4005                        [(set FR64:$dst, (bitconvert GR64:$src))]>,
   4006                        Sched<[WriteVecMoveFromGpr]>;
   4007 } // ExeDomain = SSEPackedInt
   4008 
   4009 //===---------------------------------------------------------------------===//
   4010 // Move Int Doubleword to Single Scalar
   4011 //
   4012 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
   4013   def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
   4014                         "movd\t{$src, $dst|$dst, $src}",
   4015                         [(set FR32:$dst, (bitconvert GR32:$src))]>,
   4016                         VEX, Sched<[WriteVecMoveFromGpr]>;
   4017 
   4018   def VMOVDI2SSrm  : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
   4019                         "movd\t{$src, $dst|$dst, $src}",
   4020                         [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>,
   4021                         VEX, Sched<[WriteVecLoad]>;
   4022   def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
   4023                         "movd\t{$src, $dst|$dst, $src}",
   4024                         [(set FR32:$dst, (bitconvert GR32:$src))]>,
   4025                         Sched<[WriteVecMoveFromGpr]>;
   4026 
   4027   def MOVDI2SSrm  : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
   4028                         "movd\t{$src, $dst|$dst, $src}",
   4029                         [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>,
   4030                         Sched<[WriteVecLoad]>;
   4031 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
   4032 
   4033 //===---------------------------------------------------------------------===//
   4034 // Move Packed Doubleword Int to Packed Double Int
   4035 //
   4036 let ExeDomain = SSEPackedInt in {
   4037 def VMOVPDI2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
   4038                          "movd\t{$src, $dst|$dst, $src}",
   4039                          [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
   4040                                           (iPTR 0)))]>, VEX,
   4041                          Sched<[WriteVecMoveToGpr]>;
   4042 def VMOVPDI2DImr  : VS2I<0x7E, MRMDestMem, (outs),
   4043                          (ins i32mem:$dst, VR128:$src),
   4044                          "movd\t{$src, $dst|$dst, $src}",
   4045                          [(store (i32 (extractelt (v4i32 VR128:$src),
   4046                                        (iPTR 0))), addr:$dst)]>,
   4047                          VEX, Sched<[WriteVecStore]>;
   4048 def MOVPDI2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
   4049                        "movd\t{$src, $dst|$dst, $src}",
   4050                        [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
   4051                                         (iPTR 0)))]>,
   4052                    Sched<[WriteVecMoveToGpr]>;
   4053 def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
   4054                        "movd\t{$src, $dst|$dst, $src}",
   4055                        [(store (i32 (extractelt (v4i32 VR128:$src),
   4056                                      (iPTR 0))), addr:$dst)]>,
   4057                        Sched<[WriteVecStore]>;
   4058 } // ExeDomain = SSEPackedInt
   4059 
   4060 //===---------------------------------------------------------------------===//
   4061 // Move Packed Doubleword Int first element to Doubleword Int
   4062 //
   4063 let ExeDomain = SSEPackedInt in {
   4064 let SchedRW = [WriteVecMoveToGpr] in {
   4065 def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
   4066                           "movq\t{$src, $dst|$dst, $src}",
   4067                           [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
   4068                                                         (iPTR 0)))]>,
   4069                       VEX;
   4070 
   4071 def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
   4072                         "movq\t{$src, $dst|$dst, $src}",
   4073                         [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
   4074                                                          (iPTR 0)))]>;
   4075 } //SchedRW
   4076 
   4077 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
   4078 def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
   4079                           (ins i64mem:$dst, VR128:$src),
   4080                           "movq\t{$src, $dst|$dst, $src}", []>,
   4081                           VEX, Sched<[WriteVecStore]>;
   4082 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
   4083 def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
   4084                         "movq\t{$src, $dst|$dst, $src}", []>,
   4085                         Sched<[WriteVecStore]>;
   4086 } // ExeDomain = SSEPackedInt
   4087 
   4088 //===---------------------------------------------------------------------===//
   4089 // Bitcast FR64 <-> GR64
   4090 //
   4091 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
   4092   let Predicates = [UseAVX] in
   4093   def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
   4094                           "movq\t{$src, $dst|$dst, $src}",
   4095                           [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
   4096                           VEX, Sched<[WriteVecLoad]>;
   4097   def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
   4098                            "movq\t{$src, $dst|$dst, $src}",
   4099                            [(set GR64:$dst, (bitconvert FR64:$src))]>,
   4100                            VEX, Sched<[WriteVecMoveToGpr]>;
   4101   def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
   4102                            "movq\t{$src, $dst|$dst, $src}",
   4103                            [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>,
   4104                            VEX, Sched<[WriteVecStore]>;
   4105 
   4106   def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
   4107                          "movq\t{$src, $dst|$dst, $src}",
   4108                          [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
   4109                          Sched<[WriteVecLoad]>;
   4110   def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
   4111                          "movq\t{$src, $dst|$dst, $src}",
   4112                          [(set GR64:$dst, (bitconvert FR64:$src))]>,
   4113                          Sched<[WriteVecMoveToGpr]>;
   4114   def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
   4115                          "movq\t{$src, $dst|$dst, $src}",
   4116                          [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>,
   4117                          Sched<[WriteVecStore]>;
   4118 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
   4119 
   4120 //===---------------------------------------------------------------------===//
   4121 // Move Scalar Single to Double Int
   4122 //
   4123 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
   4124   def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
   4125                         "movd\t{$src, $dst|$dst, $src}",
   4126                         [(set GR32:$dst, (bitconvert FR32:$src))]>,
   4127                         VEX, Sched<[WriteVecMoveToGpr]>;
   4128   def VMOVSS2DImr  : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
   4129                         "movd\t{$src, $dst|$dst, $src}",
   4130                         [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>,
   4131                         VEX, Sched<[WriteVecStore]>;
   4132   def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
   4133                         "movd\t{$src, $dst|$dst, $src}",
   4134                         [(set GR32:$dst, (bitconvert FR32:$src))]>,
   4135                         Sched<[WriteVecMoveToGpr]>;
   4136   def MOVSS2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
   4137                         "movd\t{$src, $dst|$dst, $src}",
   4138                         [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>,
   4139                         Sched<[WriteVecStore]>;
   4140 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
   4141 
   4142 let Predicates = [UseAVX] in {
   4143   def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
   4144             (VMOVDI2PDIrr GR32:$src)>;
   4145 
   4146   def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
   4147             (VMOV64toPQIrr GR64:$src)>;
   4148 
   4149   def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
   4150               (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
   4151             (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIrr GR64:$src)), sub_xmm)>;
   4152   // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
   4153   // These instructions also write zeros in the high part of a 256-bit register.
   4154   def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
   4155             (VMOVDI2PDIrm addr:$src)>;
   4156   def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
   4157             (VMOVDI2PDIrm addr:$src)>;
   4158   def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
   4159             (VMOVDI2PDIrm addr:$src)>;
   4160   def : Pat<(v4i32 (X86vzload addr:$src)),
   4161             (VMOVDI2PDIrm addr:$src)>;
   4162   def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
   4163               (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
   4164             (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
   4165   def : Pat<(v8i32 (X86vzload addr:$src)),
   4166             (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
   4167   // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
   4168   def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
   4169                                (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
   4170             (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrr GR32:$src)), sub_xmm)>;
   4171 }
   4172 
   4173 let Predicates = [UseSSE2] in {
   4174   def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
   4175             (MOVDI2PDIrr GR32:$src)>;
   4176 
   4177   def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
   4178             (MOV64toPQIrr GR64:$src)>;
   4179   def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
   4180             (MOVDI2PDIrm addr:$src)>;
   4181   def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
   4182             (MOVDI2PDIrm addr:$src)>;
   4183   def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
   4184             (MOVDI2PDIrm addr:$src)>;
   4185   def : Pat<(v4i32 (X86vzload addr:$src)),
   4186             (MOVDI2PDIrm addr:$src)>;
   4187 }
   4188 
   4189 // Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of
   4190 // "movq" due to MacOS parsing limitation. In order to parse old assembly, we add
   4191 // these aliases.
   4192 def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
   4193                 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
   4194 def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
   4195                 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
   4196 // Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
   4197 def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
   4198                 (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
   4199 def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
   4200                 (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
   4201 
   4202 //===---------------------------------------------------------------------===//
   4203 // SSE2 - Move Quadword
   4204 //===---------------------------------------------------------------------===//
   4205 
   4206 //===---------------------------------------------------------------------===//
   4207 // Move Quadword Int to Packed Quadword Int
   4208 //
   4209 
   4210 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in {
   4211 def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
   4212                     "vmovq\t{$src, $dst|$dst, $src}",
   4213                     [(set VR128:$dst,
   4214                       (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
   4215                     VEX, Requires<[UseAVX]>, VEX_WIG;
   4216 def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
   4217                     "movq\t{$src, $dst|$dst, $src}",
   4218                     [(set VR128:$dst,
   4219                       (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
   4220                     XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
   4221 } // ExeDomain, SchedRW
   4222 
   4223 //===---------------------------------------------------------------------===//
   4224 // Move Packed Quadword Int to Quadword Int
   4225 //
   4226 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in {
   4227 def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
   4228                         "movq\t{$src, $dst|$dst, $src}",
   4229                         [(store (i64 (extractelt (v2i64 VR128:$src),
   4230                                       (iPTR 0))), addr:$dst)]>,
   4231                         VEX, VEX_WIG;
   4232 def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
   4233                       "movq\t{$src, $dst|$dst, $src}",
   4234                       [(store (i64 (extractelt (v2i64 VR128:$src),
   4235                                     (iPTR 0))), addr:$dst)]>;
   4236 } // ExeDomain, SchedRW
   4237 
   4238 // For disassembler only
   4239 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
   4240     SchedRW = [SchedWriteVecLogic.XMM] in {
   4241 def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
   4242                      "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG;
   4243 def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
   4244                       "movq\t{$src, $dst|$dst, $src}", []>;
   4245 }
   4246 
   4247 // Aliases to help the assembler pick two byte VEX encodings by swapping the
   4248 // operands relative to the normal instructions to use VEX.R instead of VEX.B.
   4249 def : InstAlias<"vmovq\t{$src, $dst|$dst, $src}",
   4250                 (VMOVPQI2QIrr VR128L:$dst, VR128H:$src), 0>;
   4251 
   4252 def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
   4253                 (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
   4254 def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
   4255                 (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
   4256 
   4257 let Predicates = [UseAVX] in {
   4258   def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
   4259             (VMOVQI2PQIrm addr:$src)>;
   4260   def : Pat<(v2i64 (X86vzload addr:$src)),
   4261             (VMOVQI2PQIrm addr:$src)>;
   4262   def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
   4263               (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
   4264             (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
   4265   def : Pat<(v4i64 (X86vzload addr:$src)),
   4266             (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
   4267 }
   4268 
   4269 let Predicates = [UseSSE2] in {
   4270   def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
   4271             (MOVQI2PQIrm addr:$src)>;
   4272   def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>;
   4273 }
   4274 
   4275 //===---------------------------------------------------------------------===//
   4276 // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
   4277 // IA32 document. movq xmm1, xmm2 does clear the high bits.
   4278 //
   4279 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
   4280 def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   4281                         "vmovq\t{$src, $dst|$dst, $src}",
   4282                     [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
   4283                          XS, VEX, Requires<[UseAVX]>, VEX_WIG;
   4284 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   4285                         "movq\t{$src, $dst|$dst, $src}",
   4286                     [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
   4287                         XS, Requires<[UseSSE2]>;
   4288 } // ExeDomain, SchedRW
   4289 
   4290 let Predicates = [UseAVX] in {
   4291   def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
   4292             (VMOVZPQILo2PQIrr VR128:$src)>;
   4293 }
   4294 let Predicates = [UseSSE2] in {
   4295   def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
   4296             (MOVZPQILo2PQIrr VR128:$src)>;
   4297 }
   4298 
   4299 //===---------------------------------------------------------------------===//
   4300 // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
   4301 //===---------------------------------------------------------------------===//
   4302 
   4303 multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
   4304                               ValueType vt, RegisterClass RC, PatFrag mem_frag,
   4305                               X86MemOperand x86memop, X86FoldableSchedWrite sched> {
   4306 def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
   4307                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   4308                       [(set RC:$dst, (vt (OpNode RC:$src)))]>,
   4309                       Sched<[sched]>;
   4310 def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
   4311                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   4312                       [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>,
   4313                       Sched<[sched.Folded]>;
   4314 }
   4315 
   4316 let Predicates = [HasAVX, NoVLX] in {
   4317   defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
   4318                                        v4f32, VR128, loadv4f32, f128mem,
   4319                                        SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
   4320   defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
   4321                                        v4f32, VR128, loadv4f32, f128mem,
   4322                                        SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
   4323   defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
   4324                                        v8f32, VR256, loadv8f32, f256mem,
   4325                                        SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
   4326   defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
   4327                                        v8f32, VR256, loadv8f32, f256mem,
   4328                                        SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
   4329 }
   4330 defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
   4331                                    memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
   4332 defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
   4333                                    memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
   4334 
   4335 let Predicates = [HasAVX, NoVLX] in {
   4336   def : Pat<(v4i32 (X86Movshdup VR128:$src)),
   4337             (VMOVSHDUPrr VR128:$src)>;
   4338   def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))),
   4339             (VMOVSHDUPrm addr:$src)>;
   4340   def : Pat<(v4i32 (X86Movsldup VR128:$src)),
   4341             (VMOVSLDUPrr VR128:$src)>;
   4342   def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))),
   4343             (VMOVSLDUPrm addr:$src)>;
   4344   def : Pat<(v8i32 (X86Movshdup VR256:$src)),
   4345             (VMOVSHDUPYrr VR256:$src)>;
   4346   def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))),
   4347             (VMOVSHDUPYrm addr:$src)>;
   4348   def : Pat<(v8i32 (X86Movsldup VR256:$src)),
   4349             (VMOVSLDUPYrr VR256:$src)>;
   4350   def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))),
   4351             (VMOVSLDUPYrm addr:$src)>;
   4352 }
   4353 
   4354 let Predicates = [UseSSE3] in {
   4355   def : Pat<(v4i32 (X86Movshdup VR128:$src)),
   4356             (MOVSHDUPrr VR128:$src)>;
   4357   def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
   4358             (MOVSHDUPrm addr:$src)>;
   4359   def : Pat<(v4i32 (X86Movsldup VR128:$src)),
   4360             (MOVSLDUPrr VR128:$src)>;
   4361   def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
   4362             (MOVSLDUPrm addr:$src)>;
   4363 }
   4364 
   4365 //===---------------------------------------------------------------------===//
   4366 // SSE3 - Replicate Double FP - MOVDDUP
   4367 //===---------------------------------------------------------------------===//
   4368 
   4369 multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> {
   4370 def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   4371                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   4372                     [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>,
   4373                     Sched<[sched.XMM]>;
   4374 def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
   4375                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   4376                     [(set VR128:$dst,
   4377                       (v2f64 (X86Movddup
   4378                               (scalar_to_vector (loadf64 addr:$src)))))]>,
   4379                     Sched<[sched.XMM.Folded]>;
   4380 }
   4381 
   4382 // FIXME: Merge with above classes when there are patterns for the ymm version
   4383 multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> {
   4384 def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
   4385                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   4386                     [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
   4387                     Sched<[sched.YMM]>;
   4388 def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
   4389                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   4390                     [(set VR256:$dst,
   4391                       (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
   4392                     Sched<[sched.YMM.Folded]>;
   4393 }
   4394 
   4395 let Predicates = [HasAVX, NoVLX] in {
   4396   defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>,
   4397                                       VEX, VEX_WIG;
   4398   defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>,
   4399                                         VEX, VEX_L, VEX_WIG;
   4400 }
   4401 
   4402 defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
   4403 
   4404 
   4405 let Predicates = [HasAVX, NoVLX] in {
   4406   def : Pat<(X86Movddup (loadv2f64 addr:$src)),
   4407             (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
   4408 }
   4409 
   4410 let Predicates = [UseSSE3] in {
   4411   // No need for aligned memory as this only loads 64-bits.
   4412   def : Pat<(X86Movddup (loadv2f64 addr:$src)),
   4413             (MOVDDUPrm addr:$src)>;
   4414 }
   4415 
   4416 //===---------------------------------------------------------------------===//
   4417 // SSE3 - Move Unaligned Integer
   4418 //===---------------------------------------------------------------------===//
   4419 
   4420 let Predicates = [HasAVX] in {
   4421   def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
   4422                       "vlddqu\t{$src, $dst|$dst, $src}",
   4423                       [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
   4424                       Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
   4425   def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
   4426                        "vlddqu\t{$src, $dst|$dst, $src}",
   4427                        [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
   4428                        Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG;
   4429 } // Predicates
   4430 
   4431 def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
   4432                    "lddqu\t{$src, $dst|$dst, $src}",
   4433                    [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
   4434                    Sched<[SchedWriteVecMoveLS.XMM.RM]>;
   4435 
   4436 //===---------------------------------------------------------------------===//
   4437 // SSE3 - Arithmetic
   4438 //===---------------------------------------------------------------------===//
   4439 
   4440 multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC,
   4441                        X86MemOperand x86memop, X86FoldableSchedWrite sched,
   4442                        PatFrag ld_frag, bit Is2Addr = 1> {
   4443   def rr : I<0xD0, MRMSrcReg,
   4444        (outs RC:$dst), (ins RC:$src1, RC:$src2),
   4445        !if(Is2Addr,
   4446            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   4447            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   4448        [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>,
   4449        Sched<[sched]>;
   4450   def rm : I<0xD0, MRMSrcMem,
   4451        (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
   4452        !if(Is2Addr,
   4453            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   4454            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   4455        [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>,
   4456        Sched<[sched.Folded, ReadAfterLd]>;
   4457 }
   4458 
   4459 let Predicates = [HasAVX] in {
   4460   let ExeDomain = SSEPackedSingle in {
   4461     defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
   4462                                  SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
   4463                                  XD, VEX_4V, VEX_WIG;
   4464     defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
   4465                                   SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
   4466                                   XD, VEX_4V, VEX_L, VEX_WIG;
   4467   }
   4468   let ExeDomain = SSEPackedDouble in {
   4469     defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
   4470                                  SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
   4471                                  PD, VEX_4V, VEX_WIG;
   4472     defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
   4473                                   SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
   4474                                   PD, VEX_4V, VEX_L, VEX_WIG;
   4475   }
   4476 }
   4477 let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
   4478   let ExeDomain = SSEPackedSingle in
   4479   defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem,
   4480                               SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD;
   4481   let ExeDomain = SSEPackedDouble in
   4482   defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem,
   4483                               SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD;
   4484 }
   4485 
   4486 //===---------------------------------------------------------------------===//
   4487 // SSE3 Instructions
   4488 //===---------------------------------------------------------------------===//
   4489 
   4490 // Horizontal ops
   4491 multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
   4492                    X86MemOperand x86memop, SDNode OpNode,
   4493                    X86FoldableSchedWrite sched, PatFrag ld_frag,
   4494                    bit Is2Addr = 1> {
   4495   def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
   4496        !if(Is2Addr,
   4497          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   4498          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   4499       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
   4500       Sched<[sched]>;
   4501 
   4502   def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
   4503        !if(Is2Addr,
   4504          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   4505          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   4506       [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
   4507       Sched<[sched.Folded, ReadAfterLd]>;
   4508 }
   4509 multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
   4510                   X86MemOperand x86memop, SDNode OpNode,
   4511                   X86FoldableSchedWrite sched, PatFrag ld_frag,
   4512                   bit Is2Addr = 1> {
   4513   def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
   4514        !if(Is2Addr,
   4515          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   4516          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   4517       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
   4518         Sched<[sched]>;
   4519 
   4520   def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
   4521        !if(Is2Addr,
   4522          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   4523          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   4524       [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
   4525         Sched<[sched.Folded, ReadAfterLd]>;
   4526 }
   4527 
   4528 let Predicates = [HasAVX] in {
   4529   let ExeDomain = SSEPackedSingle in {
   4530     defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
   4531                             X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
   4532     defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
   4533                             X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
   4534     defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
   4535                             X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
   4536     defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
   4537                             X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
   4538   }
   4539   let ExeDomain = SSEPackedDouble in {
   4540     defm VHADDPD  : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem,
   4541                            X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
   4542     defm VHSUBPD  : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem,
   4543                            X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
   4544     defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem,
   4545                            X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
   4546     defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem,
   4547                            X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
   4548   }
   4549 }
   4550 
   4551 let Constraints = "$src1 = $dst" in {
   4552   let ExeDomain = SSEPackedSingle in {
   4553     defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
   4554                           WriteFHAdd, memopv4f32>;
   4555     defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
   4556                           WriteFHAdd, memopv4f32>;
   4557   }
   4558   let ExeDomain = SSEPackedDouble in {
   4559     defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
   4560                          WriteFHAdd, memopv2f64>;
   4561     defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
   4562                          WriteFHAdd, memopv2f64>;
   4563   }
   4564 }
   4565 
   4566 //===---------------------------------------------------------------------===//
   4567 // SSSE3 - Packed Absolute Instructions
   4568 //===---------------------------------------------------------------------===//
   4569 
   4570 /// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
   4571 multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
   4572                         SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> {
   4573   def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
   4574                  (ins VR128:$src),
   4575                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   4576                  [(set VR128:$dst, (vt (OpNode VR128:$src)))]>,
   4577                  Sched<[sched.XMM]>;
   4578 
   4579   def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
   4580                  (ins i128mem:$src),
   4581                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   4582                  [(set VR128:$dst,
   4583                    (vt (OpNode (bitconvert (ld_frag addr:$src)))))]>,
   4584                  Sched<[sched.XMM.Folded]>;
   4585 }
   4586 
   4587 /// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
   4588 multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
   4589                           SDNode OpNode, X86SchedWriteWidths sched> {
   4590   def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
   4591                   (ins VR256:$src),
   4592                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   4593                   [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
   4594                   Sched<[sched.YMM]>;
   4595 
   4596   def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
   4597                   (ins i256mem:$src),
   4598                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   4599                   [(set VR256:$dst,
   4600                     (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>,
   4601                   Sched<[sched.YMM.Folded]>;
   4602 }
   4603 
   4604 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   4605   defm VPABSB  : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
   4606                               loadv2i64>, VEX, VEX_WIG;
   4607   defm VPABSW  : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
   4608                               loadv2i64>, VEX, VEX_WIG;
   4609 }
   4610 let Predicates = [HasAVX, NoVLX] in {
   4611   defm VPABSD  : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
   4612                               loadv2i64>, VEX, VEX_WIG;
   4613 }
   4614 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   4615   defm VPABSB  : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
   4616                                 VEX, VEX_L, VEX_WIG;
   4617   defm VPABSW  : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>,
   4618                                 VEX, VEX_L, VEX_WIG;
   4619 }
   4620 let Predicates = [HasAVX2, NoVLX] in {
   4621   defm VPABSD  : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>,
   4622                                 VEX, VEX_L, VEX_WIG;
   4623 }
   4624 
   4625 defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
   4626                           memopv2i64>;
   4627 defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
   4628                           memopv2i64>;
   4629 defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
   4630                           memopv2i64>;
   4631 
   4632 //===---------------------------------------------------------------------===//
   4633 // SSSE3 - Packed Binary Operator Instructions
   4634 //===---------------------------------------------------------------------===//
   4635 
   4636 /// SS3I_binop_rm - Simple SSSE3 bin op
   4637 multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
   4638                          ValueType DstVT, ValueType OpVT, RegisterClass RC,
   4639                          PatFrag memop_frag, X86MemOperand x86memop,
   4640                          X86FoldableSchedWrite sched, bit Is2Addr = 1> {
   4641   let isCommutable = 1 in
   4642   def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
   4643        (ins RC:$src1, RC:$src2),
   4644        !if(Is2Addr,
   4645          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   4646          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   4647        [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>,
   4648        Sched<[sched]>;
   4649   def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
   4650        (ins RC:$src1, x86memop:$src2),
   4651        !if(Is2Addr,
   4652          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   4653          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   4654        [(set RC:$dst,
   4655          (DstVT (OpNode (OpVT RC:$src1),
   4656           (bitconvert (memop_frag addr:$src2)))))]>,
   4657        Sched<[sched.Folded, ReadAfterLd]>;
   4658 }
   4659 
   4660 /// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
   4661 multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
   4662                              Intrinsic IntId128, X86FoldableSchedWrite sched,
   4663                              PatFrag ld_frag, bit Is2Addr = 1> {
   4664   let isCommutable = 1 in
   4665   def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
   4666        (ins VR128:$src1, VR128:$src2),
   4667        !if(Is2Addr,
   4668          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   4669          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   4670        [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
   4671        Sched<[sched]>;
   4672   def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
   4673        (ins VR128:$src1, i128mem:$src2),
   4674        !if(Is2Addr,
   4675          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   4676          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   4677        [(set VR128:$dst,
   4678          (IntId128 VR128:$src1,
   4679           (bitconvert (ld_frag addr:$src2))))]>,
   4680        Sched<[sched.Folded, ReadAfterLd]>;
   4681 }
   4682 
   4683 multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
   4684                                Intrinsic IntId256,
   4685                                X86FoldableSchedWrite sched> {
   4686   let isCommutable = 1 in
   4687   def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
   4688        (ins VR256:$src1, VR256:$src2),
   4689        !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   4690        [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
   4691        Sched<[sched]>;
   4692   def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
   4693        (ins VR256:$src1, i256mem:$src2),
   4694        !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   4695        [(set VR256:$dst,
   4696          (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
   4697        Sched<[sched.Folded, ReadAfterLd]>;
   4698 }
   4699 
   4700 let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   4701 let isCommutable = 0 in {
   4702   defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
   4703                                   VR128, loadv2i64, i128mem,
   4704                                   SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG;
   4705   defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
   4706                                   v16i8, VR128, loadv2i64, i128mem,
   4707                                   SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
   4708 }
   4709 defm VPMULHRSW    : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
   4710                                   VR128, loadv2i64, i128mem,
   4711                                   SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
   4712 }
   4713 
   4714 let ImmT = NoImm, Predicates = [HasAVX] in {
   4715 let isCommutable = 0 in {
   4716   defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
   4717                                   loadv2i64, i128mem,
   4718                                   SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
   4719   defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
   4720                                   loadv2i64, i128mem,
   4721                                   SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
   4722   defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
   4723                                   loadv2i64, i128mem,
   4724                                   SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
   4725   defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
   4726                                   loadv2i64, i128mem,
   4727                                   SchedWritePHAdd.XMM, 0>, VEX_4V;
   4728   defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
   4729                                       int_x86_ssse3_psign_b_128,
   4730                                       SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
   4731   defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw",
   4732                                       int_x86_ssse3_psign_w_128,
   4733                                       SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
   4734   defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd",
   4735                                       int_x86_ssse3_psign_d_128,
   4736                                       SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
   4737   defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
   4738                                       int_x86_ssse3_phadd_sw_128,
   4739                                       SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
   4740   defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
   4741                                       int_x86_ssse3_phsub_sw_128,
   4742                                       SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
   4743 }
   4744 }
   4745 
   4746 let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   4747 let isCommutable = 0 in {
   4748   defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
   4749                                   VR256, loadv4i64, i256mem,
   4750                                   SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   4751   defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
   4752                                    v32i8, VR256, loadv4i64, i256mem,
   4753                                    SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   4754 }
   4755 defm VPMULHRSWY   : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
   4756                                   VR256, loadv4i64, i256mem,
   4757                                   SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   4758 }
   4759 
   4760 let ImmT = NoImm, Predicates = [HasAVX2] in {
   4761 let isCommutable = 0 in {
   4762   defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
   4763                                   VR256, loadv4i64, i256mem,
   4764                                   SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   4765   defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
   4766                                   loadv4i64, i256mem,
   4767                                   SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   4768   defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
   4769                                   VR256, loadv4i64, i256mem,
   4770                                   SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   4771   defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
   4772                                   loadv4i64, i256mem,
   4773                                   SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L;
   4774   defm VPSIGNB   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
   4775                                        SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
   4776   defm VPSIGNW   : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
   4777                                        SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
   4778   defm VPSIGND   : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
   4779                                        SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
   4780   defm VPHADDSW  : SS3I_binop_rm_int_y<0x03, "vphaddsw",
   4781                                        int_x86_avx2_phadd_sw,
   4782                                        SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
   4783   defm VPHSUBSW  : SS3I_binop_rm_int_y<0x07, "vphsubsw",
   4784                                        int_x86_avx2_phsub_sw,
   4785                                        SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
   4786 }
   4787 }
   4788 
   4789 // None of these have i8 immediate fields.
   4790 let ImmT = NoImm, Constraints = "$src1 = $dst" in {
   4791 let isCommutable = 0 in {
   4792   defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
   4793                                  memopv2i64, i128mem, SchedWritePHAdd.XMM>;
   4794   defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
   4795                                  memopv2i64, i128mem, SchedWritePHAdd.XMM>;
   4796   defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
   4797                                  memopv2i64, i128mem, SchedWritePHAdd.XMM>;
   4798   defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
   4799                                  memopv2i64, i128mem, SchedWritePHAdd.XMM>;
   4800   defm PSIGNB    : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
   4801                                      SchedWriteVecALU.XMM, memopv2i64>;
   4802   defm PSIGNW    : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
   4803                                      SchedWriteVecALU.XMM, memopv2i64>;
   4804   defm PSIGND    : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
   4805                                      SchedWriteVecALU.XMM, memopv2i64>;
   4806   defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
   4807                                  memopv2i64, i128mem, SchedWriteVarShuffle.XMM>;
   4808   defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
   4809                                      int_x86_ssse3_phadd_sw_128,
   4810                                      SchedWritePHAdd.XMM, memopv2i64>;
   4811   defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
   4812                                      int_x86_ssse3_phsub_sw_128,
   4813                                      SchedWritePHAdd.XMM, memopv2i64>;
   4814   defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
   4815                                  v16i8, VR128, memopv2i64, i128mem,
   4816                                  SchedWriteVecIMul.XMM>;
   4817 }
   4818 defm PMULHRSW    : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
   4819                                  VR128, memopv2i64, i128mem, SchedWriteVecIMul.XMM>;
   4820 }
   4821 
   4822 //===---------------------------------------------------------------------===//
   4823 // SSSE3 - Packed Align Instruction Patterns
   4824 //===---------------------------------------------------------------------===//
   4825 
   4826 multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
   4827                          PatFrag memop_frag, X86MemOperand x86memop,
   4828                          X86FoldableSchedWrite sched, bit Is2Addr = 1> {
   4829   let hasSideEffects = 0 in {
   4830   def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst),
   4831       (ins RC:$src1, RC:$src2, u8imm:$src3),
   4832       !if(Is2Addr,
   4833         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   4834         !strconcat(asm,
   4835                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   4836       [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 imm:$src3))))]>,
   4837       Sched<[sched]>;
   4838   let mayLoad = 1 in
   4839   def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
   4840       (ins RC:$src1, x86memop:$src2, u8imm:$src3),
   4841       !if(Is2Addr,
   4842         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   4843         !strconcat(asm,
   4844                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   4845       [(set RC:$dst, (VT (X86PAlignr RC:$src1,
   4846                                      (bitconvert (memop_frag addr:$src2)),
   4847                                      (i8 imm:$src3))))]>,
   4848       Sched<[sched.Folded, ReadAfterLd]>;
   4849   }
   4850 }
   4851 
   4852 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
   4853   defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, loadv2i64, i128mem,
   4854                                 SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG;
   4855 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
   4856   defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, loadv4i64, i256mem,
   4857                                  SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   4858 let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
   4859   defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memopv2i64, i128mem,
   4860                                SchedWriteShuffle.XMM>;
   4861 
   4862 //===---------------------------------------------------------------------===//
   4863 // SSSE3 - Thread synchronization
   4864 //===---------------------------------------------------------------------===//
   4865 
   4866 let SchedRW = [WriteSystem] in {
   4867 let usesCustomInserter = 1 in {
   4868 def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
   4869                 [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>,
   4870                 Requires<[HasSSE3]>;
   4871 }
   4872 
   4873 let Uses = [EAX, ECX, EDX] in
   4874 def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
   4875                    TB, Requires<[HasSSE3]>;
   4876 
   4877 let Uses = [ECX, EAX] in
   4878 def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
   4879                   [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
   4880 } // SchedRW
   4881 
   4882 def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
   4883 def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
   4884 
   4885 def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>,
   4886       Requires<[Not64BitMode]>;
   4887 def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
   4888       Requires<[In64BitMode]>;
   4889 
   4890 //===----------------------------------------------------------------------===//
   4891 // SSE4.1 - Packed Move with Sign/Zero Extend
   4892 //===----------------------------------------------------------------------===//
   4893 
   4894 multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
   4895                             RegisterClass OutRC, RegisterClass InRC,
   4896                             X86FoldableSchedWrite sched> {
   4897   def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
   4898                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
   4899                  Sched<[sched]>;
   4900 
   4901   def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
   4902                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
   4903                  Sched<[sched.Folded]>;
   4904 }
   4905 
   4906 multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
   4907                               X86MemOperand MemOp, X86MemOperand MemYOp,
   4908                               Predicate prd> {
   4909   defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128,
   4910                                SchedWriteShuffle.XMM>;
   4911   let Predicates = [HasAVX, prd] in
   4912     defm V#NAME   : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
   4913                                      VR128, VR128, SchedWriteShuffle.XMM>,
   4914                                      VEX, VEX_WIG;
   4915   let Predicates = [HasAVX2, prd] in
   4916     defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
   4917                                      VR256, VR128, WriteShuffle256>,
   4918                                      VEX, VEX_L, VEX_WIG;
   4919 }
   4920 
   4921 multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
   4922                           X86MemOperand MemYOp, Predicate prd> {
   4923   defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
   4924                                         MemOp, MemYOp, prd>;
   4925   defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
   4926                                         !strconcat("pmovzx", OpcodeStr),
   4927                                         MemOp, MemYOp, prd>;
   4928 }
   4929 
   4930 defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
   4931 defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
   4932 defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;
   4933 
   4934 defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
   4935 defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
   4936 
   4937 defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
   4938 
   4939 // AVX2 Patterns
   4940 multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtOp> {
   4941   // Register-Register patterns
   4942   let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   4943   def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
   4944             (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
   4945   }
   4946   let Predicates = [HasAVX, NoVLX] in {
   4947   def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))),
   4948             (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
   4949   def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))),
   4950             (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
   4951 
   4952   def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
   4953             (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
   4954   def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))),
   4955             (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
   4956 
   4957   def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
   4958             (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
   4959   }
   4960 
   4961   // Simple Register-Memory patterns
   4962   let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   4963   def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
   4964             (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
   4965   }
   4966   let Predicates = [HasAVX, NoVLX] in {
   4967   def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
   4968             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
   4969   def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
   4970             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
   4971 
   4972   def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
   4973             (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
   4974   def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
   4975             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
   4976 
   4977   def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
   4978             (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
   4979   }
   4980 
   4981   // AVX2 Register-Memory patterns
   4982   let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   4983   def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
   4984             (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
   4985   def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
   4986             (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
   4987   def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
   4988             (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
   4989   }
   4990   let Predicates = [HasAVX, NoVLX] in {
   4991   def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
   4992             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
   4993   def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
   4994             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
   4995   def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
   4996             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
   4997   def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
   4998             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
   4999 
   5000   def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
   5001             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
   5002   def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
   5003             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
   5004   def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
   5005             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
   5006   def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
   5007             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
   5008 
   5009   def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
   5010             (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
   5011   def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
   5012             (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
   5013   def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
   5014             (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
   5015 
   5016   def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
   5017             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
   5018   def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
   5019             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
   5020   def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
   5021             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
   5022   def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
   5023             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
   5024 
   5025   def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
   5026             (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
   5027   def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
   5028             (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
   5029   def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
   5030             (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
   5031   }
   5032 }
   5033 
   5034 defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>;
   5035 defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>;
   5036 
   5037 // SSE4.1/AVX patterns.
   5038 multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
   5039                                 SDNode ExtOp> {
   5040   let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   5041   def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
   5042             (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
   5043   }
   5044   let Predicates = [HasAVX, NoVLX] in {
   5045   def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
   5046             (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
   5047   def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
   5048             (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
   5049 
   5050   def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
   5051             (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
   5052   def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
   5053             (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
   5054 
   5055   def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
   5056             (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
   5057   }
   5058   let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   5059   def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
   5060             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
   5061   }
   5062   let Predicates = [HasAVX, NoVLX] in {
   5063   def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
   5064             (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
   5065   def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
   5066             (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
   5067 
   5068   def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
   5069             (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
   5070   def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
   5071             (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
   5072 
   5073   def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
   5074             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
   5075   }
   5076   let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   5077   def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
   5078             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
   5079   def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
   5080             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
   5081   def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
   5082             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
   5083   def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
   5084             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
   5085   def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
   5086             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
   5087   }
   5088   let Predicates = [HasAVX, NoVLX] in {
   5089   def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
   5090             (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
   5091   def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
   5092             (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
   5093   def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
   5094             (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
   5095   def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
   5096             (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
   5097 
   5098   def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
   5099             (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
   5100   def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
   5101             (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
   5102   def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
   5103             (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
   5104   def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
   5105             (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
   5106 
   5107   def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
   5108             (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
   5109   def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
   5110             (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
   5111   def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
   5112             (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
   5113   def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
   5114             (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
   5115   def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
   5116             (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
   5117 
   5118   def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
   5119             (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
   5120   def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))),
   5121             (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
   5122   def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
   5123             (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
   5124   def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
   5125             (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
   5126 
   5127   def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
   5128             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
   5129   def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
   5130             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
   5131   def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
   5132             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
   5133   def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
   5134             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
   5135   def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
   5136             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
   5137   }
   5138 }
   5139 
   5140 defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>;
   5141 defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>;
   5142 
   5143 let Predicates = [UseSSE41] in {
   5144   defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>;
   5145   defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>;
   5146 }
   5147 
   5148 //===----------------------------------------------------------------------===//
   5149 // SSE4.1 - Extract Instructions
   5150 //===----------------------------------------------------------------------===//
   5151 
   5152 /// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
   5153 multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
   5154   def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
   5155                  (ins VR128:$src1, u8imm:$src2),
   5156                  !strconcat(OpcodeStr,
   5157                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   5158                  [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
   5159                                          imm:$src2))]>,
   5160                   Sched<[WriteVecExtract]>;
   5161   let hasSideEffects = 0, mayStore = 1 in
   5162   def mr : SS4AIi8<opc, MRMDestMem, (outs),
   5163                  (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
   5164                  !strconcat(OpcodeStr,
   5165                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   5166                  [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))),
   5167                           addr:$dst)]>, Sched<[WriteVecExtractSt]>;
   5168 }
   5169 
   5170 let Predicates = [HasAVX, NoBWI] in
   5171   defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
   5172 
   5173 defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
   5174 
   5175 
   5176 /// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
   5177 multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
   5178   let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
   5179   def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
   5180                    (ins VR128:$src1, u8imm:$src2),
   5181                    !strconcat(OpcodeStr,
   5182                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
   5183                    Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>;
   5184 
   5185   let hasSideEffects = 0, mayStore = 1 in
   5186   def mr : SS4AIi8<opc, MRMDestMem, (outs),
   5187                  (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
   5188                  !strconcat(OpcodeStr,
   5189                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   5190                  [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), imm:$src2))),
   5191                           addr:$dst)]>, Sched<[WriteVecExtractSt]>;
   5192 }
   5193 
   5194 let Predicates = [HasAVX, NoBWI] in
   5195   defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
   5196 
   5197 defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
   5198 
   5199 
   5200 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
   5201 multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
   5202   def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
   5203                  (ins VR128:$src1, u8imm:$src2),
   5204                  !strconcat(OpcodeStr,
   5205                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   5206                  [(set GR32:$dst,
   5207                   (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
   5208                   Sched<[WriteVecExtract]>;
   5209   def mr : SS4AIi8<opc, MRMDestMem, (outs),
   5210                  (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
   5211                  !strconcat(OpcodeStr,
   5212                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   5213                  [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
   5214                           addr:$dst)]>, Sched<[WriteVecExtractSt]>;
   5215 }
   5216 
   5217 let Predicates = [HasAVX, NoDQI] in
   5218   defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
   5219 
   5220 defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
   5221 
   5222 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
   5223 multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
   5224   def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
   5225                  (ins VR128:$src1, u8imm:$src2),
   5226                  !strconcat(OpcodeStr,
   5227                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   5228                  [(set GR64:$dst,
   5229                   (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
   5230                   Sched<[WriteVecExtract]>;
   5231   def mr : SS4AIi8<opc, MRMDestMem, (outs),
   5232                  (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
   5233                  !strconcat(OpcodeStr,
   5234                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   5235                  [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
   5236                           addr:$dst)]>, Sched<[WriteVecExtractSt]>;
   5237 }
   5238 
   5239 let Predicates = [HasAVX, NoDQI] in
   5240   defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
   5241 
   5242 defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">, REX_W;
   5243 
   5244 /// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
   5245 /// destination
   5246 multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
   5247   def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
   5248                    (ins VR128:$src1, u8imm:$src2),
   5249                    !strconcat(OpcodeStr,
   5250                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   5251                    [(set GR32orGR64:$dst,
   5252                       (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
   5253                    Sched<[WriteVecExtract]>;
   5254   def mr : SS4AIi8<opc, MRMDestMem, (outs),
   5255                    (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
   5256                    !strconcat(OpcodeStr,
   5257                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   5258                    [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
   5259                             addr:$dst)]>, Sched<[WriteVecExtractSt]>;
   5260 }
   5261 
   5262 let ExeDomain = SSEPackedSingle in {
   5263   let Predicates = [UseAVX] in
   5264     defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG;
   5265   defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps">;
   5266 }
   5267 
   5268 // Also match an EXTRACTPS store when the store is done as f32 instead of i32.
   5269 def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
   5270                                               imm:$src2))),
   5271                  addr:$dst),
   5272           (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
   5273           Requires<[HasAVX]>;
   5274 def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
   5275                                               imm:$src2))),
   5276                  addr:$dst),
   5277           (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
   5278           Requires<[UseSSE41]>;
   5279 
   5280 //===----------------------------------------------------------------------===//
   5281 // SSE4.1 - Insert Instructions
   5282 //===----------------------------------------------------------------------===//
   5283 
   5284 multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
   5285   def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
   5286       (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
   5287       !if(Is2Addr,
   5288         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   5289         !strconcat(asm,
   5290                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   5291       [(set VR128:$dst,
   5292         (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
   5293       Sched<[WriteVecInsert]>;
   5294   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
   5295       (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
   5296       !if(Is2Addr,
   5297         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   5298         !strconcat(asm,
   5299                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   5300       [(set VR128:$dst,
   5301         (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
   5302                    imm:$src3))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>;
   5303 }
   5304 
   5305 let Predicates = [HasAVX, NoBWI] in
   5306   defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
   5307 let Constraints = "$src1 = $dst" in
   5308   defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
   5309 
   5310 multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
   5311   def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
   5312       (ins VR128:$src1, GR32:$src2, u8imm:$src3),
   5313       !if(Is2Addr,
   5314         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   5315         !strconcat(asm,
   5316                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   5317       [(set VR128:$dst,
   5318         (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
   5319       Sched<[WriteVecInsert]>;
   5320   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
   5321       (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
   5322       !if(Is2Addr,
   5323         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   5324         !strconcat(asm,
   5325                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   5326       [(set VR128:$dst,
   5327         (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
   5328                           imm:$src3)))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>;
   5329 }
   5330 
   5331 let Predicates = [HasAVX, NoDQI] in
   5332   defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
   5333 let Constraints = "$src1 = $dst" in
   5334   defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
   5335 
   5336 multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
   5337   def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
   5338       (ins VR128:$src1, GR64:$src2, u8imm:$src3),
   5339       !if(Is2Addr,
   5340         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   5341         !strconcat(asm,
   5342                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   5343       [(set VR128:$dst,
   5344         (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
   5345       Sched<[WriteVecInsert]>;
   5346   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
   5347       (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
   5348       !if(Is2Addr,
   5349         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   5350         !strconcat(asm,
   5351                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   5352       [(set VR128:$dst,
   5353         (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
   5354                           imm:$src3)))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>;
   5355 }
   5356 
   5357 let Predicates = [HasAVX, NoDQI] in
   5358   defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
   5359 let Constraints = "$src1 = $dst" in
   5360   defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
   5361 
   5362 // insertps has a few different modes, there's the first two here below which
   5363 // are optimized inserts that won't zero arbitrary elements in the destination
   5364 // vector. The next one matches the intrinsic and could zero arbitrary elements
   5365 // in the target vector.
   5366 multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
   5367   def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
   5368       (ins VR128:$src1, VR128:$src2, u8imm:$src3),
   5369       !if(Is2Addr,
   5370         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   5371         !strconcat(asm,
   5372                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   5373       [(set VR128:$dst,
   5374         (X86insertps VR128:$src1, VR128:$src2, imm:$src3))]>,
   5375       Sched<[SchedWriteFShuffle.XMM]>;
   5376   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
   5377       (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
   5378       !if(Is2Addr,
   5379         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   5380         !strconcat(asm,
   5381                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   5382       [(set VR128:$dst,
   5383         (X86insertps VR128:$src1,
   5384                    (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
   5385                     imm:$src3))]>,
   5386       Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>;
   5387 }
   5388 
   5389 let ExeDomain = SSEPackedSingle in {
   5390   let Predicates = [UseAVX] in
   5391     defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>,
   5392                      VEX_4V, VEX_WIG;
   5393   let Constraints = "$src1 = $dst" in
   5394     defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
   5395 }
   5396 
   5397 let Predicates = [UseAVX] in {
   5398   // If we're inserting an element from a vbroadcast of a load, fold the
   5399   // load into the X86insertps instruction.
   5400   def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
   5401                 (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
   5402             (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
   5403   def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
   5404                 (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
   5405             (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
   5406 }
   5407 
   5408 //===----------------------------------------------------------------------===//
   5409 // SSE4.1 - Round Instructions
   5410 //===----------------------------------------------------------------------===//
   5411 
   5412 multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
   5413                            X86MemOperand x86memop, RegisterClass RC,
   5414                            ValueType VT, PatFrag mem_frag, SDNode OpNode,
   5415                            X86FoldableSchedWrite sched> {
   5416   // Intrinsic operation, reg.
   5417   // Vector intrinsic operation, reg
   5418   def r : SS4AIi8<opc, MRMSrcReg,
   5419                   (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
   5420                   !strconcat(OpcodeStr,
   5421                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   5422                   [(set RC:$dst, (VT (OpNode RC:$src1, imm:$src2)))]>,
   5423                   Sched<[sched]>;
   5424 
   5425   // Vector intrinsic operation, mem
   5426   def m : SS4AIi8<opc, MRMSrcMem,
   5427                   (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
   5428                   !strconcat(OpcodeStr,
   5429                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   5430                   [(set RC:$dst,
   5431                         (VT (OpNode (mem_frag addr:$src1),imm:$src2)))]>,
   5432                   Sched<[sched.Folded]>;
   5433 }
   5434 
   5435 multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
   5436                           string OpcodeStr, X86FoldableSchedWrite sched> {
   5437 let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
   5438   def SSr : SS4AIi8<opcss, MRMSrcReg,
   5439         (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
   5440         !strconcat(OpcodeStr,
   5441             "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
   5442       []>, Sched<[sched]>;
   5443 
   5444   let mayLoad = 1 in
   5445   def SSm : SS4AIi8<opcss, MRMSrcMem,
   5446         (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
   5447         !strconcat(OpcodeStr,
   5448              "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
   5449         []>, Sched<[sched.Folded, ReadAfterLd]>;
   5450 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
   5451 
   5452 let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
   5453   def SDr : SS4AIi8<opcsd, MRMSrcReg,
   5454         (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
   5455         !strconcat(OpcodeStr,
   5456               "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
   5457         []>, Sched<[sched]>;
   5458 
   5459   let mayLoad = 1 in
   5460   def SDm : SS4AIi8<opcsd, MRMSrcMem,
   5461         (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
   5462         !strconcat(OpcodeStr,
   5463              "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
   5464         []>, Sched<[sched.Folded, ReadAfterLd]>;
   5465 } // ExeDomain = SSEPackedDouble, hasSideEffects = 0
   5466 }
   5467 
   5468 multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
   5469                            string OpcodeStr, X86FoldableSchedWrite sched> {
   5470 let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
   5471   def SSr : SS4AIi8<opcss, MRMSrcReg,
   5472                     (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
   5473                     !strconcat(OpcodeStr,
   5474                                "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   5475                     []>, Sched<[sched]>;
   5476 
   5477   let mayLoad = 1 in
   5478   def SSm : SS4AIi8<opcss, MRMSrcMem,
   5479                     (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
   5480                     !strconcat(OpcodeStr,
   5481                                "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   5482                     []>, Sched<[sched.Folded, ReadAfterLd]>;
   5483 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
   5484 
   5485 let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
   5486   def SDr : SS4AIi8<opcsd, MRMSrcReg,
   5487                     (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
   5488                     !strconcat(OpcodeStr,
   5489                                "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   5490                     []>, Sched<[sched]>;
   5491 
   5492   let mayLoad = 1 in
   5493   def SDm : SS4AIi8<opcsd, MRMSrcMem,
   5494                     (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
   5495                     !strconcat(OpcodeStr,
   5496                                "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   5497                     []>, Sched<[sched.Folded, ReadAfterLd]>;
   5498 } // ExeDomain = SSEPackedDouble, hasSideEffects = 0
   5499 }
   5500 
   5501 multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
   5502                             string OpcodeStr, X86FoldableSchedWrite sched,
   5503                             ValueType VT32, ValueType VT64,
   5504                             SDNode OpNode, bit Is2Addr = 1> {
   5505 let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in {
   5506   def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
   5507         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
   5508         !if(Is2Addr,
   5509             !strconcat(OpcodeStr,
   5510                 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   5511             !strconcat(OpcodeStr,
   5512                 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   5513         [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
   5514         Sched<[sched]>;
   5515 
   5516   def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
   5517         (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
   5518         !if(Is2Addr,
   5519             !strconcat(OpcodeStr,
   5520                 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   5521             !strconcat(OpcodeStr,
   5522                 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   5523         [(set VR128:$dst,
   5524              (OpNode VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
   5525         Sched<[sched.Folded, ReadAfterLd]>;
   5526 } // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
   5527 
   5528 let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in {
   5529   def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
   5530         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
   5531         !if(Is2Addr,
   5532             !strconcat(OpcodeStr,
   5533                 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   5534             !strconcat(OpcodeStr,
   5535                 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   5536         [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
   5537         Sched<[sched]>;
   5538 
   5539   def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
   5540         (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
   5541         !if(Is2Addr,
   5542             !strconcat(OpcodeStr,
   5543                 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   5544             !strconcat(OpcodeStr,
   5545                 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   5546         [(set VR128:$dst,
   5547               (OpNode VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
   5548         Sched<[sched.Folded, ReadAfterLd]>;
   5549 } // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
   5550 }
   5551 
   5552 // FP round - roundss, roundps, roundsd, roundpd
   5553 let Predicates = [HasAVX, NoVLX] in {
   5554   let ExeDomain = SSEPackedSingle in {
   5555     // Intrinsic form
   5556     defm VROUNDPS  : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32,
   5557                                      loadv4f32, X86VRndScale, SchedWriteFRnd.XMM>,
   5558                                    VEX, VEX_WIG;
   5559     defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32,
   5560                                      loadv8f32, X86VRndScale, SchedWriteFRnd.YMM>,
   5561                                    VEX, VEX_L, VEX_WIG;
   5562   }
   5563 
   5564   let ExeDomain = SSEPackedDouble in {
   5565     defm VROUNDPD  : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64,
   5566                                      loadv2f64, X86VRndScale, SchedWriteFRnd.XMM>,
   5567                                    VEX, VEX_WIG;
   5568     defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64,
   5569                                      loadv4f64, X86VRndScale, SchedWriteFRnd.YMM>,
   5570                                    VEX, VEX_L, VEX_WIG;
   5571   }
   5572 }
   5573 let Predicates = [HasAVX, NoAVX512] in {
   5574   defm VROUND  : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
   5575                                   v4f32, v2f64, X86RndScales, 0>,
   5576                                   VEX_4V, VEX_LIG, VEX_WIG;
   5577   defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
   5578                                 VEX_4V, VEX_LIG, VEX_WIG;
   5579 }
   5580 
   5581 let Predicates = [UseAVX] in {
   5582   def : Pat<(ffloor FR32:$src),
   5583             (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
   5584   def : Pat<(f32 (fnearbyint FR32:$src)),
   5585             (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
   5586   def : Pat<(f32 (fceil FR32:$src)),
   5587             (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
   5588   def : Pat<(f32 (frint FR32:$src)),
   5589             (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
   5590   def : Pat<(f32 (ftrunc FR32:$src)),
   5591             (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
   5592 
   5593   def : Pat<(f64 (ffloor FR64:$src)),
   5594             (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
   5595   def : Pat<(f64 (fnearbyint FR64:$src)),
   5596             (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
   5597   def : Pat<(f64 (fceil FR64:$src)),
   5598             (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
   5599   def : Pat<(f64 (frint FR64:$src)),
   5600             (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
   5601   def : Pat<(f64 (ftrunc FR64:$src)),
   5602             (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
   5603 }
   5604 
   5605 let Predicates = [UseAVX, OptForSize] in {
   5606   def : Pat<(ffloor (loadf32 addr:$src)),
   5607             (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>;
   5608   def : Pat<(f32 (fnearbyint (loadf32 addr:$src))),
   5609             (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>;
   5610   def : Pat<(f32 (fceil (loadf32 addr:$src))),
   5611             (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>;
   5612   def : Pat<(f32 (frint (loadf32 addr:$src))),
   5613             (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>;
   5614   def : Pat<(f32 (ftrunc (loadf32 addr:$src))),
   5615             (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>;
   5616 
   5617   def : Pat<(f64 (ffloor (loadf64 addr:$src))),
   5618             (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>;
   5619   def : Pat<(f64 (fnearbyint (loadf64 addr:$src))),
   5620             (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>;
   5621   def : Pat<(f64 (fceil (loadf64 addr:$src))),
   5622             (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>;
   5623   def : Pat<(f64 (frint (loadf64 addr:$src))),
   5624             (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>;
   5625   def : Pat<(f64 (ftrunc (loadf64 addr:$src))),
   5626             (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>;
   5627 }
   5628 
   5629 let Predicates = [HasAVX, NoVLX] in {
   5630   def : Pat<(v4f32 (ffloor VR128:$src)),
   5631             (VROUNDPSr VR128:$src, (i32 0x9))>;
   5632   def : Pat<(v4f32 (fnearbyint VR128:$src)),
   5633             (VROUNDPSr VR128:$src, (i32 0xC))>;
   5634   def : Pat<(v4f32 (fceil VR128:$src)),
   5635             (VROUNDPSr VR128:$src, (i32 0xA))>;
   5636   def : Pat<(v4f32 (frint VR128:$src)),
   5637             (VROUNDPSr VR128:$src, (i32 0x4))>;
   5638   def : Pat<(v4f32 (ftrunc VR128:$src)),
   5639             (VROUNDPSr VR128:$src, (i32 0xB))>;
   5640 
   5641   def : Pat<(v4f32 (ffloor (loadv4f32 addr:$src))),
   5642             (VROUNDPSm addr:$src, (i32 0x9))>;
   5643   def : Pat<(v4f32 (fnearbyint (loadv4f32 addr:$src))),
   5644             (VROUNDPSm addr:$src, (i32 0xC))>;
   5645   def : Pat<(v4f32 (fceil (loadv4f32 addr:$src))),
   5646             (VROUNDPSm addr:$src, (i32 0xA))>;
   5647   def : Pat<(v4f32 (frint (loadv4f32 addr:$src))),
   5648             (VROUNDPSm addr:$src, (i32 0x4))>;
   5649   def : Pat<(v4f32 (ftrunc (loadv4f32 addr:$src))),
   5650             (VROUNDPSm addr:$src, (i32 0xB))>;
   5651 
   5652   def : Pat<(v2f64 (ffloor VR128:$src)),
   5653             (VROUNDPDr VR128:$src, (i32 0x9))>;
   5654   def : Pat<(v2f64 (fnearbyint VR128:$src)),
   5655             (VROUNDPDr VR128:$src, (i32 0xC))>;
   5656   def : Pat<(v2f64 (fceil VR128:$src)),
   5657             (VROUNDPDr VR128:$src, (i32 0xA))>;
   5658   def : Pat<(v2f64 (frint VR128:$src)),
   5659             (VROUNDPDr VR128:$src, (i32 0x4))>;
   5660   def : Pat<(v2f64 (ftrunc VR128:$src)),
   5661             (VROUNDPDr VR128:$src, (i32 0xB))>;
   5662 
   5663   def : Pat<(v2f64 (ffloor (loadv2f64 addr:$src))),
   5664             (VROUNDPDm addr:$src, (i32 0x9))>;
   5665   def : Pat<(v2f64 (fnearbyint (loadv2f64 addr:$src))),
   5666             (VROUNDPDm addr:$src, (i32 0xC))>;
   5667   def : Pat<(v2f64 (fceil (loadv2f64 addr:$src))),
   5668             (VROUNDPDm addr:$src, (i32 0xA))>;
   5669   def : Pat<(v2f64 (frint (loadv2f64 addr:$src))),
   5670             (VROUNDPDm addr:$src, (i32 0x4))>;
   5671   def : Pat<(v2f64 (ftrunc (loadv2f64 addr:$src))),
   5672             (VROUNDPDm addr:$src, (i32 0xB))>;
   5673 
   5674   def : Pat<(v8f32 (ffloor VR256:$src)),
   5675             (VROUNDPSYr VR256:$src, (i32 0x9))>;
   5676   def : Pat<(v8f32 (fnearbyint VR256:$src)),
   5677             (VROUNDPSYr VR256:$src, (i32 0xC))>;
   5678   def : Pat<(v8f32 (fceil VR256:$src)),
   5679             (VROUNDPSYr VR256:$src, (i32 0xA))>;
   5680   def : Pat<(v8f32 (frint VR256:$src)),
   5681             (VROUNDPSYr VR256:$src, (i32 0x4))>;
   5682   def : Pat<(v8f32 (ftrunc VR256:$src)),
   5683             (VROUNDPSYr VR256:$src, (i32 0xB))>;
   5684 
   5685   def : Pat<(v8f32 (ffloor (loadv8f32 addr:$src))),
   5686             (VROUNDPSYm addr:$src, (i32 0x9))>;
   5687   def : Pat<(v8f32 (fnearbyint (loadv8f32 addr:$src))),
   5688             (VROUNDPSYm addr:$src, (i32 0xC))>;
   5689   def : Pat<(v8f32 (fceil (loadv8f32 addr:$src))),
   5690             (VROUNDPSYm addr:$src, (i32 0xA))>;
   5691   def : Pat<(v8f32 (frint (loadv8f32 addr:$src))),
   5692             (VROUNDPSYm addr:$src, (i32 0x4))>;
   5693   def : Pat<(v8f32 (ftrunc (loadv8f32 addr:$src))),
   5694             (VROUNDPSYm addr:$src, (i32 0xB))>;
   5695 
   5696   def : Pat<(v4f64 (ffloor VR256:$src)),
   5697             (VROUNDPDYr VR256:$src, (i32 0x9))>;
   5698   def : Pat<(v4f64 (fnearbyint VR256:$src)),
   5699             (VROUNDPDYr VR256:$src, (i32 0xC))>;
   5700   def : Pat<(v4f64 (fceil VR256:$src)),
   5701             (VROUNDPDYr VR256:$src, (i32 0xA))>;
   5702   def : Pat<(v4f64 (frint VR256:$src)),
   5703             (VROUNDPDYr VR256:$src, (i32 0x4))>;
   5704   def : Pat<(v4f64 (ftrunc VR256:$src)),
   5705             (VROUNDPDYr VR256:$src, (i32 0xB))>;
   5706 
   5707   def : Pat<(v4f64 (ffloor (loadv4f64 addr:$src))),
   5708             (VROUNDPDYm addr:$src, (i32 0x9))>;
   5709   def : Pat<(v4f64 (fnearbyint (loadv4f64 addr:$src))),
   5710             (VROUNDPDYm addr:$src, (i32 0xC))>;
   5711   def : Pat<(v4f64 (fceil (loadv4f64 addr:$src))),
   5712             (VROUNDPDYm addr:$src, (i32 0xA))>;
   5713   def : Pat<(v4f64 (frint (loadv4f64 addr:$src))),
   5714             (VROUNDPDYm addr:$src, (i32 0x4))>;
   5715   def : Pat<(v4f64 (ftrunc (loadv4f64 addr:$src))),
   5716             (VROUNDPDYm addr:$src, (i32 0xB))>;
   5717 }
   5718 
   5719 let ExeDomain = SSEPackedSingle in
   5720 defm ROUNDPS  : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32,
   5721                                 memopv4f32, X86VRndScale, SchedWriteFRnd.XMM>;
   5722 let ExeDomain = SSEPackedDouble in
   5723 defm ROUNDPD  : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
   5724                                 memopv2f64, X86VRndScale, SchedWriteFRnd.XMM>;
   5725 
   5726 defm ROUND  : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
   5727 
   5728 let Constraints = "$src1 = $dst" in
   5729 defm ROUND  : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
   5730                                v4f32, v2f64, X86RndScales>;
   5731 
   5732 let Predicates = [UseSSE41] in {
   5733   def : Pat<(ffloor FR32:$src),
   5734             (ROUNDSSr FR32:$src, (i32 0x9))>;
   5735   def : Pat<(f32 (fnearbyint FR32:$src)),
   5736             (ROUNDSSr FR32:$src, (i32 0xC))>;
   5737   def : Pat<(f32 (fceil FR32:$src)),
   5738             (ROUNDSSr FR32:$src, (i32 0xA))>;
   5739   def : Pat<(f32 (frint FR32:$src)),
   5740             (ROUNDSSr FR32:$src, (i32 0x4))>;
   5741   def : Pat<(f32 (ftrunc FR32:$src)),
   5742             (ROUNDSSr FR32:$src, (i32 0xB))>;
   5743 
   5744   def : Pat<(f64 (ffloor FR64:$src)),
   5745             (ROUNDSDr FR64:$src, (i32 0x9))>;
   5746   def : Pat<(f64 (fnearbyint FR64:$src)),
   5747             (ROUNDSDr FR64:$src, (i32 0xC))>;
   5748   def : Pat<(f64 (fceil FR64:$src)),
   5749             (ROUNDSDr FR64:$src, (i32 0xA))>;
   5750   def : Pat<(f64 (frint FR64:$src)),
   5751             (ROUNDSDr FR64:$src, (i32 0x4))>;
   5752   def : Pat<(f64 (ftrunc FR64:$src)),
   5753             (ROUNDSDr FR64:$src, (i32 0xB))>;
   5754 }
   5755 
   5756 let Predicates = [UseSSE41, OptForSize] in {
   5757   def : Pat<(ffloor (loadf32 addr:$src)),
   5758             (ROUNDSSm addr:$src, (i32 0x9))>;
   5759   def : Pat<(f32 (fnearbyint (loadf32 addr:$src))),
   5760             (ROUNDSSm addr:$src, (i32 0xC))>;
   5761   def : Pat<(f32 (fceil (loadf32 addr:$src))),
   5762             (ROUNDSSm addr:$src, (i32 0xA))>;
   5763   def : Pat<(f32 (frint (loadf32 addr:$src))),
   5764             (ROUNDSSm addr:$src, (i32 0x4))>;
   5765   def : Pat<(f32 (ftrunc (loadf32 addr:$src))),
   5766             (ROUNDSSm addr:$src, (i32 0xB))>;
   5767 
   5768   def : Pat<(f64 (ffloor (loadf64 addr:$src))),
   5769             (ROUNDSDm addr:$src, (i32 0x9))>;
   5770   def : Pat<(f64 (fnearbyint (loadf64 addr:$src))),
   5771             (ROUNDSDm addr:$src, (i32 0xC))>;
   5772   def : Pat<(f64 (fceil (loadf64 addr:$src))),
   5773             (ROUNDSDm addr:$src, (i32 0xA))>;
   5774   def : Pat<(f64 (frint (loadf64 addr:$src))),
   5775             (ROUNDSDm addr:$src, (i32 0x4))>;
   5776   def : Pat<(f64 (ftrunc (loadf64 addr:$src))),
   5777             (ROUNDSDm addr:$src, (i32 0xB))>;
   5778 }
   5779 
   5780 let Predicates = [UseSSE41] in {
   5781   def : Pat<(v4f32 (ffloor VR128:$src)),
   5782             (ROUNDPSr VR128:$src, (i32 0x9))>;
   5783   def : Pat<(v4f32 (fnearbyint VR128:$src)),
   5784             (ROUNDPSr VR128:$src, (i32 0xC))>;
   5785   def : Pat<(v4f32 (fceil VR128:$src)),
   5786             (ROUNDPSr VR128:$src, (i32 0xA))>;
   5787   def : Pat<(v4f32 (frint VR128:$src)),
   5788             (ROUNDPSr VR128:$src, (i32 0x4))>;
   5789   def : Pat<(v4f32 (ftrunc VR128:$src)),
   5790             (ROUNDPSr VR128:$src, (i32 0xB))>;
   5791 
   5792   def : Pat<(v4f32 (ffloor (memopv4f32 addr:$src))),
   5793             (ROUNDPSm addr:$src, (i32 0x9))>;
   5794   def : Pat<(v4f32 (fnearbyint (memopv4f32 addr:$src))),
   5795             (ROUNDPSm addr:$src, (i32 0xC))>;
   5796   def : Pat<(v4f32 (fceil (memopv4f32 addr:$src))),
   5797             (ROUNDPSm addr:$src, (i32 0xA))>;
   5798   def : Pat<(v4f32 (frint (memopv4f32 addr:$src))),
   5799             (ROUNDPSm addr:$src, (i32 0x4))>;
   5800   def : Pat<(v4f32 (ftrunc (memopv4f32 addr:$src))),
   5801             (ROUNDPSm addr:$src, (i32 0xB))>;
   5802 
   5803   def : Pat<(v2f64 (ffloor VR128:$src)),
   5804             (ROUNDPDr VR128:$src, (i32 0x9))>;
   5805   def : Pat<(v2f64 (fnearbyint VR128:$src)),
   5806             (ROUNDPDr VR128:$src, (i32 0xC))>;
   5807   def : Pat<(v2f64 (fceil VR128:$src)),
   5808             (ROUNDPDr VR128:$src, (i32 0xA))>;
   5809   def : Pat<(v2f64 (frint VR128:$src)),
   5810             (ROUNDPDr VR128:$src, (i32 0x4))>;
   5811   def : Pat<(v2f64 (ftrunc VR128:$src)),
   5812             (ROUNDPDr VR128:$src, (i32 0xB))>;
   5813 
   5814   def : Pat<(v2f64 (ffloor (memopv2f64 addr:$src))),
   5815             (ROUNDPDm addr:$src, (i32 0x9))>;
   5816   def : Pat<(v2f64 (fnearbyint (memopv2f64 addr:$src))),
   5817             (ROUNDPDm addr:$src, (i32 0xC))>;
   5818   def : Pat<(v2f64 (fceil (memopv2f64 addr:$src))),
   5819             (ROUNDPDm addr:$src, (i32 0xA))>;
   5820   def : Pat<(v2f64 (frint (memopv2f64 addr:$src))),
   5821             (ROUNDPDm addr:$src, (i32 0x4))>;
   5822   def : Pat<(v2f64 (ftrunc (memopv2f64 addr:$src))),
   5823             (ROUNDPDm addr:$src, (i32 0xB))>;
   5824 }
   5825 
   5826 defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSS", X86Movss,
   5827                                       v4f32, 0x01, UseSSE41>;
   5828 defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSS", X86Movss,
   5829                                       v4f32, 0x02, UseSSE41>;
   5830 defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSD", X86Movsd,
   5831                                       v2f64, 0x01, UseSSE41>;
   5832 defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSD", X86Movsd,
   5833                                       v2f64, 0x02, UseSSE41>;
   5834 
   5835 //===----------------------------------------------------------------------===//
   5836 // SSE4.1 - Packed Bit Test
   5837 //===----------------------------------------------------------------------===//
   5838 
   5839 // ptest instruction we'll lower to this in X86ISelLowering primarily from
   5840 // the intel intrinsic that corresponds to this.
   5841 let Defs = [EFLAGS], Predicates = [HasAVX] in {
   5842 def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
   5843                 "vptest\t{$src2, $src1|$src1, $src2}",
   5844                 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
   5845                 Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG;
   5846 def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
   5847                 "vptest\t{$src2, $src1|$src1, $src2}",
   5848                 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
   5849                 Sched<[SchedWriteVecTest.XMM.Folded, ReadAfterLd]>,
   5850                 VEX, VEX_WIG;
   5851 
   5852 def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
   5853                 "vptest\t{$src2, $src1|$src1, $src2}",
   5854                 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
   5855                 Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG;
   5856 def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
   5857                 "vptest\t{$src2, $src1|$src1, $src2}",
   5858                 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
   5859                 Sched<[SchedWriteVecTest.YMM.Folded, ReadAfterLd]>,
   5860                 VEX, VEX_L, VEX_WIG;
   5861 }
   5862 
   5863 let Defs = [EFLAGS] in {
   5864 def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
   5865               "ptest\t{$src2, $src1|$src1, $src2}",
   5866               [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
   5867               Sched<[SchedWriteVecTest.XMM]>;
   5868 def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
   5869               "ptest\t{$src2, $src1|$src1, $src2}",
   5870               [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
   5871               Sched<[SchedWriteVecTest.XMM.Folded, ReadAfterLd]>;
   5872 }
   5873 
   5874 // The bit test instructions below are AVX only
   5875 multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
   5876                        X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
   5877                        X86FoldableSchedWrite sched> {
   5878   def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
   5879             !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
   5880             [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
   5881             Sched<[sched]>, VEX;
   5882   def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
   5883             !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
   5884             [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
   5885             Sched<[sched.Folded, ReadAfterLd]>, VEX;
   5886 }
   5887 
   5888 let Defs = [EFLAGS], Predicates = [HasAVX] in {
   5889 let ExeDomain = SSEPackedSingle in {
   5890 defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32,
   5891                             SchedWriteFTest.XMM>;
   5892 defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32,
   5893                             SchedWriteFTest.YMM>, VEX_L;
   5894 }
   5895 let ExeDomain = SSEPackedDouble in {
   5896 defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64,
   5897                             SchedWriteFTest.XMM>;
   5898 defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64,
   5899                             SchedWriteFTest.YMM>, VEX_L;
   5900 }
   5901 }
   5902 
   5903 //===----------------------------------------------------------------------===//
   5904 // SSE4.1 - Misc Instructions
   5905 //===----------------------------------------------------------------------===//
   5906 
   5907 let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
   5908   def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
   5909                      "popcnt{w}\t{$src, $dst|$dst, $src}",
   5910                      [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
   5911                      Sched<[WritePOPCNT]>, OpSize16, XS;
   5912   def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
   5913                      "popcnt{w}\t{$src, $dst|$dst, $src}",
   5914                      [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
   5915                       (implicit EFLAGS)]>,
   5916                       Sched<[WritePOPCNT.Folded]>, OpSize16, XS;
   5917 
   5918   def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
   5919                      "popcnt{l}\t{$src, $dst|$dst, $src}",
   5920                      [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
   5921                      Sched<[WritePOPCNT]>, OpSize32, XS;
   5922 
   5923   def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
   5924                      "popcnt{l}\t{$src, $dst|$dst, $src}",
   5925                      [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
   5926                       (implicit EFLAGS)]>,
   5927                       Sched<[WritePOPCNT.Folded]>, OpSize32, XS;
   5928 
   5929   def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
   5930                       "popcnt{q}\t{$src, $dst|$dst, $src}",
   5931                       [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
   5932                       Sched<[WritePOPCNT]>, XS;
   5933   def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
   5934                       "popcnt{q}\t{$src, $dst|$dst, $src}",
   5935                       [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
   5936                        (implicit EFLAGS)]>,
   5937                        Sched<[WritePOPCNT.Folded]>, XS;
   5938 }
   5939 
   5940 // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
   5941 multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
   5942                                  SDNode OpNode, PatFrag ld_frag,
   5943                                  X86FoldableSchedWrite Sched> {
   5944   def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
   5945                  (ins VR128:$src),
   5946                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   5947                  [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
   5948                  Sched<[Sched]>;
   5949   def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
   5950                   (ins i128mem:$src),
   5951                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   5952                   [(set VR128:$dst,
   5953                     (v8i16 (OpNode (v8i16 (bitconvert (ld_frag addr:$src))))))]>,
   5954                  Sched<[Sched.Folded]>;
   5955 }
   5956 
   5957 // PHMIN has the same profile as PSAD, thus we use the same scheduling
   5958 // model, although the naming is misleading.
   5959 let Predicates = [HasAVX] in
   5960 defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
   5961                                          X86phminpos, loadv2i64,
   5962                                          WritePHMINPOS>, VEX, VEX_WIG;
   5963 defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
   5964                                          X86phminpos, memopv2i64,
   5965                                          WritePHMINPOS>;
   5966 
   5967 /// SS48I_binop_rm - Simple SSE41 binary operator.
   5968 multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
   5969                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
   5970                           X86MemOperand x86memop, X86FoldableSchedWrite sched,
   5971                           bit Is2Addr = 1> {
   5972   let isCommutable = 1 in
   5973   def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
   5974        (ins RC:$src1, RC:$src2),
   5975        !if(Is2Addr,
   5976            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   5977            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   5978        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
   5979        Sched<[sched]>;
   5980   def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
   5981        (ins RC:$src1, x86memop:$src2),
   5982        !if(Is2Addr,
   5983            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   5984            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   5985        [(set RC:$dst,
   5986          (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>,
   5987        Sched<[sched.Folded, ReadAfterLd]>;
   5988 }
   5989 
   5990 let Predicates = [HasAVX, NoVLX] in {
   5991   defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
   5992                                   loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
   5993                                   VEX_4V, VEX_WIG;
   5994   defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
   5995                                   loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
   5996                                   VEX_4V, VEX_WIG;
   5997   defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
   5998                                   loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
   5999                                   VEX_4V, VEX_WIG;
   6000   defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
   6001                                   loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
   6002                                   VEX_4V, VEX_WIG;
   6003   defm VPMULDQ   : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
   6004                                   loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>,
   6005                                   VEX_4V, VEX_WIG;
   6006 }
   6007 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   6008   defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
   6009                                   loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
   6010                                   VEX_4V, VEX_WIG;
   6011   defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
   6012                                   loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
   6013                                   VEX_4V, VEX_WIG;
   6014   defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
   6015                                   loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
   6016                                   VEX_4V, VEX_WIG;
   6017   defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
   6018                                   loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
   6019                                   VEX_4V, VEX_WIG;
   6020 }
   6021 
   6022 let Predicates = [HasAVX2, NoVLX] in {
   6023   defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
   6024                                   loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
   6025                                   VEX_4V, VEX_L, VEX_WIG;
   6026   defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
   6027                                   loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
   6028                                   VEX_4V, VEX_L, VEX_WIG;
   6029   defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
   6030                                   loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
   6031                                   VEX_4V, VEX_L, VEX_WIG;
   6032   defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
   6033                                   loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
   6034                                   VEX_4V, VEX_L, VEX_WIG;
   6035   defm VPMULDQY  : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
   6036                                   loadv4i64, i256mem, SchedWriteVecIMul.YMM, 0>,
   6037                                   VEX_4V, VEX_L, VEX_WIG;
   6038 }
   6039 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   6040   defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
   6041                                   loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
   6042                                   VEX_4V, VEX_L, VEX_WIG;
   6043   defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
   6044                                   loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
   6045                                   VEX_4V, VEX_L, VEX_WIG;
   6046   defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
   6047                                   loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
   6048                                   VEX_4V, VEX_L, VEX_WIG;
   6049   defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
   6050                                   loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
   6051                                   VEX_4V, VEX_L, VEX_WIG;
   6052 }
   6053 
   6054 let Constraints = "$src1 = $dst" in {
   6055   defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
   6056                                  memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
   6057   defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
   6058                                  memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
   6059   defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
   6060                                  memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
   6061   defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
   6062                                  memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
   6063   defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
   6064                                  memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
   6065   defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
   6066                                  memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
   6067   defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
   6068                                  memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
   6069   defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
   6070                                  memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
   6071   defm PMULDQ   : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
   6072                                  memopv2i64, i128mem, SchedWriteVecIMul.XMM, 1>;
   6073 }
   6074 
   6075 let Predicates = [HasAVX, NoVLX] in
   6076   defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
   6077                                  loadv2i64, i128mem, SchedWritePMULLD.XMM, 0>,
   6078                                  VEX_4V, VEX_WIG;
   6079 let Predicates = [HasAVX] in
   6080   defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
   6081                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
   6082                                  VEX_4V, VEX_WIG;
   6083 
   6084 let Predicates = [HasAVX2, NoVLX] in
   6085   defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
   6086                                   loadv4i64, i256mem, SchedWritePMULLD.YMM, 0>,
   6087                                   VEX_4V, VEX_L, VEX_WIG;
   6088 let Predicates = [HasAVX2] in
   6089   defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
   6090                                   loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
   6091                                   VEX_4V, VEX_L, VEX_WIG;
   6092 
   6093 let Constraints = "$src1 = $dst" in {
   6094   defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
   6095                                 memopv2i64, i128mem, SchedWritePMULLD.XMM, 1>;
   6096   defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
   6097                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
   6098 }
   6099 
   6100 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
   6101 multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
   6102                  Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
   6103                  X86MemOperand x86memop, bit Is2Addr,
   6104                  X86FoldableSchedWrite sched> {
   6105   let isCommutable = 1 in
   6106   def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
   6107         (ins RC:$src1, RC:$src2, u8imm:$src3),
   6108         !if(Is2Addr,
   6109             !strconcat(OpcodeStr,
   6110                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   6111             !strconcat(OpcodeStr,
   6112                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   6113         [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
   6114         Sched<[sched]>;
   6115   def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
   6116         (ins RC:$src1, x86memop:$src2, u8imm:$src3),
   6117         !if(Is2Addr,
   6118             !strconcat(OpcodeStr,
   6119                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   6120             !strconcat(OpcodeStr,
   6121                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   6122         [(set RC:$dst,
   6123           (IntId RC:$src1,
   6124            (bitconvert (memop_frag addr:$src2)), imm:$src3))]>,
   6125         Sched<[sched.Folded, ReadAfterLd]>;
   6126 }
   6127 
   6128 /// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
   6129 multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
   6130                            ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
   6131                            X86MemOperand x86memop, bit Is2Addr,
   6132                            X86FoldableSchedWrite sched> {
   6133   let isCommutable = 1 in
   6134   def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
   6135         (ins RC:$src1, RC:$src2, u8imm:$src3),
   6136         !if(Is2Addr,
   6137             !strconcat(OpcodeStr,
   6138                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   6139             !strconcat(OpcodeStr,
   6140                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   6141         [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
   6142         Sched<[sched]>;
   6143   def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
   6144         (ins RC:$src1, x86memop:$src2, u8imm:$src3),
   6145         !if(Is2Addr,
   6146             !strconcat(OpcodeStr,
   6147                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   6148             !strconcat(OpcodeStr,
   6149                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   6150         [(set RC:$dst,
   6151           (OpVT (OpNode RC:$src1,
   6152                  (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
   6153         Sched<[sched.Folded, ReadAfterLd]>;
   6154 }
   6155 
   6156 def BlendCommuteImm2 : SDNodeXForm<imm, [{
   6157   uint8_t Imm = N->getZExtValue() & 0x03;
   6158   return getI8Imm(Imm ^ 0x03, SDLoc(N));
   6159 }]>;
   6160 
   6161 def BlendCommuteImm4 : SDNodeXForm<imm, [{
   6162   uint8_t Imm = N->getZExtValue() & 0x0f;
   6163   return getI8Imm(Imm ^ 0x0f, SDLoc(N));
   6164 }]>;
   6165 
   6166 def BlendCommuteImm8 : SDNodeXForm<imm, [{
   6167   uint8_t Imm = N->getZExtValue() & 0xff;
   6168   return getI8Imm(Imm ^ 0xff, SDLoc(N));
   6169 }]>;
   6170 
   6171 let Predicates = [HasAVX] in {
   6172   let isCommutable = 0 in {
   6173     defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
   6174                                         VR128, loadv2i64, i128mem, 0,
   6175                                         SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG;
   6176   }
   6177 
   6178   let ExeDomain = SSEPackedSingle in
   6179   defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
   6180                                    VR128, loadv4f32, f128mem, 0,
   6181                                    SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG;
   6182   let ExeDomain = SSEPackedDouble in
   6183   defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
   6184                                    VR128, loadv2f64, f128mem, 0,
   6185                                    SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG;
   6186   let ExeDomain = SSEPackedSingle in
   6187   defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
   6188                                     VR256, loadv8f32, i256mem, 0,
   6189                                     SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG;
   6190 }
   6191 
   6192 let Predicates = [HasAVX2] in {
   6193   let isCommutable = 0 in {
   6194   defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
   6195                                   VR256, loadv4i64, i256mem, 0,
   6196                                   SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG;
   6197   }
   6198 }
   6199 
   6200 let Constraints = "$src1 = $dst" in {
   6201   let isCommutable = 0 in {
   6202   defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
   6203                                      VR128, memopv2i64, i128mem, 1,
   6204                                      SchedWriteMPSAD.XMM>;
   6205   }
   6206 
   6207   let ExeDomain = SSEPackedSingle in
   6208   defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
   6209                                   VR128, memopv4f32, f128mem, 1,
   6210                                   SchedWriteDPPS.XMM>;
   6211   let ExeDomain = SSEPackedDouble in
   6212   defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
   6213                                   VR128, memopv2f64, f128mem, 1,
   6214                                   SchedWriteDPPD.XMM>;
   6215 }
   6216 
   6217 /// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
   6218 multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
   6219                            ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
   6220                            X86MemOperand x86memop, bit Is2Addr, Domain d,
   6221                            X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> {
   6222 let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
   6223   let isCommutable = 1 in
   6224   def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
   6225         (ins RC:$src1, RC:$src2, u8imm:$src3),
   6226         !if(Is2Addr,
   6227             !strconcat(OpcodeStr,
   6228                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   6229             !strconcat(OpcodeStr,
   6230                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   6231         [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
   6232         Sched<[sched]>;
   6233   def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
   6234         (ins RC:$src1, x86memop:$src2, u8imm:$src3),
   6235         !if(Is2Addr,
   6236             !strconcat(OpcodeStr,
   6237                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
   6238             !strconcat(OpcodeStr,
   6239                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
   6240         [(set RC:$dst,
   6241           (OpVT (OpNode RC:$src1,
   6242                  (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
   6243         Sched<[sched.Folded, ReadAfterLd]>;
   6244 }
   6245 
   6246   // Pattern to commute if load is in first source.
   6247   def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)),
   6248                           RC:$src1, imm:$src3)),
   6249             (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
   6250                                             (commuteXForm imm:$src3))>;
   6251 }
   6252 
   6253 let Predicates = [HasAVX] in {
   6254   defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
   6255                                   VR128, loadv4f32, f128mem, 0, SSEPackedSingle,
   6256                                   SchedWriteFBlend.XMM, BlendCommuteImm4>,
   6257                                   VEX_4V, VEX_WIG;
   6258   defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
   6259                                    VR256, loadv8f32, f256mem, 0, SSEPackedSingle,
   6260                                    SchedWriteFBlend.YMM, BlendCommuteImm8>,
   6261                                    VEX_4V, VEX_L, VEX_WIG;
   6262   defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
   6263                                   VR128, loadv2f64, f128mem, 0, SSEPackedDouble,
   6264                                   SchedWriteFBlend.XMM, BlendCommuteImm2>,
   6265                                   VEX_4V, VEX_WIG;
   6266   defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
   6267                                    VR256, loadv4f64, f256mem, 0, SSEPackedDouble,
   6268                                    SchedWriteFBlend.YMM, BlendCommuteImm4>,
   6269                                    VEX_4V, VEX_L, VEX_WIG;
   6270   defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
   6271                                   VR128, loadv2i64, i128mem, 0, SSEPackedInt,
   6272                                   SchedWriteBlend.XMM, BlendCommuteImm8>,
   6273                                   VEX_4V, VEX_WIG;
   6274 }
   6275 
   6276 let Predicates = [HasAVX2] in {
   6277   defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
   6278                                    VR256, loadv4i64, i256mem, 0, SSEPackedInt,
   6279                                    SchedWriteBlend.YMM, BlendCommuteImm8>,
   6280                                    VEX_4V, VEX_L, VEX_WIG;
   6281 }
   6282 
   6283 defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
   6284                                VR128, memopv4f32, f128mem, 1, SSEPackedSingle,
   6285                                SchedWriteFBlend.XMM, BlendCommuteImm4>;
   6286 defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
   6287                                VR128, memopv2f64, f128mem, 1, SSEPackedDouble,
   6288                                SchedWriteFBlend.XMM, BlendCommuteImm2>;
   6289 defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
   6290                                VR128, memopv2i64, i128mem, 1, SSEPackedInt,
   6291                                SchedWriteBlend.XMM, BlendCommuteImm8>;
   6292 
   6293 // For insertion into the zero index (low half) of a 256-bit vector, it is
   6294 // more efficient to generate a blend with immediate instead of an insert*128.
   6295 let Predicates = [HasAVX] in {
   6296 def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)),
   6297           (VBLENDPDYrri VR256:$src1,
   6298                         (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
   6299                                        VR128:$src2, sub_xmm), 0x3)>;
   6300 def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
   6301           (VBLENDPSYrri VR256:$src1,
   6302                         (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
   6303                                        VR128:$src2, sub_xmm), 0xf)>;
   6304 }
   6305 
   6306 /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
   6307 multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
   6308                                     RegisterClass RC, X86MemOperand x86memop,
   6309                                     PatFrag mem_frag, Intrinsic IntId,
   6310                                     X86FoldableSchedWrite sched> {
   6311   def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
   6312                   (ins RC:$src1, RC:$src2, RC:$src3),
   6313                   !strconcat(OpcodeStr,
   6314                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
   6315                   [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
   6316                   SSEPackedInt>, TAPD, VEX_4V,
   6317                 Sched<[sched]>;
   6318 
   6319   def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
   6320                   (ins RC:$src1, x86memop:$src2, RC:$src3),
   6321                   !strconcat(OpcodeStr,
   6322                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
   6323                   [(set RC:$dst,
   6324                         (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
   6325                                RC:$src3))], SSEPackedInt>, TAPD, VEX_4V,
   6326                 Sched<[sched.Folded, ReadAfterLd,
   6327                        // x86memop:$src2
   6328                        ReadDefault, ReadDefault, ReadDefault, ReadDefault,
   6329                        ReadDefault,
   6330                        // RC::$src3
   6331                        ReadAfterLd]>;
   6332 }
   6333 
   6334 let Predicates = [HasAVX] in {
   6335 let ExeDomain = SSEPackedDouble in {
   6336 defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
   6337                                            loadv2f64, int_x86_sse41_blendvpd,
   6338                                            SchedWriteFVarBlend.XMM>;
   6339 defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
   6340                                   loadv4f64, int_x86_avx_blendv_pd_256,
   6341                                   SchedWriteFVarBlend.YMM>, VEX_L;
   6342 } // ExeDomain = SSEPackedDouble
   6343 let ExeDomain = SSEPackedSingle in {
   6344 defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
   6345                                            loadv4f32, int_x86_sse41_blendvps,
   6346                                            SchedWriteFVarBlend.XMM>;
   6347 defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
   6348                                   loadv8f32, int_x86_avx_blendv_ps_256,
   6349                                   SchedWriteFVarBlend.YMM>, VEX_L;
   6350 } // ExeDomain = SSEPackedSingle
   6351 defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
   6352                                            loadv2i64, int_x86_sse41_pblendvb,
   6353                                            SchedWriteVarBlend.XMM>;
   6354 }
   6355 
   6356 let Predicates = [HasAVX2] in {
   6357 defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
   6358                                       loadv4i64, int_x86_avx2_pblendvb,
   6359                                       SchedWriteVarBlend.YMM>, VEX_L;
   6360 }
   6361 
   6362 let Predicates = [HasAVX] in {
   6363   def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
   6364                             (v16i8 VR128:$src2))),
   6365             (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>;
   6366   def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1),
   6367                             (v4i32 VR128:$src2))),
   6368             (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
   6369   def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1),
   6370                             (v4f32 VR128:$src2))),
   6371             (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
   6372   def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1),
   6373                             (v2i64 VR128:$src2))),
   6374             (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
   6375   def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1),
   6376                             (v2f64 VR128:$src2))),
   6377             (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
   6378   def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1),
   6379                             (v8i32 VR256:$src2))),
   6380             (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
   6381   def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1),
   6382                             (v8f32 VR256:$src2))),
   6383             (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
   6384   def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1),
   6385                             (v4i64 VR256:$src2))),
   6386             (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
   6387   def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
   6388                             (v4f64 VR256:$src2))),
   6389             (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
   6390 }
   6391 
   6392 let Predicates = [HasAVX2] in {
   6393   def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
   6394                             (v32i8 VR256:$src2))),
   6395             (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
   6396 }
   6397 
   6398 // Prefer a movss or movsd over a blendps when optimizing for size. these were
   6399 // changed to use blends because blends have better throughput on sandybridge
   6400 // and haswell, but movs[s/d] are 1-2 byte shorter instructions.
   6401 let Predicates = [HasAVX, OptForSpeed] in {
   6402   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
   6403             (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
   6404   def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
   6405             (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
   6406 
   6407   def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
   6408             (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
   6409   def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
   6410             (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
   6411   def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
   6412             (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
   6413 
   6414   def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
   6415             (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
   6416   def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
   6417             (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
   6418   def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
   6419             (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
   6420 
   6421   // Move low f32 and clear high bits.
   6422   def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
   6423             (SUBREG_TO_REG (i32 0),
   6424              (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
   6425                           (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
   6426                           (i8 1))), sub_xmm)>;
   6427   def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
   6428             (SUBREG_TO_REG (i32 0),
   6429              (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
   6430                           (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
   6431                           (i8 3))), sub_xmm)>;
   6432 
   6433   def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
   6434             (SUBREG_TO_REG (i32 0),
   6435              (v2f64 (VBLENDPDrri (v2f64 (V_SET0)),
   6436                           (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)),
   6437                           (i8 1))), sub_xmm)>;
   6438   def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
   6439             (SUBREG_TO_REG (i32 0),
   6440              (v2i64 (VPBLENDWrri (v2i64 (V_SET0)),
   6441                           (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)),
   6442                           (i8 0xf))), sub_xmm)>;
   6443 }
   6444 
   6445 // Prefer a movss or movsd over a blendps when optimizing for size. these were
   6446 // changed to use blends because blends have better throughput on sandybridge
   6447 // and haswell, but movs[s/d] are 1-2 byte shorter instructions.
   6448 let Predicates = [UseSSE41, OptForSpeed] in {
   6449   // With SSE41 we can use blends for these patterns.
   6450   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
   6451             (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
   6452   def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
   6453             (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
   6454 
   6455   def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
   6456             (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
   6457   def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
   6458             (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
   6459   def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
   6460             (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
   6461 
   6462   def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
   6463             (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
   6464   def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
   6465             (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
   6466   def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
   6467             (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
   6468 }
   6469 
   6470 
   6471 /// SS41I_ternary_int - SSE 4.1 ternary operator
   6472 let Uses = [XMM0], Constraints = "$src1 = $dst" in {
   6473   multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
   6474                                X86MemOperand x86memop, Intrinsic IntId,
   6475                                X86FoldableSchedWrite sched> {
   6476     def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
   6477                     (ins VR128:$src1, VR128:$src2),
   6478                     !strconcat(OpcodeStr,
   6479                      "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
   6480                     [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
   6481                     Sched<[sched]>;
   6482 
   6483     def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
   6484                     (ins VR128:$src1, x86memop:$src2),
   6485                     !strconcat(OpcodeStr,
   6486                      "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
   6487                     [(set VR128:$dst,
   6488                       (IntId VR128:$src1,
   6489                        (bitconvert (mem_frag addr:$src2)), XMM0))]>,
   6490                     Sched<[sched.Folded, ReadAfterLd]>;
   6491   }
   6492 }
   6493 
   6494 let ExeDomain = SSEPackedDouble in
   6495 defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
   6496                                   int_x86_sse41_blendvpd, SchedWriteFVarBlend.XMM>;
   6497 let ExeDomain = SSEPackedSingle in
   6498 defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
   6499                                   int_x86_sse41_blendvps, SchedWriteFVarBlend.XMM>;
   6500 defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
   6501                                   int_x86_sse41_pblendvb, SchedWriteVarBlend.XMM>;
   6502 
   6503 // Aliases with the implicit xmm0 argument
   6504 def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
   6505                 (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>;
   6506 def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
   6507                 (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>;
   6508 def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
   6509                 (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>;
   6510 def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
   6511                 (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>;
   6512 def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
   6513                 (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>;
   6514 def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
   6515                 (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
   6516 
   6517 let Predicates = [UseSSE41] in {
   6518   def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
   6519                             (v16i8 VR128:$src2))),
   6520             (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
   6521   def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1),
   6522                             (v4i32 VR128:$src2))),
   6523             (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
   6524   def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1),
   6525                             (v4f32 VR128:$src2))),
   6526             (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
   6527   def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1),
   6528                             (v2i64 VR128:$src2))),
   6529             (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
   6530   def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
   6531                             (v2f64 VR128:$src2))),
   6532             (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
   6533 }
   6534 
   6535 let AddedComplexity = 400 in { // Prefer non-temporal versions
   6536 
   6537 let Predicates = [HasAVX, NoVLX] in
   6538 def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
   6539                         "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
   6540                         Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG;
   6541 let Predicates = [HasAVX2, NoVLX] in
   6542 def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
   6543                          "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
   6544                          Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG;
   6545 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
   6546                        "movntdqa\t{$src, $dst|$dst, $src}", []>,
   6547                        Sched<[SchedWriteVecMoveLSNT.XMM.RM]>;
   6548 
   6549 let Predicates = [HasAVX2, NoVLX] in {
   6550   def : Pat<(v8f32 (alignednontemporalload addr:$src)),
   6551             (VMOVNTDQAYrm addr:$src)>;
   6552   def : Pat<(v4f64 (alignednontemporalload addr:$src)),
   6553             (VMOVNTDQAYrm addr:$src)>;
   6554   def : Pat<(v4i64 (alignednontemporalload addr:$src)),
   6555             (VMOVNTDQAYrm addr:$src)>;
   6556 }
   6557 
   6558 let Predicates = [HasAVX, NoVLX] in {
   6559   def : Pat<(v4f32 (alignednontemporalload addr:$src)),
   6560             (VMOVNTDQArm addr:$src)>;
   6561   def : Pat<(v2f64 (alignednontemporalload addr:$src)),
   6562             (VMOVNTDQArm addr:$src)>;
   6563   def : Pat<(v2i64 (alignednontemporalload addr:$src)),
   6564             (VMOVNTDQArm addr:$src)>;
   6565 }
   6566 
   6567 let Predicates = [UseSSE41] in {
   6568   def : Pat<(v4f32 (alignednontemporalload addr:$src)),
   6569             (MOVNTDQArm addr:$src)>;
   6570   def : Pat<(v2f64 (alignednontemporalload addr:$src)),
   6571             (MOVNTDQArm addr:$src)>;
   6572   def : Pat<(v2i64 (alignednontemporalload addr:$src)),
   6573             (MOVNTDQArm addr:$src)>;
   6574 }
   6575 
   6576 } // AddedComplexity
   6577 
   6578 //===----------------------------------------------------------------------===//
   6579 // SSE4.2 - Compare Instructions
   6580 //===----------------------------------------------------------------------===//
   6581 
   6582 /// SS42I_binop_rm - Simple SSE 4.2 binary operator
   6583 multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
   6584                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
   6585                           X86MemOperand x86memop, X86FoldableSchedWrite sched,
   6586                           bit Is2Addr = 1> {
   6587   def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
   6588        (ins RC:$src1, RC:$src2),
   6589        !if(Is2Addr,
   6590            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   6591            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   6592        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
   6593        Sched<[sched]>;
   6594   def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
   6595        (ins RC:$src1, x86memop:$src2),
   6596        !if(Is2Addr,
   6597            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
   6598            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
   6599        [(set RC:$dst,
   6600          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
   6601        Sched<[sched.Folded, ReadAfterLd]>;
   6602 }
   6603 
   6604 let Predicates = [HasAVX] in
   6605   defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
   6606                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
   6607                                  VEX_4V, VEX_WIG;
   6608 
   6609 let Predicates = [HasAVX2] in
   6610   defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
   6611                                   loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
   6612                                   VEX_4V, VEX_L, VEX_WIG;
   6613 
   6614 let Constraints = "$src1 = $dst" in
   6615   defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
   6616                                 memopv2i64, i128mem, SchedWriteVecALU.XMM>;
   6617 
   6618 //===----------------------------------------------------------------------===//
   6619 // SSE4.2 - String/text Processing Instructions
   6620 //===----------------------------------------------------------------------===//
   6621 
   6622 multiclass pcmpistrm_SS42AI<string asm> {
   6623   def rr : SS42AI<0x62, MRMSrcReg, (outs),
   6624     (ins VR128:$src1, VR128:$src2, u8imm:$src3),
   6625     !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
   6626     []>, Sched<[WritePCmpIStrM]>;
   6627   let mayLoad = 1 in
   6628   def rm :SS42AI<0x62, MRMSrcMem, (outs),
   6629     (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
   6630     !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
   6631     []>, Sched<[WritePCmpIStrM.Folded, ReadAfterLd]>;
   6632 }
   6633 
   6634 let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
   6635   let Predicates = [HasAVX] in
   6636   defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
   6637   defm PCMPISTRM  : pcmpistrm_SS42AI<"pcmpistrm"> ;
   6638 }
   6639 
   6640 multiclass SS42AI_pcmpestrm<string asm> {
   6641   def rr : SS42AI<0x60, MRMSrcReg, (outs),
   6642     (ins VR128:$src1, VR128:$src3, u8imm:$src5),
   6643     !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
   6644     []>, Sched<[WritePCmpEStrM]>;
   6645   let mayLoad = 1 in
   6646   def rm : SS42AI<0x60, MRMSrcMem, (outs),
   6647     (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
   6648     !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
   6649     []>, Sched<[WritePCmpEStrM.Folded, ReadAfterLd]>;
   6650 }
   6651 
   6652 let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
   6653   let Predicates = [HasAVX] in
   6654   defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
   6655   defm PCMPESTRM :  SS42AI_pcmpestrm<"pcmpestrm">;
   6656 }
   6657 
   6658 multiclass SS42AI_pcmpistri<string asm> {
   6659   def rr : SS42AI<0x63, MRMSrcReg, (outs),
   6660     (ins VR128:$src1, VR128:$src2, u8imm:$src3),
   6661     !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
   6662     []>, Sched<[WritePCmpIStrI]>;
   6663   let mayLoad = 1 in
   6664   def rm : SS42AI<0x63, MRMSrcMem, (outs),
   6665     (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
   6666     !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
   6667     []>, Sched<[WritePCmpIStrI.Folded, ReadAfterLd]>;
   6668 }
   6669 
   6670 let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
   6671   let Predicates = [HasAVX] in
   6672   defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
   6673   defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
   6674 }
   6675 
   6676 multiclass SS42AI_pcmpestri<string asm> {
   6677   def rr : SS42AI<0x61, MRMSrcReg, (outs),
   6678     (ins VR128:$src1, VR128:$src3, u8imm:$src5),
   6679     !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
   6680     []>, Sched<[WritePCmpEStrI]>;
   6681   let mayLoad = 1 in
   6682   def rm : SS42AI<0x61, MRMSrcMem, (outs),
   6683     (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
   6684     !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
   6685     []>, Sched<[WritePCmpEStrI.Folded, ReadAfterLd]>;
   6686 }
   6687 
   6688 let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
   6689   let Predicates = [HasAVX] in
   6690   defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
   6691   defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
   6692 }
   6693 
   6694 //===----------------------------------------------------------------------===//
   6695 // SSE4.2 - CRC Instructions
   6696 //===----------------------------------------------------------------------===//
   6697 
   6698 // No CRC instructions have AVX equivalents
   6699 
   6700 // crc intrinsic instruction
   6701 // This set of instructions are only rm, the only difference is the size
   6702 // of r and m.
   6703 class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
   6704                    RegisterClass RCIn, SDPatternOperator Int> :
   6705   SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
   6706          !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
   6707          [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>,
   6708          Sched<[WriteCRC32]>;
   6709 
   6710 class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
   6711                    X86MemOperand x86memop, SDPatternOperator Int> :
   6712   SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
   6713          !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
   6714          [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>,
   6715          Sched<[WriteCRC32.Folded, ReadAfterLd]>;
   6716 
   6717 let Constraints = "$src1 = $dst" in {
   6718   def CRC32r32m8  : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
   6719                                  int_x86_sse42_crc32_32_8>;
   6720   def CRC32r32r8  : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
   6721                                  int_x86_sse42_crc32_32_8>;
   6722   def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
   6723                                  int_x86_sse42_crc32_32_16>, OpSize16;
   6724   def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
   6725                                  int_x86_sse42_crc32_32_16>, OpSize16;
   6726   def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
   6727                                  int_x86_sse42_crc32_32_32>, OpSize32;
   6728   def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
   6729                                  int_x86_sse42_crc32_32_32>, OpSize32;
   6730   def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
   6731                                  int_x86_sse42_crc32_64_64>, REX_W;
   6732   def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
   6733                                  int_x86_sse42_crc32_64_64>, REX_W;
   6734   let hasSideEffects = 0 in {
   6735     let mayLoad = 1 in
   6736     def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
   6737                                    null_frag>, REX_W;
   6738     def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
   6739                                    null_frag>, REX_W;
   6740   }
   6741 }
   6742 
   6743 //===----------------------------------------------------------------------===//
   6744 // SHA-NI Instructions
   6745 //===----------------------------------------------------------------------===//
   6746 
   6747 // FIXME: Is there a better scheduler class for SHA than WriteVecIMul?
   6748 multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
   6749                       X86FoldableSchedWrite sched, bit UsesXMM0 = 0> {
   6750   def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
   6751              (ins VR128:$src1, VR128:$src2),
   6752              !if(UsesXMM0,
   6753                  !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
   6754                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
   6755              [!if(UsesXMM0,
   6756                   (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
   6757                   (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
   6758              T8, Sched<[sched]>;
   6759 
   6760   def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
   6761              (ins VR128:$src1, i128mem:$src2),
   6762              !if(UsesXMM0,
   6763                  !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
   6764                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
   6765              [!if(UsesXMM0,
   6766                   (set VR128:$dst, (IntId VR128:$src1,
   6767                     (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
   6768                   (set VR128:$dst, (IntId VR128:$src1,
   6769                     (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8,
   6770              Sched<[sched.Folded, ReadAfterLd]>;
   6771 }
   6772 
   6773 let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
   6774   def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
   6775                          (ins VR128:$src1, VR128:$src2, u8imm:$src3),
   6776                          "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
   6777                          [(set VR128:$dst,
   6778                            (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
   6779                             (i8 imm:$src3)))]>, TA,
   6780                          Sched<[SchedWriteVecIMul.XMM]>;
   6781   def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
   6782                          (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
   6783                          "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
   6784                          [(set VR128:$dst,
   6785                            (int_x86_sha1rnds4 VR128:$src1,
   6786                             (bc_v4i32 (memopv2i64 addr:$src2)),
   6787                             (i8 imm:$src3)))]>, TA,
   6788                          Sched<[SchedWriteVecIMul.XMM.Folded, ReadAfterLd]>;
   6789 
   6790   defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte,
   6791                               SchedWriteVecIMul.XMM>;
   6792   defm SHA1MSG1  : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1,
   6793                               SchedWriteVecIMul.XMM>;
   6794   defm SHA1MSG2  : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2,
   6795                               SchedWriteVecIMul.XMM>;
   6796 
   6797   let Uses=[XMM0] in
   6798   defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2,
   6799                                 SchedWriteVecIMul.XMM, 1>;
   6800 
   6801   defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1,
   6802                                SchedWriteVecIMul.XMM>;
   6803   defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2,
   6804                                SchedWriteVecIMul.XMM>;
   6805 }
   6806 
   6807 // Aliases with explicit %xmm0
   6808 def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
   6809                 (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>;
   6810 def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
   6811                 (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>;
   6812 
   6813 //===----------------------------------------------------------------------===//
   6814 // AES-NI Instructions
   6815 //===----------------------------------------------------------------------===//
   6816 
   6817 multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
   6818                              Intrinsic IntId, PatFrag ld_frag,
   6819                              bit Is2Addr = 0, RegisterClass RC = VR128,
   6820                              X86MemOperand MemOp = i128mem> {
   6821   let AsmString = OpcodeStr##
   6822                   !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}",
   6823                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
   6824     def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst),
   6825                    (ins RC:$src1, RC:$src2), "",
   6826                    [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>,
   6827                    Sched<[WriteAESDecEnc]>;
   6828     def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst),
   6829                    (ins RC:$src1, MemOp:$src2), "",
   6830                    [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>,
   6831                    Sched<[WriteAESDecEnc.Folded, ReadAfterLd]>;
   6832   }
   6833 }
   6834 
   6835 // Perform One Round of an AES Encryption/Decryption Flow
   6836 let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
   6837   defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
   6838                          int_x86_aesni_aesenc, loadv2i64>, VEX_4V, VEX_WIG;
   6839   defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
   6840                          int_x86_aesni_aesenclast, loadv2i64>, VEX_4V, VEX_WIG;
   6841   defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
   6842                          int_x86_aesni_aesdec, loadv2i64>, VEX_4V, VEX_WIG;
   6843   defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
   6844                          int_x86_aesni_aesdeclast, loadv2i64>, VEX_4V, VEX_WIG;
   6845 }
   6846 
   6847 let Predicates = [NoVLX, HasVAES] in {
   6848   defm VAESENCY         : AESI_binop_rm_int<0xDC, "vaesenc",
   6849                          int_x86_aesni_aesenc_256, loadv4i64, 0, VR256,
   6850                          i256mem>, VEX_4V, VEX_L, VEX_WIG;
   6851   defm VAESENCLASTY     : AESI_binop_rm_int<0xDD, "vaesenclast",
   6852                          int_x86_aesni_aesenclast_256, loadv4i64, 0, VR256,
   6853                          i256mem>, VEX_4V, VEX_L, VEX_WIG;
   6854   defm VAESDECY         : AESI_binop_rm_int<0xDE, "vaesdec",
   6855                          int_x86_aesni_aesdec_256, loadv4i64, 0, VR256,
   6856                          i256mem>, VEX_4V, VEX_L, VEX_WIG;
   6857   defm VAESDECLASTY     : AESI_binop_rm_int<0xDF, "vaesdeclast",
   6858                          int_x86_aesni_aesdeclast_256, loadv4i64, 0, VR256,
   6859                          i256mem>, VEX_4V, VEX_L, VEX_WIG;
   6860 }
   6861 
   6862 let Constraints = "$src1 = $dst" in {
   6863   defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
   6864                          int_x86_aesni_aesenc, memopv2i64, 1>;
   6865   defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
   6866                          int_x86_aesni_aesenclast, memopv2i64, 1>;
   6867   defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
   6868                          int_x86_aesni_aesdec, memopv2i64, 1>;
   6869   defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
   6870                          int_x86_aesni_aesdeclast, memopv2i64, 1>;
   6871 }
   6872 
   6873 // Perform the AES InvMixColumn Transformation
   6874 let Predicates = [HasAVX, HasAES] in {
   6875   def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
   6876       (ins VR128:$src1),
   6877       "vaesimc\t{$src1, $dst|$dst, $src1}",
   6878       [(set VR128:$dst,
   6879         (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
   6880       VEX, VEX_WIG;
   6881   def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
   6882       (ins i128mem:$src1),
   6883       "vaesimc\t{$src1, $dst|$dst, $src1}",
   6884       [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
   6885       Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG;
   6886 }
   6887 def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
   6888   (ins VR128:$src1),
   6889   "aesimc\t{$src1, $dst|$dst, $src1}",
   6890   [(set VR128:$dst,
   6891     (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
   6892 def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
   6893   (ins i128mem:$src1),
   6894   "aesimc\t{$src1, $dst|$dst, $src1}",
   6895   [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
   6896   Sched<[WriteAESIMC.Folded]>;
   6897 
   6898 // AES Round Key Generation Assist
   6899 let Predicates = [HasAVX, HasAES] in {
   6900   def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
   6901       (ins VR128:$src1, u8imm:$src2),
   6902       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   6903       [(set VR128:$dst,
   6904         (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
   6905       Sched<[WriteAESKeyGen]>, VEX, VEX_WIG;
   6906   def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
   6907       (ins i128mem:$src1, u8imm:$src2),
   6908       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   6909       [(set VR128:$dst,
   6910         (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
   6911       Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
   6912 }
   6913 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
   6914   (ins VR128:$src1, u8imm:$src2),
   6915   "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   6916   [(set VR128:$dst,
   6917     (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
   6918   Sched<[WriteAESKeyGen]>;
   6919 def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
   6920   (ins i128mem:$src1, u8imm:$src2),
   6921   "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   6922   [(set VR128:$dst,
   6923     (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
   6924   Sched<[WriteAESKeyGen.Folded]>;
   6925 
   6926 //===----------------------------------------------------------------------===//
   6927 // PCLMUL Instructions
   6928 //===----------------------------------------------------------------------===//
   6929 
   6930 // Immediate transform to help with commuting.
   6931 def PCLMULCommuteImm : SDNodeXForm<imm, [{
   6932   uint8_t Imm = N->getZExtValue();
   6933   return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N));
   6934 }]>;
   6935 
   6936 // SSE carry-less Multiplication instructions
   6937 let Predicates = [NoAVX, HasPCLMUL] in {
   6938   let Constraints = "$src1 = $dst" in {
   6939     let isCommutable = 1 in
   6940     def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
   6941               (ins VR128:$src1, VR128:$src2, u8imm:$src3),
   6942               "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
   6943               [(set VR128:$dst,
   6944                 (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
   6945                 Sched<[WriteCLMul]>;
   6946 
   6947     def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
   6948               (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
   6949               "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
   6950               [(set VR128:$dst,
   6951                  (int_x86_pclmulqdq VR128:$src1, (memopv2i64 addr:$src2),
   6952                   imm:$src3))]>,
   6953               Sched<[WriteCLMul.Folded, ReadAfterLd]>;
   6954   } // Constraints = "$src1 = $dst"
   6955 
   6956   def : Pat<(int_x86_pclmulqdq (memopv2i64 addr:$src2), VR128:$src1,
   6957                                 (i8 imm:$src3)),
   6958             (PCLMULQDQrm VR128:$src1, addr:$src2,
   6959                           (PCLMULCommuteImm imm:$src3))>;
   6960 } // Predicates = [NoAVX, HasPCLMUL]
   6961 
   6962 // SSE aliases
   6963 foreach HI = ["hq","lq"] in
   6964 foreach LO = ["hq","lq"] in {
   6965   def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
   6966                   (PCLMULQDQrr VR128:$dst, VR128:$src,
   6967                    !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
   6968   def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
   6969                   (PCLMULQDQrm VR128:$dst, i128mem:$src,
   6970                    !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
   6971 }
   6972 
   6973 // AVX carry-less Multiplication instructions
   6974 multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
   6975                       PatFrag LdFrag, Intrinsic IntId> {
   6976   let isCommutable = 1 in
   6977   def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst),
   6978             (ins RC:$src1, RC:$src2, u8imm:$src3),
   6979             "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   6980             [(set RC:$dst,
   6981               (IntId RC:$src1, RC:$src2, imm:$src3))]>,
   6982             Sched<[WriteCLMul]>;
   6983 
   6984   def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
   6985             (ins RC:$src1, MemOp:$src2, u8imm:$src3),
   6986             "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   6987             [(set RC:$dst,
   6988                (IntId RC:$src1, (LdFrag addr:$src2), imm:$src3))]>,
   6989             Sched<[WriteCLMul.Folded, ReadAfterLd]>;
   6990 
   6991   // We can commute a load in the first operand by swapping the sources and
   6992   // rotating the immediate.
   6993   def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 imm:$src3)),
   6994             (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2,
   6995                                            (PCLMULCommuteImm imm:$src3))>;
   6996 }
   6997 
   6998 let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
   6999 defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, loadv2i64,
   7000                              int_x86_pclmulqdq>, VEX_4V, VEX_WIG;
   7001 
   7002 let Predicates = [NoVLX, HasVPCLMULQDQ] in
   7003 defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, loadv4i64,
   7004                               int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG;
   7005 
   7006 multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
   7007                                    X86MemOperand MemOp, string Hi, string Lo> {
   7008   def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   7009                   (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2,
   7010                         !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
   7011   def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   7012                   (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2,
   7013                         !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
   7014 }
   7015 
   7016 multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC,
   7017                               X86MemOperand MemOp> {
   7018   defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">;
   7019   defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">;
   7020   defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">;
   7021   defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">;
   7022 }
   7023 
   7024 // AVX aliases
   7025 defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>;
   7026 defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>;
   7027 
   7028 //===----------------------------------------------------------------------===//
   7029 // SSE4A Instructions
   7030 //===----------------------------------------------------------------------===//
   7031 
   7032 let Predicates = [HasSSE4A] in {
   7033 
   7034 let ExeDomain = SSEPackedInt in {
   7035 let Constraints = "$src = $dst" in {
   7036 def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
   7037                  (ins VR128:$src, u8imm:$len, u8imm:$idx),
   7038                  "extrq\t{$idx, $len, $src|$src, $len, $idx}",
   7039                  [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len,
   7040                                     imm:$idx))]>,
   7041                  PD, Sched<[SchedWriteVecALU.XMM]>;
   7042 def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
   7043               (ins VR128:$src, VR128:$mask),
   7044               "extrq\t{$mask, $src|$src, $mask}",
   7045               [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
   7046                                  VR128:$mask))]>,
   7047               PD, Sched<[SchedWriteVecALU.XMM]>;
   7048 
   7049 def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
   7050                    (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
   7051                    "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
   7052                    [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
   7053                                       imm:$len, imm:$idx))]>,
   7054                    XD, Sched<[SchedWriteVecALU.XMM]>;
   7055 def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
   7056                  (ins VR128:$src, VR128:$mask),
   7057                  "insertq\t{$mask, $src|$src, $mask}",
   7058                  [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
   7059                                     VR128:$mask))]>,
   7060                  XD, Sched<[SchedWriteVecALU.XMM]>;
   7061 }
   7062 } // ExeDomain = SSEPackedInt
   7063 
   7064 // Non-temporal (unaligned) scalar stores.
   7065 let AddedComplexity = 400 in { // Prefer non-temporal versions
   7066 let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in {
   7067 def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
   7068                 "movntss\t{$src, $dst|$dst, $src}", []>, XS;
   7069 
   7070 def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
   7071                 "movntsd\t{$src, $dst|$dst, $src}", []>, XD;
   7072 } // SchedRW
   7073 
   7074 def : Pat<(nontemporalstore FR32:$src, addr:$dst),
   7075           (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
   7076 
   7077 def : Pat<(nontemporalstore FR64:$src, addr:$dst),
   7078           (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
   7079 
   7080 } // AddedComplexity
   7081 } // HasSSE4A
   7082 
   7083 //===----------------------------------------------------------------------===//
   7084 // AVX Instructions
   7085 //===----------------------------------------------------------------------===//
   7086 
   7087 //===----------------------------------------------------------------------===//
   7088 // VBROADCAST - Load from memory and broadcast to all elements of the
   7089 //              destination operand
   7090 //
   7091 class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
   7092                            X86MemOperand x86memop, ValueType VT,
   7093                            PatFrag ld_frag, SchedWrite Sched> :
   7094   AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
   7095         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   7096         [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
   7097         Sched<[Sched]>, VEX;
   7098 
   7099 // AVX2 adds register forms
   7100 class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
   7101                         ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
   7102   AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
   7103          !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   7104          [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
   7105          Sched<[Sched]>, VEX;
   7106 
   7107 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
   7108   def VBROADCASTSSrm  : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
   7109                                          f32mem, v4f32, loadf32,
   7110                                          SchedWriteFShuffle.XMM.Folded>;
   7111   def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
   7112                                          f32mem, v8f32, loadf32,
   7113                                          SchedWriteFShuffle.XMM.Folded>, VEX_L;
   7114 }
   7115 let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
   7116 def VBROADCASTSDYrm  : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
   7117                                         v4f64, loadf64,
   7118                                         SchedWriteFShuffle.XMM.Folded>, VEX_L;
   7119 
   7120 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
   7121   def VBROADCASTSSrr  : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
   7122                                           v4f32, v4f32, SchedWriteFShuffle.XMM>;
   7123   def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
   7124                                           v8f32, v4f32, WriteFShuffle256>, VEX_L;
   7125 }
   7126 let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
   7127 def VBROADCASTSDYrr  : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
   7128                                          v4f64, v2f64, WriteFShuffle256>, VEX_L;
   7129 
   7130 let Predicates = [HasAVX, NoVLX] in {
   7131   def : Pat<(v4f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
   7132             (VBROADCASTSSrm addr:$src)>;
   7133   def : Pat<(v8f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
   7134             (VBROADCASTSSYrm addr:$src)>;
   7135   def : Pat<(v4f64 (X86VBroadcast (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
   7136             (VBROADCASTSDYrm addr:$src)>;
   7137 }
   7138 
   7139 //===----------------------------------------------------------------------===//
   7140 // VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
   7141 //                  halves of a 256-bit vector.
   7142 //
   7143 let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
   7144 def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
   7145                            (ins i128mem:$src),
   7146                            "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
   7147                            Sched<[WriteShuffleLd]>, VEX, VEX_L;
   7148 
   7149 let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX],
   7150     ExeDomain = SSEPackedSingle in
   7151 def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
   7152                            (ins f128mem:$src),
   7153                            "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
   7154                            Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
   7155 
   7156 let Predicates = [HasAVX2, NoVLX] in {
   7157 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
   7158           (VBROADCASTI128 addr:$src)>;
   7159 def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
   7160           (VBROADCASTI128 addr:$src)>;
   7161 def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
   7162           (VBROADCASTI128 addr:$src)>;
   7163 def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
   7164           (VBROADCASTI128 addr:$src)>;
   7165 }
   7166 
   7167 let Predicates = [HasAVX, NoVLX] in {
   7168 def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
   7169           (VBROADCASTF128 addr:$src)>;
   7170 def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
   7171           (VBROADCASTF128 addr:$src)>;
   7172 }
   7173 
   7174 let Predicates = [HasAVX1Only] in {
   7175 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
   7176           (VBROADCASTF128 addr:$src)>;
   7177 def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
   7178           (VBROADCASTF128 addr:$src)>;
   7179 def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
   7180           (VBROADCASTF128 addr:$src)>;
   7181 def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
   7182           (VBROADCASTF128 addr:$src)>;
   7183 }
   7184 
   7185 //===----------------------------------------------------------------------===//
   7186 // VINSERTF128 - Insert packed floating-point values
   7187 //
   7188 let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
   7189 def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
   7190           (ins VR256:$src1, VR128:$src2, u8imm:$src3),
   7191           "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   7192           []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
   7193 let mayLoad = 1 in
   7194 def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
   7195           (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
   7196           "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   7197           []>, Sched<[WriteFShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
   7198 }
   7199 
   7200 // To create a 256-bit all ones value, we should produce VCMPTRUEPS
   7201 // with YMM register containing zero.
   7202 // FIXME: Avoid producing vxorps to clear the fake inputs.
   7203 let Predicates = [HasAVX1Only] in {
   7204 def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
   7205 }
   7206 
   7207 multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
   7208                             PatFrag memop_frag> {
   7209   def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
   7210                                    (iPTR imm)),
   7211             (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
   7212                                        (INSERT_get_vinsert128_imm VR256:$ins))>;
   7213   def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
   7214                                     (From (bitconvert (memop_frag addr:$src2))),
   7215                                     (iPTR imm)),
   7216             (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
   7217                                        (INSERT_get_vinsert128_imm VR256:$ins))>;
   7218 }
   7219 
   7220 let Predicates = [HasAVX, NoVLX] in {
   7221   defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>;
   7222   defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>;
   7223 }
   7224 
   7225 let Predicates = [HasAVX1Only] in {
   7226   defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64,  loadv2i64>;
   7227   defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32,  loadv2i64>;
   7228   defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv2i64>;
   7229   defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8,  loadv2i64>;
   7230 }
   7231 
   7232 //===----------------------------------------------------------------------===//
   7233 // VEXTRACTF128 - Extract packed floating-point values
   7234 //
   7235 let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
   7236 def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
   7237           (ins VR256:$src1, u8imm:$src2),
   7238           "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   7239           []>, Sched<[WriteFShuffle256]>, VEX, VEX_L;
   7240 let mayStore = 1 in
   7241 def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
   7242           (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
   7243           "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   7244           []>, Sched<[WriteFStoreX]>, VEX, VEX_L;
   7245 }
   7246 
   7247 multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
   7248   def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
   7249             (To (!cast<Instruction>(InstrStr#rr)
   7250                                     (From VR256:$src1),
   7251                                     (EXTRACT_get_vextract128_imm VR128:$ext)))>;
   7252   def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
   7253                                                  (iPTR imm))), addr:$dst),
   7254             (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
   7255              (EXTRACT_get_vextract128_imm VR128:$ext))>;
   7256 }
   7257 
   7258 // AVX1 patterns
   7259 let Predicates = [HasAVX, NoVLX] in {
   7260   defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
   7261   defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
   7262 }
   7263 
   7264 let Predicates = [HasAVX1Only] in {
   7265   defm : vextract_lowering<"VEXTRACTF128", v4i64,  v2i64>;
   7266   defm : vextract_lowering<"VEXTRACTF128", v8i32,  v4i32>;
   7267   defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
   7268   defm : vextract_lowering<"VEXTRACTF128", v32i8,  v16i8>;
   7269 }
   7270 
   7271 //===----------------------------------------------------------------------===//
   7272 // VMASKMOV - Conditional SIMD Packed Loads and Stores
   7273 //
   7274 multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
   7275                           Intrinsic IntLd, Intrinsic IntLd256,
   7276                           Intrinsic IntSt, Intrinsic IntSt256> {
   7277   def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
   7278              (ins VR128:$src1, f128mem:$src2),
   7279              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   7280              [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
   7281              VEX_4V, Sched<[WriteFMaskedLoad]>;
   7282   def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
   7283              (ins VR256:$src1, f256mem:$src2),
   7284              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   7285              [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
   7286              VEX_4V, VEX_L, Sched<[WriteFMaskedLoadY]>;
   7287   def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
   7288              (ins f128mem:$dst, VR128:$src1, VR128:$src2),
   7289              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   7290              [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
   7291              VEX_4V, Sched<[WriteFMaskedStore]>;
   7292   def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
   7293              (ins f256mem:$dst, VR256:$src1, VR256:$src2),
   7294              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   7295              [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
   7296              VEX_4V, VEX_L, Sched<[WriteFMaskedStoreY]>;
   7297 }
   7298 
   7299 let ExeDomain = SSEPackedSingle in
   7300 defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
   7301                                  int_x86_avx_maskload_ps,
   7302                                  int_x86_avx_maskload_ps_256,
   7303                                  int_x86_avx_maskstore_ps,
   7304                                  int_x86_avx_maskstore_ps_256>;
   7305 let ExeDomain = SSEPackedDouble in
   7306 defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
   7307                                  int_x86_avx_maskload_pd,
   7308                                  int_x86_avx_maskload_pd_256,
   7309                                  int_x86_avx_maskstore_pd,
   7310                                  int_x86_avx_maskstore_pd_256>;
   7311 
   7312 //===----------------------------------------------------------------------===//
   7313 // VPERMIL - Permute Single and Double Floating-Point Values
   7314 //
   7315 
   7316 multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
   7317                       RegisterClass RC, X86MemOperand x86memop_f,
   7318                       X86MemOperand x86memop_i, PatFrag i_frag,
   7319                       ValueType f_vt, ValueType i_vt,
   7320                       X86FoldableSchedWrite sched,
   7321                       X86FoldableSchedWrite varsched> {
   7322   let Predicates = [HasAVX, NoVLX] in {
   7323     def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
   7324                (ins RC:$src1, RC:$src2),
   7325                !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   7326                [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
   7327                Sched<[varsched]>;
   7328     def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
   7329                (ins RC:$src1, x86memop_i:$src2),
   7330                !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   7331                [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
   7332                               (i_vt (bitconvert (i_frag addr:$src2))))))]>, VEX_4V,
   7333                Sched<[varsched.Folded, ReadAfterLd]>;
   7334 
   7335     def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
   7336              (ins RC:$src1, u8imm:$src2),
   7337              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   7338              [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
   7339              Sched<[sched]>;
   7340     def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
   7341              (ins x86memop_f:$src1, u8imm:$src2),
   7342              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   7343              [(set RC:$dst,
   7344                (f_vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
   7345              Sched<[sched.Folded]>;
   7346   }// Predicates = [HasAVX, NoVLX]
   7347 }
   7348 
   7349 let ExeDomain = SSEPackedSingle in {
   7350   defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
   7351                                loadv2i64, v4f32, v4i32, SchedWriteFShuffle.XMM,
   7352                                SchedWriteFVarShuffle.XMM>;
   7353   defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
   7354                                loadv4i64, v8f32, v8i32, SchedWriteFShuffle.YMM,
   7355                                SchedWriteFVarShuffle.YMM>, VEX_L;
   7356 }
   7357 let ExeDomain = SSEPackedDouble in {
   7358   defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
   7359                                loadv2i64, v2f64, v2i64, SchedWriteFShuffle.XMM,
   7360                                SchedWriteFVarShuffle.XMM>;
   7361   defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
   7362                                loadv4i64, v4f64, v4i64, SchedWriteFShuffle.YMM,
   7363                                SchedWriteFVarShuffle.YMM>, VEX_L;
   7364 }
   7365 
   7366 //===----------------------------------------------------------------------===//
   7367 // VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
   7368 //
   7369 
   7370 let ExeDomain = SSEPackedSingle in {
   7371 let isCommutable = 1 in
   7372 def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
   7373           (ins VR256:$src1, VR256:$src2, u8imm:$src3),
   7374           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   7375           [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
   7376                               (i8 imm:$src3))))]>, VEX_4V, VEX_L,
   7377           Sched<[WriteFShuffle256]>;
   7378 def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
   7379           (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
   7380           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   7381           [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2),
   7382                              (i8 imm:$src3)))]>, VEX_4V, VEX_L,
   7383           Sched<[WriteFShuffle256Ld, ReadAfterLd]>;
   7384 }
   7385 
   7386 // Immediate transform to help with commuting.
   7387 def Perm2XCommuteImm : SDNodeXForm<imm, [{
   7388   return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
   7389 }]>;
   7390 
   7391 let Predicates = [HasAVX] in {
   7392 // Pattern with load in other operand.
   7393 def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2),
   7394                                 VR256:$src1, (i8 imm:$imm))),
   7395           (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
   7396 }
   7397 
   7398 let Predicates = [HasAVX1Only] in {
   7399 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
   7400           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
   7401 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
   7402                   (loadv4i64 addr:$src2), (i8 imm:$imm))),
   7403           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
   7404 // Pattern with load in other operand.
   7405 def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
   7406                                 VR256:$src1, (i8 imm:$imm))),
   7407           (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
   7408 }
   7409 
   7410 //===----------------------------------------------------------------------===//
   7411 // VZERO - Zero YMM registers
   7412 // Note: These instruction do not affect the YMM16-YMM31.
   7413 //
   7414 
   7415 let SchedRW = [WriteSystem] in {
   7416 let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
   7417             YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
   7418   // Zero All YMM registers
   7419   def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
   7420                   [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L,
   7421                   Requires<[HasAVX]>, VEX_WIG;
   7422 
   7423   // Zero Upper bits of YMM registers
   7424   def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
   7425                      [(int_x86_avx_vzeroupper)]>, PS, VEX,
   7426                      Requires<[HasAVX]>, VEX_WIG;
   7427 } // Defs
   7428 } // SchedRW
   7429 
   7430 //===----------------------------------------------------------------------===//
   7431 // Half precision conversion instructions
   7432 //
   7433 
   7434 multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
   7435                       X86FoldableSchedWrite sched> {
   7436   def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
   7437              "vcvtph2ps\t{$src, $dst|$dst, $src}",
   7438              [(set RC:$dst, (X86cvtph2ps VR128:$src))]>,
   7439              T8PD, VEX, Sched<[sched]>;
   7440   let hasSideEffects = 0, mayLoad = 1 in
   7441   def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
   7442              "vcvtph2ps\t{$src, $dst|$dst, $src}",
   7443              [(set RC:$dst, (X86cvtph2ps (bc_v8i16
   7444                                           (loadv2i64 addr:$src))))]>,
   7445              T8PD, VEX, Sched<[sched.Folded]>;
   7446 }
   7447 
   7448 multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
   7449                       SchedWrite RR, SchedWrite MR> {
   7450   def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
   7451                (ins RC:$src1, i32u8imm:$src2),
   7452                "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   7453                [(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>,
   7454                TAPD, VEX, Sched<[RR]>;
   7455   let hasSideEffects = 0, mayStore = 1 in
   7456   def mr : Ii8<0x1D, MRMDestMem, (outs),
   7457                (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
   7458                "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
   7459                TAPD, VEX, Sched<[MR]>;
   7460 }
   7461 
   7462 let Predicates = [HasF16C, NoVLX] in {
   7463   defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>;
   7464   defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L;
   7465   defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH,
   7466                                WriteCvtPS2PHSt>;
   7467   defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY,
   7468                                WriteCvtPS2PHYSt>, VEX_L;
   7469 
   7470   // Pattern match vcvtph2ps of a scalar i64 load.
   7471   def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))),
   7472             (VCVTPH2PSrm addr:$src)>;
   7473   def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))),
   7474             (VCVTPH2PSrm addr:$src)>;
   7475   def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert
   7476               (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
   7477             (VCVTPH2PSrm addr:$src)>;
   7478 
   7479   def : Pat<(store (f64 (extractelt
   7480                          (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
   7481                          (iPTR 0))), addr:$dst),
   7482             (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
   7483   def : Pat<(store (i64 (extractelt
   7484                          (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
   7485                          (iPTR 0))), addr:$dst),
   7486             (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
   7487   def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, i32:$src2)), addr:$dst),
   7488             (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>;
   7489 }
   7490 
   7491 // Patterns for  matching conversions from float to half-float and vice versa.
   7492 let Predicates = [HasF16C, NoVLX] in {
   7493   // Use MXCSR.RC for rounding instead of explicitly specifying the default
   7494   // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the
   7495   // configurations we support (the default). However, falling back to MXCSR is
   7496   // more consistent with other instructions, which are always controlled by it.
   7497   // It's encoded as 0b100.
   7498   def : Pat<(fp_to_f16 FR32:$src),
   7499             (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (v8i16 (VCVTPS2PHrr
   7500               (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4))), sub_16bit))>;
   7501 
   7502   def : Pat<(f16_to_fp GR16:$src),
   7503             (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
   7504               (v4i32 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)))), FR32)) >;
   7505 
   7506   def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))),
   7507             (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
   7508              (v8i16 (VCVTPS2PHrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4)))), FR32)) >;
   7509 }
   7510 
   7511 //===----------------------------------------------------------------------===//
   7512 // AVX2 Instructions
   7513 //===----------------------------------------------------------------------===//
   7514 
   7515 /// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
   7516 multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
   7517                           ValueType OpVT, X86FoldableSchedWrite sched,
   7518                           RegisterClass RC, PatFrag memop_frag,
   7519                           X86MemOperand x86memop, SDNodeXForm commuteXForm> {
   7520   let isCommutable = 1 in
   7521   def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
   7522         (ins RC:$src1, RC:$src2, u8imm:$src3),
   7523         !strconcat(OpcodeStr,
   7524             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
   7525         [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
   7526         Sched<[sched]>, VEX_4V;
   7527   def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
   7528         (ins RC:$src1, x86memop:$src2, u8imm:$src3),
   7529         !strconcat(OpcodeStr,
   7530             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
   7531         [(set RC:$dst,
   7532           (OpVT (OpNode RC:$src1,
   7533            (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
   7534         Sched<[sched.Folded, ReadAfterLd]>, VEX_4V;
   7535 
   7536   // Pattern to commute if load is in first source.
   7537   def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)),
   7538                           RC:$src1, imm:$src3)),
   7539             (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
   7540                                             (commuteXForm imm:$src3))>;
   7541 }
   7542 
   7543 defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
   7544                                SchedWriteBlend.XMM, VR128, loadv2i64, i128mem,
   7545                                BlendCommuteImm4>;
   7546 defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
   7547                                 SchedWriteBlend.YMM, VR256, loadv4i64, i256mem,
   7548                                 BlendCommuteImm8>, VEX_L;
   7549 
   7550 // For insertion into the zero index (low half) of a 256-bit vector, it is
   7551 // more efficient to generate a blend with immediate instead of an insert*128.
   7552 let Predicates = [HasAVX2] in {
   7553 def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
   7554           (VPBLENDDYrri VR256:$src1,
   7555                         (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
   7556                                        VR128:$src2, sub_xmm), 0xf)>;
   7557 def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
   7558           (VPBLENDDYrri VR256:$src1,
   7559                         (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
   7560                                        VR128:$src2, sub_xmm), 0xf)>;
   7561 def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
   7562           (VPBLENDDYrri VR256:$src1,
   7563                         (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
   7564                                        VR128:$src2, sub_xmm), 0xf)>;
   7565 def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
   7566           (VPBLENDDYrri VR256:$src1,
   7567                         (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
   7568                                        VR128:$src2, sub_xmm), 0xf)>;
   7569 }
   7570 
   7571 let Predicates = [HasAVX1Only] in {
   7572 def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
   7573           (VBLENDPSYrri VR256:$src1,
   7574                         (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
   7575                                        VR128:$src2, sub_xmm), 0xf)>;
   7576 def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
   7577           (VBLENDPSYrri VR256:$src1,
   7578                         (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
   7579                                        VR128:$src2, sub_xmm), 0xf)>;
   7580 def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
   7581           (VBLENDPSYrri VR256:$src1,
   7582                         (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
   7583                                        VR128:$src2, sub_xmm), 0xf)>;
   7584 def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
   7585           (VBLENDPSYrri VR256:$src1,
   7586                         (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
   7587                                        VR128:$src2, sub_xmm), 0xf)>;
   7588 }
   7589 
   7590 //===----------------------------------------------------------------------===//
   7591 // VPBROADCAST - Load from memory and broadcast to all elements of the
   7592 //               destination operand
   7593 //
   7594 multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
   7595                           X86MemOperand x86memop, PatFrag ld_frag,
   7596                           ValueType OpVT128, ValueType OpVT256, Predicate prd> {
   7597   let Predicates = [HasAVX2, prd] in {
   7598     def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
   7599                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   7600                   [(set VR128:$dst,
   7601                    (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
   7602                   Sched<[SchedWriteShuffle.XMM]>, VEX;
   7603     def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
   7604                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   7605                   [(set VR128:$dst,
   7606                    (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>,
   7607                   Sched<[SchedWriteShuffle.XMM.Folded]>, VEX;
   7608     def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
   7609                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   7610                    [(set VR256:$dst,
   7611                     (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
   7612                    Sched<[WriteShuffle256]>, VEX, VEX_L;
   7613     def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
   7614                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
   7615                    [(set VR256:$dst,
   7616                     (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>,
   7617                    Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L;
   7618 
   7619     // Provide aliases for broadcast from the same register class that
   7620     // automatically does the extract.
   7621     def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
   7622               (!cast<Instruction>(NAME#"Yrr")
   7623                   (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
   7624   }
   7625 }
   7626 
   7627 defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
   7628                                     v16i8, v32i8, NoVLX_Or_NoBWI>;
   7629 defm VPBROADCASTW  : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
   7630                                     v8i16, v16i16, NoVLX_Or_NoBWI>;
   7631 defm VPBROADCASTD  : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
   7632                                     v4i32, v8i32, NoVLX>;
   7633 defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
   7634                                     v2i64, v4i64, NoVLX>;
   7635 
   7636 let Predicates = [HasAVX2, NoVLX] in {
   7637   // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
   7638   def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
   7639             (VPBROADCASTQrm addr:$src)>;
   7640   def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))),
   7641             (VPBROADCASTQYrm addr:$src)>;
   7642 
   7643   def : Pat<(v4i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
   7644             (VPBROADCASTDrm addr:$src)>;
   7645   def : Pat<(v8i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
   7646             (VPBROADCASTDYrm addr:$src)>;
   7647   def : Pat<(v2i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
   7648             (VPBROADCASTQrm addr:$src)>;
   7649   def : Pat<(v4i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
   7650             (VPBROADCASTQYrm addr:$src)>;
   7651 }
   7652 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   7653   // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
   7654   // This means we'll encounter truncated i32 loads; match that here.
   7655   def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
   7656             (VPBROADCASTWrm addr:$src)>;
   7657   def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
   7658             (VPBROADCASTWYrm addr:$src)>;
   7659   def : Pat<(v8i16 (X86VBroadcast
   7660               (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
   7661             (VPBROADCASTWrm addr:$src)>;
   7662   def : Pat<(v16i16 (X86VBroadcast
   7663               (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
   7664             (VPBROADCASTWYrm addr:$src)>;
   7665 }
   7666 
   7667 let Predicates = [HasAVX2, NoVLX] in {
   7668   // Provide aliases for broadcast from the same register class that
   7669   // automatically does the extract.
   7670   def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
   7671             (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src),
   7672                                                     sub_xmm)))>;
   7673   def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))),
   7674             (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src),
   7675                                                     sub_xmm)))>;
   7676 }
   7677 
   7678 let Predicates = [HasAVX2, NoVLX] in {
   7679   // Provide fallback in case the load node that is used in the patterns above
   7680   // is used by additional users, which prevents the pattern selection.
   7681     def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
   7682               (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
   7683     def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
   7684               (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
   7685     def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
   7686               (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
   7687 }
   7688 
   7689 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   7690   def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
   7691         (VPBROADCASTBrr (v16i8 (COPY_TO_REGCLASS
   7692                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
   7693                                              GR8:$src, sub_8bit)),
   7694                          VR128)))>;
   7695   def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
   7696         (VPBROADCASTBYrr (v16i8 (COPY_TO_REGCLASS
   7697                           (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
   7698                                               GR8:$src, sub_8bit)),
   7699                           VR128)))>;
   7700 
   7701   def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
   7702         (VPBROADCASTWrr (v8i16 (COPY_TO_REGCLASS
   7703                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
   7704                                              GR16:$src, sub_16bit)),
   7705                          VR128)))>;
   7706   def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
   7707         (VPBROADCASTWYrr (v8i16 (COPY_TO_REGCLASS
   7708                           (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
   7709                                               GR16:$src, sub_16bit)),
   7710                           VR128)))>;
   7711 }
   7712 let Predicates = [HasAVX2, NoVLX] in {
   7713   def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
   7714             (VPBROADCASTDrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>;
   7715   def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
   7716             (VPBROADCASTDYrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>;
   7717   def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
   7718             (VPBROADCASTQrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>;
   7719   def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
   7720             (VPBROADCASTQYrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>;
   7721 }
   7722 
   7723 // AVX1 broadcast patterns
   7724 let Predicates = [HasAVX1Only] in {
   7725 def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
   7726           (VBROADCASTSSYrm addr:$src)>;
   7727 def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
   7728           (VBROADCASTSDYrm addr:$src)>;
   7729 def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
   7730           (VBROADCASTSSrm addr:$src)>;
   7731 }
   7732 
   7733   // Provide fallback in case the load node that is used in the patterns above
   7734   // is used by additional users, which prevents the pattern selection.
   7735 let Predicates = [HasAVX, NoVLX] in {
   7736   // 128bit broadcasts:
   7737   def : Pat<(v2f64 (X86VBroadcast f64:$src)),
   7738             (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
   7739   def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
   7740             (VMOVDDUPrm addr:$src)>;
   7741 
   7742   def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
   7743             (VMOVDDUPrr VR128:$src)>;
   7744   def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
   7745             (VMOVDDUPrm addr:$src)>;
   7746 }
   7747 
   7748 let Predicates = [HasAVX1Only] in {
   7749   def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
   7750             (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>;
   7751   def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
   7752             (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
   7753               (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm),
   7754               (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>;
   7755   def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
   7756             (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
   7757               (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm),
   7758               (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
   7759 
   7760   def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
   7761             (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)>;
   7762   def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
   7763             (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
   7764               (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), sub_xmm),
   7765               (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), 1)>;
   7766   def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
   7767             (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
   7768               (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), sub_xmm),
   7769               (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), 1)>;
   7770 
   7771   def : Pat<(v2i64 (X86VBroadcast i64:$src)),
   7772             (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)>;
   7773   def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
   7774             (VMOVDDUPrm addr:$src)>;
   7775 }
   7776 
   7777 //===----------------------------------------------------------------------===//
   7778 // VPERM - Permute instructions
   7779 //
   7780 
   7781 multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
   7782                      ValueType OpVT, X86FoldableSchedWrite Sched,
   7783                      X86MemOperand memOp> {
   7784   let Predicates = [HasAVX2, NoVLX] in {
   7785     def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
   7786                      (ins VR256:$src1, VR256:$src2),
   7787                      !strconcat(OpcodeStr,
   7788                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   7789                      [(set VR256:$dst,
   7790                        (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
   7791                      Sched<[Sched]>, VEX_4V, VEX_L;
   7792     def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
   7793                      (ins VR256:$src1, memOp:$src2),
   7794                      !strconcat(OpcodeStr,
   7795                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   7796                      [(set VR256:$dst,
   7797                        (OpVT (X86VPermv VR256:$src1,
   7798                               (bitconvert (mem_frag addr:$src2)))))]>,
   7799                      Sched<[Sched.Folded, ReadAfterLd]>, VEX_4V, VEX_L;
   7800   }
   7801 }
   7802 
   7803 defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteVarShuffle256,
   7804                         i256mem>;
   7805 let ExeDomain = SSEPackedSingle in
   7806 defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFVarShuffle256,
   7807                         f256mem>;
   7808 
   7809 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
   7810                          ValueType OpVT, X86FoldableSchedWrite Sched,
   7811                          X86MemOperand memOp> {
   7812   let Predicates = [HasAVX2, NoVLX] in {
   7813     def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
   7814                        (ins VR256:$src1, u8imm:$src2),
   7815                        !strconcat(OpcodeStr,
   7816                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   7817                        [(set VR256:$dst,
   7818                          (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
   7819                        Sched<[Sched]>, VEX, VEX_L;
   7820     def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
   7821                        (ins memOp:$src1, u8imm:$src2),
   7822                        !strconcat(OpcodeStr,
   7823                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   7824                        [(set VR256:$dst,
   7825                          (OpVT (X86VPermi (mem_frag addr:$src1),
   7826                                 (i8 imm:$src2))))]>,
   7827                        Sched<[Sched.Folded, ReadAfterLd]>, VEX, VEX_L;
   7828   }
   7829 }
   7830 
   7831 defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
   7832                             WriteShuffle256, i256mem>, VEX_W;
   7833 let ExeDomain = SSEPackedDouble in
   7834 defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
   7835                              WriteFShuffle256, f256mem>, VEX_W;
   7836 
   7837 //===----------------------------------------------------------------------===//
   7838 // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
   7839 //
   7840 let isCommutable = 1 in
   7841 def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
   7842           (ins VR256:$src1, VR256:$src2, u8imm:$src3),
   7843           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   7844           [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
   7845                             (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>,
   7846           VEX_4V, VEX_L;
   7847 def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
   7848           (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
   7849           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   7850           [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
   7851                              (i8 imm:$src3)))]>,
   7852           Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
   7853 
   7854 let Predicates = [HasAVX2] in
   7855 def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
   7856                                 VR256:$src1, (i8 imm:$imm))),
   7857           (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
   7858 
   7859 
   7860 //===----------------------------------------------------------------------===//
   7861 // VINSERTI128 - Insert packed integer values
   7862 //
   7863 let hasSideEffects = 0 in {
   7864 def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
   7865           (ins VR256:$src1, VR128:$src2, u8imm:$src3),
   7866           "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   7867           []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
   7868 let mayLoad = 1 in
   7869 def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
   7870           (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
   7871           "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
   7872           []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
   7873 }
   7874 
   7875 let Predicates = [HasAVX2, NoVLX] in {
   7876   defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64,  loadv2i64>;
   7877   defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32,  loadv2i64>;
   7878   defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv2i64>;
   7879   defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8,  loadv2i64>;
   7880 }
   7881 
   7882 //===----------------------------------------------------------------------===//
   7883 // VEXTRACTI128 - Extract packed integer values
   7884 //
   7885 def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
   7886           (ins VR256:$src1, u8imm:$src2),
   7887           "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
   7888           Sched<[WriteShuffle256]>, VEX, VEX_L;
   7889 let hasSideEffects = 0, mayStore = 1 in
   7890 def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
   7891           (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
   7892           "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
   7893           Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L;
   7894 
   7895 let Predicates = [HasAVX2, NoVLX] in {
   7896   defm : vextract_lowering<"VEXTRACTI128", v4i64,  v2i64>;
   7897   defm : vextract_lowering<"VEXTRACTI128", v8i32,  v4i32>;
   7898   defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
   7899   defm : vextract_lowering<"VEXTRACTI128", v32i8,  v16i8>;
   7900 }
   7901 
   7902 //===----------------------------------------------------------------------===//
   7903 // VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
   7904 //
   7905 multiclass avx2_pmovmask<string OpcodeStr,
   7906                          Intrinsic IntLd128, Intrinsic IntLd256,
   7907                          Intrinsic IntSt128, Intrinsic IntSt256> {
   7908   def rm  : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
   7909              (ins VR128:$src1, i128mem:$src2),
   7910              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   7911              [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
   7912              VEX_4V, Sched<[WriteVecMaskedLoad]>;
   7913   def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
   7914              (ins VR256:$src1, i256mem:$src2),
   7915              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   7916              [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
   7917              VEX_4V, VEX_L, Sched<[WriteVecMaskedLoadY]>;
   7918   def mr  : AVX28I<0x8e, MRMDestMem, (outs),
   7919              (ins i128mem:$dst, VR128:$src1, VR128:$src2),
   7920              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   7921              [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
   7922              VEX_4V, Sched<[WriteVecMaskedStore]>;
   7923   def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
   7924              (ins i256mem:$dst, VR256:$src1, VR256:$src2),
   7925              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   7926              [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
   7927              VEX_4V, VEX_L, Sched<[WriteVecMaskedStoreY]>;
   7928 }
   7929 
   7930 defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
   7931                                 int_x86_avx2_maskload_d,
   7932                                 int_x86_avx2_maskload_d_256,
   7933                                 int_x86_avx2_maskstore_d,
   7934                                 int_x86_avx2_maskstore_d_256>;
   7935 defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
   7936                                 int_x86_avx2_maskload_q,
   7937                                 int_x86_avx2_maskload_q_256,
   7938                                 int_x86_avx2_maskstore_q,
   7939                                 int_x86_avx2_maskstore_q_256>, VEX_W;
   7940 
   7941 multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
   7942                           ValueType MaskVT, string BlendStr, ValueType ZeroVT> {
   7943     // masked store
   7944     def: Pat<(X86mstore addr:$ptr, (MaskVT RC:$mask), (VT RC:$src)),
   7945              (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
   7946     // masked load
   7947     def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), undef)),
   7948              (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
   7949     def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask),
   7950                               (VT (bitconvert (ZeroVT immAllZerosV))))),
   7951              (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
   7952     def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))),
   7953              (!cast<Instruction>(BlendStr#"rr")
   7954                  RC:$src0,
   7955                  (VT (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)),
   7956                  RC:$mask)>;
   7957 }
   7958 let Predicates = [HasAVX] in {
   7959   defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>;
   7960   defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64, "VBLENDVPD", v4i32>;
   7961   defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32, "VBLENDVPSY", v8i32>;
   7962   defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64, "VBLENDVPDY", v8i32>;
   7963 }
   7964 let Predicates = [HasAVX1Only] in {
   7965   // load/store i32/i64 not supported use ps/pd version
   7966   defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
   7967   defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
   7968   defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
   7969   defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
   7970 }
   7971 let Predicates = [HasAVX2] in {
   7972   defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
   7973   defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
   7974   defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
   7975   defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
   7976 }
   7977 
   7978 //===----------------------------------------------------------------------===//
   7979 // SubVector Broadcasts
   7980 // Provide fallback in case the load node that is used in the patterns above
   7981 // is used by additional users, which prevents the pattern selection.
   7982 
   7983 let Predicates = [HasAVX2, NoVLX] in {
   7984 def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
   7985           (VINSERTI128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
   7986                          (v2i64 VR128:$src), 1)>;
   7987 def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
   7988           (VINSERTI128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
   7989                          (v4i32 VR128:$src), 1)>;
   7990 def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
   7991           (VINSERTI128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
   7992                          (v8i16 VR128:$src), 1)>;
   7993 def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
   7994           (VINSERTI128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
   7995                          (v16i8 VR128:$src), 1)>;
   7996 }
   7997 
   7998 let Predicates = [HasAVX, NoVLX] in {
   7999 def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))),
   8000           (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
   8001                          (v2f64 VR128:$src), 1)>;
   8002 def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))),
   8003           (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
   8004                          (v4f32 VR128:$src), 1)>;
   8005 }
   8006 
   8007 let Predicates = [HasAVX1Only] in {
   8008 def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
   8009           (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
   8010                          (v2i64 VR128:$src), 1)>;
   8011 def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
   8012           (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
   8013                          (v4i32 VR128:$src), 1)>;
   8014 def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
   8015           (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
   8016                          (v8i16 VR128:$src), 1)>;
   8017 def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
   8018           (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
   8019                          (v16i8 VR128:$src), 1)>;
   8020 }
   8021 
   8022 //===----------------------------------------------------------------------===//
   8023 // Variable Bit Shifts
   8024 //
   8025 multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
   8026                           ValueType vt128, ValueType vt256> {
   8027   def rr  : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
   8028              (ins VR128:$src1, VR128:$src2),
   8029              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   8030              [(set VR128:$dst,
   8031                (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
   8032              VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>;
   8033   def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
   8034              (ins VR128:$src1, i128mem:$src2),
   8035              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   8036              [(set VR128:$dst,
   8037                (vt128 (OpNode VR128:$src1,
   8038                        (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
   8039              VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded, ReadAfterLd]>;
   8040   def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
   8041              (ins VR256:$src1, VR256:$src2),
   8042              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   8043              [(set VR256:$dst,
   8044                (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
   8045              VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
   8046   def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
   8047              (ins VR256:$src1, i256mem:$src2),
   8048              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
   8049              [(set VR256:$dst,
   8050                (vt256 (OpNode VR256:$src1,
   8051                        (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>,
   8052              VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded, ReadAfterLd]>;
   8053 }
   8054 
   8055 let Predicates = [HasAVX2, NoVLX] in {
   8056   defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
   8057   defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
   8058   defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
   8059   defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
   8060   defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
   8061 
   8062   def : Pat<(v4i32 (X86vsrav VR128:$src1, VR128:$src2)),
   8063             (VPSRAVDrr VR128:$src1, VR128:$src2)>;
   8064   def : Pat<(v4i32 (X86vsrav VR128:$src1,
   8065                     (bitconvert (loadv2i64 addr:$src2)))),
   8066             (VPSRAVDrm VR128:$src1, addr:$src2)>;
   8067   def : Pat<(v8i32 (X86vsrav VR256:$src1, VR256:$src2)),
   8068             (VPSRAVDYrr VR256:$src1, VR256:$src2)>;
   8069   def : Pat<(v8i32 (X86vsrav VR256:$src1,
   8070                     (bitconvert (loadv4i64 addr:$src2)))),
   8071             (VPSRAVDYrm VR256:$src1, addr:$src2)>;
   8072 }
   8073 
   8074 //===----------------------------------------------------------------------===//
   8075 // VGATHER - GATHER Operations
   8076 
   8077 // FIXME: Improve scheduling of gather instructions.
   8078 multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx,
   8079                        ValueType VTy, PatFrag GatherNode128,
   8080                        PatFrag GatherNode256, RegisterClass RC256,
   8081                        X86MemOperand memop128, X86MemOperand memop256,
   8082                        ValueType MTx = VTx, ValueType MTy = VTy> {
   8083   def rm  : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
   8084             (ins VR128:$src1, memop128:$src2, VR128:$mask),
   8085             !strconcat(OpcodeStr,
   8086               "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
   8087             [(set (VTx VR128:$dst), (MTx VR128:$mask_wb),
   8088                   (GatherNode128 VR128:$src1, VR128:$mask,
   8089                                 vectoraddr:$src2))]>,
   8090             VEX, Sched<[WriteLoad]>;
   8091   def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
   8092             (ins RC256:$src1, memop256:$src2, RC256:$mask),
   8093             !strconcat(OpcodeStr,
   8094               "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
   8095             [(set (VTy RC256:$dst), (MTy RC256:$mask_wb),
   8096                   (GatherNode256 RC256:$src1, RC256:$mask,
   8097                                 vectoraddr:$src2))]>,
   8098             VEX, VEX_L, Sched<[WriteLoad]>;
   8099 }
   8100 
   8101 let Predicates = [UseAVX2] in {
   8102   let mayLoad = 1, hasSideEffects = 0, Constraints
   8103     = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
   8104     in {
   8105     defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, mgatherv4i32,
   8106                         mgatherv4i32, VR256, vx128mem, vx256mem>, VEX_W;
   8107     defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, mgatherv2i64,
   8108                         mgatherv4i64, VR256, vx128mem, vy256mem>, VEX_W;
   8109     defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, mgatherv4i32,
   8110                         mgatherv8i32, VR256, vx128mem, vy256mem>;
   8111     defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, mgatherv2i64,
   8112                         mgatherv4i64, VR128, vx64mem, vy128mem>;
   8113 
   8114     let ExeDomain = SSEPackedDouble in {
   8115       defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, mgatherv4i32,
   8116                           mgatherv4i32, VR256, vx128mem, vx256mem,
   8117                           v2i64, v4i64>, VEX_W;
   8118       defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, mgatherv2i64,
   8119                           mgatherv4i64, VR256, vx128mem, vy256mem,
   8120                           v2i64, v4i64>, VEX_W;
   8121     }
   8122 
   8123     let ExeDomain = SSEPackedSingle in {
   8124       defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, mgatherv4i32,
   8125                           mgatherv8i32, VR256, vx128mem, vy256mem,
   8126                           v4i32, v8i32>;
   8127       defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, mgatherv2i64,
   8128                           mgatherv4i64, VR128, vx64mem, vy128mem,
   8129                           v4i32, v4i32>;
   8130     }
   8131   }
   8132 }
   8133 
   8134 //===----------------------------------------------------------------------===//
   8135 // Extra selection patterns for f128, f128mem
   8136 
   8137 // movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2.
   8138 def : Pat<(alignedstore (f128 VR128:$src), addr:$dst),
   8139           (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 VR128:$src), VR128))>;
   8140 def : Pat<(store (f128 VR128:$src), addr:$dst),
   8141           (MOVUPSmr addr:$dst, (COPY_TO_REGCLASS (f128 VR128:$src), VR128))>;
   8142 
   8143 def : Pat<(alignedloadf128 addr:$src),
   8144           (COPY_TO_REGCLASS (MOVAPSrm addr:$src), VR128)>;
   8145 def : Pat<(loadf128 addr:$src),
   8146           (COPY_TO_REGCLASS (MOVUPSrm addr:$src), VR128)>;
   8147 
   8148 // andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
   8149 def : Pat<(f128 (X86fand VR128:$src1, (memopf128 addr:$src2))),
   8150           (COPY_TO_REGCLASS
   8151            (ANDPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2),
   8152            VR128)>;
   8153 
   8154 def : Pat<(f128 (X86fand VR128:$src1, VR128:$src2)),
   8155           (COPY_TO_REGCLASS
   8156            (ANDPSrr (COPY_TO_REGCLASS VR128:$src1, VR128),
   8157                     (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>;
   8158 
   8159 def : Pat<(f128 (X86for VR128:$src1, (memopf128 addr:$src2))),
   8160           (COPY_TO_REGCLASS
   8161            (ORPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2),
   8162            VR128)>;
   8163 
   8164 def : Pat<(f128 (X86for VR128:$src1, VR128:$src2)),
   8165           (COPY_TO_REGCLASS
   8166            (ORPSrr (COPY_TO_REGCLASS VR128:$src1, VR128),
   8167                    (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>;
   8168 
   8169 def : Pat<(f128 (X86fxor VR128:$src1, (memopf128 addr:$src2))),
   8170           (COPY_TO_REGCLASS
   8171            (XORPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2),
   8172            VR128)>;
   8173 
   8174 def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)),
   8175           (COPY_TO_REGCLASS
   8176            (XORPSrr (COPY_TO_REGCLASS VR128:$src1, VR128),
   8177                     (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>;
   8178 
   8179 //===----------------------------------------------------------------------===//
   8180 // GFNI instructions
   8181 //===----------------------------------------------------------------------===//
   8182 
   8183 multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
   8184                         RegisterClass RC, PatFrag MemOpFrag,
   8185                         X86MemOperand X86MemOp, bit Is2Addr = 0> {
   8186   let ExeDomain = SSEPackedInt,
   8187       AsmString = !if(Is2Addr,
   8188         OpcodeStr##"\t{$src2, $dst|$dst, $src2}",
   8189         OpcodeStr##"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
   8190     let isCommutable = 1 in
   8191     def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
   8192                  [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
   8193              Sched<[SchedWriteVecALU.XMM]>, T8PD;
   8194 
   8195     def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
   8196                  [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
   8197                                  (bitconvert (MemOpFrag addr:$src2)))))]>,
   8198              Sched<[SchedWriteVecALU.XMM.Folded, ReadAfterLd]>, T8PD;
   8199   }
   8200 }
   8201 
   8202 multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
   8203                            SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag,
   8204                            X86MemOperand X86MemOp, bit Is2Addr = 0> {
   8205   let AsmString = !if(Is2Addr,
   8206       OpStr##"\t{$src3, $src2, $dst|$dst, $src2, $src3}",
   8207       OpStr##"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
   8208   def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
   8209               (ins RC:$src1, RC:$src2, u8imm:$src3), "",
   8210               [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
   8211               SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>;
   8212   def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
   8213               (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
   8214               [(set RC:$dst, (OpVT (OpNode RC:$src1,
   8215                                     (bitconvert (MemOpFrag addr:$src2)),
   8216                               imm:$src3)))], SSEPackedInt>,
   8217               Sched<[SchedWriteVecALU.XMM.Folded, ReadAfterLd]>;
   8218   }
   8219 }
   8220 
   8221 multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
   8222   let Constraints = "$src1 = $dst",
   8223       Predicates  = [HasGFNI, UseSSE2] in
   8224   defm NAME         : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
   8225                                       VR128, loadv2i64, i128mem, 1>;
   8226   let Predicates  = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
   8227     defm V##NAME    : GF2P8AFFINE_rmi<Op, "v"##OpStr, v16i8, OpNode, VR128,
   8228                                       loadv2i64, i128mem>, VEX_4V, VEX_W;
   8229     defm V##NAME##Y : GF2P8AFFINE_rmi<Op, "v"##OpStr, v32i8, OpNode, VR256,
   8230                                       loadv4i64, i256mem>, VEX_4V, VEX_L, VEX_W;
   8231   }
   8232 }
   8233 
   8234 // GF2P8MULB
   8235 let Constraints = "$src1 = $dst",
   8236     Predicates  = [HasGFNI, UseSSE2] in
   8237 defm GF2P8MULB      : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memopv2i64,
   8238                                     i128mem, 1>;
   8239 let Predicates  = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
   8240   defm VGF2P8MULB   : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, loadv2i64,
   8241                                    i128mem>, VEX_4V;
   8242   defm VGF2P8MULBY  : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, loadv4i64,
   8243                                    i256mem>, VEX_4V, VEX_L;
   8244 }
   8245 // GF2P8AFFINEINVQB, GF2P8AFFINEQB
   8246 let isCommutable = 0 in {
   8247   defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb",
   8248                                              X86GF2P8affineinvqb>, TAPD;
   8249   defm GF2P8AFFINEQB    : GF2P8AFFINE_common<0xCE, "gf2p8affineqb",
   8250                                              X86GF2P8affineqb>, TAPD;
   8251 }
   8252 
   8253